diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png
index 0ebb2d1ae0..302b50124a 100755
Binary files a/doc/src/JPG/user_intel.png and b/doc/src/JPG/user_intel.png differ
diff --git a/doc/src/Manual.txt b/doc/src/Manual.txt
index dd24f8465a..444e901a40 100644
--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="19 May 2017 version">
+<META NAME="docnumber" CONTENT="23 Jun 2017 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@@ -21,7 +21,7 @@
 <H1></H1>
 
 LAMMPS Documentation :c,h3
-19 May 2017 version :c,h4
+23 Jun 2017 version :c,h4
 
 Version info: :h4
 
diff --git a/doc/src/Section_commands.txt b/doc/src/Section_commands.txt
index 8aa5bdec2e..0fbab732c8 100644
--- a/doc/src/Section_commands.txt
+++ b/doc/src/Section_commands.txt
@@ -964,7 +964,7 @@ KOKKOS, o = USER-OMP, t = OPT.
 "lj/expand (gko)"_pair_lj_expand.html,
 "lj/gromacs (gko)"_pair_gromacs.html,
 "lj/gromacs/coul/gromacs (ko)"_pair_gromacs.html,
-"lj/long/coul/long (o)"_pair_lj_long.html,
+"lj/long/coul/long (io)"_pair_lj_long.html,
 "lj/long/dipole/long"_pair_dipole.html,
 "lj/long/tip4p/long"_pair_lj_long.html,
 "lj/smooth (o)"_pair_lj_smooth.html,
@@ -1073,7 +1073,7 @@ package"_Section_start.html#start_3.
 "table/rx"_pair_table_rx.html,
 "tersoff/table (o)"_pair_tersoff.html,
 "thole"_pair_thole.html,
-"tip4p/long/soft (o)"_pair_lj_soft.html :tb(c=4,ea=c) 
+"tip4p/long/soft (o)"_pair_lj_soft.html :tb(c=4,ea=c)
 
 :line
 
@@ -1225,7 +1225,7 @@ USER-OMP, t = OPT.
 "msm/cg (o)"_kspace_style.html,
 "pppm (go)"_kspace_style.html,
 "pppm/cg (o)"_kspace_style.html,
-"pppm/disp"_kspace_style.html,
+"pppm/disp (i)"_kspace_style.html,
 "pppm/disp/tip4p"_kspace_style.html,
 "pppm/stagger"_kspace_style.html,
 "pppm/tip4p (o)"_kspace_style.html :tb(c=4,ea=c)
diff --git a/doc/src/Section_howto.txt b/doc/src/Section_howto.txt
index 579cb68474..f2f2561af8 100644
--- a/doc/src/Section_howto.txt
+++ b/doc/src/Section_howto.txt
@@ -1938,7 +1938,7 @@ documentation in the src/library.cpp file for details, including
 which quantities can be queried by name:
 
 void *lammps_extract_global(void *, char *)
-void lammps_extract_box(void *, double *, double *, 
+void lammps_extract_box(void *, double *, double *,
                         double *, double *, double *, int *, int *)
 void *lammps_extract_atom(void *, char *)
 void *lammps_extract_compute(void *, char *, int, int)
@@ -2682,14 +2682,14 @@ bond_coeff      2 25.724 0.0 :pre
 
 When running dynamics with the adiabatic core/shell model, the
 following issues should be considered.  The relative motion of
-the core and shell particles corresponds to the polarization, 
-hereby an instantaneous relaxation of the shells is approximated 
+the core and shell particles corresponds to the polarization,
+hereby an instantaneous relaxation of the shells is approximated
 and a fast core/shell spring frequency ensures a nearly constant
-internal kinetic energy during the simulation. 
+internal kinetic energy during the simulation.
 Thermostats can alter this polarization behaviour, by scaling the
-internal kinetic energy, meaning the shell will not react freely to 
-its electrostatic environment. 
-Therefore it is typically desirable to decouple the relative motion of 
+internal kinetic energy, meaning the shell will not react freely to
+its electrostatic environment.
+Therefore it is typically desirable to decouple the relative motion of
 the core/shell pair, which is an imaginary degree of freedom, from the
 real physical system.  To do that, the "compute
 temp/cs"_compute_temp_cs.html command can be used, in conjunction with
@@ -2721,13 +2721,13 @@ fix thermostatequ all nve                               # integrator as needed f
 fix_modify thermoberendsen temp CSequ
 thermo_modify temp CSequ                                # output of center-of-mass derived temperature :pre
 
-The pressure for the core/shell system is computed via the regular 
-LAMMPS convention by "treating the cores and shells as individual 
-particles"_#MitchellFincham2. For the thermo output of the pressure 
-as well as for the application of a barostat, it is necessary to 
-use an additional "pressure"_compute_pressure compute based on the 
-default "temperature"_compute_temp and specifying it as a second 
-argument in "fix modify"_fix_modify.html and 
+The pressure for the core/shell system is computed via the regular
+LAMMPS convention by "treating the cores and shells as individual
+particles"_#MitchellFincham2. For the thermo output of the pressure
+as well as for the application of a barostat, it is necessary to
+use an additional "pressure"_compute_pressure compute based on the
+default "temperature"_compute_temp and specifying it as a second
+argument in "fix modify"_fix_modify.html and
 "thermo_modify"_thermo_modify.html resulting in:
 
 (...)
@@ -2757,18 +2757,18 @@ temp/cs"_compute_temp_cs.html command to the {temp} keyword of the
 velocity all create 1427 134 bias yes temp CSequ
 velocity all scale 1427 temp CSequ :pre
 
-To maintain the correct polarizability of the core/shell pairs, the 
-kinetic energy of the internal motion shall remain nearly constant. 
-Therefore the choice of spring force and mass ratio need to ensure 
-much faster relative motion of the 2 atoms within the core/shell pair 
-than their center-of-mass velocity. This allows the shells to 
-effectively react instantaneously to the electrostatic environment and 
+To maintain the correct polarizability of the core/shell pairs, the
+kinetic energy of the internal motion shall remain nearly constant.
+Therefore the choice of spring force and mass ratio need to ensure
+much faster relative motion of the 2 atoms within the core/shell pair
+than their center-of-mass velocity. This allows the shells to
+effectively react instantaneously to the electrostatic environment and
 limits energy transfer to or from the core/shell oscillators.
 This fast movement also dictates the timestep that can be used.
 
 The primary literature of the adiabatic core/shell model suggests that
 the fast relative motion of the core/shell pairs only allows negligible
-energy transfer to the environment. 
+energy transfer to the environment.
 The mentioned energy transfer will typically lead to a small drift
 in total energy over time.  This internal energy can be monitored
 using the "compute chunk/atom"_compute_chunk_atom.html and "compute
@@ -2790,7 +2790,7 @@ pairs as chunks.
 
 For example if core/shell pairs are the only molecules:
 
-read_data NaCl_CS_x0.1_prop.data 
+read_data NaCl_CS_x0.1_prop.data
 compute prop all property/atom molecule
 compute cs_chunk all chunk/atom c_prop
 compute cstherm all temp/chunk cs_chunk temp internal com yes cdof 3.0     # note the chosen degrees of freedom for the core/shell pairs
diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index 14b2c0baa3..24506379c3 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -585,7 +585,7 @@ do not recommend building with other acceleration packages installed
 
 make yes-kokkos
 make machine :pre
- 
+
 make no-kokkos
 make machine :pre
 
@@ -839,13 +839,13 @@ written and read in parallel.
 Note that MPIIO is part of the standard message-passing interface
 (MPI) library, so you should not need any additional compiler or link
 settings, beyond what LAMMPS normally uses for MPI on your system.
- 
+
 make yes-mpiio
 make machine :pre
- 
+
 make no-mpiio
 make machine :pre
- 
+
 [Supporting info:]
 
 src/MPIIO: filenames -> commands
@@ -855,7 +855,7 @@ src/MPIIO: filenames -> commands
 "read_restart"_read_restart.html :ul
 
 :line
- 
+
 MSCG package :link(mscg),h4
 
 [Contents:]
@@ -914,7 +914,7 @@ lib/mscg/README
 examples/mscg :ul
 
 :line
- 
+
 OPT package :link(OPT),h4
 
 [Contents:]
@@ -1387,7 +1387,7 @@ atomic information to continuum fields.
 [Authors:] Reese Jones, Jeremy Templeton, Jon Zimmerman (Sandia).
 
 [Install or un-install:]
-  
+
 Before building LAMMPS with this package, you must first build the ATC
 library in lib/atc.  You can do this manually if you prefer; follow
 the instructions in lib/atc/README.  You can also do it in one step
@@ -1420,10 +1420,10 @@ usual manner:
 
 make yes-user-atc
 make machine :pre
- 
+
 make no-user-atc
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-ATC: filenames -> commands
@@ -1446,7 +1446,7 @@ model.
 [Author:] Ilya Valuev (JIHT, Russia).
 
 [Install or un-install:]
-  
+
 Before building LAMMPS with this package, you must first build the
 AWPMD library in lib/awpmd.  You can do this manually if you prefer;
 follow the instructions in lib/awpmd/README.  You can also do it in
@@ -1479,10 +1479,10 @@ usual manner:
 
 make yes-user-awpmd
 make machine :pre
- 
+
 make no-user-awpmd
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-AWPMD: filenames -> commands
@@ -1505,13 +1505,13 @@ stability.
 [Author:] Oliver Henrich (University of Strathclyde, Glasgow).
 
 [Install or un-install:]
-  
+
 make yes-user-cgdna
 make machine :pre
- 
+
 make no-user-cgdna
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-CGDNA: filenames -> commands
@@ -1536,13 +1536,13 @@ acids.
 [Author:] Axel Kohlmeyer (Temple U).
 
 [Install or un-install:]
-  
+
 make yes-user-cgsdk
 make machine :pre
- 
+
 make no-user-cgsdk
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-CGSDK: filenames -> commands
@@ -1570,7 +1570,7 @@ by Giacomo Fiorin (ICMS, Temple University, Philadelphia, PA, USA) and
 Jerome Henin (LISM, CNRS, Marseille, France).
 
 [Install or un-install:]
-  
+
 Before building LAMMPS with this package, you must first build the
 COLVARS library in lib/colvars.  You can do this manually if you
 prefer; follow the instructions in lib/colvars/README.  You can also
@@ -1594,10 +1594,10 @@ usual manner:
 
 make yes-user-colvars
 make machine :pre
- 
+
 make no-user-colvars
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-COLVARS: filenames -> commands
@@ -1619,13 +1619,13 @@ intensities based on kinematic diffraction theory.
 [Author:] Shawn Coleman while at the U Arkansas.
 
 [Install or un-install:]
-  
+
 make yes-user-diffraction
 make machine :pre
- 
+
 make no-user-diffraction
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-DIFFRACTION: filenames -> commands
@@ -1654,13 +1654,13 @@ algorithm.
 Brennan (ARL).
 
 [Install or un-install:]
-  
+
 make yes-user-dpd
 make machine :pre
- 
+
 make no-user-dpd
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-DPD: filenames -> commands
@@ -1696,13 +1696,13 @@ tools/drude.
 Devemy (CNRS), and Agilio Padua (U Blaise Pascal).
 
 [Install or un-install:]
-  
+
 make yes-user-drude
 make machine :pre
- 
+
 make no-user-drude
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-DRUDE: filenames -> commands
@@ -1734,13 +1734,13 @@ tools/eff; see its README file.
 [Author:] Andres Jaramillo-Botero (CalTech).
 
 [Install or un-install:]
-  
+
 make yes-user-eff
 make machine :pre
- 
+
 make no-user-eff
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-EFF: filenames -> commands
@@ -1773,13 +1773,13 @@ for using this package in tools/fep; see its README file.
 [Author:] Agilio Padua (Universite Blaise Pascal Clermont-Ferrand)
 
 [Install or un-install:]
-  
+
 make yes-user-fep
 make machine :pre
- 
+
 make no-user-fep
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-FEP: filenames -> commands
@@ -1836,13 +1836,13 @@ file.
 
 You can then install/un-install the package and build LAMMPS in the
 usual manner:
-  
+
 make yes-user-h5md
 make machine :pre
- 
+
 make no-user-h5md
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-H5MD: filenames -> commands
@@ -1908,7 +1908,7 @@ explained in "Section 5.3.2"_accelerate_intel.html.
 
 make yes-user-intel yes-user-omp
 make machine :pre
- 
+
 make no-user-intel no-user-omp
 make machine :pre
 
@@ -1938,13 +1938,13 @@ can be used to model MD particles influenced by hydrodynamic forces.
 Ontario).
 
 [Install or un-install:]
-  
+
 make yes-user-lb
 make machine :pre
- 
+
 make no-user-lb
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-LB: filenames -> commands
@@ -1972,13 +1972,13 @@ matrix-MGPT algorithm due to Tomas Oppelstrup at LLNL.
 [Authors:] Tomas Oppelstrup and John Moriarty (LLNL).
 
 [Install or un-install:]
-  
+
 make yes-user-mgpt
 make machine :pre
- 
+
 make no-user-mgpt
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-MGPT: filenames -> commands
@@ -2000,13 +2000,13 @@ dihedral, improper, or command style.
 src/USER-MISC/README file.
 
 [Install or un-install:]
-  
+
 make yes-user-misc
 make machine :pre
- 
+
 make no-user-misc
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-MISC: filenames -> commands
@@ -2031,13 +2031,13 @@ n = grad(g).
 Netherlands; since 2017: Brandeis University, Waltham, MA, USA)
 
 [Install or un-install:]
-  
+
 make yes-user-manifold
 make machine :pre
- 
+
 make no-user-manifold
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-MANIFOLD: filenames -> commands
@@ -2080,7 +2080,7 @@ at
 [Author:] Axel Kohlmeyer (Temple U).
 
 [Install or un-install:]
-  
+
 Note that the lib/molfile/Makefile.lammps file has a setting for a
 dynamic loading library libdl.a that should is typically present on
 all systems, which is required for LAMMPS to link with this package.
@@ -2090,10 +2090,10 @@ lib/molfile/Makefile.lammps for details.
 
 make yes-user-molfile
 make machine :pre
- 
+
 make no-user-molfile
 make machine :pre
- 
+
 [Supporting info:]
 
 src/USER-MOLFILE: filenames -> commands
@@ -2128,7 +2128,7 @@ tools:
 [Author:] Lars Pastewka (Karlsruhe Institute of Technology).
 
 [Install or un-install:]
-   
+
 Note that to follow these steps, you need the standard NetCDF software
 package installed on your system.  The lib/netcdf/Makefile.lammps file
 has settings for NetCDF include and library files that LAMMPS needs to
@@ -2138,7 +2138,7 @@ lib/netcdf/README for details.
 
 make yes-user-netcdf
 make machine :pre
-  
+
 make no-user-netcdf
 make machine :pre
 
@@ -2178,10 +2178,10 @@ Once you have an appropriate Makefile.machine, you can
 install/un-install the package and build LAMMPS in the usual manner:
 
 [Install or un-install:]
-  
+
 make yes-user-omp
 make machine :pre
- 
+
 make no-user-omp
 make machine :pre
 
@@ -2213,13 +2213,13 @@ relations, directly from molecular dynamics simulations.
 [Author:] Ling-Ti Kong (Shanghai Jiao Tong University).
 
 [Install or un-install:]
-   
+
 make yes-user-phonon
 make machine :pre
-  
+
 make no-user-phonon
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-PHONON: filenames -> commands
@@ -2235,7 +2235,7 @@ USER-QMMM package :link(USER-QMMM),h4
 
 A "fix qmmm"_fix_qmmm.html command which allows LAMMPS to be used in a
 QM/MM simulation, currently only in combination with the "Quantum
-ESPRESSO"_espresso package.  
+ESPRESSO"_espresso package.
 
 :link(espresso,http://www.quantum-espresso.org)
 
@@ -2275,7 +2275,7 @@ usual manner:
 
 make yes-user-qmmm
 make machine :pre
-  
+
 make no-user-qmmm
 make machine :pre
 
@@ -2284,7 +2284,7 @@ for a QM/MM simulation.  You must also build Quantum ESPRESSO and
 create a new executable which links LAMMPS and Quanutm ESPRESSO
 together.  These are steps 3 and 4 described in the lib/qmmm/README
 file.
-  
+
 [Supporting info:]
 
 src/USER-QMMM: filenames -> commands
@@ -2312,13 +2312,13 @@ simulation.
 [Author:] Yuan Shen (Stanford U).
 
 [Install or un-install:]
-   
+
 make yes-user-qtb
 make machine :pre
-  
+
 make no-user-qtb
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-QTB: filenames -> commands
@@ -2362,10 +2362,10 @@ usual manner:
 
 make yes-user-quip
 make machine :pre
-  
+
 make no-user-quip
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-QUIP: filenames -> commands
@@ -2388,13 +2388,13 @@ for monitoring molecules as bonds are created and destroyed.
 [Author:] Hasan Metin Aktulga (MSU) while at Purdue University.
 
 [Install or un-install:]
-   
+
 make yes-user-reaxc
 make machine :pre
-  
+
 make no-user-reaxc
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-REAXC: filenames -> commands
@@ -2451,10 +2451,10 @@ usual manner:
 
 make yes-user-smd
 make machine :pre
-  
+
 make no-user-smd
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-SMD: filenames -> commands
@@ -2477,13 +2477,13 @@ ionocovalent bonds in oxides.
 Tetot (LAAS-CNRS, France).
 
 [Install or un-install:]
-   
+
 make yes-user-smtbq
 make machine :pre
-  
+
 make no-user-smtbq
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-SMTBQ: filenames -> commands
@@ -2516,13 +2516,13 @@ property/atom"_compute_property_atom.html command.
 Dynamics, Ernst Mach Institute, Germany).
 
 [Install or un-install:]
-   
+
 make yes-user-sph
 make machine :pre
-  
+
 make no-user-sph
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-SPH: filenames -> commands
@@ -2544,13 +2544,13 @@ stress, etc) about individual interactions.
 [Author:] Axel Kohlmeyer (Temple U).
 
 [Install or un-install:]
-   
+
 make yes-user-tally
 make machine :pre
-  
+
 make no-user-tally
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-TALLY: filenames -> commands
@@ -2577,7 +2577,7 @@ system.
 [Authors:] Richard Berger (JKU) and Daniel Queteschiner (DCS Computing).
 
 [Install or un-install:]
-   
+
 The lib/vtk/Makefile.lammps file has settings for accessing VTK files
 and its library, which are required for LAMMPS to build and link with
 this package.  If the settings are not valid for your system, check if
@@ -2590,10 +2590,10 @@ usual manner:
 
 make yes-user-vtk
 make machine :pre
-  
+
 make no-user-vtk
 make machine :pre
-  
+
 [Supporting info:]
 
 src/USER-VTK: filenames -> commands
diff --git a/doc/src/Section_python.txt b/doc/src/Section_python.txt
index 718e9e229c..1e67fca321 100644
--- a/doc/src/Section_python.txt
+++ b/doc/src/Section_python.txt
@@ -714,7 +714,7 @@ stored in the "image" property. All three image flags are stored in
 a packed format in a single integer, so count would be 1 to retrieve
 that integer, however also a count value of 3 can be used and then
 the image flags will be unpacked into 3 individual integers, ordered
-in a similar fashion as coordinates. 
+in a similar fashion as coordinates.
 
 Note that the data structure gather_atoms("x") returns is different
 from the data structure returned by extract_atom("x") in four ways.
diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index d629828f12..f5bd66aeba 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -30,8 +30,8 @@ Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
 Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
-charmm/coul/long, lj/cut, lj/cut/coul/long, sw, tersoff :l
-K-Space Styles: pppm :l
+charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
+K-Space Styles: pppm, pppm/disp :l
 :ule
 
 [Speed-ups to expect:]
@@ -42,62 +42,88 @@ precision mode. Performance improvements are shown compared to
 LAMMPS {without using other acceleration packages} as these are
 under active development (and subject to performance changes). The
 measurements were performed using the input files available in
-the src/USER-INTEL/TEST directory. These are scalable in size; the
-results given are with 512K particles (524K for Liquid Crystal).
-Most of the simulations are standard LAMMPS benchmarks (indicated
-by the filename extension in parenthesis) with modifications to the
-run length and to add a warmup run (for use with offload
-benchmarks).
+the src/USER-INTEL/TEST directory with the provided run script.
+These are scalable in size; the results given are with 512K
+particles (524K for Liquid Crystal). Most of the simulations are
+standard LAMMPS benchmarks (indicated by the filename extension in
+parenthesis) with modifications to the run length and to add a
+warmup run (for use with offload benchmarks).
 
 :c,image(JPG/user_intel.png)
 
 Results are speedups obtained on Intel Xeon E5-2697v4 processors
 (code-named Broadwell) and Intel Xeon Phi 7250 processors
-(code-named Knights Landing) with "18 Jun 2016" LAMMPS built with
-Intel Parallel Studio 2016 update 3. Results are with 1 MPI task
+(code-named Knights Landing) with "June 2017" LAMMPS built with
+Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
 per physical core. See {src/USER-INTEL/TEST/README} for the raw
 simulation rates and instructions to reproduce.
 
 :line
 
+[Accuracy and order of operations:]
+
+In most molecular dynamics software, parallelization parameters
+(# of MPI, OpenMP, and vectorization) can change the results due
+to changing the order of operations with finite-precision
+calculations. The USER-INTEL package is deterministic. This means
+that the results should be reproducible from run to run with the
+{same} parallel configurations and when using determinstic
+libraries or library settings (MPI, OpenMP, FFT). However, there
+are differences in the USER-INTEL package that can change the
+order of operations compared to LAMMPS without acceleration:
+
+Neighbor lists can be created in a different order :ulb,l
+Bins used for sorting atoms can be oriented differently :l
+The default stencil order for PPPM is 7. By default, LAMMPS will
+calculate other PPPM parameters to fit the desired acuracy with
+this order :l
+The {newton} setting applies to all atoms, not just atoms shared
+between MPI tasks :l
+Vectorization can change the order for adding pairwise forces :l
+:ule
+
+The precision mode (described below) used with the USER-INTEL
+package can change the {accuracy} of the calculations. For the
+default {mixed} precision option, calculations between pairs or
+triplets of atoms are performed in single precision, intended to
+be within the inherent error of MD simulations. All accumulation
+is performed in double precision to prevent the error from growing
+with the number of atoms in the simulation. {Single} precision
+mode should not be used without appropriate validation.
+
+:line
+
 [Quick Start for Experienced Users:]
 
 LAMMPS should be built with the USER-INTEL package installed.
 Simulations should be run with 1 MPI task per physical {core},
 not {hardware thread}.
 
-For Intel Xeon CPUs:
-
 Edit src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi as necessary. :ulb,l
-If using {kspace_style pppm} in the input script, add "neigh_modify binsize cutoff" and "kspace_modify diff ad" to the input script for better
-performance.  Cutoff should be roughly the neighbor list cutoff.  By
-default the binsize is half the neighbor list cutoff.  :l
-"-pk intel 0 omp 2 -sf intel" added to LAMMPS command-line :l
+Set the environment variable KMP_BLOCKTIME=0 :l
+"-pk intel 0 omp $t -sf intel" added to LAMMPS command-line :l
+$t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
+For some of the simple 2-body potentials without long-range
+electrostatics, performance and scalability can be better with
+the "newton off" setting added to the input script :l
+If using {kspace_style pppm} in the input script, add
+"kspace_modify diff ad" for better performance :l
 :ule
 
-For Intel Xeon Phi CPUs for simulations without {kspace_style
-pppm} in the input script :
+For Intel Xeon Phi CPUs:
 
-Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l
-Runs should be performed using MCDRAM. :l
-"-pk intel 0 omp 2 -sf intel" {or} "-pk intel 0 omp 4 -sf intel"
-should be added to the LAMMPS command-line. Choice for best
-performance will depend on the simulation. :l
+Runs should be performed using MCDRAM. :ulb,l
 :ule
 
-For Intel Xeon Phi CPUs for simulations with {kspace_style
-pppm} in the input script:
+For simulations using {kspace_style pppm} on Intel CPUs
+supporting AVX-512:
 
-Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l
-Runs should be performed using MCDRAM. :l
-Add "neigh_modify binsize 3" to the input script for better
-performance. :l
-Add "kspace_modify diff ad" to the input script for better
-performance. :l
-export KMP_AFFINITY=none :l
-"-pk intel 0 omp 3 lrt yes -sf intel" or "-pk intel 0 omp 1 lrt yes
--sf intel" added to LAMMPS command-line. Choice for best performance
-will depend on the simulation. :l
+Add "kspace_modify diff ad" to the input script :ulb,l
+The command-line option should be changed to
+"-pk intel 0 omp $r lrt yes -sf intel" where $r is the number of
+threads minus 1. :l
+Do not use thread affinity (set KMP_AFFINITY=none) :l
+The "newton off" setting may provide better scalability :l
 :ule
 
 For Intel Xeon Phi coprocessors (Offload):
@@ -169,6 +195,10 @@ cat /proc/cpuinfo :pre
 
 [Building LAMMPS with the USER-INTEL package:]
 
+NOTE: See the src/USER-INTEL/README file for additional flags that
+might be needed for best performance on Intel server processors
+code-named "Skylake".
+
 The USER-INTEL package must be installed into the source directory:
 
 make yes-user-intel :pre
@@ -322,8 +352,8 @@ follow in the input script.
 
 NOTE: The USER-INTEL package will perform better with modifications
 to the input script when "PPPM"_kspace_style.html is used:
-"kspace_modify diff ad"_kspace_modify.html and "neigh_modify binsize
-3"_neigh_modify.html should be added to the input script.
+"kspace_modify diff ad"_kspace_modify.html should be added to the
+input script.
 
 Long-Range Thread (LRT) mode is an option to the "package
 intel"_package.html command that can improve performance when using
@@ -342,6 +372,10 @@ would normally perform best with "-pk intel 0 omp 4", instead use
 environment variable "KMP_AFFINITY=none". LRT mode is not supported
 when using offload.
 
+NOTE: Changing the "newton"_newton.html setting to off can improve
+performance and/or scalability for simple 2-body potentials such as
+lj/cut or when using LRT mode on processors supporting AVX-512.
+
 Not all styles are supported in the USER-INTEL package. You can mix
 the USER-INTEL package with styles from the "OPT"_accelerate_opt.html
 package or the "USER-OMP package"_accelerate_omp.html. Of course,
@@ -467,7 +501,7 @@ supported.
 
 Brown, W.M., Carrillo, J.-M.Y., Mishra, B., Gavhane, N., Thakker, F.M., De Kraker, A.R., Yamada, M., Ang, J.A., Plimpton, S.J., "Optimizing Classical Molecular Dynamics in LAMMPS," in Intel Xeon Phi Processor High Performance Programming: Knights Landing Edition, J. Jeffers, J. Reinders, A. Sodani, Eds. Morgan Kaufmann. :ulb,l
 
-Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency. 2016 International Conference for High Performance Computing. In press. :l
+Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. "Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency."_http://dl.acm.org/citation.cfm?id=3014915 2016 High Performance Computing, Networking, Storage and Analysis, SC16: International Conference (pp. 82-95). :l
 
 Brown, W.M., Carrillo, J.-M.Y., Gavhane, N., Thakkar, F.M., Plimpton, S.J. Optimizing Legacy Molecular Dynamics Software with Directive-Based Offload. Computer Physics Communications. 2015. 195: p. 95-101. :l
 :ule
diff --git a/doc/src/bond_oxdna.txt b/doc/src/bond_oxdna.txt
index f9b35a167c..2add6f4c2f 100644
--- a/doc/src/bond_oxdna.txt
+++ b/doc/src/bond_oxdna.txt
@@ -30,7 +30,7 @@ The {oxdna/fene} and {oxdna2/fene} bond styles use the potential
 
 to define a modified finite extensible nonlinear elastic (FENE) potential
 "(Ouldridge)"_#oxdna_fene to model the connectivity of the phosphate backbone
-in the oxDNA force field for coarse-grained modelling of DNA. 
+in the oxDNA force field for coarse-grained modelling of DNA.
 
 The following coefficients must be defined for the bond type via the
 "bond_coeff"_bond_coeff.html command as given in the above example, or in
@@ -43,8 +43,8 @@ r0 (distance) :ul
 
 NOTE: The oxDNA bond style has to be used together with the corresponding oxDNA pair styles
 for excluded volume interaction {oxdna/excv}, stacking {oxdna/stk}, cross-stacking {oxdna/xstk}
-and coaxial stacking interaction {oxdna/coaxstk} as well as hydrogen-bonding interaction {oxdna/hbond} (see also documentation of 
-"pair_style oxdna/excv"_pair_oxdna.html). For the oxDNA2 "(Snodin)"_#oxdna2 bond style the analogous pair styles and an additional Debye-Hueckel pair 
+and coaxial stacking interaction {oxdna/coaxstk} as well as hydrogen-bonding interaction {oxdna/hbond} (see also documentation of
+"pair_style oxdna/excv"_pair_oxdna.html). For the oxDNA2 "(Snodin)"_#oxdna2 bond style the analogous pair styles and an additional Debye-Hueckel pair
 style {oxdna2/dh} have to be defined.
 The coefficients in the above example have to be kept fixed and cannot be changed without reparametrizing the entire model.
 
@@ -66,7 +66,7 @@ LAMMPS"_Section_start.html#start_3 section for more info on packages.
 
 [Related commands:]
 
-"pair_style oxdna/excv"_pair_oxdna.html, "pair_style oxdna2/excv"_pair_oxdna2.html, "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html, "bond_coeff"_bond_coeff.html 
+"pair_style oxdna/excv"_pair_oxdna.html, "pair_style oxdna2/excv"_pair_oxdna2.html, "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html, "bond_coeff"_bond_coeff.html
 
 [Default:] none
 
diff --git a/doc/src/compute_cnp_atom.txt b/doc/src/compute_cnp_atom.txt
index 9aa63c84de..16a51f5241 100644
--- a/doc/src/compute_cnp_atom.txt
+++ b/doc/src/compute_cnp_atom.txt
@@ -42,7 +42,7 @@ where the index {j} goes over the {n}i nearest neighbors of atom
 {i}, and the index {k} goes over the {n}ij common nearest neighbors
 between atom {i} and atom {j}. Rik and Rjk are the vectors connecting atom
 {k} to atoms {i} and {j}.  The quantity in the double sum is computed
-for each atom. 
+for each atom.
 
 The CNP calculation is sensitive to the specified cutoff value.
 You should ensure that the appropriate nearest neighbors of an atom are
diff --git a/doc/src/compute_pair_local.txt b/doc/src/compute_pair_local.txt
index 0121210994..16aaba4667 100644
--- a/doc/src/compute_pair_local.txt
+++ b/doc/src/compute_pair_local.txt
@@ -76,7 +76,9 @@ command for the types of the two atoms is used.  For the {radius}
 setting, the sum of the radii of the two particles is used as a
 cutoff.  For example, this is appropriate for granular particles which
 only interact when they are overlapping, as computed by "granular pair
-styles"_pair_gran.txt.
+styles"_pair_gran.txt.  Note that if a granular model defines atom
+types such that all particles of a specific type are monodisperse
+(same diameter), then the two settings are effectively identical.
 
 Note that as atoms migrate from processor to processor, there will be
 no consistent ordering of the entries within the local vector or array
diff --git a/doc/src/compute_property_local.txt b/doc/src/compute_property_local.txt
index f7851e864b..39106a39c8 100644
--- a/doc/src/compute_property_local.txt
+++ b/doc/src/compute_property_local.txt
@@ -79,6 +79,9 @@ the two atoms is used.  For the {radius} setting, the sum of the radii
 of the two particles is used as a cutoff.  For example, this is
 appropriate for granular particles which only interact when they are
 overlapping, as computed by "granular pair styles"_pair_gran.html.
+Note that if a granular model defines atom types such that all
+particles of a specific type are monodisperse (same diameter), then
+the two settings are effectively identical.
 
 If the inputs are bond, angle, etc attributes, the local data is
 generated by looping over all the atoms owned on a processor and
diff --git a/doc/src/dihedral_charmm.txt b/doc/src/dihedral_charmm.txt
index 918755ec38..73dc67cdef 100644
--- a/doc/src/dihedral_charmm.txt
+++ b/doc/src/dihedral_charmm.txt
@@ -138,7 +138,15 @@ more instructions on how to use the accelerated styles effectively.
 
 [Restrictions:]
 
-This dihedral style can only be used if LAMMPS was built with the
+When using run_style "respa"_run_style.html, these dihedral styles
+must be assigned to the same r-RESPA level as {pair} or {outer}.
+
+When used in combination with CHARMM pair styles, the 1-4
+"special_bonds"_special_bonds.html scaling factors must be set to 0.0.
+Otherwise non-bonded contributions for these 1-4 pairs will be
+computed multiple times.
+
+These dihedral styles can only be used if LAMMPS was built with the
 MOLECULE package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info on packages.
 
diff --git a/doc/src/dump_vtk.txt b/doc/src/dump_vtk.txt
index 21502e7f49..d4d28c81fc 100644
--- a/doc/src/dump_vtk.txt
+++ b/doc/src/dump_vtk.txt
@@ -16,7 +16,7 @@ ID = user-assigned name for the dump
 group-ID = ID of the group of atoms to be dumped
 vtk = style of dump command (other styles {atom} or {cfg} or {dcd} or {xtc} or {xyz} or {local} or {custom} are discussed on the "dump"_dump.html doc page)
 N = dump every this many timesteps
-file = name of file to write dump info to 
+file = name of file to write dump info to
 args = same as arguments for "dump_style custom"_dump.html :ul
 
 [Examples:]
@@ -83,7 +83,7 @@ Triclinic simulation boxes (non-orthogonal) are saved as
 hexahedrons in either legacy .vtk or .vtu XML format.
 
 Style {vtk} allows you to specify a list of atom attributes to be
-written to the dump file for each atom.  The list of possible attributes 
+written to the dump file for each atom.  The list of possible attributes
 is the same as for the "dump_style custom"_dump.html command; see
 its doc page for a listing and an explanation of each attribute.
 
diff --git a/doc/src/fix_box_relax.txt b/doc/src/fix_box_relax.txt
index 54decd6282..e3d75ee858 100644
--- a/doc/src/fix_box_relax.txt
+++ b/doc/src/fix_box_relax.txt
@@ -245,7 +245,7 @@ appear the system is converging to your specified pressure.  The
 solution for this is to either (a) zero the velocities of all atoms
 before performing the minimization, or (b) make sure you are
 monitoring the pressure without its kinetic component.  The latter can
-be done by outputting the pressure from the pressure compute this 
+be done by outputting the pressure from the pressure compute this
 command creates (see below) or a pressure compute you define yourself.
 
 NOTE: Because pressure is often a very sensitive function of volume,
diff --git a/doc/src/fix_eos_table_rx.txt b/doc/src/fix_eos_table_rx.txt
index e8d515e1f3..e5e4f772f6 100644
--- a/doc/src/fix_eos_table_rx.txt
+++ b/doc/src/fix_eos_table_rx.txt
@@ -45,14 +45,14 @@ species {j} in particle {i}, {u_j} is the internal energy of species j,
 {DeltaH_f,j} is the heat of formation of species {j}, N is the number of
 molecules represented by the coarse-grained particle, kb is the
 Boltzmann constant, and T is the temperature of the system.  Additionally,
-it is possible to modify the concentration-dependent particle internal 
-energy relation by adding an energy correction, temperature-dependent 
+it is possible to modify the concentration-dependent particle internal
+energy relation by adding an energy correction, temperature-dependent
 correction, and/or a molecule-dependent correction.  An energy correction can
-be specified as a constant (in energy units).  A temperature correction can be 
-specified by multiplying a temperature correction coefficient by the 
-internal temperature.  A molecular correction can be specified by 
-by multiplying a molecule correction coefficient by the average number of 
-product gas particles in the coarse-grain particle. 
+be specified as a constant (in energy units).  A temperature correction can be
+specified by multiplying a temperature correction coefficient by the
+internal temperature.  A molecular correction can be specified by
+by multiplying a molecule correction coefficient by the average number of
+product gas particles in the coarse-grain particle.
 
 Fix {eos/table/rx} creates interpolation tables of length {N} from {m}
 internal energy values of each species {u_j} listed in a file as a
@@ -72,12 +72,12 @@ The second filename specifies a file containing heat of formation
 {DeltaH_f,j} for each species.
 
 In cases where the coarse-grain particle represents a single molecular
-species (i.e., no reactions occur and fix {rx} is not present in the input file), 
-fix {eos/table/rx} can be applied in a similar manner to fix {eos/table} 
-within a non-reactive DPD simulation.  In this case, the heat of formation 
+species (i.e., no reactions occur and fix {rx} is not present in the input file),
+fix {eos/table/rx} can be applied in a similar manner to fix {eos/table}
+within a non-reactive DPD simulation.  In this case, the heat of formation
 filename is replaced with the heat of formation value for the single species.
-Additionally, the energy correction and temperature correction coefficients may 
-also be specified as fix arguments.  
+Additionally, the energy correction and temperature correction coefficients may
+also be specified as fix arguments.
 
 :line
 
@@ -138,8 +138,8 @@ used as the species name must correspond with the tags used to define
 the reactions with the "fix rx"_fix_rx.html command.
 
 Alternatively, corrections to the EOS can be included by specifying
-three additional columns that correspond to the energy correction, 
-the temperature correction coefficient and molecule correction 
+three additional columns that correspond to the energy correction,
+the temperature correction coefficient and molecule correction
 coefficient.  In this case, the format of the file is as follows:
 
 # HEAT OF FORMATION TABLE     (one or more comment or blank lines) :pre
diff --git a/doc/src/fix_filter_corotate.txt b/doc/src/fix_filter_corotate.txt
index a3339648fa..b782d285c7 100644
--- a/doc/src/fix_filter_corotate.txt
+++ b/doc/src/fix_filter_corotate.txt
@@ -70,8 +70,8 @@ minimization"_minimize.html.
 
 [Restrictions:]
 
-This fix is part of the USER-MISC package. It is only enabled if 
-LAMMPS was built with that package. See the "Making 
+This fix is part of the USER-MISC package. It is only enabled if
+LAMMPS was built with that package. See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 Currently, it does not support "molecule templates"_molecule.html.
diff --git a/doc/src/fix_gcmc.txt b/doc/src/fix_gcmc.txt
index 7ac607a2f1..41ec38cffb 100644
--- a/doc/src/fix_gcmc.txt
+++ b/doc/src/fix_gcmc.txt
@@ -406,7 +406,7 @@ the user for each subsequent fix gcmc command.
 [Default:]
 
 The option defaults are mol = no, maxangle = 10, overlap_cutoff = 0.0,
-fugacity_coeff = 1, and full_energy = no, 
+fugacity_coeff = 1, and full_energy = no,
 except for the situations where full_energy is required, as
 listed above.
 
diff --git a/doc/src/fix_grem.txt b/doc/src/fix_grem.txt
index 3fc5c1a10e..661f68ed99 100644
--- a/doc/src/fix_grem.txt
+++ b/doc/src/fix_grem.txt
@@ -85,13 +85,13 @@ No information about this fix is written to "binary restart
 files"_restart.html.
 
 The "thermo_modify"_thermo_modify.html {press} option is supported
-by this fix to add the rescaled kinetic pressure as part of 
+by this fix to add the rescaled kinetic pressure as part of
 "thermodynamic output"_thermo_style.html.
 
 [Restrictions:]
 
-This fix is part of the USER-MISC package. It is only enabled if 
-LAMMPS was built with that package. See the "Making 
+This fix is part of the USER-MISC package. It is only enabled if
+LAMMPS was built with that package. See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
diff --git a/doc/src/fix_ipi.txt b/doc/src/fix_ipi.txt
index b1533830bc..07e8025d77 100644
--- a/doc/src/fix_ipi.txt
+++ b/doc/src/fix_ipi.txt
@@ -58,14 +58,14 @@ input are listed in the same order as in the data file of LAMMPS. The
 initial configuration is ignored, as it will be substituted with the
 coordinates received from i-PI before forces are ever evaluated.
 
-A note of caution when using potentials that contain long-range 
+A note of caution when using potentials that contain long-range
 electrostatics, or that contain parameters that depend on box size:
 all of these options will be initialized based on the cell size in the
-LAMMPS-side initial configuration and kept constant during the run. 
-This is required to e.g. obtain reproducible and conserved forces. 
-If the cell varies too wildly, it may be advisable to reinitialize 
-these interactions at each call. This behavior can be requested by 
-setting the {reset} switch. 
+LAMMPS-side initial configuration and kept constant during the run.
+This is required to e.g. obtain reproducible and conserved forces.
+If the cell varies too wildly, it may be advisable to reinitialize
+these interactions at each call. This behavior can be requested by
+setting the {reset} switch.
 
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
diff --git a/doc/src/fix_mscg.txt b/doc/src/fix_mscg.txt
index 0e09f8a9c5..7d16967955 100644
--- a/doc/src/fix_mscg.txt
+++ b/doc/src/fix_mscg.txt
@@ -57,7 +57,7 @@ simulations is as follows:
 Perform all-atom simulations on the system to be coarse grained.
 Generate a trajectory mapped to the coarse-grained model.
 Create input files for the MS-CG library.
-Run the range finder functionality of the MS-CG library.  
+Run the range finder functionality of the MS-CG library.
 Run the force matching functionality of the MS-CG library.
 Check the results of the force matching.
 Run coarse-grained simulations using the new coarse-grained potentials. :ol
@@ -70,7 +70,7 @@ Step 2 can be performed using a Python script (what is the name?)
 provided with the MS-CG library which defines the coarse-grained model
 and converts a standard LAMMPS dump file for an all-atom simulation
 (step 1) into a LAMMPS dump file which has the positions of and forces
-on the coarse-grained beads.  
+on the coarse-grained beads.
 
 In step 3, an input file named "control.in" is needed by the MS-CG
 library which sets parameters for the range finding and force matching
diff --git a/doc/src/fix_neb.txt b/doc/src/fix_neb.txt
index 94c6ee84fd..52d8a7df84 100644
--- a/doc/src/fix_neb.txt
+++ b/doc/src/fix_neb.txt
@@ -14,152 +14,179 @@ fix ID group-ID neb Kspring keyword value :pre
 
 ID, group-ID are documented in "fix"_fix.html command :ulb,l
 neb = style name of this fix command :l
-Kspring = parallel spring constant (force/distance units or force units) :l
+Kspring = spring constant for parallel nudging force (force/distance units or force units, see parallel keyword) :l
 zero or more keyword/value pairs may be appended :l
-keyword = {nudg_style} or {perp} or {freend} or {freend_k_spring} :l
-  {nudg_style} value = {neigh} or {idealpos}
-    {neigh} = the parallel nudging force is calculated from the distances to neighbouring replicas (in this case, Kspring is in force/distance units)
-    {idealpos} = the parallel nudging force is proportional to the distance between the replica and its interpolated ideal position (in this case Kspring is in force units)
-  {perp} value {none} or kspring2
-    {none} = no perpendicular spring force is applied
-    {kspring2} = spring constant for the perpendicular nudging force (in force/distance units)
-  {freeend} value = {none} or {ini} or {final} or {finaleini} or {final2eini}
-    {none} = no nudging force is applied to the first and last replicas
-    {ini} = set the first replica to be a free end 
-    {final} = set the last replica to be a free end
-    {finaleini} = set the last replica to be a free end and set its target energy as that of the first replica
-    {final2eini} = same as {finaleini} plus prevent intermediate replicas to have a lower energy than the first replica
-  {freeend_kspring}  value = kspring3
-    kspring3 = spring constant of the perpendicular spring force (per distance units)    
-   :pre
+keyword = {parallel} or {perp} or {end} :l
+  {parallel} value = {neigh} or {ideal}
+    {neigh} = parallel nudging force based on distance to neighbor replicas (Kspring = force/distance units)
+    {ideal} = parallel nudging force based on interpolated ideal position (Kspring = force units)
+  {perp} value = {Kspring2}
+    {Kspring2} = spring constant for perpendicular nudging force (force/distance units)
+  {end} values = estyle Kspring3
+    {estyle} = {first} or {last} or {last/efirst} or {last/efirst/middle}
+      {first} = apply force to first replica
+      {last} = apply force to last replica
+      {last/efirst} = apply force to last replica and set its target energy to that of first replica
+      {last/efirst/middle} = same as {last/efirst} plus prevent middle replicas having lower energy than first replica
+    {Kspring3} = spring constant for target energy term (1/distance units) :pre,ule
 
 [Examples:]
 
 fix 1 active neb 10.0
-fix 2 all neb 1.0 perp 1.0 freeend final
-fix 1 all neb 1.0 nudg_style idealpos freeend final2eini freend_kspring 1:pre
+fix 2 all neb 1.0 perp 1.0 end last
+fix 2 all neb 1.0 perp 1.0 end first 1.0 end last 1.0
+fix 1 all neb 1.0 nudge ideal end last/efirst 1 :pre
 
 [Description:]
 
-Add a nudging force to atoms in the group for a multi-replica
+Add nudging forces to atoms in the group for a multi-replica
 simulation run via the "neb"_neb.html command to perform a nudged
 elastic band (NEB) calculation for finding the transition state.
 Hi-level explanations of NEB are given with the "neb"_neb.html command
 and in "Section_howto 5"_Section_howto.html#howto_5 of the manual.
 The fix neb command must be used with the "neb" command and defines
-how nudging inter-replica forces are computed.  A NEB calculation is
+how inter-replica nudging forces are computed.  A NEB calculation is
 divided in two stages. In the first stage n replicas are relaxed
-toward a MEP and in a second stage, the climbing image scheme (see
-"(Henkelman2)"_#Henkelman2) is turned on so that the replica having
-the highest energy relaxes toward the saddle point (i.e. the point of
-highest energy along the MEP).
+toward a MEP until convergence.  In the second stage, the climbing
+image scheme (see "(Henkelman2)"_#Henkelman2) is enabled, so that the
+replica having the highest energy relaxes toward the saddle point
+(i.e. the point of highest energy along the MEP), and a second
+relaxation is performed.
 
-One purpose of the nudging forces is to keep the replicas equally
-spaced.  During the NEB, the 3N-length vector of interatomic force Fi
-= -Grad(V) of replicas i is altered. For all intermediate replicas
-(i.e. for 1<i<n) but the climbing replica the force vector
-becomes:
+A key purpose of the nudging forces is to keep the replicas equally
+spaced.  During the NEB calculation, the 3N-length vector of
+interatomic force Fi = -Grad(V) for each replica I is altered.  For
+all intermediate replicas (i.e. for 1 < I < N, except the climbing
+replica) the force vector becomes:
 
-Fi = -Grad(V) + (Grad(V) dot That) That + Fnudgparallel + Fspringperp :pre
+Fi = -Grad(V) + (Grad(V) dot T') T' + Fnudge_parallel + Fnudge_perp :pre
 
-That is the unit "tangent" vector for replica i and is a function of
-Ri, Ri-1, Ri+1, and the potential energy of the 3 replicas; it points
-roughly in the direction of (Ri+i - Ri-1) (see the
-"(Henkelman1)"_#Henkelman1 paper for details).  Ri are the atomic
-coordinates of replica i; Ri-1 and Ri+1 are the coordinates of its
-neighbor replicas.  The term (Grad(V) dot That) is used to remove the
+T' is the unit "tangent" vector for replica I and is a function of Ri,
+Ri-1, Ri+1, and the potential energy of the 3 replicas; it points
+roughly in the direction of (Ri+i - Ri-1); see the
+"(Henkelman1)"_#Henkelman1 paper for details.  Ri are the atomic
+coordinates of replica I; Ri-1 and Ri+1 are the coordinates of its
+neighbor replicas.  The term (Grad(V) dot T') is used to remove the
 component of the gradient parallel to the path which would tend to
-distribute the replica unevenly along the path.  Fnudgparallel is an
-artificial nudging force which is applied only in the tangent direction
-and which maintains the replicas equally spaced (see below for more
-information).  Fspringperp is an optinal artificial spring which is
-applied only perpendicular to the tangent and which prevent the paths
-from forming too acute kinks (see below for more information).
+distribute the replica unevenly along the path.  Fnudge_parallel is an
+artificial nudging force which is applied only in the tangent
+direction and which maintains the equal spacing between replicas (see
+below for more information).  Fnudge_perp is an optional artificial
+spring which is applied in a direction perpendicular to the tangent
+direction and which prevent the paths from forming acute kinks (see
+below for more information).
 
-The keyword {nudg_style} allow to specify how to parallel
-nudging force is computed. With a value of idealpos, the spring 
-force is computed as suggested in "(E)"_#E :
-   
-Fnudgparallel=-{Kspring}* (RD-RDideal)/(2 meanDist) :pre
+In the second stage of the NEB calculation, the interatomic force Fi
+for the climbing replica (the replica of highest energy after the
+first stage) is changed to:
+
+Fi = -Grad(V) + 2 (Grad(V) dot T') T' :pre
+
+and the relaxation procedure is continued to a new converged MEP.
+
+:line
+
+The keyword {parallel} specifies how the parallel nudging force is
+computed.  With a value of {neigh}, the parallel nudging force is
+computed as in "(Henkelman1)"_#Henkelman1 by connecting each
+intermediate replica with the previous and the next image:
+
+Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
+
+Note that in this case the specified {Kspring) is in force/distance
+units.
+
+With a value of {ideal}, the spring force is computed as suggested in
+"(WeinenE)"_#WeinenE :
+
+Fnudge_parallel = -{Kspring} * (RD-RDideal) / (2 * meanDist) :pre
 
 where RD is the "reaction coordinate" see "neb"_neb.html section, and
-RDideal is the ideal RD for which all the images are equally spaced
-(i.e. RDideal = (i-1)*meanDist when the climbing image is off, where i
-is the replica number). The meanDist is the average distance between
-replicas.
+RDideal is the ideal RD for which all the images are equally spaced.
+I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
+I is the replica number).  The meanDist is the average distance
+between replicas.  Note that in this case the specified {Kspring) is
+in force units.
 
-When {nudg_style} has a value of neigh (or by default), the parallel 
-nudging force is computed as in "(Henkelman1)"_#Henkelman1 by 
-connecting each intermediate replica with the previous and the next 
-image:
-
-Fnudgparallel= {Kspring}* (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
-
-The parallel nudging force associated with the key word idealpos should
-usually be more efficient at keeping the images equally spaced.
+Note that the {ideal} form of nudging can often be more effective at
+keeping the replicas equally spaced.
 
 :line
 
-The keyword {perp} allows to add a spring force perpendicular to the
-path in order to prevent the path from becoming too kinky. It can
-improve significantly the convergence of the NEB when the resolution
-is poor (i.e. when too few images are used) (see "(Maras)"_#Maras1).
+The keyword {perp} specifies if and how a perpendicual nudging force
+is computed.  It adds a spring force perpendicular to the path in
+order to prevent the path from becoming too kinky.  It can
+significantly improve the convergence of the NEB calculation when the
+resolution is poor.  I.e. when few replicas are used; see
+"(Maras)"_#Maras1 for details.
+
 The perpendicular spring force is given by
 
-Fspringperp = {Kspringperp} * f(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre
+Fnudge_perp = {Kspring2} * F(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre
 
-f(Ri-1 Ri R+1) is a smooth scalar function of the angle Ri-1 Ri
-Ri+1. It is equal to 0 when the path is straight and is equal to 1
-when the angle Ri-1 Ri Ri+1 is accute. f(Ri-1 Ri R+1) is defined in
-"(Jonsson)"_#Jonsson
+where {Kspring2} is the specified value.  F(Ri-1 Ri R+1) is a smooth
+scalar function of the angle Ri-1 Ri Ri+1.  It is equal to 0.0 when
+the path is straight and is equal to 1 when the angle Ri-1 Ri Ri+1 is
+acute.  F(Ri-1 Ri R+1) is defined in "(Jonsson)"_#Jonsson.
+
+If {Kspring2} is set to 0.0 (the default) then no perpendicular spring
+force is added.
 
 :line
 
-By default, the force acting on the first and last replicas is not
-altered so that during the NEB relaxation, these ending replicas relax
-toward local minima. However it is possible to use the key word
-{freeend} to allow either the initial or the final replica to relax
-toward a MEP while constraining its energy.  The interatomic force Fi
-for the free end image becomes :
+By default, no additional forces act on the first and last replicas
+during the NEB relaxation, so these replicas simply relax toward their
+respective local minima.  By using the key word {end}, additional
+forces can be applied to the first and/or last replicas, to enable
+them to relax toward a MEP while constraining their energy.
 
-Fi = -Grad(V)+ (Grad(V) dot That + (E-ETarget)*kspring3) That,  {when} Grad(V) dot That < 0
-Fi = -Grad(V)+ (Grad(V) dot That + (ETarget- E)*kspring3) That, {when} Grad(V) dot That > 0
+The interatomic force Fi for the specified replica becomes:
+
+Fi = -Grad(V) + (Grad(V) dot T' + (E-ETarget)*Kspring3) T',  {when} Grad(V) dot T' < 0
+Fi = -Grad(V) + (Grad(V) dot T' + (ETarget- E)*Kspring3) T', {when} Grad(V) dot T' > 0
 :pre
 
-where E is the energy of the free end replica and ETarget is the
-target energy.
+where E is the current energy of the replica and ETarget is the target
+energy.  The "spring" constant on the difference in energies is the
+specified {Kspring3} value.
 
-When the value {ini} ({final}) is used after the keyword {freeend},
-the first (last) replica is considered as a free end. The target
-energy is set to the energy of the replica at starting of the NEB
-calculation. When the value {finaleini} or {final2eini} is used the
-last image is considered as a free end and the target energy is equal
-to the energy of the first replica (which can evolve during the NEB
-relaxation).  With the value {finaleini}, when the initial path is too
-far from the MEP, an intermediate repilica might relax "faster" and
-get a lower energy than the last replica. The benefit of the free end
-is then lost since this intermediate replica will relax toward a local
-minima. This behavior can be prevented by using the value {final2eini}
-which remove entirely the contribution of the gradient for all
-intermediate replica which have a lower energy than the initial one
-thus preventing these replicae to over-relax.  After converging a NEB
-with the {final2eini} value it is recommended to check that all
-intermediate replica have a larger energy than the initial
-replica. Finally note that if the last replica converges toward a
-local minimum with a larger energy than the energy of the first
-replica, a free end neb calculation with the value {finaleini} or
-{final2eini} cannot reach the convergence criteria.
+When {estyle} is specified as {first}, the force is applied to the
+first replica.  When {estyle} is specified as {last}, the force is
+applied to the last replica.  Note that the {end} keyword can be used
+twice to add forces to both the first and last replicas.
 
-:line
+For both these {estyle} settings, the target energy {ETarget} is set
+to the initial energy of the replica (at the start of the NEB
+calculation).
 
+If the {estyle} is specified as {last/efirst} or {last/efirst/middle},
+force is applied to the last replica, but the target energy {ETarget}
+is continuously set to the energy of the first replica, as it evolves
+during the NEB relaxation.
 
+The difference between these two {estyle} options is as follows.  When
+{estyle} is specified as {last/efirst}, no change is made to the
+inter-replica force applied to the intermediate replicas (neither
+first or last).  If the initial path is too far from the MEP, an
+intermediate repilica may relax "faster" and reach a lower energy than
+the last replica.  In this case the intermediate replica will be
+relaxing toward its own local minima.  This behavior can be prevented
+by specifying {estyle} as {last/efirst/middle} which will alter the
+inter-replica force applied to intermediate replicas by removing the
+contribution of the gradient to the inter-replica force.  This will
+only be done if a particular intermediate replica has a lower energy
+than the first replica.  This should effectively prevent the
+intermediate replicas from over-relaxing.
 
-In the second stage of the NEB, the interatomic force Fi for the
-climbing replica (which is the replica of highest energy) becomes:
-
-Fi = -Grad(V) + 2 (Grad(V) dot That) That :pre
-
+After converging a NEB calculation using an {estyle} of
+{last/efirst/middle}, you should check that all intermediate replicas
+have a larger energy than the first replica. If this is not the case,
+the path is probably not a MEP.
 
+Finally, note that if the last replica converges toward a local
+minimum which has a larger energy than the energy of the first
+replica, a NEB calculation using an {estyle} of {last/efirst} or
+{last/efirst/middle} cannot reach final convergence.
 
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
@@ -186,7 +213,8 @@ for more info on packages.
 
 [Default:]
 
-The option defaults are nudg_style = neigh, perp = none, freeend = none and freend_kspring = 1.
+The option defaults are nudge = neigh, perp = 0.0, ends is not
+specified (no inter-replica force on the end replicas).
 
 :line
 
@@ -197,14 +225,14 @@ The option defaults are nudg_style = neigh, perp = none, freeend = none and free
 [(Henkelman2)] Henkelman, Uberuaga, Jonsson, J Chem Phys, 113,
 9901-9904 (2000).
 
-:link(E)
-[(E)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002)
+:link(WeinenE)
+[(WeinenE)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002).
 
 :link(Jonsson)
 [(Jonsson)] Jonsson, Mills and Jacobsen, in Classical and Quantum
-Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti, and Coker
-World Scientific, Singapore, 1998, p. 385
+Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti,
+and Coker World Scientific, Singapore, 1998, p 385.
 
 :link(Maras1)
 [(Maras)] Maras, Trushin, Stukowski, Ala-Nissila, Jonsson,
-Comp Phys Comm, 205, 13-21 (2016)
+Comp Phys Comm, 205, 13-21 (2016).
diff --git a/doc/src/fix_nve_dot.txt b/doc/src/fix_nve_dot.txt
index b1c00cd25c..7ad51f3768 100644
--- a/doc/src/fix_nve_dot.txt
+++ b/doc/src/fix_nve_dot.txt
@@ -23,13 +23,13 @@ fix 1 all nve/dot :pre
 [Description:]
 
 Apply a rigid-body integrator as described in "(Davidchack)"_#Davidchack1
-to a group of atoms, but without Langevin dynamics. 
+to a group of atoms, but without Langevin dynamics.
 This command performs Molecular dynamics (MD)
-via a velocity-Verlet algorithm and an evolution operator that rotates 
-the quaternion degrees of freedom, similar to the scheme outlined in "(Miller)"_#Miller1. 
+via a velocity-Verlet algorithm and an evolution operator that rotates
+the quaternion degrees of freedom, similar to the scheme outlined in "(Miller)"_#Miller1.
 
 This command is the equivalent of the "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html
-without damping and noise and can be used to determine the stability range 
+without damping and noise and can be used to determine the stability range
 in a NVE ensemble prior to using the Langevin-type DOTC-integrator
 (see also "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html).
 The command is equivalent to the "fix nve"_fix_nve.html.
diff --git a/doc/src/fix_nve_dotc_langevin.txt b/doc/src/fix_nve_dotc_langevin.txt
index 19d5b233ce..5de8e663c4 100644
--- a/doc/src/fix_nve_dotc_langevin.txt
+++ b/doc/src/fix_nve_dotc_langevin.txt
@@ -28,20 +28,20 @@ fix 1 all nve/dotc/langevin 1.0 1.0 0.03 457145 angmom 10 :pre
 
 [Description:]
 
-Apply a rigid-body Langevin-type integrator of the kind "Langevin C" 
+Apply a rigid-body Langevin-type integrator of the kind "Langevin C"
 as described in "(Davidchack)"_#Davidchack2
 to a group of atoms, which models an interaction with an implicit background
 solvent.  This command performs Brownian dynamics (BD)
-via a technique that splits the integration into a deterministic Hamiltonian 
-part and the Ornstein-Uhlenbeck process for noise and damping. 
+via a technique that splits the integration into a deterministic Hamiltonian
+part and the Ornstein-Uhlenbeck process for noise and damping.
 The quaternion degrees of freedom are updated though an evolution
 operator which performs a rotation in quaternion space, preserves
 the quaternion norm and is akin to "(Miller)"_#Miller2.
 
-In terms of syntax this command has been closely modelled on the 
-"fix langevin"_fix_langevin.html and its {angmom} option. But it combines 
-the "fix nve"_fix_nve.html and the "fix langevin"_fix_langevin.html in 
-one single command. The main feature is improved stability 
+In terms of syntax this command has been closely modelled on the
+"fix langevin"_fix_langevin.html and its {angmom} option. But it combines
+the "fix nve"_fix_nve.html and the "fix langevin"_fix_langevin.html in
+one single command. The main feature is improved stability
 over the standard integrator, permitting slightly larger timestep sizes.
 
 NOTE: Unlike the "fix langevin"_fix_langevin.html this command performs
@@ -57,7 +57,7 @@ Fc is the conservative force computed via the usual inter-particle
 interactions ("pair_style"_pair_style.html,
 "bond_style"_bond_style.html, etc).
 
-The Ff and Fr terms are implicitly taken into account by this fix 
+The Ff and Fr terms are implicitly taken into account by this fix
 on a per-particle basis.
 
 Ff is a frictional drag or viscous damping term proportional to the
@@ -77,7 +77,7 @@ a Gaussian random number) for speed.
 
 :line
 
-{Tstart} and {Tstop} have to be constant values, i.e. they cannot 
+{Tstart} and {Tstop} have to be constant values, i.e. they cannot
 be variables.
 
 The {damp} parameter is specified in time units and determines how
@@ -98,16 +98,16 @@ different numbers of processors.
 
 The keyword/value option has to be used in the following way:
 
-This fix has to be used together with the {angmom} keyword. The 
-particles are always considered to have a finite size. 
-The keyword {angmom} enables thermostatting of the rotational degrees of 
-freedom in addition to the usual translational degrees of freedom. 
+This fix has to be used together with the {angmom} keyword. The
+particles are always considered to have a finite size.
+The keyword {angmom} enables thermostatting of the rotational degrees of
+freedom in addition to the usual translational degrees of freedom.
 
-The scale factor after the {angmom} keyword gives the ratio of the rotational to 
+The scale factor after the {angmom} keyword gives the ratio of the rotational to
 the translational friction coefficient.
 
 An example input file can be found in /examples/USER/cgdna/examples/duplex2/.
-A technical report with more information on this integrator can be found 
+A technical report with more information on this integrator can be found
 "here"_PDF/USER-CGDNA-overview.pdf.
 
 :line
@@ -120,7 +120,7 @@ LAMMPS"_Section_start.html#start_3 section for more info on packages.
 
 [Related commands:]
 
-"fix nve"_fix_nve.html, "fix langevin"_fix_langevin.html, "fix nve/dot"_fix_nve_dot.html,  
+"fix nve"_fix_nve.html, "fix langevin"_fix_langevin.html, "fix nve/dot"_fix_nve_dot.html,
 
 [Default:] none
 
diff --git a/doc/src/fix_nvk.txt b/doc/src/fix_nvk.txt
index 271483b441..49fd8217ab 100644
--- a/doc/src/fix_nvk.txt
+++ b/doc/src/fix_nvk.txt
@@ -27,7 +27,7 @@ timestep.  V is volume; K is kinetic energy. This creates a system
 trajectory consistent with the isokinetic ensemble.
 
 The equations of motion used are those of Minary et al in
-"(Minary)"_#nvk-Minary, a variant of those initially given by Zhang in 
+"(Minary)"_#nvk-Minary, a variant of those initially given by Zhang in
 "(Zhang)"_#nvk-Zhang.
 
 The kinetic energy will be held constant at its value given when fix
diff --git a/doc/src/fix_spring.txt b/doc/src/fix_spring.txt
index 5f94f4cdae..014a43aacc 100644
--- a/doc/src/fix_spring.txt
+++ b/doc/src/fix_spring.txt
@@ -89,7 +89,7 @@ NOTE: The center of mass of a group of atoms is calculated in
 group can straddle a periodic boundary.  See the "dump"_dump.html doc
 page for a discussion of unwrapped coordinates.  It also means that a
 spring connecting two groups or a group and the tether point can cross
-a periodic boundary and its length be calculated correctly.  
+a periodic boundary and its length be calculated correctly.
 
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
diff --git a/doc/src/fix_ti_spring.txt b/doc/src/fix_ti_spring.txt
index 40e595e21e..afb1dcf8ff 100644
--- a/doc/src/fix_ti_spring.txt
+++ b/doc/src/fix_ti_spring.txt
@@ -144,7 +144,11 @@ this fix.
 
 "fix spring"_fix_spring.html, "fix adapt"_fix_adapt.html
 
-[Restrictions:] none
+[Restrictions:]
+
+This fix is part of the USER-MISC package. It is only enabled if
+LAMMPS was built with that package. See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Default:]
 
diff --git a/doc/src/kspace_modify.txt b/doc/src/kspace_modify.txt
index b488df9627..6d27bb7076 100644
--- a/doc/src/kspace_modify.txt
+++ b/doc/src/kspace_modify.txt
@@ -219,10 +219,10 @@ instead of using the virial equation. This option cannot be used to access
 individual components of the pressure tensor, to compute per-atom virial,
 or with suffix kspace/pair styles of MSM, like OMP or GPU.
 
-The {fftbench} keyword applies only to PPPM. It is on by default. If
-this option is turned off, LAMMPS will not take the time at the end
-of a run to give FFT benchmark timings, and will finish a few seconds
-faster than it would if this option were on.
+The {fftbench} keyword applies only to PPPM. It is off by default. If
+this option is turned on, LAMMPS will perform a short FFT benchmark
+computation and report its timings, and will thus finish a some seconds
+later than it would if this option were off.
 
 The {collective} keyword applies only to PPPM.  It is set to {no} by
 default, except on IBM BlueGene machines.  If this option is set to
@@ -306,9 +306,10 @@ parameters, see the "How-To"_Section_howto.html#howto_24 discussion.
 The option defaults are mesh = mesh/disp = 0 0 0, order = order/disp =
 5 (PPPM), order = 10 (MSM), minorder = 2, overlap = yes, force = -1.0,
 gewald = gewald/disp = 0.0, slab = 1.0, compute = yes, cutoff/adjust =
-yes (MSM), pressure/scalar = yes (MSM), fftbench = yes (PPPM), diff = ik
+yes (MSM), pressure/scalar = yes (MSM), fftbench = no (PPPM), diff = ik
 (PPPM), mix/disp = pair, force/disp/real = -1.0, force/disp/kspace = -1.0,
-split = 0, tol = 1.0e-6, and disp/auto = no.
+split = 0, tol = 1.0e-6, and disp/auto = no. For pppm/intel, order =
+order/disp = 7.
 
 :line
 
diff --git a/doc/src/kspace_style.txt b/doc/src/kspace_style.txt
index 371540bd68..4f27c9aa78 100644
--- a/doc/src/kspace_style.txt
+++ b/doc/src/kspace_style.txt
@@ -33,12 +33,16 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
     accuracy = desired relative error in forces
   {pppm/gpu} value = accuracy
     accuracy = desired relative error in forces
+  {pppm/intel} value = accuracy
+    accuracy = desired relative error in forces
   {pppm/kk} value = accuracy
     accuracy = desired relative error in forces
   {pppm/omp} value = accuracy
     accuracy = desired relative error in forces
   {pppm/cg/omp} value = accuracy
     accuracy = desired relative error in forces
+  {pppm/disp/intel} value = accuracy
+    accuracy = desired relative error in forces
   {pppm/tip4p/omp} value = accuracy
     accuracy = desired relative error in forces
   {pppm/stagger} value = accuracy
diff --git a/doc/src/neb.txt b/doc/src/neb.txt
index a4afc2fe6d..d2e8be3f03 100644
--- a/doc/src/neb.txt
+++ b/doc/src/neb.txt
@@ -344,7 +344,7 @@ informations can help understanding what is going wrong. For instance
 when the path angle becomes accute the definition of tangent used in
 the NEB calculation is questionable and the NEB cannot may diverge
 "(Maras)"_#Maras2.
- 
+
 
 When running on multiple partitions, LAMMPS produces additional log
 files for each partition, e.g. log.lammps.0, log.lammps.1, etc.  For a
diff --git a/doc/src/pair_agni.txt b/doc/src/pair_agni.txt
index 27fb6c10fe..06dcccb9d9 100644
--- a/doc/src/pair_agni.txt
+++ b/doc/src/pair_agni.txt
@@ -40,8 +40,8 @@ vectorial atomic forces.
 
 Only a single pair_coeff command is used with the {agni} style which
 specifies an AGNI potential file containing the parameters of the
-force field for the needed elements. These are mapped to LAMMPS atom 
-types by specifying N additional arguments after the filename in the 
+force field for the needed elements. These are mapped to LAMMPS atom
+types by specifying N additional arguments after the filename in the
 pair_coeff command, where N is the number of LAMMPS atom types:
 
 filename
@@ -52,13 +52,13 @@ to specify the path for the force field file.
 
 An AGNI force field is fully specified by the filename which contains the
 parameters of the force field, i.e., the reference training environments
-used to construct the machine learning force field. Example force field 
-and input files are provided in the examples/USER/misc/agni directory. 
+used to construct the machine learning force field. Example force field
+and input files are provided in the examples/USER/misc/agni directory.
 
 :line
 
-Styles with {omp} suffix is functionally the same as the corresponding 
-style without the suffix. They have been optimized to run faster, depending 
+Styles with {omp} suffix is functionally the same as the corresponding
+style without the suffix. They have been optimized to run faster, depending
 on your available hardware, as discussed in "Section 5"_Section_accelerate.html
 of the manual.  The accelerated style takes the same arguments and
 should produce the same results, except for round-off and precision
diff --git a/doc/src/pair_buck.txt b/doc/src/pair_buck.txt
index 49161404c3..e705e735fb 100644
--- a/doc/src/pair_buck.txt
+++ b/doc/src/pair_buck.txt
@@ -75,7 +75,7 @@ Lennard-Jones 12/6) given by
 :c,image(Eqs/pair_buck.jpg)
 
 where rho is an ionic-pair dependent length parameter, and Rc is the
-cutoff on both terms. 
+cutoff on both terms.
 
 The styles with {coul/cut} or {coul/long} or {coul/msm} add a
 Coulombic term as described for the "lj/cut"_pair_lj.html pair styles.
diff --git a/doc/src/pair_charmm.txt b/doc/src/pair_charmm.txt
index 9c5973c725..1e78607c08 100644
--- a/doc/src/pair_charmm.txt
+++ b/doc/src/pair_charmm.txt
@@ -104,7 +104,15 @@ charmmfsw"_dihedral_charmm.html command.  Eventually code from the new
 styles will propagate into the related pair styles (e.g. implicit,
 accelerator, free energy variants).
 
-The general CHARMM formulas are as follows
+NOTE: The newest CHARMM pair styles reset the Coulombic energy
+conversion factor used internally in the code, from the LAMMPS value
+to the CHARMM value, as if it were effectively a parameter of the
+force field.  This is because the CHARMM code uses a slightly
+different value for the this conversion factor in "real
+units"_units.html (Kcal/mole), namely CHARMM = 332.0716, LAMMPS =
+332.06371.  This is to enable more precise agreement by LAMMPS with
+the CHARMM force field energies and forces, when using one of these
+two CHARMM pair styles.
 
 :c,image(Eqs/pair_charmm.jpg)
 
diff --git a/doc/src/pair_dipole.txt b/doc/src/pair_dipole.txt
index a9622b32fd..985581cac8 100644
--- a/doc/src/pair_dipole.txt
+++ b/doc/src/pair_dipole.txt
@@ -71,6 +71,14 @@ and force, Fij = -Fji as symmetric forces, and Tij != -Tji since the
 torques do not act symmetrically.  These formulas are discussed in
 "(Allen)"_#Allen2 and in "(Toukmaji)"_#Toukmaji2.
 
+Also note, that in the code, all of these terms (except Elj) have a
+C/epsilon prefactor, the same as the Coulombic term in the LJ +
+Coulombic pair styles discussed "here"_pair_lj.html.  C is an
+energy-conversion constant and epsilon is the dielectric constant
+which can be set by the "dielectric"_dielectric.html command.  The
+same is true of the equations that follow for other dipole pair
+styles.
+
 Style {lj/sf/dipole/sf} computes "shifted-force" interactions between
 pairs of particles that each have a charge and/or a point dipole
 moment. In general, a shifted-force potential is a (sligthly) modified
diff --git a/doc/src/pair_exp6_rx.txt b/doc/src/pair_exp6_rx.txt
index 47045a5933..cbc17d357d 100644
--- a/doc/src/pair_exp6_rx.txt
+++ b/doc/src/pair_exp6_rx.txt
@@ -55,33 +55,33 @@ defined in the reaction kinetics files specified with the "fix
 rx"_fix_rx.html command or they must correspond to the tag "1fluid",
 signifying interaction with a product species mixture determined
 through a one-fluid approximation.  The interaction potential is
-weighted by the geometric average of either the mole fraction concentrations 
-or the number of molecules associated with the interacting coarse-grained 
-particles (see the {fractional} or {molecular} weighting pair style options). 
+weighted by the geometric average of either the mole fraction concentrations
+or the number of molecules associated with the interacting coarse-grained
+particles (see the {fractional} or {molecular} weighting pair style options).
 The coarse-grained potential is stored before and after the
 reaction kinetics solver is applied, where the difference is defined
 to be the internal chemical energy (uChem).
 
-The fourth argument specifies the type of scaling that will be used 
+The fourth argument specifies the type of scaling that will be used
 to scale the EXP-6 parameters as reactions occur.  Currently, there
 are three scaling options:  {exponent}, {polynomial} and {none}.
 
-Exponent scaling requires two additional arguments for scaling 
+Exponent scaling requires two additional arguments for scaling
 the {Rm} and {epsilon} parameters, respectively.  The scaling factor
-is computed by phi^exponent, where phi is the number of molecules  
-represented by the coarse-grain particle and exponent is specified 
+is computed by phi^exponent, where phi is the number of molecules
+represented by the coarse-grain particle and exponent is specified
 as a pair coefficient argument for {Rm} and {epsilon}, respectively.
-The {Rm} and {epsilon} parameters are multiplied by the scaling 
+The {Rm} and {epsilon} parameters are multiplied by the scaling
 factor to give the scaled interaction parameters for the CG particle.
 
-Polynomial scaling requires a filename to be specified as a pair 
+Polynomial scaling requires a filename to be specified as a pair
 coeff argument.  The file contains the coefficients to a fifth order
-polynomial for the {alpha}, {epsilon} and {Rm} parameters that depend 
-upon phi (the number of molecules represented by the CG particle). 
+polynomial for the {alpha}, {epsilon} and {Rm} parameters that depend
+upon phi (the number of molecules represented by the CG particle).
 The format of a polynomial file is provided below.
 
 The {none} option to the scaling does not have any additional pair coeff
-arguments.  This is equivalent to specifying the {exponent} option with 
+arguments.  This is equivalent to specifying the {exponent} option with
 {Rm} and {epsilon} exponents of 0.0 and 0.0, respectively.
 
 The final argument specifies the interaction cutoff (optional).
@@ -102,7 +102,7 @@ parenthesized comments):
 
 # POLYNOMIAL FILE          (one or more comment or blank lines) :pre
 #  General Functional Form:
-#  A*phi^5 + B*phi^4 + C*phi^3 + D*phi^2 + E*phi + F 
+#  A*phi^5 + B*phi^4 + C*phi^3 + D*phi^2 + E*phi + F
 #
 #  Parameter  A        B         C        D         E        F
                            (blank)
diff --git a/doc/src/pair_kolmogorov_crespi_z.txt b/doc/src/pair_kolmogorov_crespi_z.txt
index 0879dc34d0..c7a6d4194f 100644
--- a/doc/src/pair_kolmogorov_crespi_z.txt
+++ b/doc/src/pair_kolmogorov_crespi_z.txt
@@ -24,25 +24,25 @@ pair_coeff 1 2 kolmogorov/crespi/z  CC.KC      C C :pre
 
 [Description:]
 
-The {kolmogorov/crespi/z} style computes the Kolmogorov-Crespi interaction 
-potential as described in "(KC05)"_#KC05. An important simplification is made, 
-which is to take all normals along the z-axis. 
+The {kolmogorov/crespi/z} style computes the Kolmogorov-Crespi interaction
+potential as described in "(KC05)"_#KC05. An important simplification is made,
+which is to take all normals along the z-axis.
 
 :c,image(Eqs/pair_kolmogorov_crespi_z.jpg)
 
-It is important to have a suffiently large cutoff to ensure smooth forces. 
-Energies are shifted so that they go continously to zero at the cutoff assuming 
+It is important to have a suffiently large cutoff to ensure smooth forces.
+Energies are shifted so that they go continously to zero at the cutoff assuming
 that the exponential part of {Vij} (first term) decays sufficiently fast.
 This shift is achieved by the last term in the equation for {Vij} above.
 
-This potential is intended for interactions between two layers of graphene. 
-Therefore, to avoid interaction between layers in multi-layered materials, 
-each layer should have a separate atom type and interactions should only 
+This potential is intended for interactions between two layers of graphene.
+Therefore, to avoid interaction between layers in multi-layered materials,
+each layer should have a separate atom type and interactions should only
 be computed between atom types of neighbouring layers.
 
-The parameter file (e.g. CC.KC), is intended for use with metal 
-"units"_units.html, with energies in meV. An additional parameter, {S}, 
-is available to facilitate scaling of energies in accordance with 
+The parameter file (e.g. CC.KC), is intended for use with metal
+"units"_units.html, with energies in meV. An additional parameter, {S},
+is available to facilitate scaling of energies in accordance with
 "(vanWijk)"_#vanWijk.
 
 This potential must be used in combination with hybrid/overlay.
@@ -64,7 +64,7 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 :line
 
-:link(KC05) 
+:link(KC05)
 [(KC05)] A. N. Kolmogorov, V. H. Crespi, Phys. Rev. B 71, 235415 (2005)
 
 :link(vanWijk)
diff --git a/doc/src/pair_lj_long.txt b/doc/src/pair_lj_long.txt
index d559871f9d..da9f37b9c3 100644
--- a/doc/src/pair_lj_long.txt
+++ b/doc/src/pair_lj_long.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style lj/long/coul/long command :h3
+pair_style lj/long/coul/long/intel command :h3
 pair_style lj/long/coul/long/omp command :h3
 pair_style lj/long/coul/long/opt command :h3
 pair_style lj/long/tip4p/long command :h3
diff --git a/doc/src/pair_lj_smooth_linear.txt b/doc/src/pair_lj_smooth_linear.txt
index 5721b02eb3..a48c441f54 100644
--- a/doc/src/pair_lj_smooth_linear.txt
+++ b/doc/src/pair_lj_smooth_linear.txt
@@ -104,3 +104,8 @@ This pair style can only be used via the {pair} keyword of the
 "pair_coeff"_pair_coeff.html, "pair lj/smooth"_pair_lj_smooth.html
 
 [Default:] none
+
+:line
+
+:link(Toxvaerd)
+[(Toxvaerd)] Toxvaerd, Dyre, J Chem Phys, 134, 081102 (2011).
diff --git a/doc/src/pair_multi_lucy_rx.txt b/doc/src/pair_multi_lucy_rx.txt
index bf5d5636fe..77ed223e2a 100644
--- a/doc/src/pair_multi_lucy_rx.txt
+++ b/doc/src/pair_multi_lucy_rx.txt
@@ -97,9 +97,9 @@ tags must either correspond to the species defined in the reaction
 kinetics files specified with the "fix rx"_fix_rx.html command or they
 must correspond to the tag "1fluid", signifying interaction with a
 product species mixture determined through a one-fluid approximation.
-The interaction potential is weighted by the geometric average of 
-either the mole fraction concentrations or the number of molecules 
-associated with the interacting coarse-grained particles (see the 
+The interaction potential is weighted by the geometric average of
+either the mole fraction concentrations or the number of molecules
+associated with the interacting coarse-grained particles (see the
 {fractional} or {molecular} weighting pair style options). The coarse-grained potential is
 stored before and after the reaction kinetics solver is applied, where
 the difference is defined to be the internal chemical energy (uChem).
diff --git a/doc/src/pair_oxdna.txt b/doc/src/pair_oxdna.txt
index 0a07417fd0..d9734f122d 100644
--- a/doc/src/pair_oxdna.txt
+++ b/doc/src/pair_oxdna.txt
@@ -39,17 +39,17 @@ pair_coeff * * oxdna/coaxstk 46.0 0.4 0.6 0.22 0.58 2.0 2.541592653589793 0.65 1
 
 [Description:]
 
-The {oxdna} pair styles compute the pairwise-additive parts of the oxDNA force field 
-for coarse-grained modelling of DNA. The effective interaction between the nucleotides consists of potentials for the 
+The {oxdna} pair styles compute the pairwise-additive parts of the oxDNA force field
+for coarse-grained modelling of DNA. The effective interaction between the nucleotides consists of potentials for the
 excluded volume interaction {oxdna/excv}, the stacking {oxdna/stk}, cross-stacking {oxdna/xstk}
 and coaxial stacking interaction {oxdna/coaxstk} as well
 as the hydrogen-bonding interaction {oxdna/hbond} between complementary pairs of nucleotides on
 opposite strands.
 
-The exact functional form of the pair styles is rather complex, which manifests itself in the 144 coefficients 
-in the above example. The individual potentials consist of products of modulation factors, 
-which themselves are constructed from a number of more basic potentials 
-(Morse, Lennard-Jones, harmonic angle and distance) as well as quadratic smoothing and modulation terms. 
+The exact functional form of the pair styles is rather complex, which manifests itself in the 144 coefficients
+in the above example. The individual potentials consist of products of modulation factors,
+which themselves are constructed from a number of more basic potentials
+(Morse, Lennard-Jones, harmonic angle and distance) as well as quadratic smoothing and modulation terms.
 We refer to "(Ouldridge-DPhil)"_#Ouldridge-DPhil1 and "(Ouldridge)"_#Ouldridge1
 for a detailed description of the oxDNA force field.
 
@@ -57,8 +57,8 @@ NOTE: These pair styles have to be used together with the related oxDNA bond sty
 {oxdna/fene} for the connectivity of the phosphate backbone (see also documentation of
 "bond_style oxdna/fene"_bond_oxdna.html). With one exception the coefficients
 in the above example have to be kept fixed and cannot be changed without reparametrizing the entire model.
-The exception is the first coefficient after {oxdna/stk} (T=0.1 in the above example).  
-When using a Langevin thermostat, e.g. through "fix langevin"_fix_langevin.html 
+The exception is the first coefficient after {oxdna/stk} (T=0.1 in the above example).
+When using a Langevin thermostat, e.g. through "fix langevin"_fix_langevin.html
 or "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html
 the temperature coefficients have to be matched to the one used in the fix.
 
@@ -79,7 +79,7 @@ LAMMPS"_Section_start.html#start_3 section for more info on packages.
 
 [Related commands:]
 
-"bond_style oxdna/fene"_bond_oxdna.html, "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html, "pair_coeff"_pair_coeff.html, 
+"bond_style oxdna/fene"_bond_oxdna.html, "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html, "pair_coeff"_pair_coeff.html,
 "bond_style oxdna2/fene"_bond_oxdna.html, "pair_style oxdna2/excv"_pair_oxdna2.html
 
 [Default:] none
diff --git a/doc/src/pair_oxdna2.txt b/doc/src/pair_oxdna2.txt
index 1cc562d5f1..1728a0bc7b 100644
--- a/doc/src/pair_oxdna2.txt
+++ b/doc/src/pair_oxdna2.txt
@@ -45,17 +45,17 @@ pair_coeff * * oxdna2/dh      0.1 1.0 0.815 :pre
 
 [Description:]
 
-The {oxdna2} pair styles compute the pairwise-additive parts of the oxDNA force field 
-for coarse-grained modelling of DNA. The effective interaction between the nucleotides consists of potentials for the 
+The {oxdna2} pair styles compute the pairwise-additive parts of the oxDNA force field
+for coarse-grained modelling of DNA. The effective interaction between the nucleotides consists of potentials for the
 excluded volume interaction {oxdna2/excv}, the stacking {oxdna2/stk}, cross-stacking {oxdna2/xstk}
 and coaxial stacking interaction {oxdna2/coaxstk}, electrostatic Debye-Hueckel interaction {oxdna2/dh}
 as well as the hydrogen-bonding interaction {oxdna2/hbond} between complementary pairs of nucleotides on
 opposite strands.
 
-The exact functional form of the pair styles is rather complex. 
-The individual potentials consist of products of modulation factors, 
-which themselves are constructed from a number of more basic potentials 
-(Morse, Lennard-Jones, harmonic angle and distance) as well as quadratic smoothing and modulation terms. 
+The exact functional form of the pair styles is rather complex.
+The individual potentials consist of products of modulation factors,
+which themselves are constructed from a number of more basic potentials
+(Morse, Lennard-Jones, harmonic angle and distance) as well as quadratic smoothing and modulation terms.
 We refer to "(Snodin)"_#Snodin and the original oxDNA publications "(Ouldridge-DPhil)"_#Ouldridge-DPhil2
 and  "(Ouldridge)"_#Ouldridge2 for a detailed description of the oxDNA2 force field.
 
@@ -63,7 +63,7 @@ NOTE: These pair styles have to be used together with the related oxDNA2 bond st
 {oxdna2/fene} for the connectivity of the phosphate backbone (see also documentation of
 "bond_style oxdna2/fene"_bond_oxdna.html). Almost all coefficients
 in the above example have to be kept fixed and cannot be changed without reparametrizing the entire model.
-Exceptions are the first coefficient after {oxdna2/stk} (T=0.1 in the above example) and the coefficients 
+Exceptions are the first coefficient after {oxdna2/stk} (T=0.1 in the above example) and the coefficients
 after {oxdna2/dh} (T=0.1, rhos=1.0, qeff=0.815 in the above example). When using a Langevin thermostat
 e.g. through "fix langevin"_fix_langevin.html or "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html
 the temperature coefficients have to be matched to the one used in the fix.
@@ -86,7 +86,7 @@ LAMMPS"_Section_start.html#start_3 section for more info on packages.
 [Related commands:]
 
 "bond_style oxdna2/fene"_bond_oxdna.html, "fix nve/dotc/langevin"_fix_nve_dotc_langevin.html, "pair_coeff"_pair_coeff.html,
-"bond_style oxdna/fene"_bond_oxdna.html, "pair_style oxdna/excv"_pair_oxdna.html 
+"bond_style oxdna/fene"_bond_oxdna.html, "pair_style oxdna/excv"_pair_oxdna.html
 
 [Default:] none
 
diff --git a/doc/src/pair_table_rx.txt b/doc/src/pair_table_rx.txt
index d089a4f9da..f93af21da4 100644
--- a/doc/src/pair_table_rx.txt
+++ b/doc/src/pair_table_rx.txt
@@ -85,9 +85,9 @@ tags must either correspond to the species defined in the reaction
 kinetics files specified with the "fix rx"_fix_rx.html command or they
 must correspond to the tag "1fluid", signifying interaction with a
 product species mixture determined through a one-fluid approximation.
-The interaction potential is weighted by the geometric average of 
-either the mole fraction concentrations or the number of molecules 
-associated with the interacting coarse-grained particles (see the 
+The interaction potential is weighted by the geometric average of
+either the mole fraction concentrations or the number of molecules
+associated with the interacting coarse-grained particles (see the
 {fractional} or {molecular} weighting pair style options). The coarse-grained potential is
 stored before and after the reaction kinetics solver is applied, where
 the difference is defined to be the internal chemical energy (uChem).
diff --git a/doc/src/python.txt b/doc/src/python.txt
index e00b90234c..c6538ded45 100644
--- a/doc/src/python.txt
+++ b/doc/src/python.txt
@@ -489,7 +489,7 @@ python"_Section_python.html.  Note that it is important that the
 stand-alone LAMMPS executable and the LAMMPS shared library be
 consistent (built from the same source code files) in order for this
 to work.  If the two have been built at different times using
-different source files, problems may occur. 
+different source files, problems may occur.
 
 [Related commands:]
 
diff --git a/doc/src/run_style.txt b/doc/src/run_style.txt
index 0e3c1a939f..a67899420b 100644
--- a/doc/src/run_style.txt
+++ b/doc/src/run_style.txt
@@ -17,7 +17,7 @@ style = {verlet} or {verlet/split} or {respa} or {respa/omp} :ulb,l
   {verlet/split} args = none
   {respa} args = N n1 n2 ... keyword values ...
     N = # of levels of rRESPA
-    n1, n2, ... = loop factor between rRESPA levels (N-1 values)
+    n1, n2, ... = loop factors between rRESPA levels (N-1 values)
     zero or more keyword/value pairings may be appended to the loop factors
     keyword = {bond} or {angle} or {dihedral} or {improper} or
               {pair} or {inner} or {middle} or {outer} or {hybrid} or {kspace}
@@ -55,7 +55,7 @@ style = {verlet} or {verlet/split} or {respa} or {respa/omp} :ulb,l
 
 run_style verlet
 run_style respa 4 2 2 2 bond 1 dihedral 2 pair 3 kspace 4
-run_style respa 4 2 2 2 bond 1 dihedral 2 inner 3 5.0 6.0 outer 4 kspace 4 :pre
+run_style respa 4 2 2 2 bond 1 dihedral 2 inner 3 5.0 6.0 outer 4 kspace 4
 run_style respa 3 4 2 bond 1 hybrid 2 2 1 kspace 3 :pre
 
 [Description:]
diff --git a/doc/src/tutorial_github.txt b/doc/src/tutorial_github.txt
index d6ec22589b..3e10b821ae 100644
--- a/doc/src/tutorial_github.txt
+++ b/doc/src/tutorial_github.txt
@@ -86,7 +86,7 @@ machine via HTTPS:
 or, if you have set up your GitHub account for using SSH keys, via SSH:
 
   $ git clone git@github.com:<your user name>/lammps.git :pre
-  
+
 You can find the proper URL by clicking the "Clone or download"-button:
 
 :c,image(JPG/tutorial_https_block.png)
diff --git a/doc/src/tutorial_pylammps.txt b/doc/src/tutorial_pylammps.txt
index 0b4fb32ed2..78cdd241fb 100644
--- a/doc/src/tutorial_pylammps.txt
+++ b/doc/src/tutorial_pylammps.txt
@@ -36,7 +36,7 @@ lammps.PyLammps :h4
 
 higher-level abstraction built on top of original C-Types interface
 manipulation of Python objects
-communication with LAMMPS is hidden from API user 
+communication with LAMMPS is hidden from API user
 shorter, more concise Python
 better IPython integration, designed for quick prototyping :ul
 
@@ -328,7 +328,7 @@ IPyLammps Examples :h2
 
 Examples of IPython notebooks can be found in the python/examples/pylammps
 subdirectory. To open these notebooks launch {jupyter notebook} inside this
-directory and navigate to one of them. If you compiled and installed 
+directory and navigate to one of them. If you compiled and installed
 a LAMMPS shared library with exceptions, PNG, JPEG and FFMPEG support
 you should be able to rerun all of these notebooks.
 
@@ -399,19 +399,19 @@ natoms = L.system.natoms :pre
 for i in range(niterations):
     iatom = random.randrange(0, natoms)
     current_atom = L.atoms\[iatom\] :pre
-    
+
     x0, y0 = current_atom.position :pre
-    
+
     dx = deltamove * random.uniform(-1, 1)
     dy = deltamove * random.uniform(-1, 1) :pre
-    
+
     current_atom.position = (x0+dx, y0+dy) :pre
-    
+
     L.run(1, "pre no post no") :pre
-    
+
     e = L.eval("pe")
     energies.append(e) :pre
-    
+
     if e <= elast:
         naccept += 1
         elast = e
@@ -460,4 +460,4 @@ Feedback and Contributing :h2
 If you find this Python interface useful, please feel free to provide feedback
 and ideas on how to improve it to Richard Berger (richard.berger@temple.edu). We also
 want to encourage people to write tutorial style IPython notebooks showcasing LAMMPS usage
-and maybe their latest research results. 
+and maybe their latest research results.
diff --git a/examples/USER/misc/filter_corotate/in.bpti b/examples/USER/misc/filter_corotate/in.bpti
index 6507a78704..2e4d8dda6f 100644
--- a/examples/USER/misc/filter_corotate/in.bpti
+++ b/examples/USER/misc/filter_corotate/in.bpti
@@ -28,7 +28,7 @@ thermo          100
 thermo_style    multi
 timestep	8
 
-run_style respa 3 2 8 bond 1 pair 2 kspace 3
+run_style respa 3 2 8 bond 1 dihedral 2 pair 2 kspace 3
 
 velocity        all create 200.0 12345678 dist uniform
 #dump            dump1 all atom 100 4pti.dump
diff --git a/examples/USER/misc/filter_corotate/in.peptide b/examples/USER/misc/filter_corotate/in.peptide
index 0a17f995b3..e10dc09f0d 100644
--- a/examples/USER/misc/filter_corotate/in.peptide
+++ b/examples/USER/misc/filter_corotate/in.peptide
@@ -20,7 +20,7 @@ thermo          50
 
 timestep        8
 
-run_style respa 3 2 8 bond 1 pair 2 kspace 3
+run_style respa 3 2 8 bond 1 dihedral 2 pair 2 kspace 3
 
 fix             1 all nvt temp 250.0 250.0 100.0 tchain 1
 fix             cor all filter/corotate m 1.0
diff --git a/examples/USER/misc/filter_corotate/log.10Mar2017.bpti.g++.1 b/examples/USER/misc/filter_corotate/log.10Mar2017.bpti.g++.1
deleted file mode 100644
index 5253b47b2d..0000000000
--- a/examples/USER/misc/filter_corotate/log.10Mar2017.bpti.g++.1
+++ /dev/null
@@ -1,240 +0,0 @@
-LAMMPS (10 Mar 2017)
-  using 1 OpenMP thread(s) per MPI task
-
-units           real
-
-atom_style      full
-bond_style      harmonic
-angle_style     charmm
-dihedral_style  charmm
-improper_style  harmonic
-
-pair_style      lj/charmm/coul/long 8 10
-pair_modify     mix arithmetic
-kspace_style    pppm 1e-4
-
-read_data       data.bpti
-  orthogonal box = (-10 -10 -30) to (50 50 30)
-  1 by 1 by 1 MPI processor grid
-  reading atoms ...
-  892 atoms
-  scanning bonds ...
-  4 = max bonds/atom
-  scanning angles ...
-  6 = max angles/atom
-  scanning dihedrals ...
-  18 = max dihedrals/atom
-  scanning impropers ...
-  2 = max impropers/atom
-  reading bonds ...
-  906 bonds
-  reading angles ...
-  1626 angles
-  reading dihedrals ...
-  2501 dihedrals
-  reading impropers ...
-  137 impropers
-  4 = max # of 1-2 neighbors
-  9 = max # of 1-3 neighbors
-  19 = max # of 1-4 neighbors
-  21 = max # of special neighbors
-
-special_bonds   charmm
-neigh_modify    delay 2 every 1
-
-
-# ------------- MINIMIZE ----------
-
-minimize 	1e-4 1e-6 1000 10000
-WARNING: Resetting reneighboring criteria during minimization (../min.cpp:168)
-PPPM initialization ...
-WARNING: System is not charge neutral, net charge = 6 (../kspace.cpp:302)
-WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
-  G vector (1/distance) = 0.203272
-  grid = 16 16 16
-  stencil order = 5
-  estimated absolute RMS force accuracy = 0.0316399
-  estimated relative force accuracy = 9.52826e-05
-  using double precision FFTs
-  3d grid and FFT values/proc = 9261 4096
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 12
-  ghost atom cutoff = 12
-  binsize = 6, bins = 10 10 10
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair lj/charmm/coul/long, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-Per MPI rank memory usage (min/avg/max) = 17.8596/1/0 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0            0   -3075.6498    943.91164   -2131.7381   -380.67776 
-     241            0    -4503.313    749.58662   -3753.7264   -29.045104 
-Loop time of 3.35722 on 1 procs for 241 steps with 892 atoms
-
-99.7% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-Minimization stats:
-  Stopping criterion = energy tolerance
-  Energy initial, next-to-last, final = 
-        -2131.73812515     -3753.43984087     -3753.72636847
-  Force two-norm initial, final = 1086.21 26.3688
-  Force max component initial, final = 310.811 3.92748
-  Final line search alpha, max atom move = 0.00596649 0.0234333
-  Iterations, force evaluations = 241 463
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 2.5003     | 2.5003     | 2.5003     |   0.0 | 74.48
-Bond    | 0.24287    | 0.24287    | 0.24287    |   0.0 |  7.23
-Kspace  | 0.53428    | 0.53428    | 0.53428    |   0.0 | 15.91
-Neigh   | 0.069765   | 0.069765   | 0.069765   |   0.0 |  2.08
-Comm    | 0.00065374 | 0.00065374 | 0.00065374 |   0.0 |  0.02
-Output  | 0          | 0          | 0          |   0.0 |  0.00
-Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 0.009358   |            |       |  0.28
-
-Nlocal:    892 ave 892 max 892 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    31 ave 31 max 31 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    148891 ave 148891 max 148891 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 148891
-Ave neighs/atom = 166.918
-Ave special neighs/atom = 10.9395
-Neighbor list builds = 15
-Dangerous builds = 0
-reset_timestep  0
-
-# ------------- RUN ---------------
-
-thermo          100
-thermo_style    multi
-timestep	8
-
-run_style respa 3 2 8 bond 1 pair 2 kspace 3
-Respa levels:
-  1 = bond angle dihedral improper
-  2 = pair
-  3 = kspace
-
-velocity        all create 200.0 12345678 dist uniform
-#dump            dump1 all atom 100 4pti.dump
-
-fix             1 all nvt temp 200 300 25
-fix             cor all filter/corotate m 1.0
-  163 = # of size 2 clusters
-  0 = # of size 3 clusters
-  25 = # of size 4 clusters
-  0 = # of size 5 clusters
-  100 = # of frozen angles
-
-run             1000
-PPPM initialization ...
-WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
-  G vector (1/distance) = 0.203272
-  grid = 16 16 16
-  stencil order = 5
-  estimated absolute RMS force accuracy = 0.0316399
-  estimated relative force accuracy = 9.52826e-05
-  using double precision FFTs
-  3d grid and FFT values/proc = 9261 4096
-Per MPI rank memory usage (min/avg/max) = 19.5425/1/0 Mbytes
----------------- Step        0 ----- CPU =      0.0000 (sec) ----------------
-TotEng   =     -3220.3378 KinEng   =       531.1804 Temp     =       200.0000 
-PotEng   =     -3751.5181 E_bond   =        42.2810 E_angle  =       345.2592 
-E_dihed  =       337.8361 E_impro  =        24.2103 E_vdwl   =      -288.5339 
-E_coul   =      -886.3622 E_long   =     -3326.2088 Press    =        83.2283 
----------------- Step      100 ----- CPU =      3.9414 (sec) ----------------
-TotEng   =     -2718.8970 KinEng   =       538.6206 Temp     =       202.8014 
-PotEng   =     -3257.5176 E_bond   =       203.3367 E_angle  =       566.5317 
-E_dihed  =       397.6202 E_impro  =        34.6623 E_vdwl   =      -248.7451 
-E_coul   =      -874.5122 E_long   =     -3336.4111 Press    =       135.8662 
----------------- Step      200 ----- CPU =      7.9028 (sec) ----------------
-TotEng   =     -2660.1406 KinEng   =       626.3319 Temp     =       235.8265 
-PotEng   =     -3286.4725 E_bond   =       209.5147 E_angle  =       591.7773 
-E_dihed  =       388.9591 E_impro  =        29.4992 E_vdwl   =      -243.5808 
-E_coul   =      -923.5115 E_long   =     -3339.1306 Press    =        88.9000 
----------------- Step      300 ----- CPU =     11.8246 (sec) ----------------
-TotEng   =     -2673.8090 KinEng   =       616.7924 Temp     =       232.2346 
-PotEng   =     -3290.6014 E_bond   =       202.8254 E_angle  =       568.6860 
-E_dihed  =       378.4182 E_impro  =        38.2399 E_vdwl   =      -221.3236 
-E_coul   =      -915.3004 E_long   =     -3342.1468 Press    =        78.8527 
----------------- Step      400 ----- CPU =     15.7990 (sec) ----------------
-TotEng   =     -2614.9416 KinEng   =       649.3474 Temp     =       244.4922 
-PotEng   =     -3264.2890 E_bond   =       211.6116 E_angle  =       617.2026 
-E_dihed  =       399.8744 E_impro  =        40.2678 E_vdwl   =      -211.7790 
-E_coul   =      -978.1624 E_long   =     -3343.3041 Press    =        -4.1958 
----------------- Step      500 ----- CPU =     19.8146 (sec) ----------------
-TotEng   =     -2588.6772 KinEng   =       660.1424 Temp     =       248.5568 
-PotEng   =     -3248.8196 E_bond   =       218.4786 E_angle  =       620.8605 
-E_dihed  =       390.3220 E_impro  =        41.6794 E_vdwl   =      -226.3657 
-E_coul   =      -953.1676 E_long   =     -3340.6269 Press    =        99.3200 
----------------- Step      600 ----- CPU =     23.8587 (sec) ----------------
-TotEng   =     -2550.4618 KinEng   =       693.3384 Temp     =       261.0557 
-PotEng   =     -3243.8002 E_bond   =       232.3563 E_angle  =       606.2922 
-E_dihed  =       396.2469 E_impro  =        37.1980 E_vdwl   =      -235.8425 
-E_coul   =      -937.1208 E_long   =     -3342.9303 Press    =       -21.7737 
----------------- Step      700 ----- CPU =     27.8381 (sec) ----------------
-TotEng   =     -2554.4355 KinEng   =       692.8951 Temp     =       260.8888 
-PotEng   =     -3247.3306 E_bond   =       216.3395 E_angle  =       637.7785 
-E_dihed  =       391.5940 E_impro  =        43.1426 E_vdwl   =      -187.6159 
-E_coul   =     -1008.1694 E_long   =     -3340.3998 Press    =        75.1484 
----------------- Step      800 ----- CPU =     31.8039 (sec) ----------------
-TotEng   =     -2508.3551 KinEng   =       699.0766 Temp     =       263.2163 
-PotEng   =     -3207.4317 E_bond   =       241.9936 E_angle  =       641.3631 
-E_dihed  =       386.2198 E_impro  =        43.7793 E_vdwl   =      -217.7523 
-E_coul   =      -964.6070 E_long   =     -3338.4282 Press    =      -127.7337 
----------------- Step      900 ----- CPU =     35.7700 (sec) ----------------
-TotEng   =     -2452.7644 KinEng   =       762.1842 Temp     =       286.9776 
-PotEng   =     -3214.9485 E_bond   =       243.9191 E_angle  =       649.8664 
-E_dihed  =       382.4351 E_impro  =        39.0029 E_vdwl   =      -221.3389 
-E_coul   =      -970.8965 E_long   =     -3337.9366 Press    =       122.7720 
----------------- Step     1000 ----- CPU =     39.7695 (sec) ----------------
-TotEng   =     -2386.6805 KinEng   =       799.0253 Temp     =       300.8490 
-PotEng   =     -3185.7058 E_bond   =       265.3649 E_angle  =       661.7543 
-E_dihed  =       374.6843 E_impro  =        38.6877 E_vdwl   =      -229.2030 
-E_coul   =      -960.7041 E_long   =     -3336.2899 Press    =       -17.9910 
-Loop time of 39.7695 on 1 procs for 1000 steps with 892 atoms
-
-Performance: 17.380 ns/day, 1.381 hours/ns, 25.145 timesteps/s
-99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 29.169     | 29.169     | 29.169     |   0.0 | 73.34
-Bond    | 7.6249     | 7.6249     | 7.6249     |   0.0 | 19.17
-Kspace  | 1.1525     | 1.1525     | 1.1525     |   0.0 |  2.90
-Neigh   | 0.87606    | 0.87606    | 0.87606    |   0.0 |  2.20
-Comm    | 0.01563    | 0.01563    | 0.01563    |   0.0 |  0.04
-Output  | 0.00048423 | 0.00048423 | 0.00048423 |   0.0 |  0.00
-Modify  | 0.80446    | 0.80446    | 0.80446    |   0.0 |  2.02
-Other   |            | 0.1266     |            |       |  0.32
-
-Nlocal:    892 ave 892 max 892 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    27 ave 27 max 27 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    146206 ave 146206 max 146206 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 146206
-Ave neighs/atom = 163.908
-Ave special neighs/atom = 10.9395
-Neighbor list builds = 186
-Dangerous builds = 0
-
-unfix           cor
-unfix           1
-
-
-Please see the log.cite file for references relevant to this simulation
-
-Total wall time: 0:00:43
diff --git a/examples/USER/misc/filter_corotate/log.10Mar2017.bpti.g++.4 b/examples/USER/misc/filter_corotate/log.10Mar2017.bpti.g++.4
deleted file mode 100644
index 4300c1caf5..0000000000
--- a/examples/USER/misc/filter_corotate/log.10Mar2017.bpti.g++.4
+++ /dev/null
@@ -1,240 +0,0 @@
-LAMMPS (10 Mar 2017)
-  using 1 OpenMP thread(s) per MPI task
-
-units           real
-
-atom_style      full
-bond_style      harmonic
-angle_style     charmm
-dihedral_style  charmm
-improper_style  harmonic
-
-pair_style      lj/charmm/coul/long 8 10
-pair_modify     mix arithmetic
-kspace_style    pppm 1e-4
-
-read_data       data.bpti
-  orthogonal box = (-10 -10 -30) to (50 50 30)
-  1 by 2 by 2 MPI processor grid
-  reading atoms ...
-  892 atoms
-  scanning bonds ...
-  4 = max bonds/atom
-  scanning angles ...
-  6 = max angles/atom
-  scanning dihedrals ...
-  18 = max dihedrals/atom
-  scanning impropers ...
-  2 = max impropers/atom
-  reading bonds ...
-  906 bonds
-  reading angles ...
-  1626 angles
-  reading dihedrals ...
-  2501 dihedrals
-  reading impropers ...
-  137 impropers
-  4 = max # of 1-2 neighbors
-  9 = max # of 1-3 neighbors
-  19 = max # of 1-4 neighbors
-  21 = max # of special neighbors
-
-special_bonds   charmm
-neigh_modify    delay 2 every 1
-
-
-# ------------- MINIMIZE ----------
-
-minimize 	1e-4 1e-6 1000 10000
-WARNING: Resetting reneighboring criteria during minimization (../min.cpp:168)
-PPPM initialization ...
-WARNING: System is not charge neutral, net charge = 6 (../kspace.cpp:302)
-WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
-  G vector (1/distance) = 0.203272
-  grid = 16 16 16
-  stencil order = 5
-  estimated absolute RMS force accuracy = 0.0316399
-  estimated relative force accuracy = 9.52826e-05
-  using double precision FFTs
-  3d grid and FFT values/proc = 3549 1024
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 12
-  ghost atom cutoff = 12
-  binsize = 6, bins = 10 10 10
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair lj/charmm/coul/long, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-Per MPI rank memory usage (min/avg/max) = 16.9693/0.981879/0 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0            0   -3075.6498    943.91164   -2131.7381   -380.67776 
-     241            0   -4503.3131    749.58666   -3753.7264   -29.045153 
-Loop time of 1.26594 on 4 procs for 241 steps with 892 atoms
-
-99.0% CPU use with 4 MPI tasks x 1 OpenMP threads
-
-Minimization stats:
-  Stopping criterion = energy tolerance
-  Energy initial, next-to-last, final = 
-        -2131.73812515     -3753.43983927     -3753.72640137
-  Force two-norm initial, final = 1086.21 26.3688
-  Force max component initial, final = 310.811 3.92751
-  Final line search alpha, max atom move = 0.00596649 0.0234334
-  Iterations, force evaluations = 241 463
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 0.34267    | 0.63792    | 0.90268    |  25.2 | 50.39
-Bond    | 0.025776   | 0.063318   | 0.095631   |  10.8 |  5.00
-Kspace  | 0.21904    | 0.51601    | 0.84895    |  31.3 | 40.76
-Neigh   | 0.023185   | 0.023363   | 0.023538   |   0.1 |  1.85
-Comm    | 0.012025   | 0.014189   | 0.016335   |   1.4 |  1.12
-Output  | 0          | 0          | 0          |   0.0 |  0.00
-Modify  | 0          | 0          | 0          |   0.0 |  0.00
-Other   |            | 0.01114    |            |       |  0.88
-
-Nlocal:    223 ave 323 max 89 min
-Histogram: 1 0 0 0 1 0 0 0 1 1
-Nghost:    613 ave 675 max 557 min
-Histogram: 1 0 0 1 0 1 0 0 0 1
-Neighs:    37222.8 ave 50005 max 20830 min
-Histogram: 1 0 0 0 1 0 0 1 0 1
-
-Total # of neighbors = 148891
-Ave neighs/atom = 166.918
-Ave special neighs/atom = 10.9395
-Neighbor list builds = 15
-Dangerous builds = 0
-reset_timestep  0
-
-# ------------- RUN ---------------
-
-thermo          100
-thermo_style    multi
-timestep	8
-
-run_style respa 3 2 8 bond 1 pair 2 kspace 3
-Respa levels:
-  1 = bond angle dihedral improper
-  2 = pair
-  3 = kspace
-
-velocity        all create 200.0 12345678 dist uniform
-#dump            dump1 all atom 100 4pti.dump
-
-fix             1 all nvt temp 200 300 25
-fix             cor all filter/corotate m 1.0
-  163 = # of size 2 clusters
-  0 = # of size 3 clusters
-  25 = # of size 4 clusters
-  0 = # of size 5 clusters
-  100 = # of frozen angles
-
-run             1000
-PPPM initialization ...
-WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
-  G vector (1/distance) = 0.203272
-  grid = 16 16 16
-  stencil order = 5
-  estimated absolute RMS force accuracy = 0.0316399
-  estimated relative force accuracy = 9.52826e-05
-  using double precision FFTs
-  3d grid and FFT values/proc = 3549 1024
-Per MPI rank memory usage (min/avg/max) = 17.142/0.97212/0 Mbytes
----------------- Step        0 ----- CPU =      0.0000 (sec) ----------------
-TotEng   =     -3220.3378 KinEng   =       531.1804 Temp     =       200.0000 
-PotEng   =     -3751.5182 E_bond   =        42.2810 E_angle  =       345.2592 
-E_dihed  =       337.8361 E_impro  =        24.2103 E_vdwl   =      -288.5339 
-E_coul   =      -886.3622 E_long   =     -3326.2088 Press    =        83.2282 
----------------- Step      100 ----- CPU =      1.5457 (sec) ----------------
-TotEng   =     -2718.9184 KinEng   =       538.6205 Temp     =       202.8014 
-PotEng   =     -3257.5389 E_bond   =       203.3365 E_angle  =       566.5311 
-E_dihed  =       397.6202 E_impro  =        34.6621 E_vdwl   =      -248.7451 
-E_coul   =      -874.5326 E_long   =     -3336.4111 Press    =       135.8435 
----------------- Step      200 ----- CPU =      3.0720 (sec) ----------------
-TotEng   =     -2660.1146 KinEng   =       626.3474 Temp     =       235.8323 
-PotEng   =     -3286.4620 E_bond   =       209.5168 E_angle  =       591.7735 
-E_dihed  =       388.9615 E_impro  =        29.5000 E_vdwl   =      -243.5840 
-E_coul   =      -923.4998 E_long   =     -3339.1299 Press    =        88.8857 
----------------- Step      300 ----- CPU =      4.5597 (sec) ----------------
-TotEng   =     -2669.7442 KinEng   =       619.3625 Temp     =       233.2023 
-PotEng   =     -3289.1067 E_bond   =       203.4405 E_angle  =       569.5281 
-E_dihed  =       378.3314 E_impro  =        38.2880 E_vdwl   =      -221.1904 
-E_coul   =      -915.3396 E_long   =     -3342.1646 Press    =        79.3780 
----------------- Step      400 ----- CPU =      5.9808 (sec) ----------------
-TotEng   =     -2618.9975 KinEng   =       644.6145 Temp     =       242.7102 
-PotEng   =     -3263.6119 E_bond   =       209.5864 E_angle  =       618.8954 
-E_dihed  =       401.3798 E_impro  =        39.9064 E_vdwl   =      -212.1271 
-E_coul   =      -977.1589 E_long   =     -3344.0940 Press    =        -7.8938 
----------------- Step      500 ----- CPU =      7.4159 (sec) ----------------
-TotEng   =     -2579.7486 KinEng   =       666.4643 Temp     =       250.9371 
-PotEng   =     -3246.2129 E_bond   =       219.2549 E_angle  =       620.3474 
-E_dihed  =       388.4395 E_impro  =        41.4499 E_vdwl   =      -225.9686 
-E_coul   =      -949.3689 E_long   =     -3340.3672 Press    =       113.2543 
----------------- Step      600 ----- CPU =      8.9252 (sec) ----------------
-TotEng   =     -2535.8235 KinEng   =       708.5919 Temp     =       266.7990 
-PotEng   =     -3244.4154 E_bond   =       243.9451 E_angle  =       606.0866 
-E_dihed  =       400.0562 E_impro  =        33.9708 E_vdwl   =      -223.1319 
-E_coul   =      -964.9940 E_long   =     -3340.3482 Press    =      -102.4475 
----------------- Step      700 ----- CPU =     10.4022 (sec) ----------------
-TotEng   =     -2552.6681 KinEng   =       702.3080 Temp     =       264.4330 
-PotEng   =     -3254.9761 E_bond   =       250.8834 E_angle  =       639.0977 
-E_dihed  =       386.4014 E_impro  =        42.3004 E_vdwl   =      -224.4816 
-E_coul   =     -1011.8551 E_long   =     -3337.3222 Press    =        10.6424 
----------------- Step      800 ----- CPU =     11.8699 (sec) ----------------
-TotEng   =     -2423.5415 KinEng   =       772.1254 Temp     =       290.7206 
-PotEng   =     -3195.6670 E_bond   =       238.5831 E_angle  =       640.9180 
-E_dihed  =       377.7994 E_impro  =        40.3135 E_vdwl   =      -216.5705 
-E_coul   =      -935.1087 E_long   =     -3341.6019 Press    =       -38.2479 
----------------- Step      900 ----- CPU =     13.3548 (sec) ----------------
-TotEng   =     -2394.4779 KinEng   =       766.6895 Temp     =       288.6739 
-PotEng   =     -3161.1673 E_bond   =       284.8428 E_angle  =       671.0959 
-E_dihed  =       380.3406 E_impro  =        51.2975 E_vdwl   =      -219.5211 
-E_coul   =      -990.6305 E_long   =     -3338.5925 Press    =       -15.2279 
----------------- Step     1000 ----- CPU =     14.7908 (sec) ----------------
-TotEng   =     -2340.1471 KinEng   =       799.0198 Temp     =       300.8469 
-PotEng   =     -3139.1669 E_bond   =       271.0389 E_angle  =       683.8278 
-E_dihed  =       407.0795 E_impro  =        39.6209 E_vdwl   =      -230.5355 
-E_coul   =      -974.2981 E_long   =     -3335.9003 Press    =       -94.3420 
-Loop time of 14.7909 on 4 procs for 1000 steps with 892 atoms
-
-Performance: 46.732 ns/day, 0.514 hours/ns, 67.609 timesteps/s
-99.1% CPU use with 4 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 4.4184     | 7.5543     | 10.133     |  74.2 | 51.07
-Bond    | 0.94027    | 1.9781     | 2.7492     |  54.4 | 13.37
-Kspace  | 0.45487    | 0.45887    | 0.46343    |   0.4 |  3.10
-Neigh   | 0.28145    | 0.28339    | 0.28539    |   0.3 |  1.92
-Comm    | 0.7515     | 4.1484     | 8.3861     | 135.5 | 28.05
-Output  | 0.00049973 | 0.00055474 | 0.00066924 |   0.0 |  0.00
-Modify  | 0.26165    | 0.31142    | 0.35023    |   6.7 |  2.11
-Other   |            | 0.05572    |            |       |  0.38
-
-Nlocal:    223 ave 313 max 122 min
-Histogram: 1 0 0 1 0 0 0 1 0 1
-Nghost:    584.5 ave 605 max 553 min
-Histogram: 1 0 0 0 0 1 0 0 0 2
-Neighs:    35448 ave 42093 max 25175 min
-Histogram: 1 0 0 0 0 0 1 1 0 1
-
-Total # of neighbors = 141792
-Ave neighs/atom = 158.96
-Ave special neighs/atom = 10.9395
-Neighbor list builds = 186
-Dangerous builds = 0
-
-unfix           cor
-unfix           1
-
-
-Please see the log.cite file for references relevant to this simulation
-
-Total wall time: 0:00:16
diff --git a/examples/USER/misc/filter_corotate/log.10Mar2017.peptide.g++.1 b/examples/USER/misc/filter_corotate/log.10Mar2017.peptide.g++.1
deleted file mode 100644
index 23dd4c8a89..0000000000
--- a/examples/USER/misc/filter_corotate/log.10Mar2017.peptide.g++.1
+++ /dev/null
@@ -1,146 +0,0 @@
-LAMMPS (10 Mar 2017)
-  using 1 OpenMP thread(s) per MPI task
-# Solvated 5-mer peptide, run for 8ps in NVT
-
-units           real
-atom_style      full
-
-pair_style      lj/charmm/coul/long 8.0 10.0 10.0
-bond_style      harmonic
-angle_style     charmm
-dihedral_style  charmm
-improper_style  harmonic
-kspace_style    pppm 0.0001
-
-read_data       data.peptide
-  orthogonal box = (36.8402 41.0137 29.7681) to (64.2116 68.3851 57.1395)
-  1 by 1 by 1 MPI processor grid
-  reading atoms ...
-  2004 atoms
-  reading velocities ...
-  2004 velocities
-  scanning bonds ...
-  3 = max bonds/atom
-  scanning angles ...
-  6 = max angles/atom
-  scanning dihedrals ...
-  14 = max dihedrals/atom
-  scanning impropers ...
-  1 = max impropers/atom
-  reading bonds ...
-  1365 bonds
-  reading angles ...
-  786 angles
-  reading dihedrals ...
-  207 dihedrals
-  reading impropers ...
-  12 impropers
-  4 = max # of 1-2 neighbors
-  7 = max # of 1-3 neighbors
-  14 = max # of 1-4 neighbors
-  18 = max # of special neighbors
-
-neighbor        2.0 bin
-neigh_modify    delay 5
-
-thermo          50
-#dump            dump1 all atom 100 peptide.dump
-
-timestep        8
-
-run_style respa 3 2 8 bond 1 pair 2 kspace 3
-Respa levels:
-  1 = bond angle dihedral improper
-  2 = pair
-  3 = kspace
-
-fix             1 all nvt temp 250.0 250.0 100.0 tchain 1
-fix             cor all filter/corotate m 1.0
-  19 = # of size 2 clusters
-  0 = # of size 3 clusters
-  3 = # of size 4 clusters
-  0 = # of size 5 clusters
-  646 = # of frozen angles
-run             1000
-PPPM initialization ...
-WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
-  G vector (1/distance) = 0.268725
-  grid = 15 15 15
-  stencil order = 5
-  estimated absolute RMS force accuracy = 0.0228209
-  estimated relative force accuracy = 6.87243e-05
-  using double precision FFTs
-  3d grid and FFT values/proc = 10648 3375
-Neighbor list info ...
-  update every 1 steps, delay 5 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 12
-  ghost atom cutoff = 12
-  binsize = 6, bins = 5 5 5
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair lj/charmm/coul/long, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-Per MPI rank memory usage (min/avg/max) = 22.6706/1/0 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0     190.0857   -6785.6785    70.391457   -5580.3684    19434.821 
-      50    239.46028   -7546.5667    1092.8874   -5023.9668   -24643.891 
-     100    242.81799   -7125.5527     416.0788   -5259.7139    15525.465 
-     150    235.97108   -7531.9334    932.35464   -5190.6987   -14838.489 
-     200    252.06415   -7195.6011    568.02993   -5122.6064     8841.332 
-     250    249.99431   -7586.5092    881.83491   -5212.0676    -9330.345 
-     300     240.3382   -7333.0933    633.29951   -5264.8395    5137.9757 
-     350    255.34529   -7568.2413    856.46371   -5187.2226    -6206.063 
-     400    242.99276   -7419.9031    713.23943   -5255.8602    2447.0091 
-     450    251.10653    -7622.061    844.20584   -5278.6079   -4906.6559 
-     500    255.59314    -7439.253    710.84907   -5202.3691    1571.0032 
-     550     253.2025   -7660.5101    823.05373    -5325.695    -4551.399 
-     600    249.05313   -7509.6729    741.48104   -5281.2046       992.87 
-     650    251.75984   -7593.6589    847.08244   -5243.4286   -3510.1176 
-     700    249.25027   -7601.9112     794.0912   -5319.6557    305.76021 
-     750      255.415   -7602.2674    822.98524   -5254.3109    -2333.421 
-     800    241.99621   -7643.8878    796.53352   -5402.5008   -298.66565 
-     850     253.6428   -7598.3764    816.45457   -5267.5316   -1905.3478 
-     900    247.20231   -7690.2806    789.75999   -5424.5838   -1331.7228 
-     950    255.92583   -7634.7505    831.18272   -5275.5466   -2186.5117 
-    1000     253.2126   -7647.9526    823.93602    -5312.195   -1189.9659 
-Loop time of 150.664 on 1 procs for 1000 steps with 2004 atoms
-
-Performance: 4.588 ns/day, 5.231 hours/ns, 6.637 timesteps/s
-99.7% CPU use with 1 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 135.81     | 135.81     | 135.81     |   0.0 | 90.14
-Bond    | 2.5889     | 2.5889     | 2.5889     |   0.0 |  1.72
-Kspace  | 2.0379     | 2.0379     | 2.0379     |   0.0 |  1.35
-Neigh   | 5.893      | 5.893      | 5.893      |   0.0 |  3.91
-Comm    | 1.6998     | 1.6998     | 1.6998     |   0.0 |  1.13
-Output  | 0.00077915 | 0.00077915 | 0.00077915 |   0.0 |  0.00
-Modify  | 2          | 2          | 2          |   0.0 |  1.33
-Other   |            | 0.6352     |            |       |  0.42
-
-Nlocal:    2004 ave 2004 max 2004 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    11197 ave 11197 max 11197 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    707779 ave 707779 max 707779 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 707779
-Ave neighs/atom = 353.183
-Ave special neighs/atom = 2.34032
-Neighbor list builds = 200
-Dangerous builds = 200
-unfix           cor
-unfix           1
-
-
-
-
-Please see the log.cite file for references relevant to this simulation
-
-Total wall time: 0:02:30
diff --git a/examples/USER/misc/filter_corotate/log.10Mar2017.peptide.g++.4 b/examples/USER/misc/filter_corotate/log.10Mar2017.peptide.g++.4
deleted file mode 100644
index 2cdd645fe3..0000000000
--- a/examples/USER/misc/filter_corotate/log.10Mar2017.peptide.g++.4
+++ /dev/null
@@ -1,146 +0,0 @@
-LAMMPS (10 Mar 2017)
-  using 1 OpenMP thread(s) per MPI task
-# Solvated 5-mer peptide, run for 8ps in NVT
-
-units           real
-atom_style      full
-
-pair_style      lj/charmm/coul/long 8.0 10.0 10.0
-bond_style      harmonic
-angle_style     charmm
-dihedral_style  charmm
-improper_style  harmonic
-kspace_style    pppm 0.0001
-
-read_data       data.peptide
-  orthogonal box = (36.8402 41.0137 29.7681) to (64.2116 68.3851 57.1395)
-  1 by 2 by 2 MPI processor grid
-  reading atoms ...
-  2004 atoms
-  reading velocities ...
-  2004 velocities
-  scanning bonds ...
-  3 = max bonds/atom
-  scanning angles ...
-  6 = max angles/atom
-  scanning dihedrals ...
-  14 = max dihedrals/atom
-  scanning impropers ...
-  1 = max impropers/atom
-  reading bonds ...
-  1365 bonds
-  reading angles ...
-  786 angles
-  reading dihedrals ...
-  207 dihedrals
-  reading impropers ...
-  12 impropers
-  4 = max # of 1-2 neighbors
-  7 = max # of 1-3 neighbors
-  14 = max # of 1-4 neighbors
-  18 = max # of special neighbors
-
-neighbor        2.0 bin
-neigh_modify    delay 5
-
-thermo          50
-#dump            dump1 all atom 100 peptide.dump
-
-timestep        8
-
-run_style respa 3 2 8 bond 1 pair 2 kspace 3
-Respa levels:
-  1 = bond angle dihedral improper
-  2 = pair
-  3 = kspace
-
-fix             1 all nvt temp 250.0 250.0 100.0 tchain 1
-fix             cor all filter/corotate m 1.0
-  19 = # of size 2 clusters
-  0 = # of size 3 clusters
-  3 = # of size 4 clusters
-  0 = # of size 5 clusters
-  646 = # of frozen angles
-run             1000
-PPPM initialization ...
-WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
-  G vector (1/distance) = 0.268725
-  grid = 15 15 15
-  stencil order = 5
-  estimated absolute RMS force accuracy = 0.0228209
-  estimated relative force accuracy = 6.87243e-05
-  using double precision FFTs
-  3d grid and FFT values/proc = 4312 960
-Neighbor list info ...
-  update every 1 steps, delay 5 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 12
-  ghost atom cutoff = 12
-  binsize = 6, bins = 5 5 5
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair lj/charmm/coul/long, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-Per MPI rank memory usage (min/avg/max) = 16.8394/0.98826/0 Mbytes
-Step Temp E_pair E_mol TotEng Press 
-       0     190.0857   -6785.6785    70.391457   -5580.3684    19434.821 
-      50    239.46028   -7546.5668    1092.8874   -5023.9668   -24643.891 
-     100    242.81819   -7125.5629    416.08082   -5259.7209    15525.244 
-     150    235.94928   -7531.9186    932.50658   -5190.6621   -14842.431 
-     200    255.85551   -7254.4065     568.8803   -5157.9249    8936.8651 
-     250     247.8705   -7607.4583    858.06087   -5269.4711   -9926.0442 
-     300    257.64176    -7267.424     618.5573   -5110.6004    5173.3307 
-     350    251.65439   -7572.3806    821.15745   -5248.7049    -7092.327 
-     400    256.87927   -7414.2145    655.33178    -5225.169    4119.4095 
-     450    257.12393   -7576.5541    853.39773   -5187.9819   -5224.8823 
-     500    242.42371    -7524.705    705.75357   -5371.5455    2111.3878 
-     550    248.97188    -7541.076    792.86994   -5261.7038   -2278.4185 
-     600    249.81862   -7592.0499    767.17722   -5333.3149   -1149.4759 
-     650    253.31349   -7578.2665    813.75975   -5252.0827   -2915.5706 
-     700    256.61152   -7588.1475    761.03356   -5294.9988   -747.88089 
-     750     248.3606    -7660.457    837.71615   -5339.8883   -3072.8311 
-     800    253.81464   -7638.6089     782.4229   -5340.7698    -1025.909 
-     850    245.69185   -7660.9036    795.66792   -5398.3172   -2717.5851 
-     900    249.13156   -7589.4769    806.43464   -5295.5867   -761.63361 
-     950    251.11482   -7691.4981    869.34937    -5322.852   -3282.3031 
-    1000     241.9195   -7630.9899    828.59107   -5358.0033   -95.962685 
-Loop time of 45.5507 on 4 procs for 1000 steps with 2004 atoms
-
-Performance: 15.174 ns/day, 1.582 hours/ns, 21.954 timesteps/s
-99.4% CPU use with 4 MPI tasks x 1 OpenMP threads
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 35.545     | 36.674     | 38.004     |  15.8 | 80.51
-Bond    | 0.51302    | 0.67796    | 0.86345    |  18.6 |  1.49
-Kspace  | 0.66031    | 0.68459    | 0.70506    |   2.1 |  1.50
-Neigh   | 1.5605     | 1.5627     | 1.5649     |   0.1 |  3.43
-Comm    | 3.4611     | 4.9841     | 6.294      |  47.2 | 10.94
-Output  | 0.00079799 | 0.00086641 | 0.0010369  |   0.0 |  0.00
-Modify  | 0.67341    | 0.69059    | 0.71186    |   1.7 |  1.52
-Other   |            | 0.2762     |            |       |  0.61
-
-Nlocal:    501 ave 523 max 473 min
-Histogram: 1 0 0 0 0 0 2 0 0 1
-Nghost:    6643.25 ave 6708 max 6566 min
-Histogram: 1 1 0 0 0 0 0 0 0 2
-Neighs:    176977 ave 185765 max 164931 min
-Histogram: 1 0 0 0 1 0 0 0 1 1
-
-Total # of neighbors = 707908
-Ave neighs/atom = 353.248
-Ave special neighs/atom = 2.34032
-Neighbor list builds = 200
-Dangerous builds = 200
-unfix           cor
-unfix           1
-
-
-
-
-Please see the log.cite file for references relevant to this simulation
-
-Total wall time: 0:00:45
diff --git a/examples/USER/misc/filter_corotate/log.22Jun2017.bpti.g++.1 b/examples/USER/misc/filter_corotate/log.22Jun2017.bpti.g++.1
new file mode 100644
index 0000000000..1e708a9d39
--- /dev/null
+++ b/examples/USER/misc/filter_corotate/log.22Jun2017.bpti.g++.1
@@ -0,0 +1,241 @@
+LAMMPS (20 Jun 2017)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (../comm.cpp:90)
+  using 1 OpenMP thread(s) per MPI task
+
+units           real
+
+atom_style      full
+bond_style      harmonic
+angle_style     charmm
+dihedral_style  charmm
+improper_style  harmonic
+
+pair_style      lj/charmm/coul/long 8 10
+pair_modify     mix arithmetic
+kspace_style    pppm 1e-4
+
+read_data       data.bpti
+  orthogonal box = (-10 -10 -30) to (50 50 30)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  892 atoms
+  scanning bonds ...
+  4 = max bonds/atom
+  scanning angles ...
+  6 = max angles/atom
+  scanning dihedrals ...
+  18 = max dihedrals/atom
+  scanning impropers ...
+  2 = max impropers/atom
+  reading bonds ...
+  906 bonds
+  reading angles ...
+  1626 angles
+  reading dihedrals ...
+  2501 dihedrals
+  reading impropers ...
+  137 impropers
+  4 = max # of 1-2 neighbors
+  9 = max # of 1-3 neighbors
+  19 = max # of 1-4 neighbors
+  21 = max # of special neighbors
+
+special_bonds   charmm
+neigh_modify    delay 2 every 1
+
+
+# ------------- MINIMIZE ----------
+
+minimize 	1e-4 1e-6 1000 10000
+WARNING: Resetting reneighboring criteria during minimization (../min.cpp:168)
+PPPM initialization ...
+WARNING: System is not charge neutral, net charge = 6 (../kspace.cpp:302)
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.203272
+  grid = 16 16 16
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0316399
+  estimated relative force accuracy = 9.52826e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 9261 4096
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 12
+  ghost atom cutoff = 12
+  binsize = 6, bins = 10 10 10
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/charmm/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 17.86 | 17.86 | 17.86 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -3075.6498    943.91164   -2131.7381   -380.67776 
+     241            0    -4503.313    749.58662   -3753.7264   -29.045104 
+Loop time of 7.63279 on 1 procs for 241 steps with 892 atoms
+
+32.0% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+        -2131.73812515     -3753.43984087     -3753.72636847
+  Force two-norm initial, final = 1086.21 26.3688
+  Force max component initial, final = 310.811 3.92748
+  Final line search alpha, max atom move = 0.00596649 0.0234333
+  Iterations, force evaluations = 241 463
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 5.8395     | 5.8395     | 5.8395     |   0.0 | 76.51
+Bond    | 0.46414    | 0.46414    | 0.46414    |   0.0 |  6.08
+Kspace  | 1.1535     | 1.1535     | 1.1535     |   0.0 | 15.11
+Neigh   | 0.14908    | 0.14908    | 0.14908    |   0.0 |  1.95
+Comm    | 0.001932   | 0.001932   | 0.001932   |   0.0 |  0.03
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.02465    |            |       |  0.32
+
+Nlocal:    892 ave 892 max 892 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    31 ave 31 max 31 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    148891 ave 148891 max 148891 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 148891
+Ave neighs/atom = 166.918
+Ave special neighs/atom = 10.9395
+Neighbor list builds = 15
+Dangerous builds = 0
+reset_timestep  0
+
+# ------------- RUN ---------------
+
+thermo          100
+thermo_style    multi
+timestep	8
+
+run_style respa 3 2 8 bond 1 dihedral 2 pair 2 kspace 3
+Respa levels:
+  1 = bond angle
+  2 = dihedral improper pair
+  3 = kspace
+
+velocity        all create 200.0 12345678 dist uniform
+#dump            dump1 all atom 100 4pti.dump
+
+fix             1 all nvt temp 200 300 25
+fix             cor all filter/corotate m 1.0
+  163 = # of size 2 clusters
+  0 = # of size 3 clusters
+  25 = # of size 4 clusters
+  0 = # of size 5 clusters
+  100 = # of frozen angles
+
+run             1000
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.203272
+  grid = 16 16 16
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0316399
+  estimated relative force accuracy = 9.52826e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 9261 4096
+Per MPI rank memory allocation (min/avg/max) = 19.55 | 19.55 | 19.55 Mbytes
+---------------- Step        0 ----- CPU =      0.0000 (sec) ----------------
+TotEng   =     -3220.3378 KinEng   =       531.1804 Temp     =       200.0000 
+PotEng   =     -3751.5181 E_bond   =        42.2810 E_angle  =       345.2592 
+E_dihed  =       337.8361 E_impro  =        24.2103 E_vdwl   =      -288.5339 
+E_coul   =      -886.3622 E_long   =     -3326.2088 Press    =        83.2283 
+---------------- Step      100 ----- CPU =      8.4380 (sec) ----------------
+TotEng   =     -2718.4258 KinEng   =       539.6265 Temp     =       203.1802 
+PotEng   =     -3258.0524 E_bond   =       203.2307 E_angle  =       566.1893 
+E_dihed  =       397.6759 E_impro  =        34.7696 E_vdwl   =      -248.6577 
+E_coul   =      -874.8466 E_long   =     -3336.4135 Press    =       135.8640 
+---------------- Step      200 ----- CPU =     16.9012 (sec) ----------------
+TotEng   =     -2661.9611 KinEng   =       625.0674 Temp     =       235.3503 
+PotEng   =     -3287.0285 E_bond   =       208.1804 E_angle  =       590.8462 
+E_dihed  =       389.1482 E_impro  =        30.5882 E_vdwl   =      -240.5448 
+E_coul   =      -926.3091 E_long   =     -3338.9378 Press    =       103.4738 
+---------------- Step      300 ----- CPU =     25.3046 (sec) ----------------
+TotEng   =     -2662.4139 KinEng   =       622.2647 Temp     =       234.2951 
+PotEng   =     -3284.6785 E_bond   =       202.4210 E_angle  =       573.6793 
+E_dihed  =       382.8919 E_impro  =        41.8973 E_vdwl   =      -218.9895 
+E_coul   =      -924.8414 E_long   =     -3341.7372 Press    =        40.6746 
+---------------- Step      400 ----- CPU =     33.8063 (sec) ----------------
+TotEng   =     -2604.9431 KinEng   =       662.9890 Temp     =       249.6286 
+PotEng   =     -3267.9321 E_bond   =       195.9116 E_angle  =       616.1383 
+E_dihed  =       407.8502 E_impro  =        43.3560 E_vdwl   =      -219.0377 
+E_coul   =      -966.3118 E_long   =     -3345.8387 Press    =       -91.8856 
+---------------- Step      500 ----- CPU =     42.3470 (sec) ----------------
+TotEng   =     -2609.3867 KinEng   =       657.0939 Temp     =       247.4090 
+PotEng   =     -3266.4806 E_bond   =       236.4955 E_angle  =       570.6256 
+E_dihed  =       390.5111 E_impro  =        41.9250 E_vdwl   =      -223.9927 
+E_coul   =      -939.5249 E_long   =     -3342.5201 Press    =       236.7471 
+---------------- Step      600 ----- CPU =     50.9590 (sec) ----------------
+TotEng   =     -2564.7161 KinEng   =       701.8494 Temp     =       264.2603 
+PotEng   =     -3266.5655 E_bond   =       223.5820 E_angle  =       582.7722 
+E_dihed  =       394.6196 E_impro  =        43.8581 E_vdwl   =      -201.7759 
+E_coul   =      -967.4136 E_long   =     -3342.2079 Press    =        26.6595 
+---------------- Step      700 ----- CPU =     59.4791 (sec) ----------------
+TotEng   =     -2510.1142 KinEng   =       689.5931 Temp     =       259.6455 
+PotEng   =     -3199.7072 E_bond   =       254.6476 E_angle  =       611.9715 
+E_dihed  =       403.0624 E_impro  =        44.1360 E_vdwl   =      -205.6377 
+E_coul   =      -964.7455 E_long   =     -3343.1416 Press    =        60.5789 
+---------------- Step      800 ----- CPU =     67.9330 (sec) ----------------
+TotEng   =     -2452.7408 KinEng   =       777.5962 Temp     =       292.7805 
+PotEng   =     -3230.3370 E_bond   =       250.4950 E_angle  =       656.6738 
+E_dihed  =       382.4702 E_impro  =        39.5378 E_vdwl   =      -225.0375 
+E_coul   =      -994.4519 E_long   =     -3340.0244 Press    =       -19.6463 
+---------------- Step      900 ----- CPU =     76.3690 (sec) ----------------
+TotEng   =     -2339.9766 KinEng   =       808.7116 Temp     =       304.4961 
+PotEng   =     -3148.6883 E_bond   =       247.7657 E_angle  =       679.0658 
+E_dihed  =       398.2984 E_impro  =        43.7890 E_vdwl   =      -230.2498 
+E_coul   =      -945.8152 E_long   =     -3341.5422 Press    =       -64.4343 
+---------------- Step     1000 ----- CPU =     84.8757 (sec) ----------------
+TotEng   =     -2329.1819 KinEng   =       822.9820 Temp     =       309.8691 
+PotEng   =     -3152.1639 E_bond   =       264.9609 E_angle  =       691.7104 
+E_dihed  =       385.9914 E_impro  =        40.5525 E_vdwl   =      -230.5182 
+E_coul   =      -954.6203 E_long   =     -3350.2405 Press    =      -146.6649 
+Loop time of 84.8758 on 1 procs for 1000 steps with 892 atoms
+
+Performance: 8.144 ns/day, 2.947 hours/ns, 11.782 timesteps/s
+32.0% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 68.548     | 68.548     | 68.548     |   0.0 | 80.76
+Bond    | 10.263     | 10.263     | 10.263     |   0.0 | 12.09
+Kspace  | 2.4528     | 2.4528     | 2.4528     |   0.0 |  2.89
+Neigh   | 1.9041     | 1.9041     | 1.9041     |   0.0 |  2.24
+Comm    | 0.044126   | 0.044126   | 0.044126   |   0.0 |  0.05
+Output  | 0.000983   | 0.000983   | 0.000983   |   0.0 |  0.00
+Modify  | 1.4113     | 1.4113     | 1.4113     |   0.0 |  1.66
+Other   |            | 0.2516     |            |       |  0.30
+
+Nlocal:    892 ave 892 max 892 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    38 ave 38 max 38 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    144068 ave 144068 max 144068 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 144068
+Ave neighs/atom = 161.511
+Ave special neighs/atom = 10.9395
+Neighbor list builds = 190
+Dangerous builds = 0
+
+unfix           cor
+unfix           1
+
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:01:32
diff --git a/examples/USER/misc/filter_corotate/log.22Jun2017.bpti.g++.4 b/examples/USER/misc/filter_corotate/log.22Jun2017.bpti.g++.4
new file mode 100644
index 0000000000..5367f0e624
--- /dev/null
+++ b/examples/USER/misc/filter_corotate/log.22Jun2017.bpti.g++.4
@@ -0,0 +1,241 @@
+LAMMPS (20 Jun 2017)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (../comm.cpp:90)
+  using 1 OpenMP thread(s) per MPI task
+
+units           real
+
+atom_style      full
+bond_style      harmonic
+angle_style     charmm
+dihedral_style  charmm
+improper_style  harmonic
+
+pair_style      lj/charmm/coul/long 8 10
+pair_modify     mix arithmetic
+kspace_style    pppm 1e-4
+
+read_data       data.bpti
+  orthogonal box = (-10 -10 -30) to (50 50 30)
+  1 by 2 by 2 MPI processor grid
+  reading atoms ...
+  892 atoms
+  scanning bonds ...
+  4 = max bonds/atom
+  scanning angles ...
+  6 = max angles/atom
+  scanning dihedrals ...
+  18 = max dihedrals/atom
+  scanning impropers ...
+  2 = max impropers/atom
+  reading bonds ...
+  906 bonds
+  reading angles ...
+  1626 angles
+  reading dihedrals ...
+  2501 dihedrals
+  reading impropers ...
+  137 impropers
+  4 = max # of 1-2 neighbors
+  9 = max # of 1-3 neighbors
+  19 = max # of 1-4 neighbors
+  21 = max # of special neighbors
+
+special_bonds   charmm
+neigh_modify    delay 2 every 1
+
+
+# ------------- MINIMIZE ----------
+
+minimize 	1e-4 1e-6 1000 10000
+WARNING: Resetting reneighboring criteria during minimization (../min.cpp:168)
+PPPM initialization ...
+WARNING: System is not charge neutral, net charge = 6 (../kspace.cpp:302)
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.203272
+  grid = 16 16 16
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0316399
+  estimated relative force accuracy = 9.52826e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 3549 1024
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 12
+  ghost atom cutoff = 12
+  binsize = 6, bins = 10 10 10
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/charmm/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 16.97 | 17.2 | 17.52 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -3075.6498    943.91164   -2131.7381   -380.67776 
+     241            0   -4503.3131    749.58665   -3753.7264   -29.044989 
+Loop time of 3.06327 on 4 procs for 241 steps with 892 atoms
+
+31.9% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+        -2131.73812515      -3753.4398752     -3753.72640446
+  Force two-norm initial, final = 1086.21 26.3687
+  Force max component initial, final = 310.811 3.92765
+  Final line search alpha, max atom move = 0.0059665 0.0234343
+  Iterations, force evaluations = 241 463
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.91458    | 1.6235     | 2.2701     |  38.2 | 53.00
+Bond    | 0.055164   | 0.13173    | 0.19487    |  15.1 |  4.30
+Kspace  | 0.48966    | 1.1993     | 1.9847     |  48.7 | 39.15
+Neigh   | 0.053297   | 0.053442   | 0.053576   |   0.0 |  1.74
+Comm    | 0.031677   | 0.035006   | 0.038061   |   1.5 |  1.14
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.02021    |            |       |  0.66
+
+Nlocal:    223 ave 323 max 89 min
+Histogram: 1 0 0 0 1 0 0 0 1 1
+Nghost:    613 ave 675 max 557 min
+Histogram: 1 0 0 1 0 1 0 0 0 1
+Neighs:    37222.8 ave 50005 max 20830 min
+Histogram: 1 0 0 0 1 0 0 1 0 1
+
+Total # of neighbors = 148891
+Ave neighs/atom = 166.918
+Ave special neighs/atom = 10.9395
+Neighbor list builds = 15
+Dangerous builds = 0
+reset_timestep  0
+
+# ------------- RUN ---------------
+
+thermo          100
+thermo_style    multi
+timestep	8
+
+run_style respa 3 2 8 bond 1 dihedral 2 pair 2 kspace 3
+Respa levels:
+  1 = bond angle
+  2 = dihedral improper pair
+  3 = kspace
+
+velocity        all create 200.0 12345678 dist uniform
+#dump            dump1 all atom 100 4pti.dump
+
+fix             1 all nvt temp 200 300 25
+fix             cor all filter/corotate m 1.0
+  163 = # of size 2 clusters
+  0 = # of size 3 clusters
+  25 = # of size 4 clusters
+  0 = # of size 5 clusters
+  100 = # of frozen angles
+
+run             1000
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.203272
+  grid = 16 16 16
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0316399
+  estimated relative force accuracy = 9.52826e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 3549 1024
+Per MPI rank memory allocation (min/avg/max) = 17.14 | 17.63 | 18.14 Mbytes
+---------------- Step        0 ----- CPU =      0.0000 (sec) ----------------
+TotEng   =     -3220.3378 KinEng   =       531.1804 Temp     =       200.0000 
+PotEng   =     -3751.5182 E_bond   =        42.2810 E_angle  =       345.2593 
+E_dihed  =       337.8361 E_impro  =        24.2103 E_vdwl   =      -288.5339 
+E_coul   =      -886.3622 E_long   =     -3326.2088 Press    =        83.2284 
+---------------- Step      100 ----- CPU =      3.4639 (sec) ----------------
+TotEng   =     -2718.4266 KinEng   =       539.6246 Temp     =       203.1794 
+PotEng   =     -3258.0513 E_bond   =       203.2306 E_angle  =       566.1887 
+E_dihed  =       397.6756 E_impro  =        34.7695 E_vdwl   =      -248.6577 
+E_coul   =      -874.8446 E_long   =     -3336.4135 Press    =       135.8653 
+---------------- Step      200 ----- CPU =      6.8898 (sec) ----------------
+TotEng   =     -2662.0450 KinEng   =       625.0178 Temp     =       235.3317 
+PotEng   =     -3287.0628 E_bond   =       208.1691 E_angle  =       590.8259 
+E_dihed  =       389.1424 E_impro  =        30.5879 E_vdwl   =      -240.5397 
+E_coul   =      -926.3110 E_long   =     -3338.9375 Press    =       103.4843 
+---------------- Step      300 ----- CPU =     10.2791 (sec) ----------------
+TotEng   =     -2661.8829 KinEng   =       623.0352 Temp     =       234.5852 
+PotEng   =     -3284.9181 E_bond   =       203.0274 E_angle  =       573.6583 
+E_dihed  =       383.0124 E_impro  =        41.9015 E_vdwl   =      -218.0696 
+E_coul   =      -926.5806 E_long   =     -3341.8675 Press    =        45.6868 
+---------------- Step      400 ----- CPU =     13.5874 (sec) ----------------
+TotEng   =     -2594.5220 KinEng   =       672.8693 Temp     =       253.3487 
+PotEng   =     -3267.3914 E_bond   =       201.3378 E_angle  =       612.7099 
+E_dihed  =       410.1920 E_impro  =        44.0201 E_vdwl   =      -217.9714 
+E_coul   =      -971.6203 E_long   =     -3346.0595 Press    =      -121.1015 
+---------------- Step      500 ----- CPU =     16.9047 (sec) ----------------
+TotEng   =     -2603.9306 KinEng   =       668.2122 Temp     =       251.5952 
+PotEng   =     -3272.1428 E_bond   =       238.1081 E_angle  =       578.3310 
+E_dihed  =       399.1305 E_impro  =        41.4314 E_vdwl   =      -216.9664 
+E_coul   =      -969.4047 E_long   =     -3342.7729 Press    =       156.7851 
+---------------- Step      600 ----- CPU =     20.1970 (sec) ----------------
+TotEng   =     -2531.1096 KinEng   =       728.1698 Temp     =       274.1705 
+PotEng   =     -3259.2794 E_bond   =       232.8396 E_angle  =       621.3323 
+E_dihed  =       398.1952 E_impro  =        37.0914 E_vdwl   =      -241.6350 
+E_coul   =      -963.1540 E_long   =     -3343.9488 Press    =        58.6784 
+---------------- Step      700 ----- CPU =     23.4360 (sec) ----------------
+TotEng   =     -2499.9495 KinEng   =       742.1211 Temp     =       279.4234 
+PotEng   =     -3242.0705 E_bond   =       240.5622 E_angle  =       582.9270 
+E_dihed  =       396.6246 E_impro  =        36.6510 E_vdwl   =      -228.4925 
+E_coul   =      -926.8734 E_long   =     -3343.4695 Press    =       -60.7458 
+---------------- Step      800 ----- CPU =     26.6709 (sec) ----------------
+TotEng   =     -2426.0217 KinEng   =       760.1083 Temp     =       286.1959 
+PotEng   =     -3186.1300 E_bond   =       266.5863 E_angle  =       652.3401 
+E_dihed  =       380.7407 E_impro  =        34.6861 E_vdwl   =      -225.3729 
+E_coul   =      -953.2382 E_long   =     -3341.8721 Press    =       -57.9824 
+---------------- Step      900 ----- CPU =     29.8152 (sec) ----------------
+TotEng   =     -2419.4636 KinEng   =       780.8361 Temp     =       294.0004 
+PotEng   =     -3200.2996 E_bond   =       269.3237 E_angle  =       665.7171 
+E_dihed  =       408.3527 E_impro  =        43.7811 E_vdwl   =      -254.0696 
+E_coul   =     -1002.0694 E_long   =     -3331.3352 Press    =       -52.0169 
+---------------- Step     1000 ----- CPU =     32.8748 (sec) ----------------
+TotEng   =     -2398.7244 KinEng   =       811.9856 Temp     =       305.7288 
+PotEng   =     -3210.7099 E_bond   =       258.2207 E_angle  =       639.3671 
+E_dihed  =       379.3353 E_impro  =        41.7602 E_vdwl   =      -207.2654 
+E_coul   =      -983.9330 E_long   =     -3338.1948 Press    =        89.4870 
+Loop time of 32.8751 on 4 procs for 1000 steps with 892 atoms
+
+Performance: 21.025 ns/day, 1.141 hours/ns, 30.418 timesteps/s
+31.9% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 12.449     | 19.023     | 24.612     |  99.6 | 57.86
+Bond    | 1.4547     | 2.8768     | 3.9098     |  61.4 |  8.75
+Kspace  | 1.0537     | 1.0778     | 1.0992     |   2.1 |  3.28
+Neigh   | 0.67542    | 0.67994    | 0.68323    |   0.3 |  2.07
+Comm    | 1.8602     | 8.4515     | 16.516     | 182.9 | 25.71
+Output  | 0.000839   | 0.00147    | 0.003293   |   2.7 |  0.00
+Modify  | 0.56658    | 0.63186    | 0.69304    |   6.8 |  1.92
+Other   |            | 0.133      |            |       |  0.40
+
+Nlocal:    223 ave 339 max 136 min
+Histogram: 1 1 0 0 0 1 0 0 0 1
+Nghost:    590 ave 626 max 552 min
+Histogram: 1 0 0 0 1 0 1 0 0 1
+Neighs:    36488.2 ave 41965 max 29054 min
+Histogram: 1 0 0 0 1 0 0 0 1 1
+
+Total # of neighbors = 145953
+Ave neighs/atom = 163.624
+Ave special neighs/atom = 10.9395
+Neighbor list builds = 189
+Dangerous builds = 0
+
+unfix           cor
+unfix           1
+
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:00:36
diff --git a/examples/USER/misc/filter_corotate/log.22Jun2017.peptide.g++.1 b/examples/USER/misc/filter_corotate/log.22Jun2017.peptide.g++.1
new file mode 100644
index 0000000000..22c5483c9e
--- /dev/null
+++ b/examples/USER/misc/filter_corotate/log.22Jun2017.peptide.g++.1
@@ -0,0 +1,147 @@
+LAMMPS (20 Jun 2017)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (../comm.cpp:90)
+  using 1 OpenMP thread(s) per MPI task
+# Solvated 5-mer peptide, run for 8ps in NVT
+
+units           real
+atom_style      full
+
+pair_style      lj/charmm/coul/long 8.0 10.0 10.0
+bond_style      harmonic
+angle_style     charmm
+dihedral_style  charmm
+improper_style  harmonic
+kspace_style    pppm 0.0001
+
+read_data       data.peptide
+  orthogonal box = (36.8402 41.0137 29.7681) to (64.2116 68.3851 57.1395)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  2004 atoms
+  reading velocities ...
+  2004 velocities
+  scanning bonds ...
+  3 = max bonds/atom
+  scanning angles ...
+  6 = max angles/atom
+  scanning dihedrals ...
+  14 = max dihedrals/atom
+  scanning impropers ...
+  1 = max impropers/atom
+  reading bonds ...
+  1365 bonds
+  reading angles ...
+  786 angles
+  reading dihedrals ...
+  207 dihedrals
+  reading impropers ...
+  12 impropers
+  4 = max # of 1-2 neighbors
+  7 = max # of 1-3 neighbors
+  14 = max # of 1-4 neighbors
+  18 = max # of special neighbors
+
+neighbor        2.0 bin
+neigh_modify    delay 5
+
+thermo          50
+#dump            dump1 all atom 100 peptide.dump
+
+timestep        8
+
+run_style respa 3 2 8 bond 1 dihedral 2 pair 2 kspace 3
+Respa levels:
+  1 = bond angle
+  2 = dihedral improper pair
+  3 = kspace
+
+fix             1 all nvt temp 250.0 250.0 100.0 tchain 1
+fix             cor all filter/corotate m 1.0
+  19 = # of size 2 clusters
+  0 = # of size 3 clusters
+  3 = # of size 4 clusters
+  0 = # of size 5 clusters
+  646 = # of frozen angles
+run             1000
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.268725
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0228209
+  estimated relative force accuracy = 6.87243e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 10648 3375
+Neighbor list info ...
+  update every 1 steps, delay 5 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 12
+  ghost atom cutoff = 12
+  binsize = 6, bins = 5 5 5
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/charmm/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 22.72 | 22.72 | 22.72 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0     190.0857   -6442.7438    70.391457   -5237.4338    20361.984 
+      50    239.47667   -7205.1006    1092.7664   -4682.5237   -23733.122 
+     100    244.63086   -6788.0793    422.97204   -4904.5234    16458.011 
+     150    240.79042   -7267.0791    966.31411   -4863.1107   -13554.894 
+     200    254.77122   -6868.5713    591.00071   -4756.4431    10532.563 
+     250    241.87417   -7264.9349     856.9357   -4963.8743   -9043.4359 
+     300    251.37775      -6976.8    650.55612   -4825.3773    6986.2021 
+     350    250.81494   -7286.7011    880.11184   -4909.0829   -6392.4665 
+     400    247.55673   -7104.4036    701.89555   -4924.4551    4720.7811 
+     450    258.54988   -7215.3011    832.23692   -4839.3759   -3446.3859 
+     500    246.80928   -7151.2468    715.61007   -4962.0464    2637.5769 
+     550    246.20721   -7159.0464    805.24974   -4883.8011    -2725.227 
+     600    250.62483   -7201.7688    806.10076   -4899.2968    770.22352 
+     650    247.59777   -7260.1607    802.97277   -4978.8899   -430.42309 
+     700    246.86951   -7286.2971    825.99865   -4986.3486   -427.88651 
+     750    252.79268   -7307.8572     833.4822   -4965.0605   -614.74372 
+     800    251.73191   -7315.2457    839.59859    -4972.666    952.56448 
+     850    246.75844   -7303.6221    816.67112   -5013.6642   -2055.2823 
+     900    251.00123   -7317.4219    825.12165   -4993.6817   -356.53166 
+     950    259.20822   -7252.3466    854.62611   -4850.1016   -1719.5267 
+    1000    245.72486   -7347.5547    811.48146   -5068.9576    -717.6136 
+Loop time of 357.523 on 1 procs for 1000 steps with 2004 atoms
+
+Performance: 1.933 ns/day, 12.414 hours/ns, 2.797 timesteps/s
+32.0% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 328.2      | 328.2      | 328.2      |   0.0 | 91.80
+Bond    | 4.4815     | 4.4815     | 4.4815     |   0.0 |  1.25
+Kspace  | 3.9448     | 3.9448     | 3.9448     |   0.0 |  1.10
+Neigh   | 12.457     | 12.457     | 12.457     |   0.0 |  3.48
+Comm    | 3.2147     | 3.2147     | 3.2147     |   0.0 |  0.90
+Output  | 0.001689   | 0.001689   | 0.001689   |   0.0 |  0.00
+Modify  | 3.937      | 3.937      | 3.937      |   0.0 |  1.10
+Other   |            | 1.289      |            |       |  0.36
+
+Nlocal:    2004 ave 2004 max 2004 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    11191 ave 11191 max 11191 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    708610 ave 708610 max 708610 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 708610
+Ave neighs/atom = 353.598
+Ave special neighs/atom = 2.34032
+Neighbor list builds = 200
+Dangerous builds = 200
+unfix           cor
+unfix           1
+
+
+
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:05:57
diff --git a/examples/USER/misc/filter_corotate/log.22Jun2017.peptide.g++.4 b/examples/USER/misc/filter_corotate/log.22Jun2017.peptide.g++.4
new file mode 100644
index 0000000000..eec3843bd0
--- /dev/null
+++ b/examples/USER/misc/filter_corotate/log.22Jun2017.peptide.g++.4
@@ -0,0 +1,147 @@
+LAMMPS (20 Jun 2017)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (../comm.cpp:90)
+  using 1 OpenMP thread(s) per MPI task
+# Solvated 5-mer peptide, run for 8ps in NVT
+
+units           real
+atom_style      full
+
+pair_style      lj/charmm/coul/long 8.0 10.0 10.0
+bond_style      harmonic
+angle_style     charmm
+dihedral_style  charmm
+improper_style  harmonic
+kspace_style    pppm 0.0001
+
+read_data       data.peptide
+  orthogonal box = (36.8402 41.0137 29.7681) to (64.2116 68.3851 57.1395)
+  1 by 2 by 2 MPI processor grid
+  reading atoms ...
+  2004 atoms
+  reading velocities ...
+  2004 velocities
+  scanning bonds ...
+  3 = max bonds/atom
+  scanning angles ...
+  6 = max angles/atom
+  scanning dihedrals ...
+  14 = max dihedrals/atom
+  scanning impropers ...
+  1 = max impropers/atom
+  reading bonds ...
+  1365 bonds
+  reading angles ...
+  786 angles
+  reading dihedrals ...
+  207 dihedrals
+  reading impropers ...
+  12 impropers
+  4 = max # of 1-2 neighbors
+  7 = max # of 1-3 neighbors
+  14 = max # of 1-4 neighbors
+  18 = max # of special neighbors
+
+neighbor        2.0 bin
+neigh_modify    delay 5
+
+thermo          50
+#dump            dump1 all atom 100 peptide.dump
+
+timestep        8
+
+run_style respa 3 2 8 bond 1 dihedral 2 pair 2 kspace 3
+Respa levels:
+  1 = bond angle
+  2 = dihedral improper pair
+  3 = kspace
+
+fix             1 all nvt temp 250.0 250.0 100.0 tchain 1
+fix             cor all filter/corotate m 1.0
+  19 = # of size 2 clusters
+  0 = # of size 3 clusters
+  3 = # of size 4 clusters
+  0 = # of size 5 clusters
+  646 = # of frozen angles
+run             1000
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.268725
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0228209
+  estimated relative force accuracy = 6.87243e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 4312 960
+Neighbor list info ...
+  update every 1 steps, delay 5 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 12
+  ghost atom cutoff = 12
+  binsize = 6, bins = 5 5 5
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/charmm/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 16.87 | 17.05 | 17.26 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0     190.0857   -6442.7438    70.391457   -5237.4338    20361.984 
+      50    239.47667   -7205.1005    1092.7664   -4682.5237   -23733.122 
+     100    244.63889   -6788.1152    422.96733   -4904.5161    16457.756 
+     150    239.36917   -7258.7053    967.87775   -4861.6589   -13526.261 
+     200    255.14702   -6864.0525    604.58036   -4736.1009      11013.1 
+     250    252.72919   -7303.0966    898.11178   -4896.0494   -8480.8766 
+     300    250.66477   -6989.2603    652.83649   -4839.8141    6209.3375 
+     350    243.30794   -7218.8575    838.31977   -4927.8525   -5180.4928 
+     400     256.3573    -7090.677    706.24197   -4853.8377     3302.577 
+     450    246.15776    -7274.574    834.31676    -4970.557    -3427.971 
+     500    256.28473   -7082.1447    735.42828   -4816.5524     2846.086 
+     550    251.32327    -7341.739    812.64934   -5028.5484   -1786.9277 
+     600    254.57737   -7152.3448    740.52534   -4891.8494    825.91675 
+     650    244.95305   -7207.1136    790.67659   -4953.9295   -520.79769 
+     700     249.4984   -7204.2699    779.06969   -4935.5544   -940.75384 
+     750    248.46962   -7232.1037     791.6642   -4956.9361   -548.12171 
+     800     260.2974   -7293.1982    793.23282   -4945.8435     -1171.26 
+     850    249.79023   -7258.3759    823.56789   -4943.4198   -499.76275 
+     900    249.97237   -7267.0584    784.57992   -4990.0028   -271.33531 
+     950    251.29018   -7261.0642      823.467   -4937.2534    -538.7168 
+    1000    246.05777   -7285.0948    847.90892   -4968.0826   -2613.1854 
+Loop time of 94.6835 on 4 procs for 1000 steps with 2004 atoms
+
+Performance: 7.300 ns/day, 3.288 hours/ns, 10.562 timesteps/s
+37.9% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 33.389     | 78.508     | 94.639     | 294.1 | 82.92
+Bond    | 0.39957    | 1.104      | 1.4443     |  40.6 |  1.17
+Kspace  | 0.53324    | 1.2631     | 1.5137     |  37.5 |  1.33
+Neigh   | 1.2668     | 3.011      | 3.5942     |  58.0 |  3.18
+Comm    | 3.4563     | 8.8707     | 11.494     | 107.9 |  9.37
+Output  | 0.000435   | 0.0017425  | 0.004136   |   3.4 |  0.00
+Modify  | 0.59335    | 1.4123     | 1.6921     |  39.8 |  1.49
+Other   |            | 0.5129     |            |       |  0.54
+
+Nlocal:    501 ave 515 max 476 min
+Histogram: 1 0 0 0 0 0 0 1 1 1
+Nghost:    6681.5 ave 6740 max 6634 min
+Histogram: 2 0 0 0 0 0 0 1 0 1
+Neighs:    176872 ave 182642 max 168464 min
+Histogram: 1 0 0 0 0 0 1 1 0 1
+
+Total # of neighbors = 707486
+Ave neighs/atom = 353.037
+Ave special neighs/atom = 2.34032
+Neighbor list builds = 200
+Dangerous builds = 200
+unfix           cor
+unfix           1
+
+
+
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:01:53
diff --git a/examples/neb/README b/examples/neb/README
index 0993c5fcdb..5ef32f2ba6 100644
--- a/examples/neb/README
+++ b/examples/neb/README
@@ -2,15 +2,13 @@ Run these examples as:
 
 mpirun -np 4 lmp_g++ -partition 4x1 -in in.neb.hop1
 mpirun -np 4 lmp_g++ -partition 4x1 -in in.neb.hop2
-mpirun -np 4 lmp_g++ -partition 4x1 -in in.neb.hop1freeend
+mpirun -np 4 lmp_g++ -partition 4x1 -in in.neb.hop1.end
 mpirun -np 3 lmp_g++ -partition 3x1 -in in.neb.sivac
 
 mpirun -np 8 lmp_g++ -partition 4x2 -in in.neb.hop1
 mpirun -np 8 lmp_g++ -partition 4x2 -in in.neb.hop2
-mpirun -np 8 lmp_g++ -partition 4x2 -in in.neb.hop1freeend
-mpirun -np 6 lmp_g++ -partition 3x2 -in in.neb.sivac
-mpirun -np 9 lmp_g++ -partition 3x3 -in in.neb.sivac
-
+mpirun -np 8 lmp_g++ -partition 4x2 -in in.neb.hop1.end
+mpirun -np 8 lmp_g++ -partition 4x2 -in in.neb.sivac
 
 Note that more than 4 replicas should be used for a precise estimate 
 of the activation energy corresponding to a transition.
diff --git a/examples/neb/in.neb.hop1 b/examples/neb/in.neb.hop1
index b874d1ba32..f26b52a28a 100644
--- a/examples/neb/in.neb.hop1
+++ b/examples/neb/in.neb.hop1
@@ -51,7 +51,7 @@ set		group nebatoms type 3
 group		nonneb subtract all nebatoms
 
 fix		1 lower setforce 0.0 0.0 0.0
-fix		2 nebatoms neb 1.0 nudg_style idealpos
+fix		2 nebatoms neb 1.0 parallel ideal
 fix		3 all enforce2d
 
 thermo		100
diff --git a/examples/neb/in.neb.hop1freeend b/examples/neb/in.neb.hop1.end
similarity index 91%
rename from examples/neb/in.neb.hop1freeend
rename to examples/neb/in.neb.hop1.end
index fa90e9a98c..81e5315306 100644
--- a/examples/neb/in.neb.hop1freeend
+++ b/examples/neb/in.neb.hop1.end
@@ -15,7 +15,7 @@ variable	u uloop 20
 lattice		hex 0.9
 region		box block 0 20 0 10 -0.25 0.25
 
-read_data        initial.hop1freeend
+read_data        initial.hop1.end
 
 # LJ potentials
 
@@ -41,7 +41,7 @@ set		group nebatoms type 3
 group		nonneb subtract all nebatoms
 
 fix		1 lower setforce 0.0 0.0 0.0
-fix		2 nebatoms neb 1.0 nudg_style idealpos freeend ini
+fix		2 nebatoms neb 1.0 parallel ideal end first 1.0
 fix		3 all enforce2d
 
 thermo		100
diff --git a/examples/neb/in.neb.hop2 b/examples/neb/in.neb.hop2
index 242de759fa..e69fb338cd 100644
--- a/examples/neb/in.neb.hop2
+++ b/examples/neb/in.neb.hop2
@@ -65,4 +65,4 @@ thermo		100
 
 min_style	fire
 
-neb		0.0 0.01 1000 1000 100 final final.hop2
+neb		0.0 0.05 1000 1000 100 final final.hop2
diff --git a/examples/neb/initial.hop1freeend b/examples/neb/initial.hop1.end
similarity index 100%
rename from examples/neb/initial.hop1freeend
rename to examples/neb/initial.hop1.end
diff --git a/examples/neb/log.19Jun17.neb.hop1.end.g++.4 b/examples/neb/log.19Jun17.neb.hop1.end.g++.4
new file mode 100644
index 0000000000..4878b86566
--- /dev/null
+++ b/examples/neb/log.19Jun17.neb.hop1.end.g++.4
@@ -0,0 +1,11 @@
+LAMMPS (19 May 2017)
+Running on 4 partitions of processors
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+0    229.26196    146.68251    2.9774577    4.4127369    233.11559  0.023301843    0.0224626    1.4763579            0    -3.048332   0.33333333   -3.0250302   0.66666667   -3.0291888            1   -3.0474928 
+100   0.11027532  0.085410308    3.0967938  0.024201563   0.38551033 0.0017583261 0.0021866943    1.7710358            0   -3.0483469   0.31192818   -3.0465886   0.61093022   -3.0466143            1   -3.0487752 
+130   0.09954083  0.075481108    3.0927626  0.015664388   0.37491833 0.0017573704 0.0021913201    1.7713726            0    -3.048342   0.31428487   -3.0465846   0.61762817   -3.0466296            1    -3.048776 
+Climbing replica = 2
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+130   0.37838747    0.3502435    3.0927626  0.015664388   0.37491833 0.0017573704 0.0021913201    1.7713726            0    -3.048342   0.31428487   -3.0465846   0.61762817   -3.0466296            1    -3.048776 
+230   0.22757286   0.12027481    3.1250243 0.0081260569   0.14019507 0.0018364585  0.002278918      1.76926            0   -3.0483347   0.39730698   -3.0464983   0.64450769   -3.0466973            1   -3.0487772 
+278  0.096184498  0.085088496    3.1405655 0.0068164307  0.093861113 0.0018426056  0.002286256    1.7684765            0   -3.0483338   0.41277997   -3.0464912   0.65562984   -3.0467294            1   -3.0487775 
diff --git a/examples/neb/log.19Jun17.neb.hop1.end.g++.8 b/examples/neb/log.19Jun17.neb.hop1.end.g++.8
new file mode 100644
index 0000000000..62344b3da5
--- /dev/null
+++ b/examples/neb/log.19Jun17.neb.hop1.end.g++.8
@@ -0,0 +1,11 @@
+LAMMPS (19 May 2017)
+Running on 4 partitions of processors
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+0    229.26196    146.68251    2.9774577    4.4127369    233.11559  0.023301843    0.0224626    1.4763579            0    -3.048332   0.33333333   -3.0250302   0.66666667   -3.0291888            1   -3.0474928 
+100   0.11375359  0.085350745    3.0966418    0.0236765   0.38531777 0.0017582606 0.0021868783    1.7710738            0   -3.0483467   0.31201141   -3.0465884   0.61117406   -3.0466149            1   -3.0487753 
+119   0.09996986  0.078639268    3.0937691  0.017444108    0.3780308 0.0017574935 0.0021899317    1.7713574            0   -3.0483433   0.31354192   -3.0465858   0.61555533   -3.0466249            1   -3.0487758 
+Climbing replica = 2
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+119    0.3793192   0.35281863    3.0937691  0.017444108    0.3780308 0.0017574935 0.0021899317    1.7713574            0   -3.0483433   0.31354192   -3.0465858   0.61555533   -3.0466249            1   -3.0487758 
+219   0.20159133   0.12247026    3.1244061 0.0085896057   0.13938632 0.0018362816 0.0022783681    1.7693295            0    -3.048335   0.39646633   -3.0464988   0.64277703   -3.0466925            1   -3.0487771 
+266  0.099868725  0.086180598    3.1401661 0.0070922949  0.095128081  0.001842608  0.002286044    1.7685191            0    -3.048334   0.41231024   -3.0464914   0.65425179   -3.0467252            1   -3.0487774 
diff --git a/examples/neb/log.19Jun17.neb.hop1.g++.4 b/examples/neb/log.19Jun17.neb.hop1.g++.4
new file mode 100644
index 0000000000..e2984c031c
--- /dev/null
+++ b/examples/neb/log.19Jun17.neb.hop1.g++.4
@@ -0,0 +1,9 @@
+LAMMPS (19 May 2017)
+Running on 4 partitions of processors
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+0    4327.2753    2746.3378  0.082169072    4.9967651    4514.5424   0.42933428   0.42323635    1.8941131            0   -3.0535948   0.33333333   -2.6242605   0.66666667   -2.7623811            1   -3.0474969 
+87  0.095951502  0.052720903  0.005588927  0.065110105   0.12467831 0.0071014928 0.0022798007    2.3003372            0   -3.0535967   0.32435271   -3.0473127   0.62805027   -3.0464952            1    -3.048775 
+Climbing replica = 3
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+87   0.14137277   0.11108954  0.005588927  0.065110105   0.12467831 0.0071014928 0.0022798007    2.3003372            0   -3.0535967   0.32435271   -3.0473127   0.62805027   -3.0464952            1    -3.048775 
+124  0.099583263  0.085936899 0.0044220372  0.023873795  0.091308308 0.0071061754 0.0022863931     2.308121            0   -3.0535968   0.32223905   -3.0473329   0.61673898   -3.0464906            1    -3.048777 
diff --git a/examples/neb/log.19Jun17.neb.hop1.g++.8 b/examples/neb/log.19Jun17.neb.hop1.g++.8
new file mode 100644
index 0000000000..d1be1284fa
--- /dev/null
+++ b/examples/neb/log.19Jun17.neb.hop1.g++.8
@@ -0,0 +1,9 @@
+LAMMPS (19 May 2017)
+Running on 4 partitions of processors
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+0    4327.2753    2746.3378  0.082169072    4.9967651    4514.5424   0.42933428   0.42323635    1.8941131            0   -3.0535948   0.33333333   -2.6242605   0.66666667   -2.7623811            1   -3.0474969 
+87  0.095951792  0.052720902 0.0055889267  0.065110091   0.12467831 0.0071014928 0.0022798007    2.3003372            0   -3.0535967   0.32435271   -3.0473127   0.62805027   -3.0464952            1    -3.048775 
+Climbing replica = 3
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+87   0.14137297   0.11108954 0.0055889267  0.065110091   0.12467831 0.0071014928 0.0022798007    2.3003372            0   -3.0535967   0.32435271   -3.0473127   0.62805027   -3.0464952            1    -3.048775 
+124  0.099582186   0.08593683 0.0044220345  0.023873731  0.091308197 0.0071061754 0.0022863931    2.3081211            0   -3.0535968   0.32223904   -3.0473329   0.61673896   -3.0464906            1    -3.048777 
diff --git a/examples/neb/log.19Jun17.neb.hop2.g++.4 b/examples/neb/log.19Jun17.neb.hop2.g++.4
new file mode 100644
index 0000000000..c6b6cbe2ce
--- /dev/null
+++ b/examples/neb/log.19Jun17.neb.hop2.g++.4
@@ -0,0 +1,12 @@
+LAMMPS (19 May 2017)
+Running on 4 partitions of processors
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+0    14.104748    10.419633    0.1227071     4.999238    8.2087606 0.0018276223 0.00064050211   0.98401186            0   -3.0514921   0.33333333   -3.0496673   0.66666667   -3.0496645            1    -3.050305 
+100   0.24646695   0.10792196 0.0077146918  0.058733261   0.63504706  0.001516756 0.0015151635     1.165391            0   -3.0514939    0.2890334   -3.0503533   0.59718494   -3.0499771            1   -3.0514923 
+200  0.061777741  0.050288749 0.0047486883 0.0095236035   0.88698597 0.0014465772 0.0014462528    1.1692938            0   -3.0514941   0.29975094   -3.0503052   0.62768286   -3.0500476            1   -3.0514938 
+261  0.048699591  0.038138604 0.0040083594 0.0074854409   0.95722712 0.0014243579 0.0014241377    1.1696848            0   -3.0514942   0.30525481   -3.0502812    0.6357998   -3.0500698            1    -3.051494 
+Climbing replica = 3
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+261   0.95753855   0.94297239 0.0040083594 0.0074854409   0.95722712 0.0014243579 0.0014241377    1.1696848            0   -3.0514942   0.30525481   -3.0502812    0.6357998   -3.0500698            1    -3.051494 
+361  0.072509627   0.06580631 0.0027545765 0.0044749366  0.016746483 0.0016018879 0.0016017805    1.1704611            0   -3.0514943   0.28176307   -3.0503855   0.50355454   -3.0498924            1   -3.0514942 
+381   0.04884836  0.040787876 0.0023445904 0.0035162935  0.017959209 0.0016017716 0.0016016898    1.1713862            0   -3.0514943   0.27120138   -3.0504399   0.50428218   -3.0498925            1   -3.0514942 
diff --git a/examples/neb/log.19Jun17.neb.hop2.g++.8 b/examples/neb/log.19Jun17.neb.hop2.g++.8
new file mode 100644
index 0000000000..c6b6cbe2ce
--- /dev/null
+++ b/examples/neb/log.19Jun17.neb.hop2.g++.8
@@ -0,0 +1,12 @@
+LAMMPS (19 May 2017)
+Running on 4 partitions of processors
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+0    14.104748    10.419633    0.1227071     4.999238    8.2087606 0.0018276223 0.00064050211   0.98401186            0   -3.0514921   0.33333333   -3.0496673   0.66666667   -3.0496645            1    -3.050305 
+100   0.24646695   0.10792196 0.0077146918  0.058733261   0.63504706  0.001516756 0.0015151635     1.165391            0   -3.0514939    0.2890334   -3.0503533   0.59718494   -3.0499771            1   -3.0514923 
+200  0.061777741  0.050288749 0.0047486883 0.0095236035   0.88698597 0.0014465772 0.0014462528    1.1692938            0   -3.0514941   0.29975094   -3.0503052   0.62768286   -3.0500476            1   -3.0514938 
+261  0.048699591  0.038138604 0.0040083594 0.0074854409   0.95722712 0.0014243579 0.0014241377    1.1696848            0   -3.0514942   0.30525481   -3.0502812    0.6357998   -3.0500698            1    -3.051494 
+Climbing replica = 3
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+261   0.95753855   0.94297239 0.0040083594 0.0074854409   0.95722712 0.0014243579 0.0014241377    1.1696848            0   -3.0514942   0.30525481   -3.0502812    0.6357998   -3.0500698            1    -3.051494 
+361  0.072509627   0.06580631 0.0027545765 0.0044749366  0.016746483 0.0016018879 0.0016017805    1.1704611            0   -3.0514943   0.28176307   -3.0503855   0.50355454   -3.0498924            1   -3.0514942 
+381   0.04884836  0.040787876 0.0023445904 0.0035162935  0.017959209 0.0016017716 0.0016016898    1.1713862            0   -3.0514943   0.27120138   -3.0504399   0.50428218   -3.0498925            1   -3.0514942 
diff --git a/examples/neb/log.19Jun17.neb.sivac.g++.4 b/examples/neb/log.19Jun17.neb.sivac.g++.4
new file mode 100644
index 0000000000..0d9880ca81
--- /dev/null
+++ b/examples/neb/log.19Jun17.neb.sivac.g++.4
@@ -0,0 +1,17 @@
+LAMMPS (19 May 2017)
+Running on 4 partitions of processors
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+0    7.5525391    1.6345605   0.16683659    7.5525391    7.5525391    1.5383951            0    1.6207355            0   -2213.3343   0.33333333   -2212.7428   0.66666667   -2212.2247            1   -2211.7959 
+10   0.24005275  0.036502104  0.036483049   0.24005275   0.68351722   0.42916118   0.41794425    1.6989349            0   -2213.3365   0.32909183   -2212.9587   0.65386736   -2212.9073            1   -2213.3253 
+20   0.07940898  0.016398055  0.024706844   0.07940898   0.71637784   0.41387872   0.41157886    1.7343662            0   -2213.3369   0.32478734   -2212.9621   0.65348766    -2212.923            1   -2213.3346 
+30  0.094973707 0.0083631681  0.015145947  0.035267404    0.7535772   0.40072717   0.40024605    1.7504612            0   -2213.3372   0.32705584   -2212.9584   0.65894506   -2212.9365            1   -2213.3367 
+40  0.027727472 0.0044528145  0.011618173  0.022562656   0.76133752   0.39614635   0.39591731    1.7547519            0   -2213.3373   0.32873163   -2212.9562   0.66124255   -2212.9411            1    -2213.337 
+50  0.019429348 0.0030110281 0.0087135563  0.015391975   0.76952681   0.39274846    0.3926388    1.7578616            0   -2213.3373   0.33022595   -2212.9543   0.66307279   -2212.9446            1   -2213.3372 
+60  0.019009471 0.0016234562 0.0053426307 0.0086166186   0.77759617   0.38936861   0.38933364    1.7610433            0   -2213.3374   0.33187548   -2212.9523   0.66497617    -2212.948            1   -2213.3373 
+63 0.0097365134 0.0012734598  0.004777604 0.0076121987   0.77865149   0.38888778   0.38886047    1.7615294            0   -2213.3374   0.33212107    -2212.952   0.66525385   -2212.9485            1   -2213.3373 
+Climbing replica = 3
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+63   0.77865149   0.31085821  0.004777604 0.0076121987   0.77865149   0.38888778   0.38886047    1.7615294            0   -2213.3374   0.33212107    -2212.952   0.66525385   -2212.9485            1   -2213.3373 
+73  0.098175496  0.033609035 0.0027886955 0.0042742148  0.036594003   0.51024838   0.51023983    1.7607181            0   -2213.3374   0.27574151   -2213.0416   0.50432348   -2212.8271            1   -2213.3374 
+83   0.03341862  0.012760857 0.0020868177 0.0031625649  0.010189924   0.51014634   0.51014168    1.7602562            0   -2213.3374   0.26045338   -2213.0672   0.50355193   -2212.8272            1   -2213.3374 
+93 0.0097374358 0.0028416114 0.0014003718 0.0020986584 0.0053485291   0.51011052   0.51010848    1.7601202            0   -2213.3374   0.25397887   -2213.0783   0.50388111   -2212.8273            1   -2213.3374 
diff --git a/examples/neb/log.19Jun17.neb.sivac.g++.8 b/examples/neb/log.19Jun17.neb.sivac.g++.8
new file mode 100644
index 0000000000..260eb9e18b
--- /dev/null
+++ b/examples/neb/log.19Jun17.neb.sivac.g++.8
@@ -0,0 +1,18 @@
+LAMMPS (19 May 2017)
+Running on 4 partitions of processors
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+0    7.5525391    1.6345605   0.16683659    7.5525391    7.5525391    1.5383951            0    1.6207355            0   -2213.3343   0.33333333   -2212.7428   0.66666667   -2212.2247            1   -2211.7959 
+10   0.24005275  0.036502104  0.036483049   0.24005275   0.68351722   0.42916118   0.41794425    1.6989349            0   -2213.3365   0.32909183   -2212.9587   0.65386736   -2212.9073            1   -2213.3253 
+20   0.07940898  0.016398055  0.024706844   0.07940898   0.71637784   0.41387872   0.41157886    1.7343662            0   -2213.3369   0.32478734   -2212.9621   0.65348766    -2212.923            1   -2213.3346 
+30  0.094973708 0.0083631681  0.015145947  0.035267404    0.7535772   0.40072717   0.40024605    1.7504612            0   -2213.3372   0.32705584   -2212.9584   0.65894506   -2212.9365            1   -2213.3367 
+40  0.027727472 0.0044528144  0.011618173  0.022562656   0.76133752   0.39614635   0.39591731    1.7547519            0   -2213.3373   0.32873163   -2212.9562   0.66124255   -2212.9411            1    -2213.337 
+50  0.019429341 0.0030110281 0.0087135565  0.015391975    0.7695268   0.39274846    0.3926388    1.7578616            0   -2213.3373   0.33022595   -2212.9543   0.66307279   -2212.9446            1   -2213.3372 
+60  0.019048963 0.0016262345 0.0053426844 0.0086167196   0.77759655   0.38936867    0.3893337    1.7610433            0   -2213.3374   0.33187545   -2212.9523   0.66497615    -2212.948            1   -2213.3373 
+63 0.0097037048 0.0012761841 0.0047749367 0.0076075138   0.77865545   0.38888554   0.38885827    1.7615318            0   -2213.3374   0.33212221    -2212.952   0.66525512   -2212.9485            1   -2213.3373 
+Climbing replica = 3
+Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
+63   0.77865545    0.3108551 0.0047749367 0.0076075138   0.77865545   0.38888554   0.38885827    1.7615318            0   -2213.3374   0.33212221    -2212.952   0.66525512   -2212.9485            1   -2213.3373 
+73  0.098595989  0.033659485 0.0027927196 0.0042813387  0.038224344   0.51024759   0.51023901    1.7607156            0   -2213.3374   0.27595612   -2213.0413   0.50453988   -2212.8271            1   -2213.3374 
+83  0.033344977  0.012868685 0.0020880608 0.0031645847  0.010250413   0.51014677    0.5101421    1.7602601            0   -2213.3374   0.26053624    -2213.067   0.50358775   -2212.8272            1   -2213.3374 
+93  0.013254873 0.0038176141 0.0014928226 0.0022407967 0.0058577818   0.51011371   0.51011138    1.7601272            0   -2213.3374   0.25452741   -2213.0774   0.50382161   -2212.8273            1   -2213.3374 
+95 0.0099964951 0.0031053214 0.0014131665 0.0021184362 0.0053683638   0.51011105   0.51010897    1.7601232            0   -2213.3374    0.2540975   -2213.0781   0.50387313   -2212.8273            1   -2213.3374 
diff --git a/examples/neb/log.5Oct16.neb.hop1.g++.4 b/examples/neb/log.5Oct16.neb.hop1.g++.4
deleted file mode 100644
index c678e69493..0000000000
--- a/examples/neb/log.5Oct16.neb.hop1.g++.4
+++ /dev/null
@@ -1,10 +0,0 @@
-LAMMPS (5 Oct 2016)
-Running on 4 partitions of processors
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-0    4327.2753    2746.3378    0.3387091    5.0075576    4514.5424   0.42933428   0.42323635    1.8941131            0   -3.0535948   0.33333333   -2.6242605   0.66666667   -2.7623811            1   -3.0474969 
-100   0.10482184  0.085218486  0.014588241  0.066178594   0.19602237 0.0070900402 0.0022691875    2.3031875            0   -3.0535967   0.31839181   -3.0473647   0.63987598   -3.0465067            1   -3.0487759 
-111  0.096708467   0.07803707  0.013922973   0.05417562    0.2023467 0.0070871172 0.0022668002    2.3052945            0   -3.0535968   0.31853431   -3.0473633   0.64178871   -3.0465096            1   -3.0487764 
-Climbing replica = 3
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-111    0.2023467    0.1777038  0.013922973   0.05417562    0.2023467 0.0070871172 0.0022668002    2.3052945            0   -3.0535968   0.31853431   -3.0473633   0.64178871   -3.0465096            1   -3.0487764 
-179  0.096874474  0.090676856   0.01040177  0.023364005  0.096874474 0.0071047642 0.0022856172    2.3122768            0   -3.0535969   0.31577311   -3.0473955   0.61798541   -3.0464922            1   -3.0487778 
diff --git a/examples/neb/log.5Oct16.neb.hop1.g++.8 b/examples/neb/log.5Oct16.neb.hop1.g++.8
deleted file mode 100644
index d70f02bd16..0000000000
--- a/examples/neb/log.5Oct16.neb.hop1.g++.8
+++ /dev/null
@@ -1,10 +0,0 @@
-LAMMPS (5 Oct 2016)
-Running on 4 partitions of processors
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-0    4327.2753    2746.3378    0.3387091    5.0075576    4514.5424   0.42933428   0.42323635    1.8941131            0   -3.0535948   0.33333333   -2.6242605   0.66666667   -2.7623811            1   -3.0474969 
-100   0.10482171  0.085218406  0.014588234  0.066178435   0.19602242 0.0070900401 0.0022691875    2.3031875            0   -3.0535967   0.31839181   -3.0473647     0.639876   -3.0465067            1   -3.0487759 
-111  0.096708718  0.078036984  0.013922966  0.054175505   0.20234693 0.0070871172 0.0022668002    2.3052946            0   -3.0535968   0.31853431   -3.0473633   0.64178873   -3.0465096            1   -3.0487764 
-Climbing replica = 3
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-111   0.20234693   0.17770387  0.013922966  0.054175505   0.20234693 0.0070871172 0.0022668002    2.3052946            0   -3.0535968   0.31853431   -3.0473633   0.64178873   -3.0465096            1   -3.0487764 
-178   0.09975409  0.093814031  0.010577358  0.024247224   0.09975409 0.0071042931 0.0022851195     2.312004            0   -3.0535969   0.31607934   -3.0473923     0.618931   -3.0464926            1   -3.0487777 
diff --git a/examples/neb/log.5Oct16.neb.hop2.g++.4 b/examples/neb/log.5Oct16.neb.hop2.g++.4
deleted file mode 100644
index 9977287303..0000000000
--- a/examples/neb/log.5Oct16.neb.hop2.g++.4
+++ /dev/null
@@ -1,18 +0,0 @@
-LAMMPS (5 Oct 2016)
-Running on 4 partitions of processors
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-0    14.104748    10.419633   0.24852044    5.0039071    8.2116049 0.0018276223 0.00064050211   0.98401186            0   -3.0514921   0.33333333   -3.0496673   0.66666667   -3.0496645            1    -3.050305 
-100   0.24646695   0.10792196   0.01781018  0.098854684   0.63725646  0.001516756 0.0015151635     1.165391            0   -3.0514939    0.2890334   -3.0503533   0.59718494   -3.0499771            1   -3.0514923 
-200  0.061777741  0.050288749  0.012466513  0.020420207   0.88741041 0.0014465772 0.0014462528    1.1692938            0   -3.0514941   0.29975094   -3.0503052   0.62768286   -3.0500476            1   -3.0514938 
-300  0.056346766  0.030000618 0.0093152917  0.013765031    1.0101529 0.0014069751 0.0014068154    1.1699608            0   -3.0514942   0.30992449   -3.0502613   0.64174291   -3.0500873            1   -3.0514941 
-400  0.025589489  0.015671005 0.0061287063  0.008588518    1.1136424  0.001370987 0.0013709154    1.1704204            0   -3.0514943   0.32016645   -3.0502198   0.65324019   -3.0501233            1   -3.0514943 
-500  0.014778626 0.0092108366 0.0042668521 0.0059963914    1.1636579 0.0013527466 0.0013527072    1.1706283            0   -3.0514944   0.32550275   -3.0501993   0.65875414   -3.0501416            1   -3.0514943 
-600   0.08786211  0.020876327 0.0031421548 0.0051657363    1.1898894 0.0013430848 0.0013430599    1.1707681            0   -3.0514944   0.32831927   -3.0501889   0.66160681   -3.0501513            1   -3.0514944 
-633 0.0098132678 0.0055392541 0.0030063464 0.0043091323    1.1924486 0.0013420127 0.0013419893    1.1707818            0   -3.0514944   0.32862625   -3.0501878   0.66191769   -3.0501524            1   -3.0514944 
-Climbing replica = 3
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-633    1.1924486    1.1648685 0.0030063464 0.0043091323    1.1924486 0.0013420127 0.0013419893    1.1707818            0   -3.0514944   0.32862625   -3.0501878   0.66191769   -3.0501524            1   -3.0514944 
-733  0.095331134  0.089136608 0.0021551441 0.0031844438  0.043042998 0.0016022317 0.0016022168     1.170789            0   -3.0514944   0.29157063   -3.0503375   0.50358402   -3.0498922            1   -3.0514944 
-833   0.10539135  0.030724373 0.0013749699  0.002221013   0.10539135 0.0016019798  0.001601971    1.1732118            0   -3.0514944   0.26249002   -3.0504848   0.50415223   -3.0498924            1   -3.0514944 
-933   0.01883894  0.011496399 0.0011058925 0.0018178041  0.014621806 0.0016018934 0.0016018865     1.173866            0   -3.0514944   0.25788763   -3.0505113   0.50466375   -3.0498925            1   -3.0514944 
-996 0.0082457876 0.0036336551 0.00077325986 0.0013910671 0.0068823708 0.0016018293 0.0016018244     1.174511            0   -3.0514944    0.2544553   -3.0505324   0.50520462   -3.0498926            1   -3.0514944 
diff --git a/examples/neb/log.5Oct16.neb.hop2.g++.8 b/examples/neb/log.5Oct16.neb.hop2.g++.8
deleted file mode 100644
index 9977287303..0000000000
--- a/examples/neb/log.5Oct16.neb.hop2.g++.8
+++ /dev/null
@@ -1,18 +0,0 @@
-LAMMPS (5 Oct 2016)
-Running on 4 partitions of processors
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-0    14.104748    10.419633   0.24852044    5.0039071    8.2116049 0.0018276223 0.00064050211   0.98401186            0   -3.0514921   0.33333333   -3.0496673   0.66666667   -3.0496645            1    -3.050305 
-100   0.24646695   0.10792196   0.01781018  0.098854684   0.63725646  0.001516756 0.0015151635     1.165391            0   -3.0514939    0.2890334   -3.0503533   0.59718494   -3.0499771            1   -3.0514923 
-200  0.061777741  0.050288749  0.012466513  0.020420207   0.88741041 0.0014465772 0.0014462528    1.1692938            0   -3.0514941   0.29975094   -3.0503052   0.62768286   -3.0500476            1   -3.0514938 
-300  0.056346766  0.030000618 0.0093152917  0.013765031    1.0101529 0.0014069751 0.0014068154    1.1699608            0   -3.0514942   0.30992449   -3.0502613   0.64174291   -3.0500873            1   -3.0514941 
-400  0.025589489  0.015671005 0.0061287063  0.008588518    1.1136424  0.001370987 0.0013709154    1.1704204            0   -3.0514943   0.32016645   -3.0502198   0.65324019   -3.0501233            1   -3.0514943 
-500  0.014778626 0.0092108366 0.0042668521 0.0059963914    1.1636579 0.0013527466 0.0013527072    1.1706283            0   -3.0514944   0.32550275   -3.0501993   0.65875414   -3.0501416            1   -3.0514943 
-600   0.08786211  0.020876327 0.0031421548 0.0051657363    1.1898894 0.0013430848 0.0013430599    1.1707681            0   -3.0514944   0.32831927   -3.0501889   0.66160681   -3.0501513            1   -3.0514944 
-633 0.0098132678 0.0055392541 0.0030063464 0.0043091323    1.1924486 0.0013420127 0.0013419893    1.1707818            0   -3.0514944   0.32862625   -3.0501878   0.66191769   -3.0501524            1   -3.0514944 
-Climbing replica = 3
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-633    1.1924486    1.1648685 0.0030063464 0.0043091323    1.1924486 0.0013420127 0.0013419893    1.1707818            0   -3.0514944   0.32862625   -3.0501878   0.66191769   -3.0501524            1   -3.0514944 
-733  0.095331134  0.089136608 0.0021551441 0.0031844438  0.043042998 0.0016022317 0.0016022168     1.170789            0   -3.0514944   0.29157063   -3.0503375   0.50358402   -3.0498922            1   -3.0514944 
-833   0.10539135  0.030724373 0.0013749699  0.002221013   0.10539135 0.0016019798  0.001601971    1.1732118            0   -3.0514944   0.26249002   -3.0504848   0.50415223   -3.0498924            1   -3.0514944 
-933   0.01883894  0.011496399 0.0011058925 0.0018178041  0.014621806 0.0016018934 0.0016018865     1.173866            0   -3.0514944   0.25788763   -3.0505113   0.50466375   -3.0498925            1   -3.0514944 
-996 0.0082457876 0.0036336551 0.00077325986 0.0013910671 0.0068823708 0.0016018293 0.0016018244     1.174511            0   -3.0514944    0.2544553   -3.0505324   0.50520462   -3.0498926            1   -3.0514944 
diff --git a/examples/neb/log.5Oct16.neb.sivac.g++.3 b/examples/neb/log.5Oct16.neb.sivac.g++.3
deleted file mode 100644
index f6adae4a18..0000000000
--- a/examples/neb/log.5Oct16.neb.sivac.g++.3
+++ /dev/null
@@ -1,14 +0,0 @@
-LAMMPS (5 Oct 2016)
-Running on 3 partitions of processors
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-0    7.5525391    1.6345605   0.16683659    7.5525391    7.5525391    1.5383951            0    1.6207355            0   -2213.3343          0.5   -2212.4096            1   -2211.7959 
-10   0.27332818  0.040944923  0.039164338   0.27332818   0.17804882   0.51235911     0.497084    1.6790474            0   -2213.3364   0.49024121    -2212.824            1   -2213.3211 
-20    0.1820396  0.018049916  0.024428411    0.1820396   0.08601739   0.51038174    0.5080746    1.7224961            0    -2213.337   0.49199582   -2212.8266            1   -2213.3347 
-30  0.043288796 0.0068108825  0.017372479  0.043288796  0.049466709   0.51032316    0.5095943    1.7304745            0   -2213.3371   0.49553568   -2212.8268            1   -2213.3364 
-40    0.0421393 0.0037035761   0.01173707    0.0421393  0.026104735   0.51022733    0.5100163    1.7366752            0   -2213.3373   0.49838067   -2212.8271            1   -2213.3371 
-50  0.025897844 0.0022804241 0.0081056535  0.025897844  0.016908913    0.5101712   0.51008591     1.739143            0   -2213.3373   0.49923344   -2212.8272            1   -2213.3373 
-59   0.00962839 0.0012946076  0.005657505  0.009365729  0.012040803   0.51014185   0.51010207    1.7404554            0   -2213.3374   0.49955698   -2212.8272            1   -2213.3373 
-Climbing replica = 2
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-59  0.012040803 0.0031505502  0.005657505  0.009365729  0.012040803   0.51014185   0.51010207    1.7404554            0   -2213.3374   0.49955698   -2212.8272            1   -2213.3373 
-63  0.009152118 0.0016692472 0.0049645771 0.0081967836  0.009152118   0.51013743   0.51010776    1.7409028            0   -2213.3374   0.50022239   -2212.8272            1   -2213.3373 
diff --git a/examples/neb/log.5Oct16.neb.sivac.g++.6 b/examples/neb/log.5Oct16.neb.sivac.g++.6
deleted file mode 100644
index e00069d052..0000000000
--- a/examples/neb/log.5Oct16.neb.sivac.g++.6
+++ /dev/null
@@ -1,14 +0,0 @@
-LAMMPS (5 Oct 2016)
-Running on 3 partitions of processors
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-0    7.5525391    1.6345605   0.16683659    7.5525391    7.5525391    1.5383951            0    1.6207355            0   -2213.3343          0.5   -2212.4096            1   -2211.7959 
-10   0.27332818  0.040944923  0.039164338   0.27332818   0.17804882   0.51235911     0.497084    1.6790474            0   -2213.3364   0.49024121    -2212.824            1   -2213.3211 
-20    0.1820396  0.018049916  0.024428411    0.1820396   0.08601739   0.51038174    0.5080746    1.7224961            0    -2213.337   0.49199582   -2212.8266            1   -2213.3347 
-30  0.043288796 0.0068108825  0.017372479  0.043288796  0.049466709   0.51032316    0.5095943    1.7304745            0   -2213.3371   0.49553568   -2212.8268            1   -2213.3364 
-40  0.042139305 0.0037035764   0.01173707  0.042139305  0.026104735   0.51022733    0.5100163    1.7366752            0   -2213.3373   0.49838067   -2212.8271            1   -2213.3371 
-50  0.025899631 0.0022805513 0.0081057075  0.025899631  0.016908929    0.5101712   0.51008591     1.739143            0   -2213.3373   0.49923345   -2212.8272            1   -2213.3373 
-59 0.0096285044 0.0012946258 0.0056576061 0.0093678253  0.012040919   0.51014185   0.51010207    1.7404554            0   -2213.3374   0.49955698   -2212.8272            1   -2213.3373 
-Climbing replica = 2
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-59  0.012040919 0.0031505771 0.0056576061 0.0093678253  0.012040919   0.51014185   0.51010207    1.7404554            0   -2213.3374   0.49955698   -2212.8272            1   -2213.3373 
-63 0.0091523813 0.0016692845 0.0049647607 0.0081998372 0.0091523813   0.51013743   0.51010775    1.7409028            0   -2213.3374   0.50022236   -2212.8272            1   -2213.3373 
diff --git a/examples/neb/log.5Oct16.neb.sivac.g++.9 b/examples/neb/log.5Oct16.neb.sivac.g++.9
deleted file mode 100644
index 31ab7c9ac6..0000000000
--- a/examples/neb/log.5Oct16.neb.sivac.g++.9
+++ /dev/null
@@ -1,14 +0,0 @@
-LAMMPS (5 Oct 2016)
-Running on 3 partitions of processors
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-0    7.5525391    1.6345605   0.16683659    7.5525391    7.5525391    1.5383951            0    1.6207355            0   -2213.3343          0.5   -2212.4096            1   -2211.7959 
-10   0.27332818  0.040944923  0.039164338   0.27332818   0.17804882   0.51235911     0.497084    1.6790474            0   -2213.3364   0.49024121    -2212.824            1   -2213.3211 
-20    0.1820396  0.018049916  0.024428411    0.1820396   0.08601739   0.51038174    0.5080746    1.7224961            0    -2213.337   0.49199582   -2212.8266            1   -2213.3347 
-30  0.043288796 0.0068108825  0.017372479  0.043288796  0.049466709   0.51032316    0.5095943    1.7304745            0   -2213.3371   0.49553568   -2212.8268            1   -2213.3364 
-40  0.042139318 0.0037035773  0.011737071  0.042139318  0.026104737   0.51022733    0.5100163    1.7366752            0   -2213.3373   0.49838067   -2212.8271            1   -2213.3371 
-50  0.025904121 0.0022808707 0.0081058431  0.025904121  0.016908969    0.5101712   0.51008591    1.7391431            0   -2213.3373   0.49923346   -2212.8272            1   -2213.3373 
-59 0.0096287928 0.0012946716  0.005657861 0.0093731008   0.01204121   0.51014185   0.51010207    1.7404554            0   -2213.3374   0.49955696   -2212.8272            1   -2213.3373 
-Climbing replica = 2
-Step MaxReplicaForce MaxAtomForce GradV0 GradV1 GradVc EBF EBR RDT RD1 PE1 RD2 PE2 ... RDN PEN
-59   0.01204121 0.0031506449  0.005657861 0.0093731008   0.01204121   0.51014185   0.51010207    1.7404554            0   -2213.3374   0.49955696   -2212.8272            1   -2213.3373 
-63 0.0091530442 0.0016693787 0.0049652227 0.0082075097 0.0091530442   0.51013743   0.51010775    1.7409027            0   -2213.3374   0.50022228   -2212.8272            1   -2213.3373 
diff --git a/src/CORESHELL/compute_temp_cs.h b/src/CORESHELL/compute_temp_cs.h
index 5a1d1434c3..3e93e4a68c 100644
--- a/src/CORESHELL/compute_temp_cs.h
+++ b/src/CORESHELL/compute_temp_cs.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/CORESHELL/pair_born_coul_long_cs.h b/src/CORESHELL/pair_born_coul_long_cs.h
index d2c8c04849..68c29e4fc2 100644
--- a/src/CORESHELL/pair_born_coul_long_cs.h
+++ b/src/CORESHELL/pair_born_coul_long_cs.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/CORESHELL/pair_buck_coul_long_cs.h b/src/CORESHELL/pair_buck_coul_long_cs.h
index 7f0bc149c1..d6b117d677 100644
--- a/src/CORESHELL/pair_buck_coul_long_cs.h
+++ b/src/CORESHELL/pair_buck_coul_long_cs.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/GPU/pair_lj_cubic_gpu.h b/src/GPU/pair_lj_cubic_gpu.h
index 1591eb8b9e..cdfc157e8e 100644
--- a/src/GPU/pair_lj_cubic_gpu.h
+++ b/src/GPU/pair_lj_cubic_gpu.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/GPU/pair_tersoff_gpu.h b/src/GPU/pair_tersoff_gpu.h
index 4fa358a6b1..ed3dadef5d 100644
--- a/src/GPU/pair_tersoff_gpu.h
+++ b/src/GPU/pair_tersoff_gpu.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/GPU/pair_tersoff_mod_gpu.h b/src/GPU/pair_tersoff_mod_gpu.h
index 6d3017669a..3967e90a70 100644
--- a/src/GPU/pair_tersoff_mod_gpu.h
+++ b/src/GPU/pair_tersoff_mod_gpu.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/GPU/pair_tersoff_zbl_gpu.h b/src/GPU/pair_tersoff_zbl_gpu.h
index 003e037bba..ba923ffd2f 100644
--- a/src/GPU/pair_tersoff_zbl_gpu.h
+++ b/src/GPU/pair_tersoff_zbl_gpu.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/GPU/pair_zbl_gpu.h b/src/GPU/pair_zbl_gpu.h
index 950fe952dd..3e6ac37394 100644
--- a/src/GPU/pair_zbl_gpu.h
+++ b/src/GPU/pair_zbl_gpu.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/KOKKOS/fix_reaxc_species_kokkos.cpp b/src/KOKKOS/fix_reaxc_species_kokkos.cpp
index ce84de30cb..8b778ecf65 100644
--- a/src/KOKKOS/fix_reaxc_species_kokkos.cpp
+++ b/src/KOKKOS/fix_reaxc_species_kokkos.cpp
@@ -48,7 +48,7 @@ FixReaxCSpeciesKokkos::FixReaxCSpeciesKokkos(LAMMPS *lmp, int narg, char **arg)
 {
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
-  
+
   // NOTE: Could improve performance if a Kokkos version of ComputeSpecAtom is added
 
   datamask_read = X_MASK | V_MASK | Q_MASK | MASK_MASK;
@@ -116,35 +116,30 @@ void FixReaxCSpeciesKokkos::FindMolecule()
       done = 1;
 
       for (ii = 0; ii < inum; ii++) {
-      	i = ilist[ii];
-      	if (!(mask[i] & groupbit)) continue;
+        i = ilist[ii];
+        if (!(mask[i] & groupbit)) continue;
 
-      	itype = atom->type[i];
+        itype = atom->type[i];
 
         for (jj = 0; jj < MAXSPECBOND; jj++) {
-      	  j = reaxc->tmpid[i][jj];
+          j = reaxc->tmpid[i][jj];
 
-      	  if (j < i) continue;
-      	  if (!(mask[j] & groupbit)) continue;
+          if ((j == 0) && (j < i)) continue;
+          if (!(mask[j] & groupbit)) continue;
 
-      	  if (clusterID[i] == clusterID[j] && PBCconnected[i] == PBCconnected[j]
-	    && x0[i].x == x0[j].x && x0[i].y == x0[j].y && x0[i].z == x0[j].z) continue;
+          if (clusterID[i] == clusterID[j]
+            && x0[i].x == x0[j].x && x0[i].y == x0[j].y && x0[i].z == x0[j].z) continue;
 
           jtype = atom->type[j];
-      	  bo_cut = BOCut[itype][jtype];
-      	  bo_tmp = spec_atom[i][jj+7];
+          bo_cut = BOCut[itype][jtype];
+          bo_tmp = spec_atom[i][jj+7];
 
-      	  if (bo_tmp > bo_cut) {
+          if (bo_tmp > bo_cut) {
             clusterID[i] = clusterID[j] = MIN(clusterID[i], clusterID[j]);
-            PBCconnected[i] = PBCconnected[j] = MAX(PBCconnected[i], PBCconnected[j]);
             x0[i] = x0[j] = chAnchor(x0[i], x0[j]);
-            if ((fabs(spec_atom[i][1] - spec_atom[j][1]) > reaxc->control->bond_cut)
-             || (fabs(spec_atom[i][2] - spec_atom[j][2]) > reaxc->control->bond_cut)
-             || (fabs(spec_atom[i][3] - spec_atom[j][3]) > reaxc->control->bond_cut))
-              PBCconnected[i] = PBCconnected[j] = 1;
-      	    done = 0;
-      	  }
-      	}
+            done = 0;
+          }
+        }
       }
       if (!done) change = 1;
       if (done) break;
diff --git a/src/KOKKOS/pair_buck_kokkos.h b/src/KOKKOS/pair_buck_kokkos.h
index d57e320e99..2691f10929 100644
--- a/src/KOKKOS/pair_buck_kokkos.h
+++ b/src/KOKKOS/pair_buck_kokkos.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/KOKKOS/pair_sw_kokkos.h b/src/KOKKOS/pair_sw_kokkos.h
index d899edfc1b..b94e39335f 100644
--- a/src/KOKKOS/pair_sw_kokkos.h
+++ b/src/KOKKOS/pair_sw_kokkos.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/KOKKOS/pair_vashishta_kokkos.h b/src/KOKKOS/pair_vashishta_kokkos.h
index 49c936185d..174db2cb94 100644
--- a/src/KOKKOS/pair_vashishta_kokkos.h
+++ b/src/KOKKOS/pair_vashishta_kokkos.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp b/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
index 6e17a9bbd7..30d8ab64b6 100644
--- a/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
+++ b/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
@@ -25,6 +25,7 @@
 #include <string.h>
 #include "pair_lj_charmmfsw_coul_long.h"
 #include "atom.h"
+#include "update.h"
 #include "comm.h"
 #include "force.h"
 #include "kspace.h"
@@ -61,6 +62,15 @@ PairLJCharmmfswCoulLong::PairLJCharmmfswCoulLong(LAMMPS *lmp) : Pair(lmp)
   // short-range/long-range flag accessed by DihedralCharmmfsw
 
   dihedflag = 1;
+
+  // switch qqr2e from LAMMPS value to CHARMM value
+
+  if (strcmp(update->unit_style,"real") == 0) {
+    if ((comm->me == 0) && (force->qqr2e != force->qqr2e_charmm_real))
+      error->message(FLERR,"Switching to CHARMM coulomb energy"
+                     " conversion constant");
+    force->qqr2e = force->qqr2e_charmm_real;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -87,6 +97,15 @@ PairLJCharmmfswCoulLong::~PairLJCharmmfswCoulLong()
     }
     if (ftable) free_tables();
   }
+
+  // switch qqr2e back from CHARMM value to LAMMPS value
+
+  if (update && strcmp(update->unit_style,"real") == 0) {
+    if ((comm->me == 0) && (force->qqr2e == force->qqr2e_charmm_real))
+      error->message(FLERR,"Restoring original LAMMPS coulomb energy"
+                     " conversion constant");
+    force->qqr2e = force->qqr2e_lammps_real;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/KSPACE/pair_lj_long_tip4p_long.cpp b/src/KSPACE/pair_lj_long_tip4p_long.cpp
index d2a6b801fc..1dc1ca1cb4 100644
--- a/src/KSPACE/pair_lj_long_tip4p_long.cpp
+++ b/src/KSPACE/pair_lj_long_tip4p_long.cpp
@@ -1337,8 +1337,8 @@ void PairLJLongTIP4PLong::compute_outer(int eflag, int vflag)
               fH[1] = 0.5 * alpha * fd[1];
               fH[2] = 0.5 * alpha * fd[2];
 
-              xH1 = x[jH1];
-              xH2 = x[jH2];
+              xH1 = x[iH1];
+              xH2 = x[iH2];
               v[0] = x[i][0]*fO[0] + xH1[0]*fH[0] + xH2[0]*fH[0];
               v[1] = x[i][1]*fO[1] + xH1[1]*fH[1] + xH2[1]*fH[1];
               v[2] = x[i][2]*fO[2] + xH1[2]*fH[2] + xH2[2]*fH[2];
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
index 2cb37ed9fe..ac8279949a 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@@ -8,7 +8,7 @@ SHELL = /bin/sh
 
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
+CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
index b7f3cd6846..db5de83a06 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
+++ b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
@@ -8,7 +8,7 @@ SHELL = /bin/sh
 
 CC =		mpiicpc 
 MIC_OPT =       -qoffload-arch=mic-avx512 -fp-model fast=2
-CCFLAGS =	-g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
+CCFLAGS =	-O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
                 -xHost -fno-alias -ansi-alias -restrict \
                 -qoverride-limits $(MIC_OPT)
 SHFLAGS =	-fPIC
diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl
index 3bc777592e..881c51f0e4 100644
--- a/src/MAKE/OPTIONS/Makefile.knl
+++ b/src/MAKE/OPTIONS/Makefile.knl
@@ -8,7 +8,7 @@ SHELL = /bin/sh
 
 CC =		mpiicpc
 OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
+CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
diff --git a/src/MANYBODY/pair_airebo.cpp b/src/MANYBODY/pair_airebo.cpp
index d83f5a39a8..0ca80c6b76 100644
--- a/src/MANYBODY/pair_airebo.cpp
+++ b/src/MANYBODY/pair_airebo.cpp
@@ -1271,7 +1271,7 @@ double PairAIREBO::bondorder(int i, int j, double rij[3],
   double w21,dw21,r34[3],r34mag,cos234,w34,dw34;
   double cross321[3],cross234[3],prefactor,SpN;
   double fcijpc,fcikpc,fcjlpc,fcjkpc,fcilpc;
-  double dt2dik[3],dt2djl[3],dt2dij[3],aa,aaa1,aaa2,at2,cw,cwnum,cwnom;
+  double dt2dik[3],dt2djl[3],dt2dij[3],aa,aaa2,at2,cw,cwnum,cwnom;
   double sin321,sin234,rr,rijrik,rijrjl,rjk2,rik2,ril2,rjl2;
   double dctik,dctjk,dctjl,dctij,dctji,dctil,rik2i,rjl2i,sink2i,sinl2i;
   double rjk[3],ril[3],dt1dik,dt1djk,dt1djl,dt1dil,dt1dij;
@@ -1856,8 +1856,6 @@ double PairAIREBO::bondorder(int i, int j, double rij[3],
 
                 aa = (prefactor*2.0*cw/cwnom)*w21*w34 *
                   (1.0-tspjik)*(1.0-tspijl);
-                aaa1 = -prefactor*(1.0-square(om1234)) *
-                  (1.0-tspjik)*(1.0-tspijl);
                 aaa2 = -prefactor*(1.0-square(om1234)) * w21*w34;
                 at2 = aa*cwnum;
 
@@ -2107,7 +2105,7 @@ double PairAIREBO::bondorderLJ(int i, int j, double rij[3], double rijmag,
   double w21,dw21,r34[3],r34mag,cos234,w34,dw34;
   double cross321[3],cross234[3],prefactor,SpN;
   double fcikpc,fcjlpc,fcjkpc,fcilpc;
-  double dt2dik[3],dt2djl[3],aa,aaa1,aaa2,at2,cw,cwnum,cwnom;
+  double dt2dik[3],dt2djl[3],aa,aaa2,at2,cw,cwnum,cwnom;
   double sin321,sin234,rr,rijrik,rijrjl,rjk2,rik2,ril2,rjl2;
   double dctik,dctjk,dctjl,dctil,rik2i,rjl2i,sink2i,sinl2i;
   double rjk[3],ril[3],dt1dik,dt1djk,dt1djl,dt1dil;
@@ -2800,8 +2798,6 @@ double PairAIREBO::bondorderLJ(int i, int j, double rij[3], double rijmag,
 
                   aa = (prefactor*2.0*cw/cwnom)*w21*w34 *
                     (1.0-tspjik)*(1.0-tspijl);
-                  aaa1 = -prefactor*(1.0-square(om1234)) *
-                    (1.0-tspjik)*(1.0-tspijl);
                   aaa2 = -prefactor*(1.0-square(om1234)) * w21*w34;
                   at2 = aa*cwnum;
 
diff --git a/src/MANYBODY/pair_bop.h b/src/MANYBODY/pair_bop.h
index d55d9a79a4..f50c5edd00 100644
--- a/src/MANYBODY/pair_bop.h
+++ b/src/MANYBODY/pair_bop.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/MANYBODY/pair_polymorphic.h b/src/MANYBODY/pair_polymorphic.h
index 9b7fe761bb..9917bcd96d 100644
--- a/src/MANYBODY/pair_polymorphic.h
+++ b/src/MANYBODY/pair_polymorphic.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/MANYBODY/pair_vashishta_table.h b/src/MANYBODY/pair_vashishta_table.h
index a45cac5ae1..8c52f967cb 100644
--- a/src/MANYBODY/pair_vashishta_table.h
+++ b/src/MANYBODY/pair_vashishta_table.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/MC/fix_atom_swap.h b/src/MC/fix_atom_swap.h
index 25208a2b5a..74720d6222 100644
--- a/src/MC/fix_atom_swap.h
+++ b/src/MC/fix_atom_swap.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/MC/fix_gcmc.h b/src/MC/fix_gcmc.h
index 8a5375eed7..3656a1df58 100644
--- a/src/MC/fix_gcmc.h
+++ b/src/MC/fix_gcmc.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/MC/fix_tfmc.h b/src/MC/fix_tfmc.h
index fee3a944cd..d4f121eb90 100644
--- a/src/MC/fix_tfmc.h
+++ b/src/MC/fix_tfmc.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/MOLECULE/dihedral_charmm.cpp b/src/MOLECULE/dihedral_charmm.cpp
index b9d1c440d4..35953a6ac4 100644
--- a/src/MOLECULE/dihedral_charmm.cpp
+++ b/src/MOLECULE/dihedral_charmm.cpp
@@ -18,6 +18,7 @@
 #include <mpi.h>
 #include <math.h>
 #include <stdlib.h>
+#include <string.h>
 #include "dihedral_charmm.h"
 #include "atom.h"
 #include "comm.h"
@@ -26,6 +27,7 @@
 #include "force.h"
 #include "pair.h"
 #include "update.h"
+#include "respa.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
@@ -368,10 +370,26 @@ void DihedralCharmm::coeff(int narg, char **arg)
 
 void DihedralCharmm::init_style()
 {
+  if (strstr(update->integrate_style,"respa")) {
+    Respa *r = (Respa *) update->integrate;
+    if (r->level_pair >= 0 && (r->level_pair != r->level_dihedral))
+      error->all(FLERR,"Dihedral style charmm must be set to same"
+                 " r-RESPA level as 'pair'");
+    if (r->level_outer >= 0 && (r->level_outer != r->level_dihedral))
+      error->all(FLERR,"Dihedral style charmm must be set to same"
+                 " r-RESPA level as 'outer'");
+  }
+
   // insure use of CHARMM pair_style if any weight factors are non-zero
   // set local ptrs to LJ 14 arrays setup by Pair
+  // also verify that the correct 1-4 scaling is set
 
   if (weightflag) {
+
+    if ((force->special_lj[3] != 0.0) || (force->special_coul[3] != 0.0))
+      error->all(FLERR,"Must use 'special_bonds charmm' with"
+                 " dihedral style charmm for use with CHARMM pair styles");
+
     int itmp;
     if (force->pair == NULL)
       error->all(FLERR,"Dihedral charmm is incompatible with Pair style");
diff --git a/src/MOLECULE/dihedral_charmmfsw.cpp b/src/MOLECULE/dihedral_charmmfsw.cpp
index 613170bbfa..feb3e02bd4 100644
--- a/src/MOLECULE/dihedral_charmmfsw.cpp
+++ b/src/MOLECULE/dihedral_charmmfsw.cpp
@@ -21,6 +21,7 @@
 #include <mpi.h>
 #include <math.h>
 #include <stdlib.h>
+#include <string.h>
 #include "dihedral_charmmfsw.h"
 #include "atom.h"
 #include "comm.h"
@@ -29,6 +30,7 @@
 #include "force.h"
 #include "pair.h"
 #include "update.h"
+#include "respa.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
@@ -386,10 +388,26 @@ void DihedralCharmmfsw::coeff(int narg, char **arg)
 
 void DihedralCharmmfsw::init_style()
 {
+  if (strstr(update->integrate_style,"respa")) {
+    Respa *r = (Respa *) update->integrate;
+    if (r->level_pair >= 0 && (r->level_pair != r->level_dihedral))
+      error->all(FLERR,"Dihedral style charmmfsw must be set to same"
+        " r-RESPA level as 'pair'");
+    if (r->level_outer >= 0 && (r->level_outer != r->level_dihedral))
+      error->all(FLERR,"Dihedral style charmmfsw must be set to same"
+        " r-RESPA level as 'outer'");
+  }
+
   // insure use of CHARMM pair_style if any weight factors are non-zero
   // set local ptrs to LJ 14 arrays setup by Pair
+  // also verify that the correct 1-4 scaling is set
 
   if (weightflag) {
+
+    if ((force->special_lj[3] != 0.0) || (force->special_coul[3] != 0.0))
+      error->all(FLERR,"Must use 'special_bonds charmm' with"
+                 " dihedral style charmm for use with CHARMM pair styles");
+
     int itmp;
     if (force->pair == NULL)
       error->all(FLERR,"Dihedral charmmfsw is incompatible with Pair style");
diff --git a/src/MOLECULE/pair_lj_charmmfsw_coul_charmmfsh.cpp b/src/MOLECULE/pair_lj_charmmfsw_coul_charmmfsh.cpp
index 1e34b06478..0d2159b671 100644
--- a/src/MOLECULE/pair_lj_charmmfsw_coul_charmmfsh.cpp
+++ b/src/MOLECULE/pair_lj_charmmfsw_coul_charmmfsh.cpp
@@ -25,6 +25,7 @@
 #include <string.h>
 #include "pair_lj_charmmfsw_coul_charmmfsh.h"
 #include "atom.h"
+#include "update.h"
 #include "comm.h"
 #include "force.h"
 #include "neighbor.h"
@@ -46,6 +47,15 @@ PairLJCharmmfswCoulCharmmfsh::PairLJCharmmfswCoulCharmmfsh(LAMMPS *lmp) :
   // short-range/long-range flag accessed by DihedralCharmmfsw
 
   dihedflag = 0;
+
+  // switch qqr2e from LAMMPS value to CHARMM value
+
+  if (strcmp(update->unit_style,"real") == 0) {
+    if ((comm->me == 0) && (force->qqr2e != force->qqr2e_charmm_real))
+      error->message(FLERR,"Switching to CHARMM coulomb energy"
+                     " conversion constant");
+    force->qqr2e = force->qqr2e_charmm_real;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -71,6 +81,15 @@ PairLJCharmmfswCoulCharmmfsh::~PairLJCharmmfswCoulCharmmfsh()
       memory->destroy(lj14_4);
     }
   }
+
+  // switch qqr2e back from CHARMM value to LAMMPS value
+
+  if (update && strcmp(update->unit_style,"real") == 0) {
+    if ((comm->me == 0) && (force->qqr2e == force->qqr2e_charmm_real))
+      error->message(FLERR,"Restoring original LAMMPS coulomb energy"
+                     " conversion constant");
+    force->qqr2e = force->qqr2e_lammps_real;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/REPLICA/fix_neb.cpp b/src/REPLICA/fix_neb.cpp
index b17315ca0d..6daaf94710 100644
--- a/src/REPLICA/fix_neb.cpp
+++ b/src/REPLICA/fix_neb.cpp
@@ -34,6 +34,9 @@ using namespace FixConst;
 using namespace MathConst;
 
 enum{SINGLE_PROC_DIRECT,SINGLE_PROC_MAP,MULTI_PROC};
+
+#define BUFSIZE 8
+
 /* ---------------------------------------------------------------------- */
 
 FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
@@ -46,55 +49,67 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
   displacements(NULL)
 {
 
-  NEBLongRange=false;
-  StandardNEB=true;
-  PerpSpring=FreeEndIni=FreeEndFinal=false;
-  FreeEndFinalWithRespToEIni=FinalAndInterWithRespToEIni=false;
-
-  kspringPerp=0.0;
-  kspring2=1.0;
-  if (narg < 4)
-    error->all(FLERR,"Illegal fix neb command, argument missing");
+  if (narg < 4) error->all(FLERR,"Illegal fix neb command");
 
   kspring = force->numeric(FLERR,arg[3]);
-  if (kspring <= 0.0)
-    error->all(FLERR,"Illegal fix neb command."
-               " The spring force was not provided properly");
+  if (kspring <= 0.0) error->all(FLERR,"Illegal fix neb command");
 
-  int iarg =4;
+  // optional params
+
+  NEBLongRange = false;
+  StandardNEB = true;
+  PerpSpring = FreeEndIni = FreeEndFinal = false;
+  FreeEndFinalWithRespToEIni = FinalAndInterWithRespToEIni = false;
+  kspringPerp = 0.0;
+  kspringIni = 1.0;
+  kspringFinal = 1.0;
+
+  int iarg = 4;
   while (iarg < narg) {
-    if (strcmp (arg[iarg],"nudg_style")==0) {
-          if (strcmp (arg[iarg+1],"idealpos")==0) {
-	    NEBLongRange = true;
-	    iarg+=2;}
-	  else if (strcmp (arg[iarg+1],"neigh")==0) {
-	    NEBLongRange = false;
-	    StandardNEB = true;
-	    iarg+=2;}
-	  else error->all(FLERR,"Illegal fix neb command. Unknown keyword");}
-    else if (strcmp (arg[iarg],"perp")==0) {
-      PerpSpring=true;
+    if (strcmp(arg[iarg],"parallel") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command");
+      if (strcmp(arg[iarg+1],"ideal") == 0) {
+        NEBLongRange = true;
+        StandardNEB = false;
+      } else if (strcmp(arg[iarg+1],"neigh") == 0) {
+        NEBLongRange = false;
+        StandardNEB = true;
+      } else error->all(FLERR,"Illegal fix neb command");
+      iarg += 2;
+
+    } else if (strcmp(arg[iarg],"perp") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command");
+      PerpSpring = true;
       kspringPerp = force->numeric(FLERR,arg[iarg+1]);
-      if (kspringPerp < 0.0)
-        error->all(FLERR,"Illegal fix neb command. "
-                   "The perpendicular spring force was not provided properly");
-      iarg+=2;} 
-    else if (strcmp (arg[iarg],"freeend")==0) {
-      if (strcmp (arg[iarg+1],"ini")==0)
-        FreeEndIni=true;
-      else if (strcmp (arg[iarg+1],"final")==0)
-        FreeEndFinal=true;
-      else if (strcmp (arg[iarg+1],"finaleini")==0)
-        FreeEndFinalWithRespToEIni=true;
-      else if (strcmp (arg[iarg+1],"final2eini")==0) {
-        FinalAndInterWithRespToEIni=true;
-        FreeEndFinalWithRespToEIni=true;}
-      else if (strcmp (arg[iarg+1],"none")!=0) error->all(FLERR,"Illegal fix neb command. Unknown keyword");
-      iarg+=2;} 
-    else if (strcmp (arg[iarg],"freeend_kspring")==0) {
-      kspring2=force->numeric(FLERR,arg[iarg+1]);
-      iarg+=2; }
-    else error->all(FLERR,"Illegal fix neb command. Unknown keyword");
+      if (kspringPerp == 0.0) PerpSpring = false;
+      if (kspringPerp < 0.0) error->all(FLERR,"Illegal fix neb command");
+      iarg += 2;
+
+    } else if (strcmp (arg[iarg],"end") == 0) {
+      if (iarg+3 > narg) error->all(FLERR,"Illegal fix neb command");
+      if (strcmp(arg[iarg+1],"first") == 0) {
+        FreeEndIni = true;
+	kspringIni = force->numeric(FLERR,arg[iarg+2]);
+      } else if (strcmp(arg[iarg+1],"last") == 0) {
+        FreeEndFinal = true;
+        FinalAndInterWithRespToEIni = false;
+        FreeEndFinalWithRespToEIni = false;
+	kspringFinal = force->numeric(FLERR,arg[iarg+2]);
+      } else if (strcmp(arg[iarg+1],"last/efirst") == 0) {
+        FreeEndFinal = false;
+        FinalAndInterWithRespToEIni = false;
+        FreeEndFinalWithRespToEIni = true;
+	kspringFinal = force->numeric(FLERR,arg[iarg+2]);
+      } else if (strcmp(arg[iarg+1],"last/efirst/middle") == 0) {
+        FreeEndFinal = false;
+        FinalAndInterWithRespToEIni = true;
+        FreeEndFinalWithRespToEIni = true;
+	kspringFinal = force->numeric(FLERR,arg[iarg+2]);
+      } else error->all(FLERR,"Illegal fix neb command");
+
+      iarg += 3;
+    
+    } else error->all(FLERR,"Illegal fix neb command");
   }
 
   // nreplica = number of partitions
@@ -119,12 +134,12 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
   MPI_Group uworldgroup,rootgroup;
   if (NEBLongRange) {
     for (int i=0; i<nreplica; i++)
-      iroots[i]=universe->root_proc[i];
+      iroots[i] = universe->root_proc[i];
     MPI_Comm_group(uworld, &uworldgroup);
     MPI_Group_incl(uworldgroup, nreplica, iroots, &rootgroup);
     MPI_Comm_create(uworld, rootgroup, &rootworld);
   }
-  delete[] iroots;
+  delete [] iroots;
 
   // create a new compute pe style
   // id = fix-ID + pe, compute group = all
@@ -256,11 +271,11 @@ void FixNEB::min_post_force(int vflag)
   double delxp,delyp,delzp,delxn,delyn,delzn;
   double vIni=0.0;
 
-  vprev=vnext=veng=pe->compute_scalar();
+  vprev = vnext = veng = pe->compute_scalar();
 
-  if (ireplica < nreplica-1 && me ==0)
+  if (ireplica < nreplica-1 && me == 0)
     MPI_Send(&veng,1,MPI_DOUBLE,procnext,0,uworld);
-  if (ireplica > 0 && me ==0)
+  if (ireplica > 0 && me == 0)
     MPI_Recv(&vprev,1,MPI_DOUBLE,procprev,0,uworld,MPI_STATUS_IGNORE);
 
   if (ireplica > 0 && me == 0)
@@ -273,7 +288,7 @@ void FixNEB::min_post_force(int vflag)
     MPI_Bcast(&vnext,1,MPI_DOUBLE,0,world);
   }
 
-  if (FreeEndFinal && (update->ntimestep == 0)) EFinalIni = veng;
+  if (FreeEndFinal && ireplica == nreplica-1 && (update->ntimestep == 0)) EFinalIni = veng;
 
   if (ireplica == 0) vIni=veng;
 
@@ -287,16 +302,19 @@ void FixNEB::min_post_force(int vflag)
       MPI_Bcast(&vIni,1,MPI_DOUBLE,0,world);
     }
   }
-  if (FreeEndIni && ireplica == 0) {
-    if (me == 0 )
+
+  if (FreeEndIni && ireplica == 0 && (update->ntimestep == 0)) EIniIni = veng;
+  /*  if (FreeEndIni && ireplica == 0) {
+    //    if (me == 0 )
       if (update->ntimestep == 0) {
         EIniIni = veng;
-        if (cmode == MULTI_PROC)
-          MPI_Bcast(&EIniIni,1,MPI_DOUBLE,0,world);
+	//	if (cmode == MULTI_PROC)
+	// MPI_Bcast(&EIniIni,1,MPI_DOUBLE,0,world);
       }
-  }
+      }*/
 
   // communicate atoms to/from adjacent replicas to fill xprev,xnext
+
   inter_replica_comm();
 
   // trigger potential energy computation on next timestep
@@ -335,10 +353,10 @@ void FixNEB::min_post_force(int vflag)
           tangent[i][0]=delxp;
           tangent[i][1]=delyp;
           tangent[i][2]=delzp;
-          tlen += tangent[i][0]*tangent[i][0]
-            + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
-          dot += f[i][0]*tangent[i][0]
-            + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
+          tlen += tangent[i][0]*tangent[i][0] +
+            tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
+          dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + 
+            f[i][2]*tangent[i][2];
         }
       }
 
@@ -360,10 +378,10 @@ void FixNEB::min_post_force(int vflag)
           tangent[i][0]=delxn;
           tangent[i][1]=delyn;
           tangent[i][2]=delzn;
-          tlen += tangent[i][0]*tangent[i][0]
-            + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
-          dot += f[i][0]*tangent[i][0]
-            + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
+          tlen += tangent[i][0]*tangent[i][0] + 
+            tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
+          dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + 
+            f[i][2]*tangent[i][2];
         }
       }
   } else {
@@ -388,13 +406,13 @@ void FixNEB::min_post_force(int vflag)
         domain->minimum_image(delxn,delyn,delzn);
 
         if (vnext > veng && veng > vprev) {
-          tangent[i][0]=delxn;
-          tangent[i][1]=delyn;
-          tangent[i][2]=delzn;
+          tangent[i][0] = delxn;
+          tangent[i][1] = delyn;
+          tangent[i][2] = delzn;
         } else if (vnext < veng && veng < vprev) {
-          tangent[i][0]=delxp;
-          tangent[i][1]=delyp;
-          tangent[i][2]=delzp;
+          tangent[i][0] = delxp;
+          tangent[i][1] = delyp;
+          tangent[i][2] = delzp;
         } else {
           if (vnext > vprev) {
             tangent[i][0] = vmax*delxn + vmin*delxp;
@@ -408,24 +426,23 @@ void FixNEB::min_post_force(int vflag)
         }
 
         nlen += delxn*delxn + delyn*delyn + delzn*delzn;
-        tlen += tangent[i][0]*tangent[i][0]
-          + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
+        tlen += tangent[i][0]*tangent[i][0] + 
+          tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
         gradlen += f[i][0]*f[i][0] + f[i][1]*f[i][1] + f[i][2]*f[i][2];
         dotpath += delxp*delxn + delyp*delyn + delzp*delzn;
-        dottangrad += tangent[i][0]* f[i][0]
-          + tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2];
-        gradnextlen += fnext[i][0]*fnext[i][0]
-          + fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2];
-        dotgrad += f[i][0]*fnext[i][0]
-          + f[i][1]*fnext[i][1] + f[i][2]*fnext[i][2];
+        dottangrad += tangent[i][0]*f[i][0] + 
+          tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2];
+        gradnextlen += fnext[i][0]*fnext[i][0] + 
+          fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2];
+        dotgrad += f[i][0]*fnext[i][0] + f[i][1]*fnext[i][1] + 
+          f[i][2]*fnext[i][2];
 
-        springF[i][0]=kspringPerp*(delxn-delxp);
-        springF[i][1]=kspringPerp*(delyn-delyp);
-        springF[i][2]=kspringPerp*(delzn-delzp);
+        springF[i][0] = kspringPerp*(delxn-delxp);
+        springF[i][1] = kspringPerp*(delyn-delyp);
+        springF[i][2] = kspringPerp*(delzn-delzp);
       }
   }
 
-#define BUFSIZE 8
   double bufin[BUFSIZE], bufout[BUFSIZE];
   bufin[0] = nlen;
   bufin[1] = plen;
@@ -459,7 +476,7 @@ void FixNEB::min_post_force(int vflag)
 
   // first or last replica has no change to forces, just return
 
-  if(ireplica>0 && ireplica<nreplica-1)
+  if (ireplica > 0 && ireplica < nreplica-1)
     dottangrad = dottangrad/(tlen*gradlen);
   if (ireplica == 0)
     dottangrad = dottangrad/(nlen*gradlen);
@@ -468,15 +485,14 @@ void FixNEB::min_post_force(int vflag)
   if (ireplica < nreplica-1)
     dotgrad = dotgrad /(gradlen*gradnextlen);
 
-
   if (FreeEndIni && ireplica == 0) {
     if (tlen > 0.0) {
       double dotall;
       MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
       dot=dotall/tlen;
 
-      if (dot<0) prefactor = -dot - kspring2*(veng-EIniIni);
-      else prefactor = -dot + kspring2*(veng-EIniIni);
+      if (dot<0) prefactor = -dot - kspringIni*(veng-EIniIni);
+      else prefactor = -dot + kspringIni*(veng-EIniIni);
 
       for (int i = 0; i < nlocal; i++)
         if (mask[i] & groupbit) {
@@ -493,8 +509,8 @@ void FixNEB::min_post_force(int vflag)
       MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
       dot=dotall/tlen;
 
-      if (dot<0) prefactor = -dot - kspring2*(veng-EFinalIni);
-      else prefactor = -dot + kspring2*(veng-EFinalIni);
+      if (dot<0) prefactor = -dot - kspringFinal*(veng-EFinalIni);
+      else prefactor = -dot + kspringFinal*(veng-EFinalIni);
 
       for (int i = 0; i < nlocal; i++)
         if (mask[i] & groupbit) {
@@ -511,8 +527,8 @@ void FixNEB::min_post_force(int vflag)
       MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
       dot=dotall/tlen;
 
-      if (dot<0) prefactor = -dot - kspring2*(veng-vIni);
-      else prefactor = -dot + kspring2*(veng-vIni);
+      if (dot<0) prefactor = -dot - kspringFinal*(veng-vIni);
+      else prefactor = -dot + kspringFinal*(veng-vIni);
 
       for (int i = 0; i < nlocal; i++)
         if (mask[i] & groupbit) {
@@ -568,14 +584,15 @@ void FixNEB::min_post_force(int vflag)
 
   for (int i = 0; i < nlocal; i++) {
     if (mask[i] & groupbit) {
-      dot += f[i][0]*tangent[i][0]
-        + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
-      dotSpringTangent += springF[i][0]*tangent[i][0]
-        +springF[i][1]*tangent[i][1]+springF[i][2]*tangent[i][2];}
+      dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + 
+        f[i][2]*tangent[i][2];
+      dotSpringTangent += springF[i][0]*tangent[i][0] +
+        springF[i][1]*tangent[i][1] + springF[i][2]*tangent[i][2];}
   }
 
   double dotSpringTangentall;
-  MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,
+                MPI_DOUBLE,MPI_SUM,world);
   dotSpringTangent=dotSpringTangentall;
   double dotall;
   MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
@@ -603,12 +620,12 @@ void FixNEB::min_post_force(int vflag)
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
-      f[i][0] += prefactor*tangent[i][0]
-        +AngularContr*(springF[i][0] -dotSpringTangent*tangent[i][0]);
-      f[i][1] += prefactor*tangent[i][1]
-        + AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]);
-      f[i][2] += prefactor*tangent[i][2]
-        + AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]);
+      f[i][0] += prefactor*tangent[i][0] + 
+        AngularContr*(springF[i][0] - dotSpringTangent*tangent[i][0]);
+      f[i][1] += prefactor*tangent[i][1] + 
+        AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]);
+      f[i][2] += prefactor*tangent[i][2] + 
+        AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]);
     }
 }
 
@@ -827,7 +844,6 @@ void FixNEB::inter_replica_comm()
   }
 }
 
-
 /* ----------------------------------------------------------------------
    reallocate xprev,xnext,tangent arrays if necessary
    reallocate communication arrays if necessary
diff --git a/src/REPLICA/fix_neb.h b/src/REPLICA/fix_neb.h
index 7e9e6db865..232790a1f0 100644
--- a/src/REPLICA/fix_neb.h
+++ b/src/REPLICA/fix_neb.h
@@ -38,7 +38,7 @@ class FixNEB : public Fix {
 
  private:
   int me,nprocs,nprocs_universe;
-  double kspring,kspring2,kspringPerp,EIniIni,EFinalIni;
+  double kspring,kspringIni,kspringFinal,kspringPerp,EIniIni,EFinalIni;
   bool StandardNEB,NEBLongRange,PerpSpring,FreeEndIni,FreeEndFinal;
   bool FreeEndFinalWithRespToEIni,FinalAndInterWithRespToEIni;
   int ireplica,nreplica;
diff --git a/src/RIGID/fix_ehex.h b/src/RIGID/fix_ehex.h
index 3220b77195..02f83df1af 100644
--- a/src/RIGID/fix_ehex.h
+++ b/src/RIGID/fix_ehex.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/mf_oxdna.h b/src/USER-CGDNA/mf_oxdna.h
index 642c325af9..56055d5fac 100644
--- a/src/USER-CGDNA/mf_oxdna.h
+++ b/src/USER-CGDNA/mf_oxdna.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna2_coaxstk.h b/src/USER-CGDNA/pair_oxdna2_coaxstk.h
index 477b35ee13..be8d6d6b37 100644
--- a/src/USER-CGDNA/pair_oxdna2_coaxstk.h
+++ b/src/USER-CGDNA/pair_oxdna2_coaxstk.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna2_dh.h b/src/USER-CGDNA/pair_oxdna2_dh.h
index 3af355d503..b40346e1cf 100644
--- a/src/USER-CGDNA/pair_oxdna2_dh.h
+++ b/src/USER-CGDNA/pair_oxdna2_dh.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna2_excv.h b/src/USER-CGDNA/pair_oxdna2_excv.h
index 94e39a0fa2..f59daf8361 100644
--- a/src/USER-CGDNA/pair_oxdna2_excv.h
+++ b/src/USER-CGDNA/pair_oxdna2_excv.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna2_stk.h b/src/USER-CGDNA/pair_oxdna2_stk.h
index b78fc89d5e..7654e5db2f 100644
--- a/src/USER-CGDNA/pair_oxdna2_stk.h
+++ b/src/USER-CGDNA/pair_oxdna2_stk.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna_coaxstk.h b/src/USER-CGDNA/pair_oxdna_coaxstk.h
index b12ef6e77b..f9228c94a2 100644
--- a/src/USER-CGDNA/pair_oxdna_coaxstk.h
+++ b/src/USER-CGDNA/pair_oxdna_coaxstk.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna_excv.h b/src/USER-CGDNA/pair_oxdna_excv.h
index 0308c1f48e..ec9ddee3ec 100644
--- a/src/USER-CGDNA/pair_oxdna_excv.h
+++ b/src/USER-CGDNA/pair_oxdna_excv.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna_hbond.h b/src/USER-CGDNA/pair_oxdna_hbond.h
index 409241710b..1c9f37bf50 100644
--- a/src/USER-CGDNA/pair_oxdna_hbond.h
+++ b/src/USER-CGDNA/pair_oxdna_hbond.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna_stk.h b/src/USER-CGDNA/pair_oxdna_stk.h
index fd0c27d38c..950c276228 100644
--- a/src/USER-CGDNA/pair_oxdna_stk.h
+++ b/src/USER-CGDNA/pair_oxdna_stk.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-CGDNA/pair_oxdna_xstk.h b/src/USER-CGDNA/pair_oxdna_xstk.h
index c71962ab52..5c443a4dac 100644
--- a/src/USER-CGDNA/pair_oxdna_xstk.h
+++ b/src/USER-CGDNA/pair_oxdna_xstk.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DIFFRACTION/compute_saed.h b/src/USER-DIFFRACTION/compute_saed.h
index 89e57f5097..87785c4936 100644
--- a/src/USER-DIFFRACTION/compute_saed.h
+++ b/src/USER-DIFFRACTION/compute_saed.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DIFFRACTION/compute_saed_consts.h b/src/USER-DIFFRACTION/compute_saed_consts.h
index 0cce0abfc2..0c07ae13ad 100644
--- a/src/USER-DIFFRACTION/compute_saed_consts.h
+++ b/src/USER-DIFFRACTION/compute_saed_consts.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DIFFRACTION/compute_xrd.h b/src/USER-DIFFRACTION/compute_xrd.h
index 92a59fcf23..61e1dae1bd 100644
--- a/src/USER-DIFFRACTION/compute_xrd.h
+++ b/src/USER-DIFFRACTION/compute_xrd.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DIFFRACTION/compute_xrd_consts.h b/src/USER-DIFFRACTION/compute_xrd_consts.h
index 1ca0d6bd66..582cecae01 100644
--- a/src/USER-DIFFRACTION/compute_xrd_consts.h
+++ b/src/USER-DIFFRACTION/compute_xrd_consts.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DIFFRACTION/fix_saed_vtk.h b/src/USER-DIFFRACTION/fix_saed_vtk.h
index 294b003b0c..fa379e7216 100644
--- a/src/USER-DIFFRACTION/fix_saed_vtk.h
+++ b/src/USER-DIFFRACTION/fix_saed_vtk.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DPD/fix_dpd_energy.h b/src/USER-DPD/fix_dpd_energy.h
index 9be41c3b9a..89ba84c08b 100644
--- a/src/USER-DPD/fix_dpd_energy.h
+++ b/src/USER-DPD/fix_dpd_energy.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DPD/fix_rx.h b/src/USER-DPD/fix_rx.h
index 5e226aec73..ca87fc51fd 100644
--- a/src/USER-DPD/fix_rx.h
+++ b/src/USER-DPD/fix_rx.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DPD/pair_exp6_rx.h b/src/USER-DPD/pair_exp6_rx.h
index 31d4ffb20b..45c046cc07 100644
--- a/src/USER-DPD/pair_exp6_rx.h
+++ b/src/USER-DPD/pair_exp6_rx.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-DPD/pair_multi_lucy_rx.h b/src/USER-DPD/pair_multi_lucy_rx.h
index 5975bd6ccd..2bfa5d20e3 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.h
+++ b/src/USER-DPD/pair_multi_lucy_rx.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index e32a09c45c..c02014d0ce 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -4,6 +4,7 @@
                      --------------------------------
                      
              W. Michael Brown (Intel) michael.w.brown at intel.com
+                   William McDoniel (RWTH Aachen University)
                    Rodrigo Canales (RWTH Aachen University)
                   Markus H�hnerbach (RWTH Aachen University)
                            Stan Moore (Sandia)
@@ -14,15 +15,25 @@
 
 -----------------------------------------------------------------------------
 
-This package is based on the USER-OMP package and provides LAMMPS styles that:
+This package provides LAMMPS styles that:
 
    1. include support for single and mixed precision in addition to double.
    2. include modifications to support vectorization for key routines
+   3. include modifications for data layouts to improve cache efficiency
    3. include modifications to support offload to Intel(R) Xeon Phi(TM) 
       coprocessors
 
 -----------------------------------------------------------------------------
 
+For Intel server processors codenamed "Skylake", the following flags should
+be added or changed in the Makefile depending on the version:
+
+2017 update 2         - No changes needed
+2017 updates 3 or 4   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
+2018 or newer         - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
+
+-----------------------------------------------------------------------------
+
 When using the suffix command with "intel", intel styles will be used if they
 exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP 
 USER-OMP styles will be used whenever USER-INTEL styles are not available. This
diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README
index cf14fb3237..758c37bf56 100644
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@@ -4,6 +4,7 @@
 # in.intel.lj -	        Atomic fluid (LJ Benchmark)
 # in.intel.rhodo -      Protein (Rhodopsin Benchmark)
 # in.intel.lc -	        Liquid Crystal w/ Gay-Berne potential
+# in.intel.eam -	Copper benchmark with Embedded Atom Method
 # in.intel.sw -	        Silicon benchmark with Stillinger-Weber
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
@@ -11,19 +12,26 @@
 #############################################################################
 
 #############################################################################
-# Expected Timesteps/second with turbo on and HT enabled, LAMMPS 18-Jun-2016
+# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
+#  - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
 #
 #                     Xeon E5-2697v4     Xeon Phi 7250
 #                    
-# in.intel.lj -          162.764             179.148
-# in.intel.rhodo -        11.633              13.668
-# in.intel.lc -	          19.136              24.863
-# in.intel.sw -	         139.048             152.026
-# in.intel.tersoff -      82.663              92.985
-# in.intel.water -        59.838              85.704
+# in.intel.lj -            199.5               282.3
+# in.intel.rhodo -          12.4                17.5
+# in.intel.lc -	            19.0                25.7
+# in.intel.eam -            59.4                92.8
+# in.intel.sw -	           132.4               161.9
+# in.intel.tersoff -        83.3               101.1
+# in.intel.water -          53.4                90.3
 #
 #############################################################################
 
+#############################################################################
+# For Skylake server (Xeon) architectures, see notes in the USER-INTEL/README
+# for build flags that should be used. 
+#############################################################################
+
 #############################################################################
 # For Haswell (Xeon v3) architectures, depending on the compiler version, 
 # it may give better performance to compile for an AVX target (with -xAVX 
@@ -42,7 +50,18 @@
 # -v m 0.5		# Run for half as long
 #############################################################################
 
-#	Example for running benchmarks:
+#############################################################################
+# The LAMMPS newton setting can be controlled from the commandline for the
+# benchmarks with the N variable:
+#
+# -v N on		# newton on
+# -v N off		# newton off
+#
+# The default is on for all of the benchmarks except for LJ where the off
+# setting performs best with the USER-INTEL package
+#############################################################################
+
+#	Example for running benchmarks (see run_benchmarks.sh for script):
 
 # 	Number of physical cores per node not including hyperthreads
 export LMP_CORES=28
@@ -57,26 +76,35 @@ export LMP_BIN=../../lmp_intel_cpu
 #      LAMMPS root directory
 export LMP_ROOT=../../../
                
-source /opt/intel/parallel_studio_xe_2016.2.062/psxevars.sh
+source source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
+export KMP_BLOCKTIME=0
 export I_MPI_PIN_DOMAIN=core
 export I_MPI_FABRICS=shm		# For single node
 
+# ONLY FOR INTEL XEON PHI x200 SERIES PROCESSORS
+export I_MPI_SHM_LMT=shm
+
 #      Generate the restart file for use with liquid crystal benchmark
 mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none
 
 #      Benchmark to run
 export bench=in.intel.lj
 
+#############################################################################
+# For Intel Xeon Phi x200 series processors best performance is achieved by
+# using MCDRAM. In flat mode, this can be achieved with numactl,
+# MPI environment variables, or other options provided by batch schedulers
+#############################################################################
 
 #############################################################################
 # To run without a optimization package
 #############################################################################
-mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none
+mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -v N on
 
 #############################################################################
 # To run with USER-OMP package
 #############################################################################
-mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp
+mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp -v N on
 
 #############################################################################
 # To run with USER-INTEL package and no coprocessor
@@ -89,6 +117,9 @@ mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 -sf intel
 mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel
 
 #############################################################################
-# If using PPPM (in.intel.rhodo) on Intel Xeon Phi x200 series processors
+# If using PPPM (e.g. in.intel.rhodo) on Intel Xeon Phi x200 series 
+#   or Skylake processors
 #############################################################################
-mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp 3 lrt yes -sf intel
+export KMP_AFFINITY=none
+rthreads=$((OMP_NUM_THREADS-1))
+mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp $rthreads lrt yes -sf intel
diff --git a/src/USER-INTEL/TEST/in.intel.eam b/src/USER-INTEL/TEST/in.intel.eam
index e9523a5dd1..5a3b3064af 100644
--- a/src/USER-INTEL/TEST/in.intel.eam
+++ b/src/USER-INTEL/TEST/in.intel.eam
@@ -1,4 +1,6 @@
 # bulk Cu lattice
+
+variable        N index on      # Newton Setting
 variable	w index 10      # Warmup Timesteps
 variable	t index 3100    # Main Run Timesteps
 variable	m index 1       # Main Run Timestep Multiplier
@@ -13,6 +15,7 @@ variable	z index 2
 variable	rr equal floor($t*$m)
 variable	root getenv LMP_ROOT
 
+newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 variable	xx equal 20*$x
diff --git a/src/USER-INTEL/TEST/in.intel.lc b/src/USER-INTEL/TEST/in.intel.lc
index 0172ba3b4d..411f5d830d 100644
--- a/src/USER-INTEL/TEST/in.intel.lc
+++ b/src/USER-INTEL/TEST/in.intel.lc
@@ -3,6 +3,7 @@
 # shape: 2 1.5 1
 # cutoff 4.0 with skin 0.8
 
+variable        N index on      # Newton Setting
 variable        w index 10	# Warmup Timesteps
 variable        t index 840	# Main Run Timesteps
 variable        m index 1	# Main Run Timestep Multiplier
@@ -15,6 +16,7 @@ variable        z index 2
 
 variable        rr equal floor($t*$m)
 
+newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 units		lj
diff --git a/src/USER-INTEL/TEST/in.intel.lj b/src/USER-INTEL/TEST/in.intel.lj
index 8931ca24bc..2b724f6014 100644
--- a/src/USER-INTEL/TEST/in.intel.lj
+++ b/src/USER-INTEL/TEST/in.intel.lj
@@ -1,5 +1,6 @@
 # 3d Lennard-Jones melt
 
+variable        N index off     # Newton Setting
 variable	w index 10	# Warmup Timesteps
 variable	t index 7900	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
@@ -15,6 +16,7 @@ variable	yy equal 20*$y
 variable	zz equal 20*$z
 variable	rr equal floor($t*$m)
 
+newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 units		lj
diff --git a/src/USER-INTEL/TEST/in.intel.rhodo b/src/USER-INTEL/TEST/in.intel.rhodo
index 7b3b092607..05145d79c0 100644
--- a/src/USER-INTEL/TEST/in.intel.rhodo
+++ b/src/USER-INTEL/TEST/in.intel.rhodo
@@ -1,5 +1,6 @@
 # Rhodopsin model
 
+variable        N index on      # Newton Setting
 variable	w index 10	# Warmup Timesteps
 variable	t index 520	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
@@ -16,10 +17,11 @@ variable	z index 2
 variable	rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 
+newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 units           real  
-neigh_modify    delay 5 every 1 binsize $b
+neigh_modify    delay 5 every 1
 
 atom_style      full  
 bond_style      harmonic 
diff --git a/src/USER-INTEL/TEST/in.intel.sw b/src/USER-INTEL/TEST/in.intel.sw
index 077c9bb4fb..494f58dea3 100644
--- a/src/USER-INTEL/TEST/in.intel.sw
+++ b/src/USER-INTEL/TEST/in.intel.sw
@@ -1,5 +1,6 @@
 # bulk Si via Stillinger-Weber
 
+variable        N index on      # Newton Setting
 variable        w index 10      # Warmup Timesteps
 variable        t index 6200	# Main Run Timesteps
 variable        m index 1       # Main Run Timestep Multiplier
@@ -16,6 +17,7 @@ variable	zz equal 10*$z
 variable        rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 
+newton          $N
 if "$n > 0"     then "processors * * * grid numa"
 
 units		metal
diff --git a/src/USER-INTEL/TEST/in.intel.tersoff b/src/USER-INTEL/TEST/in.intel.tersoff
index f0c6a88f75..574b29f674 100644
--- a/src/USER-INTEL/TEST/in.intel.tersoff
+++ b/src/USER-INTEL/TEST/in.intel.tersoff
@@ -1,5 +1,6 @@
 # bulk Si via Tersoff
 
+variable        N index on      # Newton Setting
 variable        w index 10      # Warmup Timesteps
 variable        t index 2420	# Main Run Timesteps
 variable        m index 1       # Main Run Timestep Multiplier
@@ -16,6 +17,7 @@ variable        zz equal 10*$z
 variable        rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 
+newton          $N
 if "$n > 0"     then "processors * * * grid numa"
 
 units		metal
diff --git a/src/USER-INTEL/TEST/in.intel.water b/src/USER-INTEL/TEST/in.intel.water
index 1c1fca311f..0643def19e 100644
--- a/src/USER-INTEL/TEST/in.intel.water
+++ b/src/USER-INTEL/TEST/in.intel.water
@@ -1,5 +1,6 @@
 # Coarse-grain water simulation using Stillinger-Weber
 
+variable        N index on      # Newton Setting
 variable        w index 10      # Warmup Timesteps
 variable        t index 2600	# Main Run Timesteps
 variable        m index 1       # Main Run Timestep Multiplier
@@ -11,6 +12,7 @@ variable	y index 2
 variable	z index 2
 variable        rr equal floor($t*$m)
 
+newton          $N
 if "$n > 0"     then "processors * * * grid numa"
 
 units		real
diff --git a/src/USER-INTEL/TEST/in.lc_generate_restart b/src/USER-INTEL/TEST/in.lc_generate_restart
index 8ae53c5c8e..30d593f2cd 100644
--- a/src/USER-INTEL/TEST/in.lc_generate_restart
+++ b/src/USER-INTEL/TEST/in.lc_generate_restart
@@ -4,13 +4,13 @@
 # cutoff 4.0 with skin 0.8
 # NPT, T=2.4, P=8.0
 
-variable	x index 1
-variable	y index 1
-variable	z index 1
+variable	xt index 1
+variable	yt index 1
+variable	zt index 1
 
-variable	i equal $x*32
-variable	j equal $y*32
-variable	k equal $z*32
+variable	i equal ${xt}*32
+variable	j equal ${yt}*32
+variable	k equal ${zt}*32
 
 units		lj
 atom_style	ellipsoid
diff --git a/src/USER-INTEL/TEST/run_benchmarks.sh b/src/USER-INTEL/TEST/run_benchmarks.sh
new file mode 100755
index 0000000000..10bd79e0d1
--- /dev/null
+++ b/src/USER-INTEL/TEST/run_benchmarks.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+#########################################################################
+# Adjust settings below for your system
+#########################################################################
+
+# --------------------- MPI Launch Command
+
+export MPI="mpirun"           
+#export MPI="numactl -p 1 mpirun"    # -- Systems w/ MCDRAM in flat mode
+
+# ------------- Name and location of the LAMMPS binary
+
+export LMP_BIN=../../lmp_intel_cpu_intelmpi
+#export LMP_BIN=../../lmp_knl
+
+# ------------- Directory containing the LAMMPS installation
+
+export LMP_ROOT=../../../
+
+# ------------- Number of physical cores (not HW threads)
+
+export LMP_CORES=36            # -- For Intel Xeon E5-2697v4 SKU
+#export LMP_CORES=68           # -- For Intel Xeon Phi x200 7250 SKU
+
+# ------------- Number of HW threads to use in tests
+
+export LMP_THREAD_LIST="2"     # -- For 2 threads per core w/ HT enabled
+#export LMP_THREAD_LIST="2 4"   # -- For 2 threads per core w/ HT enabled
+
+# ------------- MPI Tuning Parameters
+
+#export I_MPI_SHM_LMT=shm      # -- Uncomment for Xeon Phi x200 series
+
+# ------------- Library locations for build
+
+#source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
+
+#########################################################################
+# End settings for your system
+#########################################################################
+
+export WORKLOADS="lj rhodo rhodo_lrt lc sw water eam"
+export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1"
+export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1"
+
+export LOG_DIR_HEADER=`echo $LMP_BIN | sed 's/\.\.\///g' | sed 's/\.\///g'`
+export LOG_DIR_HOST=`hostname`
+export DATE_STRING=`date +%s`
+export LOG_DIR=$LOG_DIR_HOST"_"$LOG_DIR_HEADER"_"$DATE_STRING
+mkdir $LOG_DIR
+
+export I_MPI_PIN_DOMAIN=core
+export I_MPI_FABRICS=shm
+export KMP_BLOCKTIME=0
+
+echo -n "Creating restart file...."
+$MPI -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none $LMP_ARGS
+echo "Done."
+for threads in $LMP_THREAD_LIST
+do
+  export OMP_NUM_THREADS=$threads
+  for workload in $WORKLOADS
+  do
+    export LOGFILE=$LOG_DIR/$workload.$LMP_CORES"c"$threads"t".log
+    echo "Running $LOGFILE"
+    cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $LMP_ARGS";
+    rthreads=$threads
+    unset KMP_AFFINITY
+    $cmd
+
+    # - For benchmarks with PPPM, also try LRT mode
+    if [ $workload = "rhodo" ]; then
+      export LOGFILE=$LOG_DIR/$workload"_lrt".$LMP_CORES"c"$threads"t".log
+      cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $RLMP_ARGS";
+      rthreads=$((threads-1))
+      export KMP_AFFINITY=none
+      export OMP_NUM_THREADS=$rthreads
+      echo "  $cmd" >> $LOG_DIR/commands.info
+      $cmd
+    fi
+  done
+done
+
+# Performance reported by LAMMPS (Timesteps/second ignoring warm-up run)
+grep Perf $LOG_DIR/*.log | awk 'BEGIN{n=1}n%2==0{print $0}{n++}' | sed 's/\/day//g' | sed 's/steps\/s/steps_s/g' | sed 's/hours\/ns//g' | sed 's/.*\///g' | sed 's/\.log:Performance://g' | awk '{c=NF-1; print $1,$c}'
diff --git a/src/USER-INTEL/angle_charmm_intel.cpp b/src/USER-INTEL/angle_charmm_intel.cpp
index aafc765c6b..d55afd4742 100644
--- a/src/USER-INTEL/angle_charmm_intel.cpp
+++ b/src/USER-INTEL/angle_charmm_intel.cpp
@@ -37,7 +37,7 @@ typedef struct { int a,b,c,t;  } int4_t;
 
 /* ---------------------------------------------------------------------- */
 
-AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp) 
+AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
@@ -74,23 +74,23 @@ void AngleCharmmIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void AngleCharmmIntel::compute(int eflag, int vflag,
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -102,10 +102,10 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void AngleCharmmIntel::eval(const int vflag, 
-			    IntelBuffers<flt_t,acc_t> *buffers,
-			    const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void AngleCharmmIntel::eval(const int vflag,
+                            IntelBuffers<flt_t,acc_t> *buffers,
+                            const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->nanglelist;
@@ -126,31 +126,42 @@ void AngleCharmmIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oeangle = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oeangle = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)	\
+    shared(f_start,f_stride,fc) \
     reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int4_t * _noalias const anglelist = 
+    const int4_t * _noalias const anglelist =
       (int4_t *) neighbor->anglelist[0];
 
-    for (int n = nfrom; n < nto; n++) {
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) seangle = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = anglelist[n].a;
       const int i2 = anglelist[n].b;
       const int i3 = anglelist[n].c;
@@ -229,40 +240,58 @@ void AngleCharmmIntel::eval(const int vflag,
 
       // apply force to each of 3 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += f1x;
-	f[i1].y += f1y;
-	f[i1].z += f1z;
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x -= f1x + f3x;
+          f[i2].y -= f1y + f3y;
+          f[i2].z -= f1z + f3z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
+        }
       }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x -= f1x + f3x;
-        f[i2].y -= f1y + f3y;
-        f[i2].z -= f1z + f3z;
-      }
-
-      if (NEWTON_BOND || i3 < nlocal) {
-        f[i3].x += f3x;
-        f[i3].y += f3y;
-        f[i3].z += f3z;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, 
-                              f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, 
-                              delx2, dely2, delz2, oeangle, f, NEWTON_BOND, 
-                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
+                              i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
+                              dely1, delz1, delx2, dely2, delz2, seangle,
+                              f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
+                              sv4, sv5);
+        #else
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
+                              i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
+                              dely1, delz1, delx2, dely2, delz2, oeangle,
+                              f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+        #endif
       }
     } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oeangle += seangle;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oeangle;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
-    }
+  if (EFLAG) energy += oeangle;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -319,11 +348,11 @@ void AngleCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
-	                                             Memory *memory) {
+                                                     Memory *memory) {
   if (nangletypes != _nangletypes) {
     if (_nangletypes > 0)
       _memory->destroy(fc);
-    
+
     if (nangletypes > 0)
       _memory->create(fc,nangletypes,"anglecharmmintel.fc");
   }
diff --git a/src/USER-INTEL/angle_charmm_intel.h b/src/USER-INTEL/angle_charmm_intel.h
index a98007b3ef..342af31b8c 100644
--- a/src/USER-INTEL/angle_charmm_intel.h
+++ b/src/USER-INTEL/angle_charmm_intel.h
@@ -45,8 +45,8 @@ class AngleCharmmIntel : public AngleCharmm {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp
index f101fd9e1f..47e0add690 100644
--- a/src/USER-INTEL/angle_harmonic_intel.cpp
+++ b/src/USER-INTEL/angle_harmonic_intel.cpp
@@ -37,7 +37,7 @@ typedef struct { int a,b,c,t;  } int4_t;
 
 /* ---------------------------------------------------------------------- */
 
-AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp) 
+AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
@@ -74,23 +74,23 @@ void AngleHarmonicIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void AngleHarmonicIntel::compute(int eflag, int vflag,
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -102,10 +102,10 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void AngleHarmonicIntel::eval(const int vflag, 
-			    IntelBuffers<flt_t,acc_t> *buffers,
-			    const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void AngleHarmonicIntel::eval(const int vflag,
+                            IntelBuffers<flt_t,acc_t> *buffers,
+                            const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->nanglelist;
@@ -126,31 +126,42 @@ void AngleHarmonicIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oeangle = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oeangle = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)	\
+    shared(f_start,f_stride,fc) \
     reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int4_t * _noalias const anglelist = 
+    const int4_t * _noalias const anglelist =
       (int4_t *) neighbor->anglelist[0];
 
-    for (int n = nfrom; n < nto; n++) {
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) seangle = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = anglelist[n].a;
       const int i2 = anglelist[n].b;
       const int i3 = anglelist[n].c;
@@ -211,40 +222,58 @@ void AngleHarmonicIntel::eval(const int vflag,
 
       // apply force to each of 3 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += f1x;
-	f[i1].y += f1y;
-	f[i1].z += f1z;
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x -= f1x + f3x;
+          f[i2].y -= f1y + f3y;
+          f[i2].z -= f1z + f3z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
+        }
       }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x -= f1x + f3x;
-        f[i2].y -= f1y + f3y;
-        f[i2].z -= f1z + f3z;
-      }
-
-      if (NEWTON_BOND || i3 < nlocal) {
-        f[i3].x += f3x;
-        f[i3].y += f3y;
-        f[i3].z += f3z;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, 
-                              f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, 
-                              delx2, dely2, delz2, oeangle, f, NEWTON_BOND, 
-                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
+                              f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
+                              delz1, delx2, dely2, delz2, seangle, f,
+                              NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4,
+                              sv5);
+        #else
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
+                              f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
+                              delz1, delx2, dely2, delz2, oeangle, f,
+                              NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4,
+                              ov5);
+        #endif
       }
     } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oeangle += seangle;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oeangle;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
-    }
+  if (EFLAG) energy += oeangle;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -299,11 +328,11 @@ void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void AngleHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
-	                                             Memory *memory) {
+                                                     Memory *memory) {
   if (nangletypes != _nangletypes) {
     if (_nangletypes > 0)
       _memory->destroy(fc);
-    
+
     if (nangletypes > 0)
       _memory->create(fc,nangletypes,"anglecharmmintel.fc");
   }
diff --git a/src/USER-INTEL/angle_harmonic_intel.h b/src/USER-INTEL/angle_harmonic_intel.h
index 340ea4b974..301fc7cc06 100644
--- a/src/USER-INTEL/angle_harmonic_intel.h
+++ b/src/USER-INTEL/angle_harmonic_intel.h
@@ -45,8 +45,8 @@ class AngleHarmonicIntel : public AngleHarmonic {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/bond_fene_intel.cpp b/src/USER-INTEL/bond_fene_intel.cpp
index e61ab9be84..bb96135b2d 100644
--- a/src/USER-INTEL/bond_fene_intel.cpp
+++ b/src/USER-INTEL/bond_fene_intel.cpp
@@ -33,7 +33,7 @@ typedef struct { int a,b,t;  } int3_t;
 
 /* ---------------------------------------------------------------------- */
 
-BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp) 
+BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
@@ -70,23 +70,23 @@ void BondFENEIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void BondFENEIntel::compute(int eflag, int vflag,
-				IntelBuffers<flt_t,acc_t> *buffers,
-				const ForceConst<flt_t> &fc)
+                                IntelBuffers<flt_t,acc_t> *buffers,
+                                const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -96,10 +96,10 @@ void BondFENEIntel::compute(int eflag, int vflag,
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void BondFENEIntel::eval(const int vflag, 
-			     IntelBuffers<flt_t,acc_t> *buffers,
-			     const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void BondFENEIntel::eval(const int vflag,
+                         IntelBuffers<flt_t,acc_t> *buffers,
+                         const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nbondlist;
   if (inum == 0) return;
@@ -119,32 +119,42 @@ void BondFENEIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oebond = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oebond = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
-
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int3_t * _noalias const bondlist = 
+    const int3_t * _noalias const bondlist =
       (int3_t *) neighbor->bondlist[0];
 
-    for (int n = nfrom; n < nto; n++) {
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) sebond = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = bondlist[n].a;
       const int i2 = bondlist[n].b;
       const int type = bondlist[n].t;
@@ -166,7 +176,7 @@ void BondFENEIntel::eval(const int vflag,
       // if r -> r0, then rlogarg < 0.0 which is an error
       // issue a warning and reset rlogarg = epsilon
       // if r > 2*r0 something serious is wrong, abort
-      
+
       if (rlogarg < (flt_t)0.1) {
         char str[128];
         sprintf(str,"FENE bond too long: " BIGINT_FORMAT " "
@@ -176,18 +186,18 @@ void BondFENEIntel::eval(const int vflag,
         if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond");
         rlogarg = (flt_t)0.1;
       }
-      
+
       flt_t fbond = -k/rlogarg;
-      
+
       // force from LJ term
-      
+
       flt_t sr2,sr6;
       if (rsq < (flt_t)TWO_1_3*sigmasq) {
-	sr2 = sigmasq * irsq;
+        sr2 = sigmasq * irsq;
         sr6 = sr2 * sr2 * sr2;
         fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq;
       }
-      
+
       // energy
 
       flt_t ebond;
@@ -199,33 +209,48 @@ void BondFENEIntel::eval(const int vflag,
 
       // apply force to each of 2 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += delx*fbond;
-        f[i1].y += dely*fbond;
-        f[i1].z += delz*fbond;
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += delx*fbond;
+          f[i1].y += dely*fbond;
+          f[i1].z += delz*fbond;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x -= delx*fbond;
+          f[i2].y -= dely*fbond;
+          f[i2].z -= delz*fbond;
+        }
       }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x -= delx*fbond;
-        f[i2].y -= dely*fbond;
-        f[i2].z -= delz*fbond;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, 
-                             delx, dely, delz, oebond, f, NEWTON_BOND, 
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
+                             delx, dely, delz, sebond, f, NEWTON_BOND,
+                             nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
+                             delx, dely, delz, oebond, f, NEWTON_BOND,
                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
       }
     } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oebond += sebond;
+    if (VFLAG && vflag) {
+       ov0 += sv0; ov1 += sv1; ov2 += sv2;
+       ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oebond;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
-    }
+  if (EFLAG) energy += oebond;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -282,11 +307,11 @@ void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
-	                                              Memory *memory) {
+                                                      Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(fc);
-    
+
     if (nbondtypes > 0)
       _memory->create(fc,nbondtypes,"bondfeneintel.fc");
   }
diff --git a/src/USER-INTEL/bond_fene_intel.h b/src/USER-INTEL/bond_fene_intel.h
index d64f1e7254..89c3033096 100644
--- a/src/USER-INTEL/bond_fene_intel.h
+++ b/src/USER-INTEL/bond_fene_intel.h
@@ -45,8 +45,8 @@ class BondFENEIntel : public BondFENE {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/bond_harmonic_intel.cpp b/src/USER-INTEL/bond_harmonic_intel.cpp
index 51a33b1cc3..beb0ebcdda 100644
--- a/src/USER-INTEL/bond_harmonic_intel.cpp
+++ b/src/USER-INTEL/bond_harmonic_intel.cpp
@@ -33,7 +33,7 @@ typedef struct { int a,b,t;  } int3_t;
 
 /* ---------------------------------------------------------------------- */
 
-BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp) 
+BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
@@ -70,23 +70,23 @@ void BondHarmonicIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void BondHarmonicIntel::compute(int eflag, int vflag,
-				IntelBuffers<flt_t,acc_t> *buffers,
-				const ForceConst<flt_t> &fc)
+                                IntelBuffers<flt_t,acc_t> *buffers,
+                                const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -96,10 +96,10 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void BondHarmonicIntel::eval(const int vflag, 
-			     IntelBuffers<flt_t,acc_t> *buffers,
-			     const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void BondHarmonicIntel::eval(const int vflag,
+                             IntelBuffers<flt_t,acc_t> *buffers,
+                             const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nbondlist;
   if (inum == 0) return;
@@ -119,31 +119,42 @@ void BondHarmonicIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oebond = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oebond = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int3_t * _noalias const bondlist = 
+    const int3_t * _noalias const bondlist =
       (int3_t *) neighbor->bondlist[0];
 
-    for (int n = nfrom; n < nto; n++) {
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) sebond = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = bondlist[n].a;
       const int i2 = bondlist[n].b;
       const int type = bondlist[n].t;
@@ -167,33 +178,50 @@ void BondHarmonicIntel::eval(const int vflag,
       if (EFLAG) ebond = rk*dr;
 
       // apply force to each of 2 atoms
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += delx*fbond;
-        f[i1].y += dely*fbond;
-        f[i1].z += delz*fbond;
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += delx*fbond;
+          f[i1].y += dely*fbond;
+          f[i1].z += delz*fbond;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x -= delx*fbond;
+          f[i2].y -= dely*fbond;
+          f[i2].z -= delz*fbond;
+        }
       }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x -= delx*fbond;
-        f[i2].y -= dely*fbond;
-        f[i2].z -= delz*fbond;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, 
-                             delx, dely, delz, oebond, f, NEWTON_BOND, 
-                             nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
+                             fbond, delx, dely, delz, sebond, f,
+                             NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
+                             sv4, sv5);
+        #else
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
+                             fbond, delx, dely, delz, oebond, f,
+                             NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
+                             ov4, ov5);
+        #endif
       }
     } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oebond += sebond;
+    if (VFLAG && vflag) {
+       ov0 += sv0; ov1 += sv1; ov2 += sv2;
+       ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oebond;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
-    }
+  if (EFLAG) energy += oebond;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -248,11 +276,11 @@ void BondHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
-	                                              Memory *memory) {
+                                                      Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(fc);
-    
+
     if (nbondtypes > 0)
       _memory->create(fc,nbondtypes,"bondharmonicintel.fc");
   }
diff --git a/src/USER-INTEL/bond_harmonic_intel.h b/src/USER-INTEL/bond_harmonic_intel.h
index 0de844cddf..8fc04f432a 100644
--- a/src/USER-INTEL/bond_harmonic_intel.h
+++ b/src/USER-INTEL/bond_harmonic_intel.h
@@ -45,8 +45,8 @@ class BondHarmonicIntel : public BondHarmonic {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/dihedral_charmm_intel.cpp b/src/USER-INTEL/dihedral_charmm_intel.cpp
index c07c226611..715cef4d37 100644
--- a/src/USER-INTEL/dihedral_charmm_intel.cpp
+++ b/src/USER-INTEL/dihedral_charmm_intel.cpp
@@ -80,8 +80,8 @@ void DihedralCharmmIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void DihedralCharmmIntel::compute(int eflag, int vflag,
-				  IntelBuffers<flt_t,acc_t> *buffers,
-				  const ForceConst<flt_t> &fc)
+                                  IntelBuffers<flt_t,acc_t> *buffers,
+                                  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -93,16 +93,16 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
     force->pair->vflag_either = force->pair->vflag_global = 1;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -114,10 +114,10 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
 
 #ifndef LMP_USE_AVXCD_DHC
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void DihedralCharmmIntel::eval(const int vflag, 
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralCharmmIntel::eval(const int vflag,
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
@@ -140,50 +140,50 @@ void DihedralCharmmIntel::eval(const int vflag,
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oevdwl = oecoul = oedihedral = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-      opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
-    }
+  if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
-	      opv0,opv1,opv2,opv3,opv4,opv5)
+              opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
+    #if defined(LMP_SIMD_COMPILER_TEST)
     int nfrom, nto, tid;
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    int nfrom, npl, nto, tid;
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const dihedrallist = 
+    const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
     const flt_t qqrd2e = force->qqrd2e;
 
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
-    if (EVFLAG) {
-      if (EFLAG)
-	sevdwl = secoul = sedihedral = (acc_t)0.0;
-      if (vflag) {
-	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
-	spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
-      }
+    if (EFLAG) sevdwl = secoul = sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+      spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
     }
 
     #if defined(LMP_SIMD_COMPILER_TEST)
     #pragma vector aligned
     #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
-                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) 
-    #endif
+                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
     for (int n = nfrom; n < nto; n++) {
+    #endif
+    for (int n = nfrom; n < nto; n += npl) {
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
@@ -204,7 +204,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
-      
+
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
@@ -244,25 +244,25 @@ void DihedralCharmmIntel::eval(const int vflag,
       // error check
       #ifndef LMP_SIMD_COMPILER_TEST
       if (c > PTOLERANCE || c < MTOLERANCE) {
-	int me = comm->me;
+        int me = comm->me;
 
-	if (screen) {
-	  char str[128];
-	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT,
-		  me,tid,update->ntimestep,
-		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
-		  me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
-		  me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
-		  me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
-		  me,x[i4].x,x[i4].y,x[i4].z);
-	}
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
       }
       #endif
 
@@ -279,19 +279,19 @@ void DihedralCharmmIntel::eval(const int vflag,
       ddf1 = df1 = (flt_t)0.0;
 
       for (int i = 0; i < m; i++) {
-	ddf1 = p*c - df1*s;
-	df1 = p*s + df1*c;
-	p = ddf1;
+        ddf1 = p*c - df1*s;
+        df1 = p*s + df1*c;
+        p = ddf1;
       }
 
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 *= -m;
       p += (flt_t)1.0;
-      
+
       if (m == 0) {
-	p = (flt_t)1.0 + tcos_shift;
-	df1 = (flt_t)0.0;
+        p = (flt_t)1.0 + tcos_shift;
+        df1 = (flt_t)0.0;
       }
 
       const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
@@ -333,14 +333,14 @@ void DihedralCharmmIntel::eval(const int vflag,
       const flt_t f3y = -sy2 - f4y;
       const flt_t f3z = -sz2 - f4z;
 
-      if (EVFLAG) {
-	flt_t deng;
-	if (EFLAG) deng = tk * p;
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, 
-			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
-			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
-			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
-			      sv0, sv1, sv2, sv3, sv4, sv5);
+      if (EFLAG || VFLAG) {
+        flt_t deng;
+        if (EFLAG) deng = tk * p;
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3,
+                              i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
+                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
+                              vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND,
+                              nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
       }
 
 
@@ -349,15 +349,15 @@ void DihedralCharmmIntel::eval(const int vflag,
       #endif
       {
         if (NEWTON_BOND || i2 < nlocal) {
-	  f[i2].x += f2x;
-	  f[i2].y += f2y;
-	  f[i2].z += f2z;
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
-	  f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
       }
 
@@ -372,54 +372,54 @@ void DihedralCharmmIntel::eval(const int vflag,
       flt_t forcecoul;
       if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
       else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
-      const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv - 
-				     fc.ljp[itype][jtype].lj2);
+      const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv -
+                                     fc.ljp[itype][jtype].lj2);
       const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
 
       if (NEWTON_BOND || i1 < nlocal) {
-	f1x += delx*fpair;
-	f1y += dely*fpair;
-	f1z += delz*fpair;
+        f1x += delx*fpair;
+        f1y += dely*fpair;
+        f1z += delz*fpair;
       }
       if (NEWTON_BOND || i4 < nlocal) {
-	f4x -= delx*fpair;
-	f4y -= dely*fpair;
-	f4z -= delz*fpair;
+        f4x -= delx*fpair;
+        f4y -= dely*fpair;
+        f4z -= delz*fpair;
       }
 
-      if (EVFLAG) {
-	flt_t ev_pre = (flt_t)0;
-	if (NEWTON_BOND || i1 < nlocal)
-	  ev_pre += (flt_t)0.5;
-	if (NEWTON_BOND || i4 < nlocal)
-	  ev_pre += (flt_t)0.5;
+      if (EFLAG || VFLAG) {
+        flt_t ev_pre = (flt_t)0;
+        if (NEWTON_BOND || i1 < nlocal)
+          ev_pre += (flt_t)0.5;
+        if (NEWTON_BOND || i4 < nlocal)
+          ev_pre += (flt_t)0.5;
 
-	if (EFLAG) {
-	  flt_t ecoul, evdwl;
-	  ecoul = tweight * forcecoul;
-	  evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv - 
-				     fc.ljp[itype][jtype].lj4);
-	  secoul += ev_pre * ecoul;
-	  sevdwl += ev_pre * evdwl;
-	  if (eatom) {
-	    evdwl *= (flt_t)0.5;
-	    evdwl += (flt_t)0.5 * ecoul;
-	    if (NEWTON_BOND || i1 < nlocal)
-	      f[i1].w += evdwl;
-	    if (NEWTON_BOND || i4 < nlocal)
-	      f[i4].w += evdwl;
-	  }
-	}
-	//	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
-	//				   delx, dely, delz);
-	if (vflag) {                                                    
-	  spv0 += ev_pre * delx * delx * fpair;                               
-	  spv1 += ev_pre * dely * dely * fpair;                               
-	  spv2 += ev_pre * delz * delz * fpair;                               
-	  spv3 += ev_pre * delx * dely * fpair;                               
-	  spv4 += ev_pre * delx * delz * fpair;                               
-	  spv5 += ev_pre * dely * delz * fpair;                               
-	}                                                                    
+        if (EFLAG) {
+          flt_t ecoul, evdwl;
+          ecoul = tweight * forcecoul;
+          evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv -
+                                     fc.ljp[itype][jtype].lj4);
+          secoul += ev_pre * ecoul;
+          sevdwl += ev_pre * evdwl;
+          if (eatom) {
+            evdwl *= (flt_t)0.5;
+            evdwl += (flt_t)0.5 * ecoul;
+            if (NEWTON_BOND || i1 < nlocal)
+              f[i1].w += evdwl;
+            if (NEWTON_BOND || i4 < nlocal)
+              f[i4].w += evdwl;
+          }
+        }
+        //            IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
+        //                                 delx, dely, delz);
+        if (VFLAG && vflag) {
+          spv0 += ev_pre * delx * delx * fpair;
+          spv1 += ev_pre * dely * dely * fpair;
+          spv2 += ev_pre * delz * delz * fpair;
+          spv3 += ev_pre * delx * dely * fpair;
+          spv4 += ev_pre * delx * delz * fpair;
+          spv5 += ev_pre * dely * delz * fpair;
+        }
       }
 
       // apply force to each of 4 atoms
@@ -428,48 +428,44 @@ void DihedralCharmmIntel::eval(const int vflag,
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
-	  f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
-	  f[i4].x += f4x;
-	  f[i4].y += f4y;
-	  f[i4].z += f4z;
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
         }
       }
     } // for n
-    if (EVFLAG) {
-      if (EFLAG) {
-	oedihedral += sedihedral;
-	oecoul += secoul;
-	oevdwl += sevdwl;
-      }
-      if (vflag) {
-	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
-	opv0 += spv0; opv1 += spv1; opv2 += spv2; 
-	opv3 += spv3; opv4 += spv4; opv5 += spv5;
-      }
+    if (EFLAG) {
+      oedihedral += sedihedral;
+      oecoul += secoul;
+      oevdwl += sevdwl;
+    }
+    if (VFLAG && vflag) {
+      ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
+      opv0 += spv0; opv1 += spv1; opv2 += spv2;
+      opv3 += spv3; opv4 += spv4; opv5 += spv5;
     }
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG) {
-      energy += oedihedral;
-      force->pair->eng_vdwl += oevdwl;
-      force->pair->eng_coul += oecoul;
-    }
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
-      force->pair->virial[0] += opv0;
-      force->pair->virial[1] += opv1;
-      force->pair->virial[2] += opv2;
-      force->pair->virial[3] += opv3;
-      force->pair->virial[4] += opv4;
-      force->pair->virial[5] += opv5;
-    }
+  if (EFLAG) {
+    energy += oedihedral;
+    force->pair->eng_vdwl += oevdwl;
+    force->pair->eng_coul += oecoul;
+  }
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+    force->pair->virial[0] += opv0;
+    force->pair->virial[1] += opv1;
+    force->pair->virial[2] += opv2;
+    force->pair->virial[3] += opv3;
+    force->pair->virial[4] += opv4;
+    force->pair->virial[5] += opv5;
   }
 
   fix->set_reduce_flag();
@@ -488,10 +484,10 @@ authors for more details.
 
 ------------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void DihedralCharmmIntel::eval(const int vflag, 
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralCharmmIntel::eval(const int vflag,
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 
 {
   typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
@@ -518,30 +514,28 @@ void DihedralCharmmIntel::eval(const int vflag,
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oevdwl = oecoul = oedihedral = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-      opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
-    }
+  if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
-	      opv0,opv1,opv2,opv3,opv4,opv5)
+              opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
-    int nfrom, nto, tid;
-    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    int nfrom, npl, nto, tid;
+    IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
+                             swidth);
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int * _noalias const dihedrallist = 
+    const int * _noalias const dihedrallist =
       (int *) neighbor->dihedrallist[0];
     const flt_t * _noalias const weight = &(fc.weight[0]);
     const flt_t * _noalias const x_f = &(x[0].x);
@@ -559,36 +553,34 @@ void DihedralCharmmIntel::eval(const int vflag,
 
     SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
-    if (EVFLAG) {
-      if (EFLAG) {
-	sevdwl = SIMD_set((acc_t)0.0);
-	secoul = SIMD_set((acc_t)0.0);
-	sedihedral = SIMD_set((acc_t)0.0);
-      }
-      if (vflag) {
-	sv0 = SIMD_set((acc_t)0.0);
-	sv1 = SIMD_set((acc_t)0.0);
-	sv2 = SIMD_set((acc_t)0.0);
-	sv3 = SIMD_set((acc_t)0.0);
-	sv4 = SIMD_set((acc_t)0.0);
-	sv5 = SIMD_set((acc_t)0.0);
-	spv0 = SIMD_set((acc_t)0.0);
-	spv1 = SIMD_set((acc_t)0.0);
-	spv2 = SIMD_set((acc_t)0.0);
-	spv3 = SIMD_set((acc_t)0.0);
-	spv4 = SIMD_set((acc_t)0.0);
-	spv5 = SIMD_set((acc_t)0.0);
-      }
+    if (EFLAG) {
+      sevdwl = SIMD_set((acc_t)0.0);
+      secoul = SIMD_set((acc_t)0.0);
+      sedihedral = SIMD_set((acc_t)0.0);
+    }
+    if (VFLAG && vflag) {
+      sv0 = SIMD_set((acc_t)0.0);
+      sv1 = SIMD_set((acc_t)0.0);
+      sv2 = SIMD_set((acc_t)0.0);
+      sv3 = SIMD_set((acc_t)0.0);
+      sv4 = SIMD_set((acc_t)0.0);
+      sv5 = SIMD_set((acc_t)0.0);
+      spv0 = SIMD_set((acc_t)0.0);
+      spv1 = SIMD_set((acc_t)0.0);
+      spv2 = SIMD_set((acc_t)0.0);
+      spv3 = SIMD_set((acc_t)0.0);
+      spv4 = SIMD_set((acc_t)0.0);
+      spv5 = SIMD_set((acc_t)0.0);
     }
 
     SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
-				 55, 60, 65, 70, 75) + (nfrom * 5);
+                                 55, 60, 65, 70, 75) + (nfrom * 5);
     const int nto5 = nto * 5;
     const int nlocals4 = nlocal << 4;
     const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
     const int ntypes = atom->ntypes + 1;
 
-    for (int n = nfrom; n < nto; n += swidth) {
+    for (int n = nfrom; n < nto; n += npl) {
       SIMD_mask nmask = n_offset < nto5;
       SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
       const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
@@ -601,7 +593,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
       const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
       type = type << 2;
-      n_offset = n_offset + swidth * 5;
+      n_offset = n_offset + npl * 5;
 
       // 1st bond
 
@@ -626,7 +618,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       const SIMD_flt_t vb2zm = z2 - z3;
 
       // 3rd bond
-      
+
       SIMD_flt_t x4, y4, z4;
       SIMD_int jtype;
 
@@ -672,7 +664,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
       const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
       if (c > ptol || c < ntol)
-	if (screen)
+        if (screen)
           error->warning(FLERR,"Dihedral problem.");
 
       c = SIMD_set(c, c > one, one);
@@ -686,14 +678,14 @@ void DihedralCharmmIntel::eval(const int vflag,
       SIMD_flt_t p(one);
       SIMD_flt_t ddf1(szero);
       SIMD_flt_t df1(szero);
-      
+
       const int m_max = SIMD_max(m);
 
       for (int i = 0; i < m_max; i++) {
-	const SIMD_mask my_m = i < m;
-	ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
-	df1 = SIMD_set(df1, my_m, p*s + df1*c);
-	p = SIMD_set(p, my_m, ddf1);
+        const SIMD_mask my_m = i < m;
+        ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
+        df1 = SIMD_set(df1, my_m, p*s + df1*c);
+        p = SIMD_set(p, my_m, ddf1);
       }
 
       SIMD_flt_t multf;
@@ -702,7 +694,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 = df1 * multf;
       p = p + one;
-      
+
       SIMD_mask mzero = (m == SIMD_set((int)0));
       p = SIMD_set(p, mzero, one + tcos_shift);
       df1 = SIMD_set(df1, mzero, szero);
@@ -747,41 +739,41 @@ void DihedralCharmmIntel::eval(const int vflag,
       SIMD_flt_t f3z = -sz2 - f4z;
 
       SIMD_flt_t qdeng;
-      if (EVFLAG) {
-	SIMD_flt_t ev_pre;
-	if (NEWTON_BOND) ev_pre = one;
-	else {
-	  ev_pre = szero;
-	  const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
-	  ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
-	  ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
-	  ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
-	  ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
-	}
-	SIMD_zero_masked(nmask, ev_pre);
-	if (EFLAG) {
-	  const SIMD_flt_t deng = tk * p;
-	  sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
-	  if (eatom) {
-	    qdeng = deng * SIMD_set((flt_t)0.25);
-	    SIMD_mask newton_mask;
-	    if (NEWTON_BOND) newton_mask = nmask;
-	    if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
-	    SIMD_flt_t ieng = qdeng;
-	    SIMD_jeng_update(newton_mask, featom, i2, ieng);
-	    ieng = qdeng;
-	    if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
-	    SIMD_jeng_update(newton_mask, featom, i3, ieng);
-	  }
-	}
-	if (vflag) {
+      if (EFLAG || VFLAG) {
+        SIMD_flt_t ev_pre;
+        if (NEWTON_BOND) ev_pre = one;
+        else {
+          ev_pre = szero;
+          const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
+          ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
+          ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
+          ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
+          ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
+        }
+        SIMD_zero_masked(nmask, ev_pre);
+        if (EFLAG) {
+          const SIMD_flt_t deng = tk * p;
+          sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
+          if (eatom) {
+            qdeng = deng * SIMD_set((flt_t)0.25);
+            SIMD_mask newton_mask;
+            if (NEWTON_BOND) newton_mask = nmask;
+            if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
+            SIMD_flt_t ieng = qdeng;
+            SIMD_jeng_update(newton_mask, featom, i2, ieng);
+            ieng = qdeng;
+            if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
+            SIMD_jeng_update(newton_mask, featom, i3, ieng);
+          }
+        }
+        if (VFLAG && vflag) {
           sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
-	  sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
-	  sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
-	  sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
-	  sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
-	  sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
-	}
+          sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
+          sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
+          sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
+          sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
+          sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
+        }
       }
 
       SIMD_mask newton_mask;
@@ -816,28 +808,28 @@ void DihedralCharmmIntel::eval(const int vflag,
       f4y = f4y - dely * fpair;
       f4z = f4z - delz * fpair;
 
-      if (EVFLAG) {
-	SIMD_flt_t ev_pre;
-	if (NEWTON_BOND) ev_pre = one;
-	else {
-	  ev_pre = szero;
+      if (EFLAG || VFLAG) {
+        SIMD_flt_t ev_pre;
+        if (NEWTON_BOND) ev_pre = one;
+        else {
+          ev_pre = szero;
           const SIMD_flt_t half = SIMD_set((flt_t)0.5);
           ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
           ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
-	}
-	SIMD_zero_masked(nmask, ev_pre);
+        }
+        SIMD_zero_masked(nmask, ev_pre);
 
-	if (EFLAG) {
-	  const SIMD_flt_t ecoul = tweight * forcecoul;
-	  const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
-	  const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
-	  SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
-	  secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
-	  sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
-	  if (eatom) {
- 	    const SIMD_flt_t half = SIMD_set((flt_t)0.5);
-	    evdwl = evdwl * half;
-	    evdwl = evdwl + half * ecoul + qdeng;
+        if (EFLAG) {
+          const SIMD_flt_t ecoul = tweight * forcecoul;
+          const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
+          const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
+          SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
+          secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
+          sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
+          if (eatom) {
+            const SIMD_flt_t half = SIMD_set((flt_t)0.5);
+            evdwl = evdwl * half;
+            evdwl = evdwl + half * ecoul + qdeng;
 
             if (NEWTON_BOND) newton_mask = nmask;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
@@ -846,16 +838,16 @@ void DihedralCharmmIntel::eval(const int vflag,
             ieng = evdwl;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
             SIMD_jeng_update(newton_mask, featom, i4, ieng);
-	  }
-	}
-	if (vflag) {                                                    
+          }
+        }
+        if (VFLAG && vflag) {
           spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
-	  spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
-	  spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
-	  spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
-	  spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
-	  spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
-	}                                                                    
+          spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
+          spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
+          spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
+          spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
+          spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
+        }
       }
 
       if (NEWTON_BOND) newton_mask = nmask;
@@ -865,45 +857,41 @@ void DihedralCharmmIntel::eval(const int vflag,
       SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
     } // for n
 
-    if (EVFLAG) {
-      if (EFLAG) {
-	oedihedral += SIMD_sum(sedihedral);
-	oecoul += SIMD_sum(secoul);
-	oevdwl += SIMD_sum(sevdwl);
-      }
-      if (vflag) {
-	ov0 += SIMD_sum(sv0); 
-	ov1 += SIMD_sum(sv1); 
-	ov2 += SIMD_sum(sv2); 
-	ov3 += SIMD_sum(sv3); 
-	ov4 += SIMD_sum(sv4); 
-	ov5 += SIMD_sum(sv5);
-	opv0 += SIMD_sum(spv0); 
-	opv1 += SIMD_sum(spv1); 
-	opv2 += SIMD_sum(spv2); 
-	opv3 += SIMD_sum(spv3); 
-	opv4 += SIMD_sum(spv4); 
-	opv5 += SIMD_sum(spv5);
-      }
+    if (EFLAG) {
+      oedihedral += SIMD_sum(sedihedral);
+      oecoul += SIMD_sum(secoul);
+      oevdwl += SIMD_sum(sevdwl);
+    }
+    if (VFLAG && vflag) {
+      ov0 += SIMD_sum(sv0);
+      ov1 += SIMD_sum(sv1);
+      ov2 += SIMD_sum(sv2);
+      ov3 += SIMD_sum(sv3);
+      ov4 += SIMD_sum(sv4);
+      ov5 += SIMD_sum(sv5);
+      opv0 += SIMD_sum(spv0);
+      opv1 += SIMD_sum(spv1);
+      opv2 += SIMD_sum(spv2);
+      opv3 += SIMD_sum(spv3);
+      opv4 += SIMD_sum(spv4);
+      opv5 += SIMD_sum(spv5);
     }
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG) {
-      energy += oedihedral;
-      force->pair->eng_vdwl += oevdwl;
-      force->pair->eng_coul += oecoul;
-    }
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
-      force->pair->virial[0] += opv0;
-      force->pair->virial[1] += opv1;
-      force->pair->virial[2] += opv2;
-      force->pair->virial[3] += opv3;
-      force->pair->virial[4] += opv4;
-      force->pair->virial[5] += opv5;
-    }
+  if (EFLAG) {
+    energy += oedihedral;
+    force->pair->eng_vdwl += oevdwl;
+    force->pair->eng_coul += oecoul;
+  }
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+    force->pair->virial[0] += opv0;
+    force->pair->virial[1] += opv1;
+    force->pair->virial[2] += opv2;
+    force->pair->virial[3] += opv3;
+    force->pair->virial[4] += opv4;
+    force->pair->virial[5] += opv5;
   }
 
   fix->set_reduce_flag();
@@ -945,7 +933,7 @@ void DihedralCharmmIntel::init_style()
 
 template <class flt_t, class acc_t>
 void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
-	                                   IntelBuffers<flt_t,acc_t> *buffers)
+                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
 
   const int tp1 = atom->ntypes + 1;
@@ -953,12 +941,14 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
   fc.set_ntypes(tp1,bp1,memory);
   buffers->set_ntypes(tp1);
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
-      fc.ljp[i][j].lj1 = lj14_1[i][j];
-      fc.ljp[i][j].lj2 = lj14_2[i][j];
-      fc.ljp[i][j].lj3 = lj14_3[i][j];
-      fc.ljp[i][j].lj4 = lj14_4[i][j];
+  if (weightflag) {
+    for (int i = 0; i < tp1; i++) {
+      for (int j = 0; j < tp1; j++) {
+        fc.ljp[i][j].lj1 = lj14_1[i][j];
+        fc.ljp[i][j].lj2 = lj14_2[i][j];
+        fc.ljp[i][j].lj3 = lj14_3[i][j];
+        fc.ljp[i][j].lj4 = lj14_4[i][j];
+      }
     }
   }
 
@@ -975,8 +965,8 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
-            	                                        const int nbondtypes,
-	                                                Memory *memory) {
+                                                        const int nbondtypes,
+                                                        Memory *memory) {
   if (npairtypes != _npairtypes) {
     if (_npairtypes > 0)
       _memory->destroy(ljp);
@@ -989,7 +979,7 @@ void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
       _memory->destroy(bp);
       _memory->destroy(weight);
     }
-    
+
     if (nbondtypes > 0) {
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
       _memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");
diff --git a/src/USER-INTEL/dihedral_charmm_intel.h b/src/USER-INTEL/dihedral_charmm_intel.h
index 292faea9f9..d80b32c8ac 100644
--- a/src/USER-INTEL/dihedral_charmm_intel.h
+++ b/src/USER-INTEL/dihedral_charmm_intel.h
@@ -44,8 +44,8 @@ class DihedralCharmmIntel : public DihedralCharmm {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
@@ -58,7 +58,7 @@ class DihedralCharmmIntel : public DihedralCharmm {
   class ForceConst {
    public:
     typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1;
-    typedef struct { flt_t cos_shift, sin_shift, k; 
+    typedef struct { flt_t cos_shift, sin_shift, k;
                      int multiplicity; } fc_packed3;
 
     fc_packed1 **ljp;
diff --git a/src/USER-INTEL/dihedral_harmonic_intel.cpp b/src/USER-INTEL/dihedral_harmonic_intel.cpp
index 03ab152f49..196b024fa6 100644
--- a/src/USER-INTEL/dihedral_harmonic_intel.cpp
+++ b/src/USER-INTEL/dihedral_harmonic_intel.cpp
@@ -69,24 +69,24 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void DihedralHarmonicIntel::compute(int eflag, int vflag,
-				  IntelBuffers<flt_t,acc_t> *buffers,
-				  const ForceConst<flt_t> &fc)
+                                  IntelBuffers<flt_t,acc_t> *buffers,
+                                  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -96,10 +96,10 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void DihedralHarmonicIntel::eval(const int vflag, 
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralHarmonicIntel::eval(const int vflag,
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
@@ -120,40 +120,42 @@ void DihedralHarmonicIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oedihedral = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const dihedrallist = 
+    const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
 
+    #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
-    if (EVFLAG) {
-      if (EFLAG)
-	sedihedral = (acc_t)0.0;
-      if (vflag) {
-	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
-      }
+    if (EFLAG) sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
-
-    for (int n = nfrom; n < nto; n++) {
+    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
@@ -173,7 +175,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
-      
+
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
@@ -203,27 +205,29 @@ void DihedralHarmonicIntel::eval(const int vflag,
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
 
       // error check
+      #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
-	int me = comm->me;
+        int me = comm->me;
 
-	if (screen) {
-	  char str[128];
-	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT,
-		  me,tid,update->ntimestep,
-		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
-		  me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
-		  me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
-		  me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
-		  me,x[i4].x,x[i4].y,x[i4].z);
-	}
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
       }
+      #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
@@ -238,19 +242,19 @@ void DihedralHarmonicIntel::eval(const int vflag,
       ddf1 = df1 = (flt_t)0.0;
 
       for (int i = 0; i < m; i++) {
-	ddf1 = p*c - df1*s;
-	df1 = p*s + df1*c;
-	p = ddf1;
+        ddf1 = p*c - df1*s;
+        df1 = p*s + df1*c;
+        p = ddf1;
       }
 
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 *= -m;
       p += (flt_t)1.0;
-      
+
       if (m == 0) {
-	p = (flt_t)1.0 + tcos_shift;
-	df1 = (flt_t)0.0;
+        p = (flt_t)1.0 + tcos_shift;
+        df1 = (flt_t)0.0;
       }
 
       const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
@@ -292,56 +296,66 @@ void DihedralHarmonicIntel::eval(const int vflag,
       const flt_t f3y = -sy2 - f4y;
       const flt_t f3z = -sz2 - f4z;
 
-      if (EVFLAG) {
-	flt_t deng;
-	if (EFLAG) deng = tk * p;
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, 
-			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
-			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
-			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
-			      sv0, sv1, sv2, sv3, sv4, sv5);
+      if (EFLAG || VFLAG) {
+        flt_t deng;
+        if (EFLAG) deng = tk * p;
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+                              sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+                              ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
       }
 
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
-	  f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
-	  f[i2].x += f2x;
-	  f[i2].y += f2y;
-	  f[i2].z += f2z;
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
-	  f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
-	  f[i4].x += f4x;
-	  f[i4].y += f4y;
-	  f[i4].z += f4z;
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
         }
       }
     } // for n
-    if (EVFLAG) {
-      if (EFLAG) oedihedral += sedihedral;
-      if (vflag) {
-	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
-      }
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oedihedral += sedihedral;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG) energy += oedihedral;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
-    }
+  if (EFLAG) energy += oedihedral;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -381,7 +395,7 @@ void DihedralHarmonicIntel::init_style()
 
 template <class flt_t, class acc_t>
 void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
-	                                     IntelBuffers<flt_t,acc_t> *buffers)
+                                             IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
@@ -398,11 +412,11 @@ void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
-	                                                  Memory *memory) {
+                                                          Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(bp);
-    
+
     if (nbondtypes > 0)
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
   }
diff --git a/src/USER-INTEL/dihedral_harmonic_intel.h b/src/USER-INTEL/dihedral_harmonic_intel.h
index 41e3d20540..0a9cfaa042 100644
--- a/src/USER-INTEL/dihedral_harmonic_intel.h
+++ b/src/USER-INTEL/dihedral_harmonic_intel.h
@@ -44,8 +44,8 @@ class DihedralHarmonicIntel : public DihedralHarmonic {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
@@ -57,7 +57,7 @@ class DihedralHarmonicIntel : public DihedralHarmonic {
   template <class flt_t>
   class ForceConst {
    public:
-    typedef struct { flt_t cos_shift, sin_shift, k; 
+    typedef struct { flt_t cos_shift, sin_shift, k;
                      int multiplicity; } fc_packed1;
 
     fc_packed1 *bp;
diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp
index bfd5a53956..1abeba1d5e 100644
--- a/src/USER-INTEL/dihedral_opls_intel.cpp
+++ b/src/USER-INTEL/dihedral_opls_intel.cpp
@@ -73,24 +73,24 @@ void DihedralOPLSIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::compute(int eflag, int vflag,
-				  IntelBuffers<flt_t,acc_t> *buffers,
-				  const ForceConst<flt_t> &fc)
+                                  IntelBuffers<flt_t,acc_t> *buffers,
+                                  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
   } else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -100,10 +100,10 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void DihedralOPLSIntel::eval(const int vflag, 
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralOPLSIntel::eval(const int vflag,
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
@@ -124,40 +124,42 @@ void DihedralOPLSIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oedihedral = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const dihedrallist = 
+    const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
 
+    #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
-    if (EVFLAG) {
-      if (EFLAG)
-	sedihedral = (acc_t)0.0;
-      if (vflag) {
-	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
-      }
+    if (EFLAG) sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
-
-    for (int n = nfrom; n < nto; n++) {
+    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
@@ -177,7 +179,7 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
-      
+
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
@@ -207,7 +209,7 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
 
       flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm;
-      const flt_t r12c1 =  rb1 * rb2; 
+      const flt_t r12c1 =  rb1 * rb2;
       const flt_t c1mag = ctmp * r12c1;
 
       ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
@@ -236,27 +238,29 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3;
 
       // error check
+      #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
-	int me = comm->me;
+        int me = comm->me;
 
-	if (screen) {
-	  char str[128];
-	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT,
-		  me,tid,update->ntimestep,
-		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
-		  me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
-		  me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
-		  me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
-		  me,x[i4].x,x[i4].y,x[i4].z);
-	}
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
       }
+      #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
@@ -279,14 +283,14 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim;
 
       flt_t p, pd;
-      p = fc.bp[type].k1*((flt_t)1.0 + c) + 
-	  fc.bp[type].k2*((flt_t)1.0 - cos_2phi) + 
-	  fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
-	  fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
-      pd = fc.bp[type].k1 - 
-	   (flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
-	   (flt_t)3.0 * fc.bp[type].k3 * sin_3phim - 
-	   (flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
+      p = fc.bp[type].k1*((flt_t)1.0 + c) +
+          fc.bp[type].k2*((flt_t)1.0 - cos_2phi) +
+          fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
+          fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
+      pd = fc.bp[type].k1 -
+           (flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
+           (flt_t)3.0 * fc.bp[type].k3 * sin_3phim -
+           (flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
 
       flt_t edihed;
       if (EFLAG) edihed = p;
@@ -321,54 +325,64 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t f3y = sy2 - f4y;
       const flt_t f3z = sz2 - f4z;
 
-      if (EVFLAG) {
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, edihed, i1, i2, i3, i4, f1x, 
-			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
-			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
-			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
-			      sv0, sv1, sv2, sv3, sv4, sv5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
+                              i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+                              sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
+                              i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+                              ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
       }
 
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
-	  f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
-	  f[i2].x += f2x;
-	  f[i2].y += f2y;
-	  f[i2].z += f2z;
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
-	  f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
-	  f[i4].x += f4x;
-	  f[i4].y += f4y;
-	  f[i4].z += f4z;
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
         }
       }
     } // for n
-    if (EVFLAG) {
-      if (EFLAG) oedihedral += sedihedral;
-      if (vflag) {
-	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
-      }
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oedihedral += sedihedral;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG) energy += oedihedral;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
-    }
+  if (EFLAG) energy += oedihedral;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -408,7 +422,7 @@ void DihedralOPLSIntel::init_style()
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
-	                                     IntelBuffers<flt_t,acc_t> *buffers)
+                                             IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
@@ -425,11 +439,11 @@ void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
-	                                                  Memory *memory) {
+                                                          Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(bp);
-    
+
     if (nbondtypes > 0)
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
   }
diff --git a/src/USER-INTEL/dihedral_opls_intel.h b/src/USER-INTEL/dihedral_opls_intel.h
index ea0930f4b8..1080bfa6c3 100644
--- a/src/USER-INTEL/dihedral_opls_intel.h
+++ b/src/USER-INTEL/dihedral_opls_intel.h
@@ -44,8 +44,8 @@ class DihedralOPLSIntel : public DihedralOPLS {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index edd33eb72b..b06f76c90d 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -61,6 +61,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   int ncops = force->inumeric(FLERR,arg[3]);
 
   _nbor_pack_width = 1;
+  _three_body_neighbor = 0;
 
   _precision_mode = PREC_MODE_MIXED;
   _offload_balance = -1.0;
@@ -178,7 +179,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
     _real_space_comm = MPI_COMM_WORLD;
     if (no_affinity == 0)
       if (set_host_affinity(nomp) != 0)
-	error->all(FLERR,"Could not set host affinity for offload tasks");
+        error->all(FLERR,"Could not set host affinity for offload tasks");
   }
 
   int max_offload_threads = 0, offload_cores = 0;
@@ -263,7 +264,7 @@ FixIntel::~FixIntel()
     double *time2 = off_watch_neighbor();
     int *overflow = get_off_overflow_flag();
     if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
-	overflow != NULL) {
+        overflow != NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
     }
@@ -319,19 +320,25 @@ void FixIntel::init()
       if (strstr(hybrid->keywords[i], "/intel") != NULL)
         nstyles++;
       else
-	force->pair->no_virial_fdotr_compute = 1;
+        force->pair->no_virial_fdotr_compute = 1;
   }
   if (nstyles > 1)
     error->all(FLERR,
-	       "Currently, cannot use more than one intel style with hybrid.");
+               "Currently, cannot use more than one intel style with hybrid.");
 
   check_neighbor_intel();
-  if (_precision_mode == PREC_MODE_SINGLE)
+  int off_mode = 0;
+  if (_offload_balance != 0.0) off_mode = 1;
+  if (_precision_mode == PREC_MODE_SINGLE) {
     _single_buffers->zero_ev();
-  else if (_precision_mode == PREC_MODE_MIXED)
+    _single_buffers->grow_ncache(off_mode,_nthreads);
+  } else if (_precision_mode == PREC_MODE_MIXED) {
     _mixed_buffers->zero_ev();
-  else
+    _mixed_buffers->grow_ncache(off_mode,_nthreads);
+  } else {
     _double_buffers->zero_ev();
+    _double_buffers->grow_ncache(off_mode,_nthreads);
+  }
 
   _need_reduce = 0;
 }
@@ -342,13 +349,13 @@ void FixIntel::setup(int vflag)
 {
   if (neighbor->style != BIN)
     error->all(FLERR,
-	    "Currently, neighbor style BIN must be used with Intel package.");
+            "Currently, neighbor style BIN must be used with Intel package.");
   if (neighbor->exclude_setting() != 0)
     error->all(FLERR,
-	    "Currently, cannot use neigh_modify exclude with Intel package.");
+            "Currently, cannot use neigh_modify exclude with Intel package.");
   if (vflag_atom)
    error->all(FLERR,
-	       "Cannot currently get per-atom virials with Intel package.");
+               "Cannot currently get per-atom virials with Intel package.");
   #ifdef _LMP_INTEL_OFFLOAD
   post_force(vflag);
   #endif
@@ -367,8 +374,6 @@ void FixIntel::pair_init_check(const bool cdmessage)
 {
   #ifdef INTEL_VMASK
   atom->sortfreq = 1;
-  if (neighbor->binsizeflag && atom->userbinsize <= 0.0)
-    atom->userbinsize = neighbor->binsize_user;
   #endif
 
   _nbor_pack_width = 1;
@@ -376,9 +381,8 @@ void FixIntel::pair_init_check(const bool cdmessage)
   #ifdef _LMP_INTEL_OFFLOAD
   if (_offload_balance != 0.0) atom->sortfreq = 1;
 
-  if (force->newton_pair == 0)
-    _offload_noghost = 0;
-  else if (_offload_ghost == 0)
+  _offload_noghost = 0;
+  if (force->newton_pair && _offload_ghost == 0)
     _offload_noghost = 1;
 
   set_offload_affinity();
@@ -388,7 +392,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
     double *time2 = off_watch_neighbor();
     int *overflow = get_off_overflow_flag();
     if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
-	overflow != NULL) {
+        overflow != NULL) {
       #pragma offload_transfer target(mic:_cop)  \
         nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
         in(overflow:length(5) alloc_if(1) free_if(0))
@@ -403,7 +407,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
     error->warning(FLERR, "Unknown Intel Compiler Version\n");
     #else
     if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
-	__INTEL_COMPILER_BUILD_DATE < 20141023)
+        __INTEL_COMPILER_BUILD_DATE < 20141023)
       error->warning(FLERR, "Unsupported Intel Compiler.");
     #endif
     #if !defined(__INTEL_COMPILER)
@@ -434,24 +438,24 @@ void FixIntel::pair_init_check(const bool cdmessage)
   if (comm->me == 0) {
     if (screen) {
       fprintf(screen,
-	      "----------------------------------------------------------\n");
+              "----------------------------------------------------------\n");
       if (_offload_balance != 0.0) {
         fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
-		_offload_tpc);
+                _offload_tpc);
         fprintf(screen,"%d threads per task\n",_offload_threads);
       } else {
-	fprintf(screen,"Using Intel Package without Coprocessor.\n");
+        fprintf(screen,"Using Intel Package without Coprocessor.\n");
       }
       fprintf(screen,"Precision: %s\n",kmode);
       if (cdmessage) {
-	#ifdef LMP_USE_AVXCD
-	fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
-	#else
-	fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
-	#endif
+        #ifdef LMP_USE_AVXCD
+        fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
+        #else
+        fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
+        #endif
       }
       fprintf(screen,
-	      "----------------------------------------------------------\n");
+              "----------------------------------------------------------\n");
     }
   }
 }
@@ -460,7 +464,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
 
 void FixIntel::bond_init_check()
 {
-  if (_offload_balance != 0.0 && atom->molecular && 
+  if (_offload_balance != 0.0 && atom->molecular &&
       force->newton_pair != force->newton_bond)
     error->all(FLERR,
       "USER-INTEL package requires same setting for newton bond and non-bond.");
@@ -535,24 +539,24 @@ void FixIntel::pre_reverse(int eflag, int vflag)
 {
   if (_force_array_m != 0) {
     if (_need_reduce) {
-      reduce_results(_force_array_m);
+      reduce_results(&_force_array_m[0].x);
       _need_reduce = 0;
     }
-    add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom, 0);
+    add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
     _force_array_m = 0;
   } else if (_force_array_d != 0) {
     if (_need_reduce) {
-      reduce_results(_force_array_d);
+      reduce_results(&_force_array_d[0].x);
       _need_reduce = 0;
     }
-    add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom, 0);
+    add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
     _force_array_d = 0;
   } else if (_force_array_s != 0) {
     if (_need_reduce) {
-      reduce_results(_force_array_s);
+      reduce_results(&_force_array_s[0].x);
       _need_reduce = 0;
     }
-    add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom, 0);
+    add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
     _force_array_s = 0;
   }
 
@@ -563,47 +567,56 @@ void FixIntel::pre_reverse(int eflag, int vflag)
 
 /* ---------------------------------------------------------------------- */
 
-template <class ft>
-void FixIntel::reduce_results(ft * _noalias const f_start)
+template <class acc_t>
+void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
 {
   int o_range, f_stride;
   if (force->newton_pair)
     o_range = atom->nlocal + atom->nghost;
-  else		
+  else
     o_range = atom->nlocal;
-  IP_PRE_get_stride(f_stride, o_range, sizeof(ft), lmp->atom->torque);
+  IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
 
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) shared(o_range, f_stride)
-  #endif
-  {
-    int iifrom, iito, tid;
-    IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
-  			      sizeof(ft));
+  o_range *= 4;
+  const int f_stride4 = f_stride * 4;
 
-    int t_off = f_stride;
-    if (_results_eatom) {
-      for (int t = 1; t < _nthreads; t++) {
-        _use_simd_pragma("vector nontemporal")
-        _use_simd_pragma("novector")
-        for (int n = iifrom; n < iito; n++) {
-	  f_start[n].x += f_start[n + t_off].x;
-	  f_start[n].y += f_start[n + t_off].y;
-	  f_start[n].z += f_start[n + t_off].z;
-	  f_start[n].w += f_start[n + t_off].w;
-	}
-	t_off += f_stride;
-      }
+  if (_nthreads <= INTEL_HTHREADS) {
+    acc_t *f_scalar2 = f_scalar + f_stride4;
+    if (_nthreads == 4) {
+      acc_t *f_scalar3 = f_scalar2 + f_stride4;
+      acc_t *f_scalar4 = f_scalar3 + f_stride4;
+      _use_simd_pragma("vector aligned")
+      _use_simd_pragma("simd")
+      for (int n = 0; n < o_range; n++)
+        f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
+    } else if (_nthreads == 2) {
+      _use_simd_pragma("vector aligned")
+      _use_simd_pragma("simd")
+      for (int n = 0; n < o_range; n++)
+        f_scalar[n] += f_scalar2[n];
     } else {
+      acc_t *f_scalar3 = f_scalar2 + f_stride4;
+      _use_simd_pragma("vector aligned")
+      _use_simd_pragma("simd")
+      for (int n = 0; n < o_range; n++)
+        f_scalar[n] += f_scalar2[n] + f_scalar3[n];
+    }
+  } else {
+    #if defined(_OPENMP)
+    #pragma omp parallel
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
+                                sizeof(acc_t));
+
+      acc_t *f_scalar2 = f_scalar + f_stride4;
       for (int t = 1; t < _nthreads; t++) {
-	_use_simd_pragma("vector nontemporal")
-	_use_simd_pragma("novector")
-	for (int n = iifrom; n < iito; n++) {
-	  f_start[n].x += f_start[n + t_off].x;
-	  f_start[n].y += f_start[n + t_off].y;
-	  f_start[n].z += f_start[n + t_off].z;
-	}
-	t_off += f_stride;
+        _use_simd_pragma("vector aligned")
+        _use_simd_pragma("simd")
+        for (int n = iifrom; n < iito; n++)
+          f_scalar[n] += f_scalar2[n];
+        f_scalar2 += f_stride4;
       }
     }
   }
@@ -635,46 +648,65 @@ template <class ft, class acc_t>
 void FixIntel::add_results(const ft * _noalias const f_in,
                            const acc_t * _noalias const ev_global,
                            const int eatom, const int vatom,
-			   const int offload) {
+                           const int offload) {
   start_watch(TIME_PACK);
   int f_length;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
     if (offload) {
-      add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
       if (force->newton_pair) {
-	const acc_t * _noalias const enull = 0;
-	int offset = _offload_nlocal;
-	if (atom->torque) offset *= 2;
-	add_oresults(f_in + offset, enull, eatom, vatom,
-		     _offload_min_ghost, _offload_nghost);
-      }
+        add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
+        const acc_t * _noalias const enull = 0;
+        int offset = _offload_nlocal;
+        if (atom->torque) offset *= 2;
+        add_oresults(f_in + offset, enull, eatom, vatom,
+                     _offload_min_ghost, _offload_nghost);
+      } else
+        add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
     } else {
-      add_oresults(f_in, ev_global, eatom, vatom,
-		   _host_min_local, _host_used_local);
       if (force->newton_pair) {
-	const acc_t * _noalias const enull = 0;
-	int offset = _host_used_local;
-	if (atom->torque) offset *= 2;
-	add_oresults(f_in + offset, enull, eatom,
-		     vatom, _host_min_ghost, _host_used_ghost);
+        add_oresults(f_in, ev_global, eatom, vatom,
+                     _host_min_local, _host_used_local);
+        const acc_t * _noalias const enull = 0;
+        int offset = _host_used_local;
+        if (atom->torque) offset *= 2;
+        add_oresults(f_in + offset, enull, eatom,
+                     vatom, _host_min_ghost, _host_used_ghost);
+      } else {
+        int start = host_start_pair();
+        add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
       }
     }
     stop_watch(TIME_PACK);
     return;
   }
-  if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
-    f_length = atom->nlocal + atom->nghost;
-  else
-    f_length = atom->nlocal;
+  int start;
+  if (offload) {
+    start = 0;
+    if (force->newton_pair) {
+      if (_offload_noghost == 0)
+        f_length = atom->nlocal + atom->nghost;
+      else
+        f_length = atom->nlocal;
+    } else
+      f_length = offload_end_pair();
+  } else {
+    if (force->newton_pair) {
+      start = 0;
+      f_length = atom->nlocal + atom->nghost;
+    } else {
+      start = host_start_pair();
+      f_length = atom->nlocal - start;
+    }
+  }
+  add_oresults(f_in, ev_global, eatom, vatom, start, f_length);
   #else
   if (force->newton_pair)
     f_length = atom->nlocal + atom->nghost;
   else
     f_length = atom->nlocal;
-  #endif
-
   add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
+  #endif
   stop_watch(TIME_PACK);
 }
 
@@ -682,9 +714,9 @@ void FixIntel::add_results(const ft * _noalias const f_in,
 
 template <class ft, class acc_t>
 void FixIntel::add_oresults(const ft * _noalias const f_in,
-			    const acc_t * _noalias const ev_global,
-			    const int eatom, const int vatom,
-			    const int out_offset, const int nall) {
+                            const acc_t * _noalias const ev_global,
+                            const int eatom, const int vatom,
+                            const int out_offset, const int nall) {
   lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
   if (atom->torque) {
     if (f_in[1].w)
@@ -695,8 +727,11 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
                    "Sphere particles not yet supported for gayberne/intel");
   }
 
+  int packthreads;
+  if (_nthreads > INTEL_HTHREADS) packthreads = _nthreads;
+  else packthreads = 1;
   #if defined(_OPENMP)
-  #pragma omp parallel default(none)
+  #pragma omp parallel if(packthreads > 1)
   #endif
   {
     #if defined(_OPENMP)
@@ -705,16 +740,16 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
     const int tid = 0;
     #endif
     int ifrom, ito;
-    IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
+    IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t));
     if (atom->torque) {
       int ii = ifrom * 2;
       lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
-	out_offset;
+        out_offset;
       if (eatom) {
-	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
+        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
+        #pragma novector
+        #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[ii].x;
           f[i].y += f_in[ii].y;
@@ -727,8 +762,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
+        #pragma novector
+        #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[ii].x;
           f[i].y += f_in[ii].y;
@@ -741,10 +776,10 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
       }
     } else {
       if (eatom) {
-	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
+        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
+        #pragma novector
+        #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[i].x;
           f[i].y += f_in[i].y;
@@ -753,8 +788,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
+        #pragma novector
+        #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[i].x;
           f[i].y += f_in[i].y;
@@ -833,6 +868,11 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
       _offload_nlocal;
   }
 
+  if (atom->torque)
+    if (f_in[1].w < 0.0)
+      error->all(FLERR, "Bad matrix inversion in mldivide3");
+  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
+
   // Load balance?
   if (_offload_balance < 0.0) {
     if (neighbor->ago == 0)
@@ -860,10 +900,6 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
   stop_watch(TIME_IMBALANCE);
   #endif
   acc_timers();
-  if (atom->torque)
-    if (f_in[1].w < 0.0)
-      error->all(FLERR, "Bad matrix inversion in mldivide3");
-  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -895,7 +931,7 @@ void FixIntel::output_timing_data() {
     balance_out[0] = _balance_pair;
     balance_out[1] = _balance_neighbor;
     MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
-	       0, _real_space_comm);
+               0, _real_space_comm);
     balance_in[0] /= size;
     balance_in[1] /= size;
 
@@ -922,25 +958,25 @@ void FixIntel::output_timing_data() {
                 balance_in[1]);
         fprintf(_tscreen, "  Offload Pair Balance      %f\n",
                 balance_in[0]);
-	fprintf(_tscreen, "  Offload Ghost Atoms       ");
-	if (_offload_noghost) fprintf(_tscreen,"No\n");
-	else fprintf(_tscreen,"Yes\n");
+        fprintf(_tscreen, "  Offload Ghost Atoms       ");
+        if (_offload_noghost) fprintf(_tscreen,"No\n");
+        else fprintf(_tscreen,"Yes\n");
         #ifdef TIME_BALANCE
         fprintf(_tscreen, "  Offload Imbalance Seconds %f\n",
                 timers[TIME_IMBALANCE]);
-	fprintf(_tscreen, "  Offload Min/Max Seconds   ");
-	for (int i = 0; i < NUM_ITIMERS; i++)
-	  fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
-	fprintf(_tscreen, "\n");
+        fprintf(_tscreen, "  Offload Min/Max Seconds   ");
+        for (int i = 0; i < NUM_ITIMERS; i++)
+          fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
+        fprintf(_tscreen, "\n");
         #endif
-	double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
-	  timers[TIME_OFFLOAD_WAIT];
-	double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
-	  timers[TIME_OFFLOAD_PAIR];
-	double tt = MAX(ht,ct);
-	if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
-	  error->warning(FLERR,
-		 "Leaving a core free can improve performance for offload");
+        double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
+          timers[TIME_OFFLOAD_WAIT];
+        double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
+          timers[TIME_OFFLOAD_PAIR];
+        double tt = MAX(ht,ct);
+        if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
+          error->warning(FLERR,
+                 "Leaving a core free can improve performance for offload");
       }
       fprintf(_tscreen, "------------------------------------------------\n");
     }
@@ -963,14 +999,14 @@ int FixIntel::get_ppn(int &node_rank) {
   node_name[name_length] = '\0';
   char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs];
   MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
-		MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
+                MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
   int ppn = 0;
   node_rank = 0;
   for (int i = 0; i < nprocs; i++) {
     if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
       ppn++;
       if (i < rank)
-	node_rank++;
+        node_rank++;
     }
   }
 
@@ -1032,19 +1068,19 @@ void FixIntel::set_offload_affinity()
       kmp_create_affinity_mask(&mask);
       int proc = offload_threads * node_rank + tnum;
       #ifdef __AVX512F__
-      proc = (proc / offload_tpc) + (proc % offload_tpc) * 
-	     ((offload_cores) / 4);
+      proc = (proc / offload_tpc) + (proc % offload_tpc) *
+             ((offload_cores) / 4);
       proc += 68;
       #else
       if (offload_affinity_balanced)
-	proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
+        proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
       else
-	proc += (proc / 4) * (4 - offload_tpc) + 1;
+        proc += (proc / 4) * (4 - offload_tpc) + 1;
       #endif
       kmp_set_affinity_mask_proc(proc, &mask);
       if (kmp_set_affinity(&mask) != 0)
-	printf("Could not set affinity on rank %d thread %d to %d\n",
-	       node_rank, tnum, proc);
+        printf("Could not set affinity on rank %d thread %d to %d\n",
+               node_rank, tnum, proc);
     }
   }
 
@@ -1074,7 +1110,7 @@ int FixIntel::set_host_affinity(const int nomp)
   char cmd[512];
   char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
   sprintf(cmd, "lscpu -p | grep -v '#' |"
-	  "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
+          "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
   p = popen(cmd, "r");
   if (p == NULL) return -1;
   ncores = 0;
@@ -1111,7 +1147,7 @@ int FixIntel::set_host_affinity(const int nomp)
   if (subscription > ncores) {
     if (rank == 0)
       error->warning(FLERR,
-		     "More MPI tasks/OpenMP threads than available cores");
+                     "More MPI tasks/OpenMP threads than available cores");
     return 0;
   }
   if (subscription == ncores)
@@ -1137,10 +1173,10 @@ int FixIntel::set_host_affinity(const int nomp)
       int first = coi_cores + node_rank * mpi_cores;
       CPU_ZERO(&cpuset);
       for (int i = first; i < first + mpi_cores; i++)
-	CPU_SET(proc_list[i], &cpuset);
+        CPU_SET(proc_list[i], &cpuset);
       if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
-	fail = 1;
-	break;
+        fail = 1;
+        break;
       }
       plwp++;
     }
@@ -1153,13 +1189,13 @@ int FixIntel::set_host_affinity(const int nomp)
     buf1 = (float*) malloc(sizeof(float)*pragma_size);
 
     #pragma offload target (mic:0) mandatory \
-      in(buf1:length(pragma_size) alloc_if(1) free_if(0))	\
+      in(buf1:length(pragma_size) alloc_if(1) free_if(0))       \
       signal(&sig1)
     { buf1[0] = 0.0; }
     #pragma offload_wait target(mic:0) wait(&sig1)
 
     #pragma offload target (mic:0) mandatory \
-      out(buf1:length(pragma_size) alloc_if(0) free_if(1))	\
+      out(buf1:length(pragma_size) alloc_if(0) free_if(1))      \
       signal(&sig2)
     { buf1[0] = 1.0; }
     #pragma offload_wait target(mic:0) wait(&sig2)
@@ -1175,11 +1211,11 @@ int FixIntel::set_host_affinity(const int nomp)
 
       CPU_ZERO(&cpuset);
       for(int i=0; i<coi_cores; i++)
-	CPU_SET(proc_list[i], &cpuset);
+        CPU_SET(proc_list[i], &cpuset);
 
       if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
-	fail = 1;
-	break;
+        fail = 1;
+        break;
       }
     }
     pclose(p);
@@ -1192,7 +1228,7 @@ int FixIntel::set_host_affinity(const int nomp)
   if (screen && rank == 0) {
     if (coi_cores)
       fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
-	      mlwp, coi_cores);
+              mlwp, coi_cores);
     fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
   }
   if (fail) return -1;
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index f4c02b37b5..068e5ed890 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -70,23 +70,32 @@ class FixIntel : public Fix {
 
   inline int nbor_pack_width() const { return _nbor_pack_width; }
   inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
+  inline int three_body_neighbor() { return _three_body_neighbor; }
+  inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; }
 
   inline int need_zero(const int tid) {
     if (_need_reduce == 0 && tid > 0) return 1;
     return 0;
   }
-  inline void set_reduce_flag() { _need_reduce = 1; }
+  inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
   inline int lrt() {
     if (force->kspace_match("pppm/intel", 0)) return _lrt;
     else return 0;
   }
+  inline int pppm_table() {
+    if (force->kspace_match("pppm/intel", 0) ||
+        force->kspace_match("pppm/disp/intel",0))
+      return INTEL_P3M_TABLE;
+    else return 0;
+  }
+
 
  protected:
   IntelBuffers<float,float> *_single_buffers;
   IntelBuffers<float,double> *_mixed_buffers;
   IntelBuffers<double,double> *_double_buffers;
 
-  int _precision_mode, _nthreads, _nbor_pack_width;
+  int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
 
  public:
   inline int* get_overflow_flag() { return _overflow_flag; }
@@ -94,17 +103,17 @@ class FixIntel : public Fix {
   inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
                                double *ev_in, const int offload,
                                const int eatom = 0, const int vatom = 0,
-			       const int rflag = 0);
+                               const int rflag = 0);
   inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
                                double *ev_in, const int offload,
                                const int eatom = 0, const int vatom = 0,
-			       const int rflag = 0);
+                               const int rflag = 0);
   inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
                                float *ev_in, const int offload,
                                const int eatom = 0, const int vatom = 0,
-			       const int rflag = 0);
+                               const int rflag = 0);
   inline void get_buffern(const int offload, int &nlocal, int &nall,
-			  int &minlocal);
+                          int &minlocal);
 
   #ifdef _LMP_INTEL_OFFLOAD
   void post_force(int vflag);
@@ -204,13 +213,13 @@ class FixIntel : public Fix {
   inline void add_results(const ft * _noalias const f_in,
                           const acc_t * _noalias const ev_global,
                           const int eatom, const int vatom,
-			  const int offload);
+                          const int offload);
 
   template <class ft, class acc_t>
   inline void add_oresults(const ft * _noalias const f_in,
-			   const acc_t * _noalias const ev_global,
-			   const int eatom, const int vatom,
-			   const int out_offset, const int nall);
+                           const acc_t * _noalias const ev_global,
+                           const int eatom, const int vatom,
+                           const int out_offset, const int nall);
 
   int _offload_affinity_balanced, _offload_threads, _offload_tpc;
   #ifdef _LMP_INTEL_OFFLOAD
@@ -226,22 +235,25 @@ class FixIntel : public Fix {
 /* ---------------------------------------------------------------------- */
 
 void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
-			   int &minlocal) {
+                           int &minlocal) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
     if (offload) {
       if (neighbor->ago != 0) {
-	nlocal = _offload_nlocal;
-	nall = _offload_nall;
+        nlocal = _offload_nlocal;
+        nall = _offload_nall;
       } else {
-	nlocal = atom->nlocal;
-	nall = nlocal + atom->nghost;
+        nlocal = atom->nlocal;
+        nall = nlocal + atom->nghost;
       }
       minlocal = 0;
     } else {
       nlocal = atom->nlocal;
       nall = _host_nall;
-      minlocal = _host_min_local;
+      if (force->newton)
+        minlocal = _host_min_local;
+      else
+        minlocal = host_start_pair();
     }
     return;
   }
@@ -259,7 +271,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
 void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
                                 double *ev_in, const int offload,
                                 const int eatom, const int vatom,
-				const int rflag) {
+                                const int rflag) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload) {
     _off_results_eatom = eatom;
@@ -275,7 +287,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
   _results_eatom = eatom;
   _results_vatom = vatom;
   #ifndef _LMP_INTEL_OFFLOAD
-  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
   #endif
 
   if (_overflow_flag[LMP_OVERFLOW])
@@ -287,7 +299,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
 void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
                                 double *ev_in, const int offload,
                                 const int eatom, const int vatom,
-				const int rflag) {
+                                const int rflag) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload) {
     _off_results_eatom = eatom;
@@ -303,7 +315,7 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
   _results_eatom = eatom;
   _results_vatom = vatom;
   #ifndef _LMP_INTEL_OFFLOAD
-  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
   #endif
 
   if (_overflow_flag[LMP_OVERFLOW])
@@ -331,7 +343,7 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
   _results_eatom = eatom;
   _results_vatom = vatom;
   #ifndef _LMP_INTEL_OFFLOAD
-  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
   #endif
 
   if (_overflow_flag[LMP_OVERFLOW])
@@ -349,12 +361,12 @@ int FixIntel::offload_end_neighbor() {
     if (atom->nlocal < 2)
       error->one(FLERR,"Too few atoms for load balancing offload");
     double granularity = 1.0 / atom->nlocal;
-    if (_balance_neighbor < granularity) 
+    if (_balance_neighbor < granularity)
       _balance_neighbor = granularity + 1e-10;
-    else if (_balance_neighbor > 1.0 - granularity) 
+    else if (_balance_neighbor > 1.0 - granularity)
       _balance_neighbor = 1.0 - granularity + 1e-10;
   }
-  return _balance_neighbor * atom->nlocal; 
+  return _balance_neighbor * atom->nlocal;
 }
 
 int FixIntel::offload_end_pair() {
@@ -505,7 +517,7 @@ The newton setting must be the same for both pairwise and bonded forces.
 
 E: Intel styles for bond/angle/dihedral/improper require intel pair style."
 
-You cannot use the USER-INTEL package for bond calculations without a 
+You cannot use the USER-INTEL package for bond calculations without a
 USER-INTEL supported pair style.
 
 E: Intel styles for kspace require intel pair style.
diff --git a/src/USER-INTEL/fix_nh_intel.cpp b/src/USER-INTEL/fix_nh_intel.cpp
index 3f76e53c1f..6e44b38ef1 100644
--- a/src/USER-INTEL/fix_nh_intel.cpp
+++ b/src/USER-INTEL/fix_nh_intel.cpp
@@ -45,7 +45,7 @@ typedef struct { double x,y,z; } dbl3_t;
    NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
  ---------------------------------------------------------------------- */
 
-FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) : 
+FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) :
   FixNH(lmp, narg, arg)
 {
   _dtfm = 0;
@@ -118,12 +118,12 @@ void FixNHIntel::remap()
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & dilate_group_bit) {
-	const double d0 = x[i].x - b0;
-	const double d1 = x[i].y - b1;
-	const double d2 = x[i].z - b2;
-	x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
-	x[i].y = hi1*d1 + hi3*d2;
-	x[i].z = hi2*d2;
+        const double d0 = x[i].x - b0;
+        const double d1 = x[i].y - b1;
+        const double d2 = x[i].z - b2;
+        x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
+        x[i].y = hi1*d1 + hi3*d2;
+        x[i].z = hi2*d2;
       }
     }
   }
@@ -294,9 +294,9 @@ void FixNHIntel::remap()
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & dilate_group_bit) {
-	x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
-	x[i].y = h1*x[i].y + h3*x[i].z + nb1;
-	x[i].z = h2*x[i].z + nb2;
+        x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
+        x[i].y = h1*x[i].y + h3*x[i].z + nb1;
+        x[i].z = h2*x[i].z + nb2;
       }
     }
   }
@@ -318,7 +318,7 @@ void FixNHIntel::reset_dt()
   dto = dthalf;
 
   // If using respa, then remap is performed in innermost level
-  
+
   if (strstr(update->integrate_style,"respa"))
     dto = 0.5*step_respa[0];
 
@@ -329,7 +329,7 @@ void FixNHIntel::reset_dt()
     tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
 
   const int * const mask = atom->mask;
-  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : 
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
     atom->nlocal;
 
   if (nlocal > _nlocal_max) {
@@ -345,9 +345,9 @@ void FixNHIntel::reset_dt()
       const double * const rmass = atom->rmass;
       int n = 0;
       for (int i = 0; i < nlocal; i++) {
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
       }
     } else {
       const double * const mass = atom->mass;
@@ -364,29 +364,29 @@ void FixNHIntel::reset_dt()
       const double * const rmass = atom->rmass;
       int n = 0;
       for (int i = 0; i < nlocal; i++)
-	if (mask[i] & groupbit) {
-	  _dtfm[n++] = dtf / rmass[i];
-	  _dtfm[n++] = dtf / rmass[i];
-	  _dtfm[n++] = dtf / rmass[i];
+        if (mask[i] & groupbit) {
+          _dtfm[n++] = dtf / rmass[i];
+          _dtfm[n++] = dtf / rmass[i];
+          _dtfm[n++] = dtf / rmass[i];
         } else {
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	}
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
     } else {
       const double * const mass = atom->mass;
       const int * const type = atom->type;
       int n = 0;
       for (int i = 0; i < nlocal; i++)
-	if (mask[i] & groupbit) {
-	  _dtfm[n++] = dtf / mass[type[i]];
-	  _dtfm[n++] = dtf / mass[type[i]];
-	  _dtfm[n++] = dtf / mass[type[i]];
+        if (mask[i] & groupbit) {
+          _dtfm[n++] = dtf / mass[type[i]];
+          _dtfm[n++] = dtf / mass[type[i]];
+          _dtfm[n++] = dtf / mass[type[i]];
         } else {
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	}
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
     }
   }
 }
@@ -431,9 +431,9 @@ void FixNHIntel::nh_v_press()
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
-	v[i].x *= f0;
-	v[i].y *= f1;
-	v[i].z *= f2;
+        v[i].x *= f0;
+        v[i].y *= f1;
+        v[i].z *= f2;
       }
     }
   }
@@ -506,7 +506,7 @@ void FixNHIntel::nh_v_temp()
     #pragma simd
     #endif
     for (int i = 0; i < _nlocal3; i++)
-	v[i] *= factor_eta;
+        v[i] *= factor_eta;
   } else {
     #if defined(LMP_SIMD_COMPILER)
     #pragma vector aligned
@@ -514,12 +514,12 @@ void FixNHIntel::nh_v_temp()
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0)
-	v[i] *= factor_eta;
+        v[i] *= factor_eta;
     }
   }
 }
 
-double FixNHIntel::memory_usage() 
+double FixNHIntel::memory_usage()
 {
   return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double);
 }
diff --git a/src/USER-INTEL/fix_nh_intel.h b/src/USER-INTEL/fix_nh_intel.h
index 32ed6c8534..cc6ba8c481 100644
--- a/src/USER-INTEL/fix_nh_intel.h
+++ b/src/USER-INTEL/fix_nh_intel.h
@@ -35,7 +35,7 @@ class FixNHIntel : public FixNH {
   int _nlocal3, _nlocal_max;
 
   virtual void remap();
-  virtual void nve_x();      
+  virtual void nve_x();
   virtual void nve_v();
   virtual void nh_v_press();
   virtual void nh_v_temp();
diff --git a/src/USER-INTEL/fix_nve_asphere_intel.cpp b/src/USER-INTEL/fix_nve_asphere_intel.cpp
index 6563165454..8ad63f7326 100644
--- a/src/USER-INTEL/fix_nve_asphere_intel.cpp
+++ b/src/USER-INTEL/fix_nve_asphere_intel.cpp
@@ -36,7 +36,7 @@ using namespace FixConst;
 /* ---------------------------------------------------------------------- */
 
 FixNVEAsphereIntel::FixNVEAsphereIntel(LAMMPS *lmp, int narg, char **arg) :
-  FixNVE(lmp, narg, arg) 
+  FixNVE(lmp, narg, arg)
 {
   _dtfm = 0;
   _nlocal3 = 0;
@@ -129,9 +129,9 @@ void FixNVEAsphereIntel::initial_integrate(int vflag)
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
-	double *quat = bonus[ellipsoid[i]].quat;
-	ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
-			    _inertia1[i], _inertia2[i]);
+        double *quat = bonus[ellipsoid[i]].quat;
+        ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
+                            _inertia1[i], _inertia2[i]);
       }
     }
   }
@@ -168,7 +168,7 @@ void FixNVEAsphereIntel::reset_dt() {
   dtf = 0.5 * update->dt * force->ftm2v;
 
   const int * const mask = atom->mask;
-  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : 
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
     atom->nlocal;
 
   if (nlocal > _nlocal_max) {
@@ -211,27 +211,27 @@ void FixNVEAsphereIntel::reset_dt() {
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
         _dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
-	double *shape = bonus[ellipsoid[i]].shape;
-	double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
-	if (idot != 0.0) idot = 1.0 / idot;
-	_inertia0[i] = idot;
-	idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
-	if (idot != 0.0) idot = 1.0 / idot;
-	_inertia1[i] = idot;
-	idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
-	if (idot != 0.0) idot = 1.0 / idot;
-	_inertia2[i] = idot;
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        double *shape = bonus[ellipsoid[i]].shape;
+        double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia0[i] = idot;
+        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia1[i] = idot;
+        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia2[i] = idot;
       } else {
         _dtfm[n++] = 0.0;
-	_dtfm[n++] = 0.0;
-	_dtfm[n++] = 0.0;
+        _dtfm[n++] = 0.0;
+        _dtfm[n++] = 0.0;
       }
     }
   }
 }
-double FixNVEAsphereIntel::memory_usage() 
+double FixNVEAsphereIntel::memory_usage()
 {
   return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
 }
diff --git a/src/USER-INTEL/fix_nve_intel.cpp b/src/USER-INTEL/fix_nve_intel.cpp
index 3fb290b3ab..c0f6da06ae 100644
--- a/src/USER-INTEL/fix_nve_intel.cpp
+++ b/src/USER-INTEL/fix_nve_intel.cpp
@@ -29,7 +29,7 @@ using namespace FixConst;
 /* ---------------------------------------------------------------------- */
 
 FixNVEIntel::FixNVEIntel(LAMMPS *lmp, int narg, char **arg) :
-  FixNVE(lmp, narg, arg) 
+  FixNVE(lmp, narg, arg)
 {
   _dtfm = 0;
   _nlocal3 = 0;
@@ -91,7 +91,7 @@ void FixNVEIntel::initial_integrate(int vflag)
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0) {
         v[i] += _dtfm[i] * f[i];
-	x[i] += dtv * v[i];
+        x[i] += dtv * v[i];
       }
     }
   }
@@ -130,7 +130,7 @@ void FixNVEIntel::reset_dt() {
   dtf = 0.5 * update->dt * force->ftm2v;
 
   const int * const mask = atom->mask;
-  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : 
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
     atom->nlocal;
 
   if (nlocal > _nlocal_max) {
@@ -146,9 +146,9 @@ void FixNVEIntel::reset_dt() {
       const double * const rmass = atom->rmass;
       int n = 0;
       for (int i = 0; i < nlocal; i++) {
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
       }
     } else {
       const double * const mass = atom->mass;
@@ -165,34 +165,34 @@ void FixNVEIntel::reset_dt() {
       const double * const rmass = atom->rmass;
       int n = 0;
       for (int i = 0; i < nlocal; i++)
-	if (mask[i] & groupbit) {
-	  _dtfm[n++] = dtf / rmass[i];
-	  _dtfm[n++] = dtf / rmass[i];
-	  _dtfm[n++] = dtf / rmass[i];
+        if (mask[i] & groupbit) {
+          _dtfm[n++] = dtf / rmass[i];
+          _dtfm[n++] = dtf / rmass[i];
+          _dtfm[n++] = dtf / rmass[i];
         } else {
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	}
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
     } else {
       const double * const mass = atom->mass;
       const int * const type = atom->type;
       int n = 0;
       for (int i = 0; i < nlocal; i++)
-	if (mask[i] & groupbit) {
-	  _dtfm[n++] = dtf / mass[type[i]];
-	  _dtfm[n++] = dtf / mass[type[i]];
-	  _dtfm[n++] = dtf / mass[type[i]];
+        if (mask[i] & groupbit) {
+          _dtfm[n++] = dtf / mass[type[i]];
+          _dtfm[n++] = dtf / mass[type[i]];
+          _dtfm[n++] = dtf / mass[type[i]];
         } else {
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	}
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
     }
   }
 }
 
-double FixNVEIntel::memory_usage() 
+double FixNVEIntel::memory_usage()
 {
   return FixNVE::memory_usage() + _nlocal_max * 3 * sizeof(double);
 }
diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp
index 0fb02420b9..dc9765d913 100644
--- a/src/USER-INTEL/improper_cvff_intel.cpp
+++ b/src/USER-INTEL/improper_cvff_intel.cpp
@@ -42,7 +42,7 @@ typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
-ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) : 
+ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) :
   ImproperCvff(lmp)
 {
   suffix_flag |= Suffix::INTEL;
@@ -80,23 +80,23 @@ void ImproperCvffIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void ImproperCvffIntel::compute(int eflag, int vflag,
-				    IntelBuffers<flt_t,acc_t> *buffers,
-				    const ForceConst<flt_t> &fc)
+                                    IntelBuffers<flt_t,acc_t> *buffers,
+                                    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -108,10 +108,10 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void ImproperCvffIntel::eval(const int vflag, 
-				 IntelBuffers<flt_t,acc_t> *buffers,
-				 const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void ImproperCvffIntel::eval(const int vflag,
+                                 IntelBuffers<flt_t,acc_t> *buffers,
+                                 const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nimproperlist;
   if (inum == 0) return;
@@ -131,12 +131,9 @@ void ImproperCvffIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oeimproper = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oeimproper = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -145,17 +142,31 @@ void ImproperCvffIntel::eval(const int vflag,
     reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF_FIX
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const improperlist = 
+    const int5_t * _noalias const improperlist =
       (int5_t *) neighbor->improperlist[0];
 
+    #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+    acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) seimproper = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = improperlist[n].a;
       const int i2 = improperlist[n].b;
       const int i3 = improperlist[n].c;
@@ -216,28 +227,29 @@ void ImproperCvffIntel::eval(const int vflag,
       flt_t c = (c0 + c1mag*c2mag) * s12;
 
       // error check
-
+      #ifndef LMP_INTEL_USE_SIMDOFF_FIX
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me;
-	MPI_Comm_rank(world,&me);
-	if (screen) {
+        MPI_Comm_rank(world,&me);
+        if (screen) {
           char str[128];
-	  sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
+          sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
+      #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
@@ -250,36 +262,41 @@ void ImproperCvffIntel::eval(const int vflag,
       const int m = fc.fc[type].multiplicity;
 
       flt_t p, pd;
-      if (m == 2) {
-	p = (flt_t)2.0*c*c;
-	pd = (flt_t)2.0*c;
-      } else if (m == 3) {
-	const flt_t rc2 = c*c;
-	p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
-	pd = (flt_t)6.0*rc2 - (flt_t)1.5;
-      } else if (m == 4) {
-	const flt_t rc2 = c*c;
-	p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
-	pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
-      } else if (m == 6) {
-	const flt_t rc2 = c*c;
-	p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
-	pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
-      } else if (m == 1) {
-	p = c + (flt_t)1.0;
-	pd = (flt_t)0.5;
-      } else if (m == 5) {
-	const flt_t rc2 = c*c;
-	p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
-	pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
-      } else if (m == 0) {
-	p = (flt_t)2.0;
-	pd = (flt_t)0.0;
+      #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+      #pragma simdoff
+      #endif
+      {
+        if (m == 2) {
+          p = (flt_t)2.0*c*c;
+          pd = (flt_t)2.0*c;
+        } else if (m == 3) {
+          const flt_t rc2 = c*c;
+          p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
+          pd = (flt_t)6.0*rc2 - (flt_t)1.5;
+        } else if (m == 4) {
+          const flt_t rc2 = c*c;
+          p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
+          pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
+        } else if (m == 6) {
+          const flt_t rc2 = c*c;
+          p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
+          pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
+        } else if (m == 1) {
+          p = c + (flt_t)1.0;
+          pd = (flt_t)0.5;
+        } else if (m == 5) {
+          const flt_t rc2 = c*c;
+          p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
+          pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
+        } else if (m == 0) {
+          p = (flt_t)2.0;
+          pd = (flt_t)0.0;
+        }
       }
 
       if (fc.fc[type].sign == -1) {
-	p = (flt_t)2.0 - p;
-	pd = -pd;
+        p = (flt_t)2.0 - p;
+        pd = -pd;
       }
 
       flt_t eimproper;
@@ -317,46 +334,63 @@ void ImproperCvffIntel::eval(const int vflag,
 
       // apply force to each of 4 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += f1x;
-	f[i1].y += f1y;
-	f[i1].z += f1z;
+      #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
+        }
+
+        if (NEWTON_BOND || i4 < nlocal) {
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
+        }
       }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x += f2x;
-	f[i2].y += f2y;
-	f[i2].z += f2z;
-      }
-
-      if (NEWTON_BOND || i3 < nlocal) {
-        f[i3].x += f3x;
-        f[i3].y += f3y;
-        f[i3].z += f3z;
-      }
-
-      if (NEWTON_BOND || i4 < nlocal) {
-        f[i4].x += f4x;
-	f[i4].y += f4y;
-	f[i4].z += f4z;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, 
-                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
-                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
-			      vb3y, vb3z, oeimproper, f, NEWTON_BOND, nlocal,
-			      ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
+                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
+                              vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
+                              nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
+                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
+                              vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
+                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
       }
     } // for n
-  } // omp parallel
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oeimproper;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+    if (EFLAG) oeimproper += seimproper;
+    if (VFLAG && vflag) {
+      ov0 += sv0; ov1 += sv1; ov2 += sv2;
+      ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
+    #endif
+  } // omp parallel
+  if (EFLAG) energy += oeimproper;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -394,7 +428,7 @@ void ImproperCvffIntel::init_style()
 
 template <class flt_t, class acc_t>
 void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
-					     IntelBuffers<flt_t,acc_t> *buffers)
+                                             IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nimpropertypes + 1;
   fc.set_ntypes(bp1,memory);
@@ -410,11 +444,11 @@ void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void ImproperCvffIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
-	                                                  Memory *memory) {
+                                                          Memory *memory) {
   if (nimproper != _nimpropertypes) {
     if (_nimpropertypes > 0)
       _memory->destroy(fc);
-    
+
     if (nimproper > 0)
       _memory->create(fc,nimproper,"improperharmonicintel.fc");
   }
diff --git a/src/USER-INTEL/improper_cvff_intel.h b/src/USER-INTEL/improper_cvff_intel.h
index 95ccd8f9d2..cb5da25f99 100644
--- a/src/USER-INTEL/improper_cvff_intel.h
+++ b/src/USER-INTEL/improper_cvff_intel.h
@@ -45,8 +45,8 @@ class ImproperCvffIntel : public ImproperCvff {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/improper_harmonic_intel.cpp b/src/USER-INTEL/improper_harmonic_intel.cpp
index 071ff548ea..fe0efca5ec 100644
--- a/src/USER-INTEL/improper_harmonic_intel.cpp
+++ b/src/USER-INTEL/improper_harmonic_intel.cpp
@@ -43,7 +43,7 @@ typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
-ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) : 
+ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) :
   ImproperHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
@@ -81,23 +81,23 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void ImproperHarmonicIntel::compute(int eflag, int vflag,
-				    IntelBuffers<flt_t,acc_t> *buffers,
-				    const ForceConst<flt_t> &fc)
+                                    IntelBuffers<flt_t,acc_t> *buffers,
+                                    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -109,10 +109,10 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void ImproperHarmonicIntel::eval(const int vflag, 
-				 IntelBuffers<flt_t,acc_t> *buffers,
-				 const ForceConst<flt_t> &fc)
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void ImproperHarmonicIntel::eval(const int vflag,
+                                 IntelBuffers<flt_t,acc_t> *buffers,
+                                 const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nimproperlist;
   if (inum == 0) return;
@@ -132,12 +132,9 @@ void ImproperHarmonicIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oeimproper = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oeimproper = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -146,17 +143,31 @@ void ImproperHarmonicIntel::eval(const int vflag,
     reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const improperlist = 
+    const int5_t * _noalias const improperlist =
       (int5_t *) neighbor->improperlist[0];
 
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) seimproper = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = improperlist[n].a;
       const int i2 = improperlist[n].b;
       const int i3 = improperlist[n].c;
@@ -207,28 +218,29 @@ void ImproperHarmonicIntel::eval(const int vflag,
       flt_t c = (c1*c2 + c0) * s12;
 
       // error check
-
+      #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me;
-	MPI_Comm_rank(world,&me);
-	if (screen) {
+        MPI_Comm_rank(world,&me);
+        if (screen) {
           char str[128];
-	  sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
+          sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
+      #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
@@ -278,46 +290,63 @@ void ImproperHarmonicIntel::eval(const int vflag,
 
       // apply force to each of 4 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += f1x;
-	f[i1].y += f1y;
-	f[i1].z += f1z;
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
+        }
+
+        if (NEWTON_BOND || i4 < nlocal) {
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
+        }
       }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x += f2x;
-	f[i2].y += f2y;
-	f[i2].z += f2z;
-      }
-
-      if (NEWTON_BOND || i3 < nlocal) {
-        f[i3].x += f3x;
-        f[i3].y += f3y;
-        f[i3].z += f3z;
-      }
-
-      if (NEWTON_BOND || i4 < nlocal) {
-        f[i4].x += f4x;
-	f[i4].y += f4y;
-	f[i4].z += f4z;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, 
-                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
-                              vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y, 
-                              vb3z, oeimproper, f, NEWTON_BOND, nlocal,
-			      ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
+                              f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
+                              vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
+                              nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
+                              f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
+                              vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
+                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
       }
     } // for n
-  } // omp parallel
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oeimproper;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oeimproper += seimproper;
+    if (VFLAG && vflag) {
+      ov0 += sv0; ov1 += sv1; ov2 += sv2;
+      ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
+    #endif
+  } // omp parallel
+  if (EFLAG) energy += oeimproper;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -355,7 +384,7 @@ void ImproperHarmonicIntel::init_style()
 
 template <class flt_t, class acc_t>
 void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
-					     IntelBuffers<flt_t,acc_t> *buffers)
+                                             IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nimpropertypes + 1;
   fc.set_ntypes(bp1,memory);
@@ -370,11 +399,11 @@ void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
-	                                                  Memory *memory) {
+                                                          Memory *memory) {
   if (nimproper != _nimpropertypes) {
     if (_nimpropertypes > 0)
       _memory->destroy(fc);
-    
+
     if (nimproper > 0)
       _memory->create(fc,nimproper,"improperharmonicintel.fc");
   }
diff --git a/src/USER-INTEL/improper_harmonic_intel.h b/src/USER-INTEL/improper_harmonic_intel.h
index 4e38383863..0b759b4e43 100644
--- a/src/USER-INTEL/improper_harmonic_intel.h
+++ b/src/USER-INTEL/improper_harmonic_intel.h
@@ -45,8 +45,8 @@ class ImproperHarmonicIntel : public ImproperHarmonic {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index c81dffec83..3664bc248b 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -12,6 +12,7 @@
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
+#include <math.h>
 #include "intel_buffers.h"
 #include "force.h"
 #include "memory.h"
@@ -28,6 +29,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
   _ntypes = 0;
   _off_map_listlocal = 0;
   _ccachex = 0;
+  _ncache_alloc = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   _separate_buffers = 0;
   _off_f = 0;
@@ -36,6 +38,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
   _off_list_alloc = false;
   _off_threads = 0;
   _off_ccache = 0;
+  _off_ncache = 0;
   _host_nmax = 0;
   #endif
 }
@@ -68,8 +71,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
       if (ev_global != 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x:alloc_if(0) free_if(1)) \
-	  nocopy(f_start:alloc_if(0) free_if(1)) \
-	  nocopy(ev_global:alloc_if(0) free_if(1))
+          nocopy(f_start:alloc_if(0) free_if(1)) \
+          nocopy(ev_global:alloc_if(0) free_if(1))
       }
 
       if (q != 0) {
@@ -102,8 +105,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
-				       const int nthreads,
-				       const int offload_end)
+                                       const int nthreads,
+                                       const int offload_end)
 {
   free_buffers();
   _buf_size = static_cast<double>(nall) * 1.1 + 1;
@@ -111,15 +114,20 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
     _buf_local_size = _buf_size;
   else
     _buf_local_size = static_cast<double>(nlocal) * 1.1 + 1;
-  if (lmp->atom->torque)
-    _buf_local_size *= 2;
   const int f_stride = get_stride(_buf_local_size);
   lmp->memory->create(_x, _buf_size,"intel_x");
   if (lmp->atom->q != NULL)
     lmp->memory->create(_q, _buf_size, "intel_q");
   if (lmp->atom->ellipsoid != NULL)
     lmp->memory->create(_quat, _buf_size, "intel_quat");
-  lmp->memory->create(_f, f_stride * nthreads, "intel_f");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (lmp->force->newton_pair)
+  #else
+  if (lmp->force->newton_pair || lmp->atom->molecular)
+  #endif
+    lmp->memory->create(_f, f_stride * nthreads, "intel_f");
+  else
+    lmp->memory->create(_f, f_stride, "intel_f");
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
@@ -131,7 +139,10 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
   }
 
   if (offload_end > 0) {
-    lmp->memory->create(_off_f, f_stride * _off_threads, "intel_off_f");
+    int fm;
+    if (lmp->force->newton_pair) fm = _off_threads;
+    else fm = 1;
+    lmp->memory->create(_off_f, f_stride * fm, "intel_off_f");
     const atom_t * const x = get_x();
     const flt_t * const q = get_q();
     const vec3_acc_t * f_start = get_off_f();
@@ -140,15 +151,15 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
       if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
-	  nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
-	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
+          nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
+          nocopy(ev_global:length(8) alloc_if(1) free_if(0))
       }
     } else {
       if (x != NULL && f_start != NULL && ev_global != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
-          nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
-	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
+          nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
+          nocopy(ev_global:length(8) alloc_if(1) free_if(0))
       }
     }
     if (lmp->atom->ellipsoid != NULL) {
@@ -175,7 +186,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
     if (tag != 0 && special != 0 && nspecial !=0) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(tag:alloc_if(0) free_if(1)) \
-	nocopy(special,nspecial:alloc_if(0) free_if(1))
+        nocopy(special,nspecial:alloc_if(0) free_if(1))
     }
     _off_map_nmax = 0;
     _host_nmax = 0;
@@ -250,7 +261,7 @@ void IntelBuffers<flt_t, acc_t>::free_list_local()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
-	                                          const int offload_end)
+                                                  const int offload_end)
 {
   free_list_local();
   int size = list->get_maxlocal();
@@ -265,7 +276,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
     if (cnumneigh != 0) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
-	nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
+        nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
         nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
     }
     _off_map_ilist = ilist;
@@ -298,14 +309,14 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
                                                  const int nlocal,
-	                                         const int nthreads,
-	                                         const int offload_end,
-	                                         const int pack_width)
+                                                 const int nthreads,
+                                                 const int offload_end,
+                                                 const int pack_width)
 {
   free_nbor_list();
   _list_alloc_atoms = 1.10 * nlocal;
   int nt = MAX(nthreads, _off_threads);
-  int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) * 
+  int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) *
     get_max_nbors();
   lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
   #ifdef _LMP_INTEL_OFFLOAD
@@ -369,8 +380,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
-	const int nthreads,
-	const int width)
+        const int nthreads,
+        const int width)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_ccachex && off_flag && _off_ccache == 0)
@@ -407,7 +418,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
     int *ccachej = _ccachej;
 
     if (ccachex != NULL && ccachey !=NULL && ccachez != NULL &&
-	ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
+        ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
@@ -427,6 +438,115 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
 
 /* ---------------------------------------------------------------------- */
 
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_ncache()
+{
+  if (_ncache_alloc) {
+    flt_t *ncachex = _ncachex;
+    flt_t *ncachey = _ncachey;
+    flt_t *ncachez = _ncachez;
+    int *ncachej = _ncachej;
+    int *ncachejtype = _ncachejtype;
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_off_ncache) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
+        nocopy(ncachejtype:alloc_if(0) free_if(1))
+    }
+    _off_ncache = 0;
+    #endif
+
+    lmp->memory->destroy(ncachex);
+    lmp->memory->destroy(ncachey);
+    lmp->memory->destroy(ncachez);
+    lmp->memory->destroy(ncachej);
+    lmp->memory->destroy(ncachejtype);
+
+    _ncache_alloc = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
+                                             const int nthreads)
+{
+  const int nsize = get_max_nbors() * 3;
+  int esize = MIN(sizeof(int), sizeof(flt_t));
+  IP_PRE_get_stride(_ncache_stride, nsize, esize, 0);
+  int nt = MAX(nthreads, _off_threads);
+  const int vsize = _ncache_stride * nt;
+
+  if (_ncache_alloc) {
+    if (vsize > _ncache_alloc)
+      free_ncache();
+    #ifdef _LMP_INTEL_OFFLOAD
+    else if (off_flag && _off_ncache == 0)
+      free_ncache();
+    #endif
+    else
+      return;
+  }
+
+  lmp->memory->create(_ncachex, vsize, "_ncachex");
+  lmp->memory->create(_ncachey, vsize, "_ncachey");
+  lmp->memory->create(_ncachez, vsize, "_ncachez");
+  lmp->memory->create(_ncachej, vsize, "_ncachej");
+  lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
+
+  _ncache_alloc = vsize;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_flag) {
+    flt_t *ncachex = _ncachex;
+    flt_t *ncachey = _ncachey;
+    flt_t *ncachez = _ncachez;
+    int *ncachej = _ncachej;
+    int *ncachejtype = _ncachejtype;
+
+    if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
+        ncachej != NULL && ncachejtype != NULL) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
+        nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
+        nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
+    }
+    _off_ncache = 1;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifndef _LMP_INTEL_OFFLOAD
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
+    const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
+    acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
+{
+  IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
+                            ov1, ov2, ov3, ov4, ov5);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#ifndef _LMP_INTEL_OFFLOAD
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
+    const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
+    acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
+{
+  int iifrom, iito, tid;
+  IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
+                         ov0, ov1, ov2, ov3, ov4, ov5);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
 {
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index 3462d013a1..135309fe44 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -62,7 +62,7 @@ class IntelBuffers {
 
   void free_buffers();
   void free_nmax();
-  inline void set_bininfo(int *atombin, int *binpacked) 
+  inline void set_bininfo(int *atombin, int *binpacked)
     { _atombin = atombin; _binpacked = binpacked; }
   inline void grow(const int nall, const int nlocal, const int nthreads,
                    const int offload_end) {
@@ -78,6 +78,7 @@ class IntelBuffers {
     free_nbor_list();
     free_nmax();
     free_list_local();
+    free_ncache();
   }
 
   inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
@@ -106,6 +107,15 @@ class IntelBuffers {
   inline acc_t * get_ccachef() { return _ccachef; }
   #endif
 
+  void free_ncache();
+  void grow_ncache(const int off_flag, const int nthreads);
+  inline int ncache_stride() { return _ncache_stride; }
+  inline flt_t * get_ncachex() { return _ncachex; }
+  inline flt_t * get_ncachey() { return _ncachey; }
+  inline flt_t * get_ncachez() { return _ncachez; }
+  inline int * get_ncachej() { return _ncachej; }
+  inline int * get_ncachejtype() { return _ncachejtype; }
+
   inline int get_max_nbors() {
     int mn = lmp->neighbor->oneatom * sizeof(int) /
         (INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
@@ -116,7 +126,7 @@ class IntelBuffers {
 
   inline void grow_nbor_list(NeighList *list, const int nlocal,
                              const int nthreads, const int offload_end,
-			     const int pack_width) {
+                             const int pack_width) {
     if (nlocal > _list_alloc_atoms)
       _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
@@ -155,7 +165,7 @@ class IntelBuffers {
   inline int get_off_threads() { return _off_threads; }
   #ifdef _LMP_INTEL_OFFLOAD
   inline void set_off_params(const int n, const int cop,
-			     const int separate_buffers)
+                             const int separate_buffers)
     { _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; }
   inline vec3_acc_t * get_off_f() { return _off_f; }
   #endif
@@ -180,9 +190,18 @@ class IntelBuffers {
     }
   }
 
+  #ifndef _LMP_INTEL_OFFLOAD
+  void fdotr_reduce_l5(const int lf, const int lt, const int nthreads,
+                       const int f_stride, acc_t &ov0, acc_t &ov1,
+                       acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
+  void fdotr_reduce(const int nall, const int nthreads, const int f_stride,
+                    acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3,
+                    acc_t &ov4, acc_t &ov5);
+  #endif
+
   #ifdef _LMP_INTEL_OFFLOAD
   inline void thr_pack_cop(const int ifrom, const int ito,
-			   const int offset, const bool dotype = false) {
+                           const int offset, const bool dotype = false) {
     double ** x = lmp->atom->x + offset;
     if (dotype == false) {
       #pragma vector nontemporal
@@ -195,16 +214,16 @@ class IntelBuffers {
       int *type = lmp->atom->type + offset;
       #pragma vector nontemporal
       for (int i = ifrom; i < ito; i++) {
-	_x[i].x = x[i][0];
-	_x[i].y = x[i][1];
-	_x[i].z = x[i][2];
-	_x[i].w = type[i];
+        _x[i].x = x[i][0];
+        _x[i].y = x[i][1];
+        _x[i].z = x[i][2];
+        _x[i].w = type[i];
       }
     }
   }
 
   inline void thr_pack_host(const int ifrom, const int ito,
-			    const int offset) {
+                            const int offset) {
     double ** x = lmp->atom->x + offset;
     for (int i = ifrom; i < ito; i++) {
       _host_x[i].x = x[i][0];
@@ -214,13 +233,13 @@ class IntelBuffers {
   }
 
   inline void pack_sep_from_single(const int host_min_local,
-				   const int used_local,
-				   const int host_min_ghost,
-				   const int used_ghost) {
+                                   const int used_local,
+                                   const int host_min_ghost,
+                                   const int used_ghost) {
     memcpy(_host_x + host_min_local, _x + host_min_local,
-	   used_local * sizeof(atom_t));
+           used_local * sizeof(atom_t));
     memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost,
-	   used_ghost * sizeof(atom_t));
+           used_ghost * sizeof(atom_t));
     int nall = used_local + used_ghost + host_min_local;
     _host_x[nall].x = INTEL_BIGP;
     _host_x[nall].y = INTEL_BIGP;
@@ -228,9 +247,9 @@ class IntelBuffers {
     _host_x[nall].w = 1;
     if (lmp->atom->q != NULL) {
       memcpy(_host_q + host_min_local, _q + host_min_local,
-	     used_local * sizeof(flt_t));
+             used_local * sizeof(flt_t));
       memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost,
-	     used_ghost * sizeof(flt_t));
+             used_ghost * sizeof(flt_t));
     }
   }
 
@@ -263,6 +282,10 @@ class IntelBuffers {
   int _ccache_stride;
   flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
   int *_ccachei, *_ccachej;
+
+  int _ncache_stride, _ncache_alloc;
+  flt_t *_ncachex, *_ncachey, *_ncachez;
+  int *_ncachej, *_ncachejtype;
   #ifdef LMP_USE_AVXCD
   int _ccache_stride3;
   acc_t * _ccachef;
@@ -274,7 +297,7 @@ class IntelBuffers {
   flt_t *_host_q;
   quat_t *_host_quat;
   vec3_acc_t *_off_f;
-  int _off_map_nmax, _cop, _off_ccache;
+  int _off_map_nmax, _cop, _off_ccache, _off_ncache;
   int *_off_map_ilist;
   int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
   int *_off_map_numneigh;
@@ -287,7 +310,7 @@ class IntelBuffers {
   _alignvar(acc_t _ev_global_host[8],64);
 
   void _grow(const int nall, const int nlocal, const int nthreads,
-	     const int offload_end);
+             const int offload_end);
   void _grow_nmax(const int offload_end);
   void _grow_list_local(NeighList *list, const int offload_end);
   void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
diff --git a/src/USER-INTEL/intel_intrinsics.h b/src/USER-INTEL/intel_intrinsics.h
index 44a9605961..069eb5bed5 100644
--- a/src/USER-INTEL/intel_intrinsics.h
+++ b/src/USER-INTEL/intel_intrinsics.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* *- c++ -*- -----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -46,23 +46,23 @@ struct lmp_intel_an_fvec {
     lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; }
     lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; }
     const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const {
-        lmp_intel_an_fvec ret = *this; 
-        ret.data[:] += b.data[:]; 
+        lmp_intel_an_fvec ret = *this;
+        ret.data[:] += b.data[:];
         return ret;
     }
     const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const {
-        lmp_intel_an_fvec ret = *this; 
-        ret.data[:] -= b.data[:]; 
+        lmp_intel_an_fvec ret = *this;
+        ret.data[:] -= b.data[:];
         return ret;
     }
     const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const {
-        lmp_intel_an_fvec ret = *this; 
-        ret.data[:] *= b.data[:]; 
+        lmp_intel_an_fvec ret = *this;
+        ret.data[:] *= b.data[:];
         return ret;
     }
     const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const {
-        lmp_intel_an_fvec ret = *this; 
-        ret.data[:] /= b.data[:]; 
+        lmp_intel_an_fvec ret = *this;
+        ret.data[:] /= b.data[:];
         return ret;
     }
     lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) {
@@ -103,18 +103,18 @@ struct lmp_intel_an_ivec {
     explicit lmp_intel_an_ivec(int i) { data[:] = i; }
     explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; }
     const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) {
-        lmp_intel_an_ivec ret = *this; 
-        ret.data[:] &= b.data[:]; 
+        lmp_intel_an_ivec ret = *this;
+        ret.data[:] &= b.data[:];
         return ret;
     }
     const lmp_intel_an_ivec operator |(const lmp_intel_an_ivec &b) {
-        lmp_intel_an_ivec ret = *this; 
-        ret.data[:] |= b.data[:]; 
+        lmp_intel_an_ivec ret = *this;
+        ret.data[:] |= b.data[:];
         return ret;
     }
     const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) {
-        lmp_intel_an_ivec ret = *this; 
-        ret.data[:] += b.data[:]; 
+        lmp_intel_an_ivec ret = *this;
+        ret.data[:] += b.data[:];
         return ret;
     }
 };
@@ -171,13 +171,13 @@ enum CalculationMode { KNC, AVX, AVX2, SSE, NONE, AN };
 
 // This is used in the selection logic
 template<CalculationMode mode>
-struct vector_traits { 
-    static const bool support_integer_and_gather_ops = true; 
+struct vector_traits {
+    static const bool support_integer_and_gather_ops = true;
 };
 
 template<>
-struct vector_traits<AVX> { 
-    static const bool support_integer_and_gather_ops = false; 
+struct vector_traits<AVX> {
+    static const bool support_integer_and_gather_ops = false;
 };
 
 // This is the base template for all the different architectures
@@ -198,10 +198,10 @@ struct ivec32x16 {
   }
   explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); }
   operator __m512i() const { return vec; }
-  friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) { 
+  friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) {
     return _mm512_and_epi32(a, b);
   }
-  friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) { 
+  friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) {
     return _mm512_or_epi32(a, b);
   }
   friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) {
@@ -326,7 +326,7 @@ struct vector_ops<double, KNC> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  8);
@@ -337,7 +337,7 @@ struct vector_ops<double, KNC> {
       *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 48);
       *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 56);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  8);
@@ -464,7 +464,7 @@ struct vector_ops<float, KNC> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  4);
@@ -475,7 +475,7 @@ struct vector_ops<float, KNC> {
       *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
       *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  4);
@@ -519,10 +519,10 @@ struct ivec32x8 {
   }
   explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); }
   operator __m256i() const { return vec; }
-  friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) { 
+  friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) {
     return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
   }
-  friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) { 
+  friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) {
     return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
   }
   friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) {
@@ -545,10 +545,10 @@ struct avx_bvec {
   operator F64vec4() const { return _mm256_castsi256_pd(vec); }
   operator F32vec8() const { return _mm256_castsi256_ps(vec); }
   operator ivec32x8() const { return vec; }
-  friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) { 
+  friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) {
     return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
   }
-  friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) { 
+  friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) {
     return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
   }
   friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); }
@@ -582,8 +582,8 @@ struct vector_ops<double, AVX> {
       _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
       _mm256_store_pd(reinterpret_cast<double*>(src), from);
       for (int i = 0; i < VL; i++) {
-        result[i] = mask_test_at(mask, i) 
-            ? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i]) 
+        result[i] = mask_test_at(mask, i)
+            ? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i])
             : src[i];
       }
       return _mm256_load_pd(reinterpret_cast<double*>(result));
@@ -605,18 +605,18 @@ struct vector_ops<double, AVX> {
       __m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
       __m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
       __m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
-      *x = blend(mask, *x, c0); 
-      *y = blend(mask, *y, c1); 
-      *z = blend(mask, *z, c2); 
+      *x = blend(mask, *x, c0);
+      *y = blend(mask, *y, c1);
+      *z = blend(mask, *z, c2);
       *w = int_blend(mask, *w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0)));
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       iarr i, m;
       _mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs);
@@ -642,10 +642,10 @@ struct vector_ops<double, AVX> {
       __m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
       __m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
       __m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
-      *r0 = blend(mask, *r0, c0); 
-      *r1 = blend(mask, *r1, c1); 
-      *r2 = blend(mask, *r2, c2); 
-      *r3 = blend(mask, *r3, c3); 
+      *r0 = blend(mask, *r0, c0);
+      *r1 = blend(mask, *r1, c1);
+      *r2 = blend(mask, *r2, c2);
+      *r3 = blend(mask, *r3, c3);
     }
     static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
       return (b & mask) | (a & ~ mask);
@@ -809,8 +809,8 @@ struct vector_ops<float, AVX> {
       _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
       _mm256_store_ps(reinterpret_cast<float*>(src), from);
       for (int i = 0; i < VL; i++) {
-        result[i] = mask_test_at(mask, i) 
-            ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) 
+        result[i] = mask_test_at(mask, i)
+            ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
             : src[i];
       }
       return _mm256_load_ps(reinterpret_cast<float*>(result));
@@ -842,18 +842,18 @@ struct vector_ops<float, AVX> {
       __m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
       __m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
       __m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
-      *x = blend(mask, *x, c0); 
-      *y = blend(mask, *y, c1); 
-      *z = blend(mask, *z, c2); 
+      *x = blend(mask, *x, c0);
+      *y = blend(mask, *y, c1);
+      *z = blend(mask, *z, c2);
       *w = int_blend(mask, *w, _mm256_castps_si256(c3));
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       iarr i, m;
       int_store(i, idxs);
@@ -880,10 +880,10 @@ struct vector_ops<float, AVX> {
       __m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
       __m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
       __m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
-      *r0 = blend(mask, *r0, c0); 
-      *r1 = blend(mask, *r1, c1); 
-      *r2 = blend(mask, *r2, c2); 
-      *r3 = blend(mask, *r3, c3); 
+      *r0 = blend(mask, *r0, c0);
+      *r1 = blend(mask, *r1, c1);
+      *r2 = blend(mask, *r2, c2);
+      *r3 = blend(mask, *r3, c3);
     }
     static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
       return (b & mask) | (a & ~ mask);
@@ -961,8 +961,8 @@ struct vector_ops<float, AVX> {
       _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
       _mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
       for (int i = 0; i < VL; i++) {
-        result[i] = mask_test_at(mask, i) 
-            ? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) 
+        result[i] = mask_test_at(mask, i)
+            ? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
             : src[i];
       }
       return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
@@ -1038,10 +1038,10 @@ struct avx2_ivec32 {
   }
   explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); }
   operator __m256i() const { return vec; }
-  friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) { 
+  friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) {
     return _mm256_and_si256(a, b);
   }
-  friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) { 
+  friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) {
     return _mm256_or_si256(a, b);
   }
   friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) {
@@ -1060,14 +1060,14 @@ struct avx2_bvec {
   operator F64vec4() const { return _mm256_castsi256_pd(vec); }
   operator F32vec8() const { return _mm256_castsi256_ps(vec); }
   operator avx2_ivec32() const { return vec; }
-  friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) { 
+  friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) {
     return _mm256_and_si256(a, b);
   }
-  friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) { 
+  friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) {
     return _mm256_or_si256(a, b);
   }
   friend avx2_bvec operator ~(const avx2_bvec &a) {
-    return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF)); 
+    return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF));
   }
   avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); }
 };
@@ -1106,13 +1106,13 @@ struct vector_ops<double, AVX2> {
       *z = _mm256_mask_i32gather_pd(*z, &base->z, _mm256_castsi256_si128(idx1), mask, 1);
       *w = _mm256_mask_i32gather_epi32(*w, &base->w, idx, mask, 1);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idx, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idx, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
       ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
@@ -1253,7 +1253,7 @@ struct vector_ops<float, AVX2> {
       *z = _mm256_mask_i32gather_ps(*z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1);
       *w = _mm256_mask_i32gather_epi32(*w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  4);
@@ -1264,7 +1264,7 @@ struct vector_ops<float, AVX2> {
       *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
       *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  4);
@@ -1401,10 +1401,10 @@ struct ivec32x4 {
   }
   explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); }
   operator __m128i() const { return vec; }
-  friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) { 
+  friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) {
     return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
   }
-  friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) { 
+  friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) {
     return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
   }
   friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) {
@@ -1420,10 +1420,10 @@ struct sse_bvecx4 {
   operator __m128i() const { return vec; }
   operator F64vec2() const { return _mm_castsi128_pd(vec); }
   operator ivec32x4() const { return vec; }
-  friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) { 
+  friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) {
     return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
   }
-  friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) { 
+  friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) {
     return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
   }
   friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); }
@@ -1477,18 +1477,18 @@ struct vector_ops<double, SSE> {
       __m128d c1 = _mm_unpackhi_pd(a0lo, a1lo);
       __m128d c2 = _mm_unpacklo_pd(a0hi, a1hi);
       __m128d c3 = _mm_unpackhi_pd(a0hi, a1hi);
-      *x = blend(mask, *x, c0); 
-      *y = blend(mask, *y, c1); 
-      *z = blend(mask, *z, c2); 
+      *x = blend(mask, *x, c0);
+      *y = blend(mask, *y, c1);
+      *z = blend(mask, *z, c2);
       *w = int_blend(mask, *w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0));
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) +  8);
@@ -1634,8 +1634,8 @@ struct vector_ops<float, SSE> {
       _mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
       _mm_store_ps(reinterpret_cast<float*>(src), from);
       for (int i = 0; i < VL; i++) {
-        result[i] = m[i] 
-            ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) 
+        result[i] = m[i]
+            ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
             : src[i];
       }
       return _mm_load_ps(reinterpret_cast<float*>(result));
@@ -1647,13 +1647,13 @@ struct vector_ops<float, SSE> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) +  4);
@@ -1816,13 +1816,13 @@ struct vector_ops<flt_t, NONE> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) +  0 * sizeof(fscal));
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) +  1 * sizeof(fscal));
@@ -1946,13 +1946,13 @@ struct vector_ops<flt_t, AN> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) +  0 * sizeof(fscal));
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) +  1 * sizeof(fscal));
@@ -2113,7 +2113,7 @@ struct AccumulatorTwiceMixin {
 
   typedef avec_t avec;
   typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN)));
-  
+
   static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
     typename HIGH::fvec blo = BASE::cvtup_lo(b);
     typename HIGH::fvec bhi = BASE::cvtup_hi(b);
@@ -2121,7 +2121,7 @@ struct AccumulatorTwiceMixin {
     BASE::mask_cvtup(m, &mlo, &mhi);
     return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi));
   }
-  
+
   static typename HIGH::fscal acc_reduce_add(const avec &a) {
     return HIGH::reduce_add(a.lo + a.hi);
   }
@@ -2143,13 +2143,13 @@ template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic>
 struct AccumulatorTwiceMixinNone {
   typedef vector_ops<BASE_flt_t, mic> BASE;
   typedef vector_ops<HIGH_flt_t, mic> HIGH;
- 
+
   typedef typename HIGH::fvec avec;
   typedef typename HIGH::fscal aarr[BASE::VL];
-  
+
   static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
      return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b));
-  }  
+  }
   static typename HIGH::fscal acc_reduce_add(const avec &a) {
     return HIGH::reduce_add(a);
   }
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index ad07dfd7c2..d5cf6f5be2 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -17,6 +17,9 @@
 
 #ifdef __INTEL_COMPILER
 #define LMP_SIMD_COMPILER
+#if (__INTEL_COMPILER_BUILD_DATE > 20160720)
+#define LMP_INTEL_USE_SIMDOFF
+#endif
 #endif
 
 #ifdef __INTEL_OFFLOAD
@@ -65,7 +68,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_STENCIL 256
 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
 #define INTEL_MAX_STENCIL_CHECK 4096
-#define INTEL_P3M_MAXORDER 5
+#define INTEL_P3M_MAXORDER 7
+#define INTEL_P3M_ALIGNED_MAXORDER 8
+// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
+#define INTEL_P3M_TABLE 1
 
 #ifdef __INTEL_COMPILER
 #ifdef __AVX__
@@ -87,24 +93,36 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #ifdef __MIC__
 #define INTEL_V512 1
 #define INTEL_VMASK 1
+#define INTEL_HTHREADS 4
 #endif
 #endif
 
+#ifdef __AVX512ER__
+#define INTEL_HTHREADS 4
+#endif
+
 #ifdef __AVX512CD__
 #ifndef _LMP_INTEL_OFFLOAD
 #define LMP_USE_AVXCD
 #endif
 #endif
 
+#ifdef __MIC__
+#define INTEL_COMPILE_WIDTH INTEL_MIC_VECTOR_WIDTH
+#else
+#define INTEL_COMPILE_WIDTH INTEL_VECTOR_WIDTH
+#endif
+
 #else
 
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 1
+#define INTEL_COMPILE_WIDTH 1
 
 #endif
 
 #define INTEL_DATA_ALIGN 64
-#define INTEL_ONEATOM_FACTOR 2
+#define INTEL_ONEATOM_FACTOR 1
 #define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
 #define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
 #define INTEL_LB_MEAN_WEIGHT 0.1
@@ -112,104 +130,380 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_HOST_CORE_COUNT 512
 #define INTEL_MAX_COI_CORES 36
 
-#define IP_PRE_get_stride(stride, n, datasize, torque)	\
-  {								\
-    int blength = n;						\
-    if (torque) blength *= 2;					\
-    const int bytes = blength * datasize;			\
+#ifndef INTEL_HTHREADS
+#define INTEL_HTHREADS 2
+#endif
+
+#define IP_PRE_get_stride(stride, n, datasize, torque)  \
+  {                                                             \
+    int blength = n;                                            \
+    if (torque) blength *= 2;                                   \
+    const int bytes = blength * datasize;                       \
     stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN);     \
-    stride = blength + stride / datasize;			\
+    stride = blength + stride / datasize;                       \
   }
 
 #if defined(_OPENMP)
 
-#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) 	\
-  {								\
-    const int idelta = 1 + inum/nthreads;			\
-    ifrom = tid * idelta;					\
-    ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;	\
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
+  {                                                             \
+    int idelta = inum/nthreads;                                 \
+    const int imod = inum % nthreads;                           \
+    ifrom = tid * idelta;                                       \
+    ito = ifrom + idelta;                                       \
+    if (tid < imod) {                                           \
+      ito+=tid+1;                                               \
+      ifrom+=tid;                                               \
+    } else {                                                    \
+      ito+=imod;                                                \
+      ifrom+=imod;                                              \
+    }                                                           \
   }
 
-#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
-  {								\
-    tid = omp_get_thread_num();         			\
-    IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);		\
+#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);          \
+  }
+
+#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr)      \
+  {                                                             \
+    if (nthr <= INTEL_HTHREADS) {                               \
+      ifrom = tid;                                              \
+      ito = inum;                                               \
+      ip = nthr;                                                \
+    } else if (nthr % INTEL_HTHREADS == 0) {                    \
+      int nd = nthr / INTEL_HTHREADS;                           \
+      int td = tid / INTEL_HTHREADS;                            \
+      int tm = tid % INTEL_HTHREADS;                            \
+      IP_PRE_omp_range(ifrom, ito, td, inum, nd);               \
+      ifrom += tm;                                              \
+      ip = INTEL_HTHREADS;                                      \
+    } else {                                                    \
+      IP_PRE_omp_range(ifrom, ito, tid, inum, nthr);            \
+      ip = 1;                                                   \
+    }                                                           \
+  }
+
+#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)   \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr);         \
   }
 
 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
                              datasize)                          \
 {                                                               \
   int chunk_size = INTEL_DATA_ALIGN / datasize;                 \
-  int idelta = static_cast<int>(static_cast<float>(inum)	\
-				/chunk_size/nthreads) + 1;	\
-  idelta *= chunk_size;						\
+  int idelta = static_cast<int>(ceil(static_cast<float>(inum)   \
+                                     /chunk_size/nthreads));    \
+  idelta *= chunk_size;                                         \
   ifrom = tid*idelta;                                           \
   ito = ifrom + idelta;                                         \
   if (ito > inum) ito = inum;                                   \
 }
 
 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
-				nthreads, datasize)		\
-  {								\
-    tid = omp_get_thread_num();         			\
+                                nthreads, datasize)             \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
     IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads,     \
-			   datasize);				\
+                           datasize);                           \
   }
 
 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
-				nthreads, vecsize)		\
-  {								\
-    tid = omp_get_thread_num();         			\
-    int idelta = static_cast<int>(ceil(static_cast<float>(inum)	\
-				       /vecsize/nthreads));	\
-    idelta *= vecsize;						\
-    ifrom = tid*idelta;						\
-    ito = ifrom + idelta;					\
-    if (ito > inum) ito = inum;					\
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
+                                       /vecsize/nthreads));     \
+    idelta *= vecsize;                                          \
+    ifrom = tid*idelta;                                         \
+    ito = ifrom + idelta;                                       \
+    if (ito > inum) ito = inum;                                 \
+  }
+
+#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
+                                 nthr, vecsize)                 \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    if (nthr <= INTEL_HTHREADS) {                               \
+      ifrom = tid*vecsize;                                      \
+      ito = inum;                                               \
+      ip = nthr*vecsize;                                        \
+    } else if (nthr % INTEL_HTHREADS == 0) {                    \
+      int nd = nthr / INTEL_HTHREADS;                           \
+      int td = tid / INTEL_HTHREADS;                            \
+      int tm = tid % INTEL_HTHREADS;                            \
+      IP_PRE_omp_range_id_vec(ifrom, ito, td, inum, nd,         \
+        vecsize);                                               \
+      ifrom += tm * vecsize;                                    \
+      ip = INTEL_HTHREADS * vecsize;                            \
+    } else {                                                    \
+      IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, nthr,      \
+                              vecsize);                         \
+      ip = vecsize;                                             \
+    }                                                           \
   }
 
 #else
 
-#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)	\
-  {								\
-    ifrom = 0;							\
-    ito = inum;						        \
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
+  {                                                             \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
   }
 
-#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
-  {								\
-    tid = 0;							\
-    ifrom = 0;							\
-    ito = inum;							\
+#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+  }
+
+#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads)   \
+  {                                                             \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+    ip = 1;                                                     \
+  }
+
+#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)   \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+    ip = 1;                                                     \
   }
 
 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
                              datasize)                          \
 {                                                               \
-    ifrom = 0;							\
-    ito = inum;						        \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
 }
 
 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
-				nthreads, datasize)		\
-{								\
-  tid = 0;							\
-  ifrom = 0;							\
-  ito = inum;							\
+                                nthreads, datasize)             \
+{                                                               \
+  tid = 0;                                                      \
+  ifrom = 0;                                                    \
+  ito = inum;                                                   \
 }
 
 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
-				nthreads, vecsize)		\
-  {								\
-    tid = 0;                            			\
-    int idelta = static_cast<int>(ceil(static_cast<float>(inum)	\
-				       /vecsize));         	\
-    ifrom = 0;							\
-    ito = inum;							\
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+  }
+
+#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum,      \
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+    ip = vecsize;                                               \
   }
 
 #endif
 
+#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
+                                  f_stride, pos, ov0, ov1, ov2,         \
+                                  ov3, ov4, ov5)                        \
+{                                                                       \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  flt_t *x_scalar = &pos[minlocal].x;                                   \
+  int f_stride4 = f_stride * 4;                                         \
+  _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64);                         \
+  int vwidth;                                                           \
+  if (sizeof(acc_t) == sizeof(double))                                  \
+    vwidth = INTEL_COMPILE_WIDTH/2;                                     \
+  else                                                                  \
+    vwidth = INTEL_COMPILE_WIDTH;                                       \
+  if (vwidth < 4) vwidth = 4;                                           \
+  _use_simd_pragma("vector aligned")                                    \
+  _use_simd_pragma("simd")                                              \
+  for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
+  int remainder = lt % vwidth;                                          \
+  if (lf > lt) remainder = 0;                                           \
+  const int v_range = lt - remainder;                                   \
+  if (nthreads == 2) {                                                  \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("vector aligned")                                \
+      _use_simd_pragma("simd")                                          \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v];                                \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n];                                      \
+  } else if (nthreads==4) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("vector aligned")                                \
+      _use_simd_pragma("simd")                                          \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
+          f_scalar4[n+v];                                               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
+  } else if (nthreads==1) {                                             \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("vector aligned")                                \
+      _use_simd_pragma("simd")                                          \
+      for (int v = 0; v < vwidth; v++)                                  \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+  } else if (nthreads==3) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("vector aligned")                                \
+      _use_simd_pragma("simd")                                          \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
+  }                                                                     \
+  for (int n = v_range; n < lt; n += 4) {                               \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    for (int v = 0; v < 4; v++)                                         \
+      ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
+    ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
+    ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
+    ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
+  }                                                                     \
+  ov0 += ovv[0];                                                        \
+  ov1 += ovv[1];                                                        \
+  ov2 += ovv[2];                                                        \
+  if (vwidth > 4) {                                                     \
+    ov0 += ovv[4];                                                      \
+    ov1 += ovv[5];                                                      \
+    ov2 += ovv[6];                                                      \
+  }                                                                     \
+  if (vwidth > 8) {                                                     \
+    ov0 += ovv[8] + ovv[12];                                            \
+    ov1 += ovv[9] + ovv[13];                                            \
+    ov2 += ovv[10] + ovv[14];                                           \
+  }                                                                     \
+}
+
+#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
+                               f_stride, pos, offload, vflag, ov0, ov1, \
+                               ov2, ov3, ov4, ov5)                      \
+{                                                                       \
+  int o_range = (nall - minlocal) * 4;                                  \
+  IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
+                            sizeof(acc_t));                             \
+                                                                        \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  int f_stride4 = f_stride * 4;                                         \
+  int t;                                                                \
+  if (vflag == 2) t = 4; else t = 1;                                    \
+  acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
+  for ( ; t < nthreads; t++) {                                          \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("simd")                                            \
+    for (int n = iifrom; n < iito; n++)                                 \
+      f_scalar[n] += f_scalar2[n];                                      \
+    f_scalar2 += f_stride4;                                             \
+  }                                                                     \
+                                                                        \
+  if (vflag == 2) {                                                     \
+    int nt_min = MIN(4,nthreads);                                       \
+    IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
+                              f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
+                              ov5);                                     \
+  }                                                                     \
+}
+
 #ifdef _LMP_INTEL_OFFLOAD
 #include <sys/time.h>
 
@@ -223,120 +517,131 @@ inline double MIC_Wtime() {
   return time;
 }
 
-#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,	\
-				     nlocal, nall)			\
-{									\
-    if (fix->separate_buffers() && ago != 0) {				\
-    fix->start_watch(TIME_PACK);					\
-    if (offload) {							\
-      _use_omp_pragma("omp parallel default(none) shared(buffers,nlocal,nall)")	\
-      {									\
-        int ifrom, ito, tid;						\
-	int nthreads = comm->nthreads;					\
-	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,		\
-				nthreads, sizeof(flt_t));		\
-	buffers->thr_pack_cop(ifrom, ito, 0);				\
-	int nghost = nall - nlocal;					\
-	if (nghost) {							\
-	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,	\
-				 nthreads, sizeof(flt_t));		\
-	  buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,		\
-				fix->offload_min_ghost() - nlocal,	\
-				ago == 1);				\
-	}								\
-      }									\
-    } else {								\
-      buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);		\
-      buffers->thr_pack_host(nlocal, nall,				\
-			     fix->host_min_ghost()-nlocal);		\
-    }									\
-    fix->stop_watch(TIME_PACK);						\
-  }									\
+#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
+                                     nlocal, nall)                      \
+{                                                                       \
+    if (fix->separate_buffers() && ago != 0) {                          \
+    fix->start_watch(TIME_PACK);                                        \
+    if (offload) {                                                      \
+      int packthreads;                                                  \
+      if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\
+      else packthreads = 1;                                             \
+      _use_omp_pragma("omp parallel if(packthreads > 1)")               \
+      {                                                                 \
+        int ifrom, ito, tid;                                            \
+        IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,              \
+                                  packthreads, sizeof(flt_t));          \
+        buffers->thr_pack_cop(ifrom, ito, 0);                           \
+        int nghost = nall - nlocal;                                     \
+        if (nghost) {                                                   \
+          IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,        \
+                                 packthreads, sizeof(flt_t));           \
+          buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,           \
+                                fix->offload_min_ghost() - nlocal,      \
+                                ago == 1);                              \
+        }                                                               \
+      }                                                                 \
+    } else {                                                            \
+      buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);         \
+      buffers->thr_pack_host(nlocal, nall,                              \
+                             fix->host_min_ghost()-nlocal);             \
+    }                                                                   \
+    fix->stop_watch(TIME_PACK);                                         \
+  }                                                                     \
 }
 
-#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, 	\
-			     buffers, offload, fix, separate_flag,	\
-			     x_size, q_size, ev_size, f_stride)		\
-{									\
-  separate_flag = 0;							\
-  if (ago == 0) {							\
-    x_size = 0;								\
-    q_size = nall;							\
-    if (offload) {							\
-      if (fix->separate_buffers()) {					\
-	if (lmp->atom->torque)						\
-	  separate_flag = 2;						\
-	else								\
-	  separate_flag = 1;						\
-      } else								\
-	separate_flag = 3;						\
-    }									\
-  } else {								\
-    x_size = nall;							\
-    q_size = 0;								\
-  }									\
-  ev_size = 0;								\
-  if (evflag) {								\
-    if (eflag) ev_size = 2;						\
-    if (vflag) ev_size = 8;						\
-  }									\
-  int f_length;								\
-  if (newton)								\
-    f_length = nall;							\
-  else									\
-    f_length = nlocal;							\
-  f_length -= minlocal;							\
-  f_stride = buffers->get_stride(f_length);				\
+#define IP_PRE_get_transfern(ago, newton, eflag, vflag,                 \
+                             buffers, offload, fix, separate_flag,      \
+                             x_size, q_size, ev_size, f_stride)         \
+{                                                                       \
+  separate_flag = 0;                                                    \
+  if (ago == 0) {                                                       \
+    x_size = 0;                                                         \
+    q_size = nall;                                                      \
+    if (offload) {                                                      \
+      if (fix->separate_buffers()) {                                    \
+        if (lmp->atom->torque)                                          \
+          separate_flag = 2;                                            \
+        else                                                            \
+          separate_flag = 1;                                            \
+      } else                                                            \
+        separate_flag = 3;                                              \
+    }                                                                   \
+  } else {                                                              \
+    x_size = nall;                                                      \
+    q_size = 0;                                                         \
+  }                                                                     \
+  ev_size = 0;                                                          \
+  if (eflag) ev_size = 2;                                               \
+  if (vflag) ev_size = 8;                                               \
+  if (newton)                                                           \
+    f_stride = buffers->get_stride(nall);                               \
+  else                                                                  \
+    f_stride = buffers->get_stride(inum);                               \
 }
 
-#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
-			   ev_global)					\
-{									\
-  if (offload) {							\
-    tc = buffers->get_off_threads();					\
-    f_start = buffers->get_off_f();					\
-    ev_global = buffers->get_ev_global();				\
-  } else {								\
-    tc = comm->nthreads;						\
-    f_start = buffers->get_f();						\
-    fix->start_watch(TIME_HOST_PAIR);					\
-    ev_global = buffers->get_ev_global_host();				\
-  }									\
+#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
+                           ev_global)                                   \
+{                                                                       \
+  if (offload) {                                                        \
+    tc = buffers->get_off_threads();                                    \
+    f_start = buffers->get_off_f();                                     \
+    ev_global = buffers->get_ev_global();                               \
+  } else {                                                              \
+    tc = comm->nthreads;                                                \
+    f_start = buffers->get_f();                                         \
+    fix->start_watch(TIME_HOST_PAIR);                                   \
+    ev_global = buffers->get_ev_global_host();                          \
+  }                                                                     \
 }
 
-#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
-				  f_stride, x, q)			\
-{									\
-  if (separate_flag) {							\
-    if (separate_flag < 3) {						\
-      int all_local = nlocal;						\
-      int ghost_min = overflow[LMP_GHOST_MIN];				\
-      nlocal = overflow[LMP_LOCAL_MAX] + 1;				\
-      int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;		\
-      if (nghost < 0) nghost = 0;					\
-      nall = nlocal + nghost;						\
-      separate_flag--;							\
-      int flength;							\
-      if (newton) flength = nall;					\
-      else flength = nlocal;						\
-      IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),		\
-			   separate_flag);				\
-      if (nghost) {							\
-	if (nlocal < all_local || ghost_min > all_local) {		\
-	  memmove(x + nlocal, x + ghost_min,				\
-		  (nall - nlocal) * sizeof(ATOM_T));			\
-	  if (q != 0)							\
-	    memmove((void *)(q + nlocal), (void *)(q + ghost_min),	\
-		    (nall - nlocal) * sizeof(flt_t));			\
-	}								\
-      }									\
-    }									\
-    x[nall].x = INTEL_BIGP;						\
-    x[nall].y = INTEL_BIGP;						\
-    x[nall].z = INTEL_BIGP;						\
-  }									\
+#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,  \
+                                  f_stride, x, q)                       \
+{                                                                       \
+  if (separate_flag) {                                                  \
+    if (separate_flag < 3) {                                            \
+      int all_local = nlocal;                                           \
+      int ghost_min = overflow[LMP_GHOST_MIN];                          \
+      nlocal = overflow[LMP_LOCAL_MAX] + 1;                             \
+      int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;             \
+      if (nghost < 0) nghost = 0;                                       \
+      nall = nlocal + nghost;                                           \
+      separate_flag--;                                                  \
+      int flength;                                                      \
+      if (newton) flength = nall;                                       \
+      else flength = nlocal;                                            \
+      IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),             \
+                           separate_flag);                              \
+      if (nghost) {                                                     \
+        if (nlocal < all_local || ghost_min > all_local) {              \
+          memmove(x + nlocal, x + ghost_min,                            \
+                  (nall - nlocal) * sizeof(ATOM_T));                    \
+          if (q != 0)                                                   \
+            memmove((void *)(q + nlocal), (void *)(q + ghost_min),      \
+                    (nall - nlocal) * sizeof(flt_t));                   \
+        }                                                               \
+      }                                                                 \
+    }                                                                   \
+    x[nall].x = INTEL_BIGP;                                             \
+    x[nall].y = INTEL_BIGP;                                             \
+    x[nall].z = INTEL_BIGP;                                             \
+  }                                                                     \
 }
 
+#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,       \
+                                f_start, f_stride, x, offload, vflag,   \
+                                ov0, ov1, ov2, ov3, ov4, ov5)           \
+{                                                                       \
+  if (newton) {                                                         \
+    _use_omp_pragma("omp barrier");                                     \
+    IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,           \
+                           f_stride, x, offload, vflag, ov0, ov1, ov2,  \
+                           ov3, ov4, ov5);                              \
+  }                                                                     \
+}
+
+#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,    \
+                            ov0, ov1, ov2, ov3, ov4, ov5)
 
 #else
 
@@ -344,265 +649,242 @@ inline double MIC_Wtime() {
 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                      nlocal, nall)
 
-#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, 	\
-			     buffers, offload, fix, separate_flag,	\
-			     x_size, q_size, ev_size, f_stride)		\
+#define IP_PRE_get_transfern(ago, newton, eflag, vflag,                 \
+                             buffers, offload, fix, separate_flag,      \
+                             x_size, q_size, ev_size, f_stride)         \
 {                                                                       \
-  separate_flag = 0;							\
+  separate_flag = 0;                                                    \
   int f_length;                                                         \
   if (newton)                                                           \
     f_length = nall;                                                    \
   else                                                                  \
     f_length = nlocal;                                                  \
-  f_stride = buffers->get_stride(f_length);				\
+  f_stride = buffers->get_stride(f_length);                             \
 }
 
-#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
-			   ev_global)					\
-{									\
-  tc = comm->nthreads;							\
-  f_start = buffers->get_f();						\
-  fix->start_watch(TIME_HOST_PAIR);					\
-  ev_global = buffers->get_ev_global_host();				\
-}
-
-#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
-				  f_stride, x, q)
-
-
-#endif
-
-#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz)    \
+#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
+                           ev_global)                                   \
 {                                                                       \
-  if (vflag == 1) {                                                     \
-    sv0 += ev_pre * delx * delx * fpair;                                \
-    sv1 += ev_pre * dely * dely * fpair;                                \
-    sv2 += ev_pre * delz * delz * fpair;                                \
-    sv3 += ev_pre * delx * dely * fpair;                                \
-    sv4 += ev_pre * delx * delz * fpair;                                \
-    sv5 += ev_pre * dely * delz * fpair;                                \
+  tc = comm->nthreads;                                                  \
+  f_start = buffers->get_f();                                           \
+  fix->start_watch(TIME_HOST_PAIR);                                     \
+  ev_global = buffers->get_ev_global_host();                            \
+}
+
+#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,  \
+                                  f_stride, x, q)
+
+#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,       \
+                                f_start, f_stride, x, offload, vflag,   \
+                                ov0, ov1, ov2, ov3, ov4, ov5)           \
+{                                                                       \
+  if (newton) {                                                         \
+    if (vflag == 2 && nthreads > INTEL_HTHREADS) {                      \
+      _use_omp_pragma("omp barrier");                                   \
+      buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2,    \
+                            ov3, ov4, ov5);                             \
+    }                                                                   \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2)	\
+#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,    \
+                            ov0, ov1, ov2, ov3, ov4, ov5)               \
+{                                                                       \
+  if (newton) {                                                         \
+    if (vflag == 2 && nthreads <= INTEL_HTHREADS) {                     \
+      int lt = nall * 4;                                                \
+      buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1,     \
+                               ov2, ov3, ov4, ov5);                     \
+    }                                                                   \
+  }                                                                     \
+}
+
+#endif
+
+#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz)            \
+{                                                                       \
+  if (vflag == 1) {                                                     \
+    sv0 += delx * delx * fpair;                                         \
+    sv1 += dely * dely * fpair;                                         \
+    sv2 += delz * delz * fpair;                                         \
+    sv3 += delx * dely * fpair;                                         \
+    sv4 += delx * delz * fpair;                                         \
+    sv5 += dely * delz * fpair;                                         \
+  }                                                                     \
+}
+
+#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz)         \
+{                                                                       \
+  if (vflag == 1) {                                                     \
+    sv0 += dx * fpx;                                                    \
+    sv1 += dy * fpy;                                                    \
+    sv2 += dz * fpz;                                                    \
+    sv3 += dx * fpy;                                                    \
+    sv4 += dx * fpz;                                                    \
+    sv5 += dy * fpz;                                                    \
+  }                                                                     \
+}
+
+#define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2)   \
 {                                                                       \
   if (vflag == 1) {                                                     \
     sv0 += delx * fj[0] + delr2[0] * fk[0];                             \
-    sv1 += dely * fj[1] + delr2[1] * fk[1];				\
-    sv2 += delz * fj[2] + delr2[2] * fk[2];				\
-    sv3 += delx * fj[1] + delr2[0] * fk[1];				\
-    sv4 += delx * fj[2] + delr2[0] * fk[2];				\
-    sv5 += dely * fj[2] + delr2[1] * fk[2];				\
+    sv1 += dely * fj[1] + delr2[1] * fk[1];                             \
+    sv2 += delz * fj[2] + delr2[2] * fk[2];                             \
+    sv3 += delx * fj[1] + delr2[0] * fk[1];                             \
+    sv4 += delx * fj[2] + delr2[0] * fk[2];                             \
+    sv5 += dely * fj[2] + delr2[1] * fk[2];                             \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_nbor3v(vflag, fj0, fj1, fj2, delx, dely, delz)  \
 {                                                                       \
   if (vflag == 1) {                                                     \
-    sv0 += delx * fj0;							\
-    sv1 += dely * fj1;							\
-    sv2 += delz * fj2;							\
-    sv3 += delx * fj1;							\
-    sv4 += delx * fj2;							\
-    sv5 += dely * fj2;							\
+    sv0 += delx * fj0;                                                  \
+    sv1 += dely * fj1;                                                  \
+    sv2 += delz * fj2;                                                  \
+    sv3 += delx * fj1;                                                  \
+    sv4 += delx * fj2;                                                  \
+    sv5 += dely * fj2;                                                  \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_bond(eflag, eatom, vflag, ebond, i1, i2, fbond,	\
-			     delx, dely, delz, obond, force, newton,	\
-			     nlocal, ov0, ov1, ov2, ov3, ov4, ov5)	\
+#define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \
+                             fbond, delx, dely, delz, obond, force,     \
+                             newton, nlocal, ov0, ov1, ov2, ov3, ov4,   \
+                             ov5)                                       \
 {                                                                       \
-  flt_t ev_pre;								\
-  if (newton) ev_pre = (flt_t)1.0;					\
-  else {								\
-    ev_pre = (flt_t)0.0;						\
-    if (i1 < nlocal) ev_pre += (flt_t)0.5;				\
-    if (i2 < nlocal) ev_pre += (flt_t)0.5;				\
-  }									\
-									\
-  if (eflag) {								\
-    oebond += ev_pre * ebond;						\
-    if (eatom) {							\
-      flt_t halfeng = ebond * (flt_t)0.5;				\
-      if (newton || i1 < nlocal) f[i1].w += halfeng;			\
-      if (newton || i2 < nlocal) f[i2].w += halfeng;			\
-    }									\
-  }									\
-									\
-  if (vflag) {								\
-    ov0 += ev_pre * (delx * delx * fbond);				\
-    ov1 += ev_pre * (dely * dely * fbond);				\
-    ov2 += ev_pre * (delz * delz * fbond);				\
-    ov3 += ev_pre * (delx * dely * fbond);				\
-    ov4 += ev_pre * (delx * delz * fbond);				\
-    ov5 += ev_pre * (dely * delz * fbond);				\
+  flt_t ev_pre;                                                         \
+  if (newton) ev_pre = (flt_t)1.0;                                      \
+  else {                                                                \
+    ev_pre = (flt_t)0.0;                                                \
+    if (i1 < nlocal) ev_pre += (flt_t)0.5;                              \
+    if (i2 < nlocal) ev_pre += (flt_t)0.5;                              \
+  }                                                                     \
+                                                                        \
+  if (eflag) {                                                          \
+    obond += ev_pre * ebond;                                            \
+    if (eatom) {                                                        \
+      flt_t halfeng = ebond * (flt_t)0.5;                               \
+      if (newton || i1 < nlocal) f[i1].w += halfeng;                    \
+      if (newton || i2 < nlocal) f[i2].w += halfeng;                    \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  if (VFLAG && vflag) {                                                 \
+    ov0 += ev_pre * (delx * delx * fbond);                              \
+    ov1 += ev_pre * (dely * dely * fbond);                              \
+    ov2 += ev_pre * (delz * delz * fbond);                              \
+    ov3 += ev_pre * (delx * dely * fbond);                              \
+    ov4 += ev_pre * (delx * delz * fbond);                              \
+    ov5 += ev_pre * (dely * delz * fbond);                              \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_angle(eflag, eatom, vflag, eangle, i1, i2, i3, 	\
-			      f1x, f1y, f1z, f3x, f3y, f3z, delx1,	\
-			      dely1, delz1, delx2, dely2, delz2,	\
-			      oeangle, force, newton, nlocal, ov0, ov1, \
-			      ov2, ov3, ov4, ov5)			\
+#define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1,   \
+                              i2, i3, f1x, f1y, f1z, f3x, f3y, f3z,     \
+                              delx1, dely1, delz1, delx2, dely2, delz2, \
+                              oeangle, force, newton, nlocal, ov0, ov1, \
+                              ov2, ov3, ov4, ov5)                       \
 {                                                                       \
-  flt_t ev_pre;								\
-  if (newton) ev_pre = (flt_t)1.0;					\
-  else {								\
-    ev_pre = (flt_t)0.0;						\
-    if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
-    if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
-    if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
-  }									\
-									\
-  if (eflag) {								\
-    oeangle += ev_pre * eangle;						\
-    if (eatom) {							\
-      flt_t thirdeng = eangle * (flt_t)0.3333333333333333;		\
-      if (newton || i1 < nlocal) f[i1].w += thirdeng;			\
-      if (newton || i2 < nlocal) f[i2].w += thirdeng;			\
-      if (newton || i3 < nlocal) f[i3].w += thirdeng;			\
-    }									\
-  }									\
-									\
-  if (vflag) {								\
-    ov0 += ev_pre * (delx1 * f1x + delx2 * f3x);			\
-    ov1 += ev_pre * (dely1 * f1y + dely2 * f3y);			\
-    ov2 += ev_pre * (delz1 * f1z + delz2 * f3z);			\
-    ov3 += ev_pre * (delx1 * f1y + delx2 * f3y);			\
-    ov4 += ev_pre * (delx1 * f1z + delx2 * f3z);			\
-    ov5 += ev_pre * (dely1 * f1z + dely2 * f3z);			\
+  flt_t ev_pre;                                                         \
+  if (newton) ev_pre = (flt_t)1.0;                                      \
+  else {                                                                \
+    ev_pre = (flt_t)0.0;                                                \
+    if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
+    if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
+    if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
+  }                                                                     \
+                                                                        \
+  if (eflag) {                                                          \
+    oeangle += ev_pre * eangle;                                         \
+    if (eatom) {                                                        \
+      flt_t thirdeng = eangle * (flt_t)0.3333333333333333;              \
+      if (newton || i1 < nlocal) f[i1].w += thirdeng;                   \
+      if (newton || i2 < nlocal) f[i2].w += thirdeng;                   \
+      if (newton || i3 < nlocal) f[i3].w += thirdeng;                   \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  if (VFLAG && vflag) {                                                 \
+    ov0 += ev_pre * (delx1 * f1x + delx2 * f3x);                        \
+    ov1 += ev_pre * (dely1 * f1y + dely2 * f3y);                        \
+    ov2 += ev_pre * (delz1 * f1z + delz2 * f3z);                        \
+    ov3 += ev_pre * (delx1 * f1y + delx2 * f3y);                        \
+    ov4 += ev_pre * (delx1 * f1z + delx2 * f3z);                        \
+    ov5 += ev_pre * (dely1 * f1z + dely2 * f3z);                        \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
-			      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,	\
-			      f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,	\
-			      vb3x, vb3y, vb3z,oedihedral, force,	\
-			      newton, nlocal, ov0, ov1, ov2, ov3, ov4,  \
-			      ov5)					\
+#define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
+                              f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y,   \
+                              vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
+                              newton, nlocal, ov0, ov1, ov2, ov3, ov4,  \
+                              ov5)                                      \
 {                                                                       \
-  flt_t ev_pre;								\
-  if (newton) ev_pre = (flt_t)1.0;					\
-  else {								\
-    ev_pre = (flt_t)0.0;						\
-    if (i1 < nlocal) ev_pre += (flt_t)0.25;				\
-    if (i2 < nlocal) ev_pre += (flt_t)0.25;				\
-    if (i3 < nlocal) ev_pre += (flt_t)0.25;				\
-    if (i4 < nlocal) ev_pre += (flt_t)0.25;				\
-  }									\
-									\
-  if (eflag) {								\
-    oedihedral += ev_pre * deng;					\
-    if (eatom) {							\
-      flt_t qdeng = deng * (flt_t)0.25;					\
-      if (newton || i1 < nlocal) f[i1].w += qdeng;			\
-      if (newton || i2 < nlocal) f[i2].w += qdeng;			\
-      if (newton || i3 < nlocal) f[i3].w += qdeng;			\
-      if (newton || i4 < nlocal) f[i4].w += qdeng;			\
-    }									\
-  }									\
-									\
-  if (vflag) {								\
-    ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);		\
-    ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);		\
-    ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);		\
-    ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);		\
-    ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);		\
-    ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);		\
+  flt_t ev_pre;                                                         \
+  if (newton) ev_pre = (flt_t)1.0;                                      \
+  else {                                                                \
+    ev_pre = (flt_t)0.0;                                                \
+    if (i1 < nlocal) ev_pre += (flt_t)0.25;                             \
+    if (i2 < nlocal) ev_pre += (flt_t)0.25;                             \
+    if (i3 < nlocal) ev_pre += (flt_t)0.25;                             \
+    if (i4 < nlocal) ev_pre += (flt_t)0.25;                             \
+  }                                                                     \
+                                                                        \
+  if (eflag) {                                                          \
+    oedihedral += ev_pre * deng;                                        \
+    if (eatom) {                                                        \
+      flt_t qdeng = deng * (flt_t)0.25;                                 \
+      if (newton || i1 < nlocal) f[i1].w += qdeng;                      \
+      if (newton || i2 < nlocal) f[i2].w += qdeng;                      \
+      if (newton || i3 < nlocal) f[i3].w += qdeng;                      \
+      if (newton || i4 < nlocal) f[i4].w += qdeng;                      \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  if (VFLAG && vflag) {                                                 \
+    ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);            \
+    ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);            \
+    ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);            \
+    ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);            \
+    ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);            \
+    ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);            \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp)    	\
-{									\
-  if (evflag) {								\
-    if (eflag) {							\
-      f[i].w += fwtmp;							\
-      oevdwl += sevdwl;							\
-    }									\
-    if (vflag == 1) {							\
-      ov0 += sv0;							\
-      ov1 += sv1;							\
-      ov2 += sv2;							\
-      ov3 += sv3;							\
-      ov4 += sv4;							\
-      ov5 += sv5;							\
-    }									\
-  }									\
+#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp)            \
+{                                                                       \
+  if (eflag) {                                                          \
+    f[i].w += fwtmp;                                                    \
+    oevdwl += sevdwl;                                                   \
+  }                                                                     \
+  if (newton == 0 && vflag == 1) {                                      \
+    ov0 += sv0;                                                         \
+    ov1 += sv1;                                                         \
+    ov2 += sv2;                                                         \
+    ov3 += sv3;                                                         \
+    ov4 += sv4;                                                         \
+    ov5 += sv5;                                                         \
+  }                                                                     \
 }
 
-#define IP_PRE_ev_tally_atomq(evflag, eflag, vflag, f, fwtmp)    	\
-{									\
-  if (evflag) {								\
-    if (eflag) {							\
-      f[i].w += fwtmp;							\
-      oevdwl += sevdwl;							\
-      oecoul += secoul;							\
-    }									\
-    if (vflag == 1) {							\
-      ov0 += sv0;							\
-      ov1 += sv1;							\
-      ov2 += sv2;							\
-      ov3 += sv3;							\
-      ov4 += sv4;							\
-      ov5 += sv5;							\
-    }									\
-  }									\
-}
-
-#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom,	\
-			       nall, nlocal, minlocal, nthreads,	\
-			       f_start, f_stride, x, offload)		\
-{									\
-  int o_range;								\
-  if (newton)								\
-    o_range = nall;							\
-  else									\
-    o_range = nlocal;							\
-  if (offload == 0) o_range -= minlocal;				\
-    IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,	\
-			 sizeof(acc_t));				\
-									\
-  int t_off = f_stride;						        \
-  if (eflag && eatom) {							\
-    for (int t = 1; t < nthreads; t++) {				\
-      _use_simd_pragma("vector nontemporal")				\
-      _use_simd_pragma("novector")					\
-      for (int n = iifrom; n < iito; n++) {				\
-        f_start[n].x += f_start[n + t_off].x;				\
-        f_start[n].y += f_start[n + t_off].y;				\
-	f_start[n].z += f_start[n + t_off].z;				\
-	f_start[n].w += f_start[n + t_off].w;				\
-      }									\
-      t_off += f_stride;						\
-    }									\
-  } else {								\
-    for (int t = 1; t < nthreads; t++) {				\
-      _use_simd_pragma("vector nontemporal")  				\
-      _use_simd_pragma("novector")					\
-      for (int n = iifrom; n < iito; n++) {                             \
-	f_start[n].x += f_start[n + t_off].x;                  	        \
-        f_start[n].y += f_start[n + t_off].y;				\
-        f_start[n].z += f_start[n + t_off].z;				\
-      }									\
-      t_off += f_stride;						\
-    }									\
-  }									\
-									\
-  if (evflag) {								\
-    if (vflag == 2) {							\
-      const ATOM_T * _noalias const xo = x + minlocal;			\
-      _use_simd_pragma("vector nontemporal")   				\
-      _use_simd_pragma("novector")					\
-      for (int n = iifrom; n < iito; n++) {				\
-	ov0 += f_start[n].x * xo[n].x;					\
-	ov1 += f_start[n].y * xo[n].y;					\
-	ov2 += f_start[n].z * xo[n].z;					\
-	ov3 += f_start[n].y * xo[n].x;					\
-	ov4 += f_start[n].z * xo[n].x;					\
-	ov5 += f_start[n].z * xo[n].y;					\
-      }									\
-    }									\
-  }									\
+#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp)           \
+{                                                                       \
+  if (eflag) {                                                          \
+    f[i].w += fwtmp;                                                    \
+    oevdwl += sevdwl;                                                   \
+    oecoul += secoul;                                                   \
+  }                                                                     \
+  if (newton == 0 && vflag == 1) {                                      \
+    ov0 += sv0;                                                         \
+    ov1 += sv1;                                                         \
+    ov2 += sv2;                                                         \
+    ov3 += sv3;                                                         \
+    ov4 += sv4;                                                         \
+    ov5 += sv5;                                                         \
+  }                                                                     \
 }
 
 }
diff --git a/src/USER-INTEL/intel_simd.h b/src/USER-INTEL/intel_simd.h
index ac13f1edfd..4616f628e7 100644
--- a/src/USER-INTEL/intel_simd.h
+++ b/src/USER-INTEL/intel_simd.h
@@ -42,25 +42,25 @@ namespace ip_simd {
   struct SIMD_int {
     __m512i v;
     SIMD_int() {}
-    SIMD_int(const __m512i in) : v(in) {} 
+    SIMD_int(const __m512i in) : v(in) {}
     operator __m512i() const { return v;}
   };
 
   struct SIMD_float {
     __m512 v;
     SIMD_float() {}
-    SIMD_float(const __m512 in) : v(in) {} 
+    SIMD_float(const __m512 in) : v(in) {}
     operator __m512() const { return v;}
   };
 
   struct SIMD_double {
     __m512d v;
     SIMD_double() {}
-    SIMD_double(const __m512d in) : v(in) {} 
+    SIMD_double(const __m512d in) : v(in) {}
     operator __m512d() const { return v;}
   };
 
-  template<class flt_t> 
+  template<class flt_t>
   class SIMD_type {
   };
 
@@ -92,20 +92,20 @@ namespace ip_simd {
 
   // ------- Set Operations
 
-  inline SIMD_int SIMD_set(const int l0, const int l1, const int l2, 
-			   const int l3, const int l4, const int l5,
-			   const int l6, const int l7, const int l8,
-			   const int l9, const int l10, const int l11,
-			   const int l12, const int l13, const int l14,
-			   const int l15) {
+  inline SIMD_int SIMD_set(const int l0, const int l1, const int l2,
+                           const int l3, const int l4, const int l5,
+                           const int l6, const int l7, const int l8,
+                           const int l9, const int l10, const int l11,
+                           const int l12, const int l13, const int l14,
+                           const int l15) {
     return _mm512_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7,
-			     l8,l9,l10,l11,l12,l13,l14,l15);
+                             l8,l9,l10,l11,l12,l13,l14,l15);
   }
 
   inline SIMD_int SIMD_set(const int l) {
     return _mm512_set1_epi32(l);
   }
-  
+
   inline SIMD_float SIMD_set(const float l) {
     return _mm512_set1_ps(l);
   }
@@ -113,28 +113,28 @@ namespace ip_simd {
   inline SIMD_double SIMD_set(const double l) {
     return _mm512_set1_pd(l);
   }
-  
+
   inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) {
     return _mm512_maskz_mov_epi32(m, one);
   }
 
-  inline SIMD_float SIMD_zero_masked(const SIMD_mask &m, 
-				     const SIMD_float &one) {
+  inline SIMD_float SIMD_zero_masked(const SIMD_mask &m,
+                                     const SIMD_float &one) {
     return _mm512_maskz_mov_ps(m, one);
   }
 
-  inline SIMD_double SIMD_zero_masked(const SIMD_mask &m, 
-				     const SIMD_double &one) {
+  inline SIMD_double SIMD_zero_masked(const SIMD_mask &m,
+                                     const SIMD_double &one) {
     return _mm512_maskz_mov_pd(m, one);
   }
 
-  inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m, 
-			     const SIMD_float &one) {
+  inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m,
+                             const SIMD_float &one) {
     return _mm512_mask_mov_ps(src,m,one);
   }
 
-  inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m, 
-			      const SIMD_double &one) {
+  inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m,
+                              const SIMD_double &one) {
     return _mm512_mask_mov_pd(src,m,one);
   }
 
@@ -147,11 +147,11 @@ namespace ip_simd {
   inline SIMD_float SIMD_load(const float *p) {
     return _mm512_load_ps(p);
   }
-  
+
   inline SIMD_double SIMD_load(const double *p) {
     return _mm512_load_pd(p);
   }
-  
+
   inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) {
     return _mm512_maskz_load_epi32(m, p);
   }
@@ -159,7 +159,7 @@ namespace ip_simd {
   inline SIMD_float SIMD_loadz(const SIMD_mask &m, const float *p) {
     return _mm512_maskz_load_ps(m, p);
   }
-  
+
   inline SIMD_double SIMD_loadz(const SIMD_mask &m, const double *p) {
     return _mm512_maskz_load_pd(m, p);
   }
@@ -168,7 +168,7 @@ namespace ip_simd {
     return _mm512_i32gather_epi32(i, p, _MM_SCALE_4);
   }
 
-  inline SIMD_float SIMD_gather(const float *p,	const SIMD_int &i) {
+  inline SIMD_float SIMD_gather(const float *p, const SIMD_int &i) {
     return _mm512_i32gather_ps(i, p, _MM_SCALE_4);
   }
 
@@ -177,56 +177,56 @@ namespace ip_simd {
   }
 
   inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
-			      const SIMD_int &i) {
+                              const SIMD_int &i) {
     return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, p,
-				       _MM_SCALE_4);
+                                       _MM_SCALE_4);
   }
 
   inline SIMD_float SIMD_gather(const SIMD_mask &m, const float *p,
-				const SIMD_int &i) {
+                                const SIMD_int &i) {
     return _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, p,
-				    _MM_SCALE_4);
+                                    _MM_SCALE_4);
   }
 
   inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
-				 const SIMD_int &i) {
+                                 const SIMD_int &i) {
     return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
-				      _MM_SCALE_8);
+                                      _MM_SCALE_8);
   }
 
   template <typename T>
   inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p,
-				      const SIMD_int &i) {
+                                      const SIMD_int &i) {
   }
 
   template <>
   inline SIMD_int SIMD_gatherz_offset<float>(const SIMD_mask &m, const int *p,
-					     const SIMD_int &i) {
+                                             const SIMD_int &i) {
     return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
-				       _MM_SCALE_4);
+                                       _MM_SCALE_4);
   }
 
   template <>
   inline SIMD_int SIMD_gatherz_offset<double>(const SIMD_mask &m, const int *p,
-					      const SIMD_int &i) {
+                                              const SIMD_int &i) {
     return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
-				       _MM_SCALE_8);
+                                       _MM_SCALE_8);
   }
 
   inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p,
-				 const SIMD_int &i) {
+                                 const SIMD_int &i) {
     return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p,
-				    _MM_SCALE_4);
+                                    _MM_SCALE_4);
   }
 
   inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
-				  const SIMD_int &i) {
+                                  const SIMD_int &i) {
     return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
-				      _MM_SCALE_8);
+                                      _MM_SCALE_8);
   }
 
   // ------- Store Operations
-  
+
   inline void SIMD_store(int *p, const SIMD_int &one) {
     return _mm512_store_epi32(p,one);
   }
@@ -240,17 +240,17 @@ namespace ip_simd {
   }
 
   inline void SIMD_scatter(const SIMD_mask &m, int *p,
-			   const SIMD_int &i, const SIMD_int &vec) {
+                           const SIMD_int &i, const SIMD_int &vec) {
     _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
   }
 
   inline void SIMD_scatter(const SIMD_mask &m, float *p,
-			   const SIMD_int &i, const SIMD_float &vec) {
+                           const SIMD_int &i, const SIMD_float &vec) {
     _mm512_mask_i32scatter_ps(p, m, i, vec, _MM_SCALE_4);
   }
 
   inline void SIMD_scatter(const SIMD_mask &m, double *p,
-			   const SIMD_int &i, const SIMD_double &vec) {
+                           const SIMD_int &i, const SIMD_double &vec) {
     _mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
   }
 
@@ -263,76 +263,76 @@ namespace ip_simd {
   inline SIMD_float operator+(const SIMD_float &one, const SIMD_float &two) {
     return _mm512_add_ps(one,two);
   }
-  
+
   inline SIMD_double operator+(const SIMD_double &one, const SIMD_double &two) {
     return _mm512_add_pd(one,two);
   }
-  
+
   inline SIMD_int operator+(const SIMD_int &one, const int two) {
     return _mm512_add_epi32(one,SIMD_set(two));
   }
-  
+
   inline SIMD_float operator+(const SIMD_float &one, const float two) {
     return _mm512_add_ps(one,SIMD_set(two));
   }
-  
+
   inline SIMD_double operator+(const SIMD_double &one, const double two) {
     return _mm512_add_pd(one,SIMD_set(two));
   }
 
   inline SIMD_int SIMD_add(const SIMD_mask &m,
-			   const SIMD_int &one, const int two) {
+                           const SIMD_int &one, const int two) {
     return _mm512_mask_add_epi32(one,m,one,SIMD_set(two));
   }
 
   inline SIMD_float SIMD_add(const SIMD_mask &m,
-			     const SIMD_float &one, const float two) {
+                             const SIMD_float &one, const float two) {
     return _mm512_mask_add_ps(one,m,one,SIMD_set(two));
   }
 
   inline SIMD_double SIMD_add(const SIMD_mask &m,
-			      const SIMD_double &one, const double two) {
+                              const SIMD_double &one, const double two) {
     return _mm512_mask_add_pd(one,m,one,SIMD_set(two));
   }
 
   inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m,
-			   const SIMD_int &one, const SIMD_int &two) {
+                           const SIMD_int &one, const SIMD_int &two) {
     return _mm512_mask_add_epi32(s,m,one,two);
   }
 
   inline SIMD_float SIMD_add(const SIMD_float &s, const SIMD_mask &m,
-			     const SIMD_float &one, const SIMD_float &two) {
+                             const SIMD_float &one, const SIMD_float &two) {
     return _mm512_mask_add_ps(s,m,one,two);
   }
 
   inline SIMD_double SIMD_add(const SIMD_double &s, const SIMD_mask &m,
-			      const SIMD_double &one, const SIMD_double &two) {
+                              const SIMD_double &one, const SIMD_double &two) {
     return _mm512_mask_add_pd(s,m,one,two);
   }
 
   inline SIMD_int SIMD_sub(const SIMD_int &s, const SIMD_mask &m,
-			   const SIMD_int &one, const SIMD_int &two) {
+                           const SIMD_int &one, const SIMD_int &two) {
     return _mm512_mask_sub_epi32(s,m,one,two);
   }
 
   inline SIMD_float SIMD_sub(const SIMD_float &s, const SIMD_mask &m,
-			     const SIMD_float &one, const SIMD_float &two) {
+                             const SIMD_float &one, const SIMD_float &two) {
     return _mm512_mask_sub_ps(s,m,one,two);
   }
 
   inline SIMD_double SIMD_sub(const SIMD_double &s, const SIMD_mask &m,
-			      const SIMD_double &one, const SIMD_double &two) {
+                              const SIMD_double &one, const SIMD_double &two) {
     return _mm512_mask_sub_pd(s,m,one,two);
   }
 
   inline SIMD_int operator-(const SIMD_int &one) {
     return _mm512_sub_epi32(SIMD_set((int)0),one);
   }
-  
+
   inline SIMD_float operator-(const SIMD_float &one) {
     return _mm512_sub_ps(SIMD_set((float)0),one);
   }
-  
+
   inline SIMD_double operator-(const SIMD_double &one) {
     return _mm512_sub_pd(SIMD_set((double)0),one);
   }
@@ -340,80 +340,80 @@ namespace ip_simd {
   inline SIMD_int operator-(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_sub_epi32(one,two);
   }
-  
+
   inline SIMD_float operator-(const SIMD_float &one, const SIMD_float &two) {
     return _mm512_sub_ps(one,two);
   }
-  
+
   inline SIMD_double operator-(const SIMD_double &one, const SIMD_double &two) {
     return _mm512_sub_pd(one,two);
   }
-  
+
   inline SIMD_int operator-(const SIMD_int &one, const int two) {
     return _mm512_sub_epi32(one,SIMD_set(two));
   }
-  
+
   inline SIMD_float operator-(const SIMD_float &one, const float two) {
     return _mm512_sub_ps(one,SIMD_set(two));
   }
-  
+
   inline SIMD_double operator-(const SIMD_double &one, const double two) {
     return _mm512_sub_pd(one,SIMD_set(two));
   }
-  
+
   inline SIMD_int operator*(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_mullo_epi32(one,two);
   }
-  
+
   inline SIMD_float operator*(const SIMD_float &one, const SIMD_float &two) {
     return _mm512_mul_ps(one,two);
   }
-  
+
   inline SIMD_double operator*(const SIMD_double &one, const SIMD_double &two) {
     return _mm512_mul_pd(one,two);
   }
-  
+
   inline SIMD_int operator*(const SIMD_int &one, const int two) {
     return _mm512_mullo_epi32(one,SIMD_set(two));
   }
-  
+
   inline SIMD_float operator*(const SIMD_float &one, const float two) {
     return _mm512_mul_ps(one,SIMD_set(two));
   }
-  
+
   inline SIMD_double operator*(const SIMD_double &one, const double two) {
     return _mm512_mul_pd(one,SIMD_set(two));
   }
-  
+
   inline SIMD_float operator/(const SIMD_float &one, const SIMD_float &two) {
     return _mm512_div_ps(one,two);
   }
-  
+
   inline SIMD_double operator/(const SIMD_double &one, const SIMD_double &two) {
     return _mm512_div_pd(one,two);
   }
-  
+
   inline SIMD_float SIMD_fma(const SIMD_float &one, const SIMD_float &two,
-			     const SIMD_float &three) {
+                             const SIMD_float &three) {
     return _mm512_fmadd_ps(one,two,three);
   }
 
   inline SIMD_double SIMD_fma(const SIMD_double &one, const SIMD_double &two,
-			      const SIMD_double &three) {
+                              const SIMD_double &three) {
     return _mm512_fmadd_pd(one,two,three);
   }
 
   inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two,
-			     const SIMD_float &three) {
+                             const SIMD_float &three) {
     return _mm512_fmsub_ps(one,two,three);
   }
 
   inline SIMD_double SIMD_fms(const SIMD_double &one, const SIMD_double &two,
-			      const SIMD_double &three) {
+                              const SIMD_double &three) {
     return _mm512_fmsub_pd(one,two,three);
   }
-  
-  // ------- SVML operations  
+
+  // ------- SVML operations
 
   inline SIMD_float SIMD_rcp(const SIMD_float &one) {
     #ifdef __AVX512ER__
@@ -489,33 +489,33 @@ namespace ip_simd {
 
   // ------- Comparison operations
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one, 
-			   const SIMD_int &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one,
+                           const SIMD_int &two) {
     return _mm512_mask_cmplt_epi32_mask(m, one, two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_float &one, 
-			   const SIMD_float &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_float &one,
+                           const SIMD_float &two) {
     return _mm512_mask_cmplt_ps_mask(m, one, two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_double &one, 
-			   const SIMD_double &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_double &one,
+                           const SIMD_double &two) {
     return _mm512_mask_cmplt_pd_mask(m, one, two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const int one, 
-			   const SIMD_int &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const int one,
+                           const SIMD_int &two) {
     return _mm512_mask_cmplt_epi32_mask(m, SIMD_set(one), two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const float one, 
-			   const SIMD_float &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const float one,
+                           const SIMD_float &two) {
     return _mm512_mask_cmplt_ps_mask(m, SIMD_set(one), two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const double one, 
-			   const SIMD_double &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const double one,
+                           const SIMD_double &two) {
     return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two);
   }
 
@@ -629,112 +629,112 @@ namespace ip_simd {
 
   // i indices should be positive
   inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
-				       SIMD_float &v1) {
+                                       SIMD_float &v1) {
     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
     if (todo_mask) {
       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
-				      _mm512_lzcnt_epi32(cd));
-      
+                                      _mm512_lzcnt_epi32(cd));
+
       while(todo_mask) {
-	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
-	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
-							  todo_bcast);
-	SIMD_float am_perm;
-	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
-					     now_mask, lid, v1);
-	v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
-	todo_mask = _mm512_kxor(todo_mask, now_mask);
+        SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+        SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
+                                                          todo_bcast);
+        SIMD_float am_perm;
+        am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+                                             now_mask, lid, v1);
+        v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
+        todo_mask = _mm512_kxor(todo_mask, now_mask);
       }
     }
   }
 
   // i indices should be positive
   inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
-				       SIMD_double &v1) {
+                                       SIMD_double &v1) {
     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
     if (todo_mask) {
       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
-				      _mm512_lzcnt_epi32(cd));
+                                      _mm512_lzcnt_epi32(cd));
       lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid));
-      
+
       while(todo_mask) {
-	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
-	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
-							  todo_bcast);
-	SIMD_double am_perm;
-	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
-					     now_mask, lid, v1);
-	v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
-	todo_mask = _mm512_kxor(todo_mask, now_mask);
+        SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+        SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
+                                                          todo_bcast);
+        SIMD_double am_perm;
+        am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+                                             now_mask, lid, v1);
+        v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
+        todo_mask = _mm512_kxor(todo_mask, now_mask);
       }
     }
   }
 
   // i indices should be positive
   inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i,
-				       SIMD_float &v1, SIMD_float &v2,
-				       SIMD_float &v3) {
+                                       SIMD_float &v1, SIMD_float &v2,
+                                       SIMD_float &v3) {
     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
     if (todo_mask) {
       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
-				      _mm512_lzcnt_epi32(cd));
-      
+                                      _mm512_lzcnt_epi32(cd));
+
       while(todo_mask) {
-	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
-	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
-							  todo_bcast);
-	SIMD_float am_perm;
-	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
-					     now_mask, lid, v1);
-	v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
-	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
-					     now_mask, lid, v2);
-	v2 = _mm512_mask_add_ps(v2, now_mask, v2, am_perm);
-	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
-					     now_mask, lid, v3);
-	v3 = _mm512_mask_add_ps(v3, now_mask, v3, am_perm);
-	todo_mask = _mm512_kxor(todo_mask, now_mask);
+        SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+        SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
+                                                          todo_bcast);
+        SIMD_float am_perm;
+        am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+                                             now_mask, lid, v1);
+        v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
+        am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+                                             now_mask, lid, v2);
+        v2 = _mm512_mask_add_ps(v2, now_mask, v2, am_perm);
+        am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+                                             now_mask, lid, v3);
+        v3 = _mm512_mask_add_ps(v3, now_mask, v3, am_perm);
+        todo_mask = _mm512_kxor(todo_mask, now_mask);
       }
     }
   }
 
   // i indices should be positive
   inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i,
-				       SIMD_double &v1, SIMD_double &v2,
-				       SIMD_double &v3) {
+                                       SIMD_double &v1, SIMD_double &v2,
+                                       SIMD_double &v3) {
     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
     if (todo_mask) {
       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
-				      _mm512_lzcnt_epi32(cd));
+                                      _mm512_lzcnt_epi32(cd));
       lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid));
-      
+
       while(todo_mask) {
-	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
-	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
-							  todo_bcast);
-	SIMD_double am_perm;
-	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
-					     now_mask, lid, v1);
-	v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
-	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
-					     now_mask, lid, v2);
-	v2 = _mm512_mask_add_pd(v2, now_mask, v2, am_perm);
-	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
-					     now_mask, lid, v3);
-	v3 = _mm512_mask_add_pd(v3, now_mask, v3, am_perm);
-	todo_mask = _mm512_kxor(todo_mask, now_mask);
+        SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+        SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
+                                                          todo_bcast);
+        SIMD_double am_perm;
+        am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+                                             now_mask, lid, v1);
+        v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
+        am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+                                             now_mask, lid, v2);
+        v2 = _mm512_mask_add_pd(v2, now_mask, v2, am_perm);
+        am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+                                             now_mask, lid, v3);
+        v3 = _mm512_mask_add_pd(v3, now_mask, v3, am_perm);
+        todo_mask = _mm512_kxor(todo_mask, now_mask);
       }
     }
   }
@@ -744,7 +744,7 @@ namespace ip_simd {
   inline SIMD_int operator&(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_and_epi32(one,two);
   }
-  
+
   inline SIMD_int operator>>(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_srlv_epi32(one,two);
   }
@@ -752,21 +752,21 @@ namespace ip_simd {
   inline SIMD_int operator<<(const SIMD_int &one, const unsigned two) {
     return _mm512_slli_epi32(one,two);
   }
-  
+
   // -------- I/O operations
 
   inline void SIMD_print(const __m512i &vec) {
-    for (int i = 0; i < 16; i++) 
+    for (int i = 0; i < 16; i++)
       printf("%d ",(*((int*)&(vec) + (i))));
   }
 
   inline void SIMD_print(const __m512 &vec) {
-    for (int i = 0; i < 16; i++) 
+    for (int i = 0; i < 16; i++)
       printf("%f ",(*((float*)&(vec) + (i))));
   }
 
   inline void SIMD_print(const __m512d &vec) {
-    for (int i = 0; i < 8; i++) 
+    for (int i = 0; i < 8; i++)
       printf("%f ",(*((double*)&(vec) + (i))));
   }
 
@@ -801,280 +801,280 @@ namespace ip_simd {
   // ---------- LAMMPS operations
   #ifndef SW_GATHER_TEST
   inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom,
-			       const SIMD_int &i, SIMD_float &x, SIMD_float &y,
-			       SIMD_float &z) {
-    x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, 
-				 _MM_SCALE_1);
+                               const SIMD_int &i, SIMD_float &x, SIMD_float &y,
+                               SIMD_float &z) {
+    x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom,
+                                 _MM_SCALE_1);
     y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1,
-				 _MM_SCALE_1);
+                                 _MM_SCALE_1);
     z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2,
-				 _MM_SCALE_1);
+                                 _MM_SCALE_1);
   }
 
   inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom,
-			       const SIMD_int &i, SIMD_float &x, SIMD_float &y,
-			       SIMD_float &z, SIMD_int &type) {
-    x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, 
-				 _MM_SCALE_1);
+                               const SIMD_int &i, SIMD_float &x, SIMD_float &y,
+                               SIMD_float &z, SIMD_int &type) {
+    x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom,
+                                 _MM_SCALE_1);
     y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1,
-				 _MM_SCALE_1);
+                                 _MM_SCALE_1);
     z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2,
-				 _MM_SCALE_1);
+                                 _MM_SCALE_1);
     type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
-				       _MM_SCALE_1);
+                                       _MM_SCALE_1);
   }
   #endif
 
   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
-			       const SIMD_int &i, SIMD_double &x, 
-			       SIMD_double &y, SIMD_double &z) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, 
-				   _MM_SCALE_2);
+                               const SIMD_int &i, SIMD_double &x,
+                               SIMD_double &y, SIMD_double &z) {
+    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
+                                   _MM_SCALE_2);
     y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-				   _MM_SCALE_2);
+                                   _MM_SCALE_2);
     z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-				   _MM_SCALE_2);
+                                   _MM_SCALE_2);
   }
 
   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
-			       const SIMD_int &i, SIMD_double &x, 
-			       SIMD_double &y, SIMD_double &z, SIMD_int &type) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, 
-				   _MM_SCALE_2);
+                               const SIMD_int &i, SIMD_double &x,
+                               SIMD_double &y, SIMD_double &z, SIMD_int &type) {
+    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
+                                   _MM_SCALE_2);
     y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-				   _MM_SCALE_2);
+                                   _MM_SCALE_2);
     z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-				   _MM_SCALE_2);
+                                   _MM_SCALE_2);
     type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
-				       _MM_SCALE_2);
+                                       _MM_SCALE_2);
   }
 
-  inline SIMD_float SIMD_ev_add(const SIMD_float &one, 
-				const SIMD_float &two) {
+  inline SIMD_float SIMD_ev_add(const SIMD_float &one,
+                                const SIMD_float &two) {
     return _mm512_add_ps(one,two);
   }
 
-  inline SIMD_double SIMD_ev_add(const SIMD_double &one, 
-				 const SIMD_double &two) {
+  inline SIMD_double SIMD_ev_add(const SIMD_double &one,
+                                 const SIMD_double &two) {
     return _mm512_add_pd(one,two);
   }
 
-  inline SIMD_double SIMD_ev_add(const SIMD_double &one, 
-				 const SIMD_float &two) {
+  inline SIMD_double SIMD_ev_add(const SIMD_double &one,
+                                 const SIMD_float &two) {
     SIMD_double twod = _mm512_cvtps_pd(_mm512_castps512_ps256(two));
     SIMD_double ans = _mm512_add_pd(one,twod);
     twod = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			     _mm512_shuffle_f32x4(two,two,238)));
+                             _mm512_shuffle_f32x4(two,two,238)));
     return _mm512_add_pd(ans,twod);
   }
 
-  inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force, 
-			       const SIMD_int &joffset, SIMD_float &eng) {
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force,
+                               const SIMD_int &joffset, SIMD_float &eng) {
     SIMD_float jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset, 
-				    force, _MM_SCALE_1);
+    jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset,
+                                    force, _MM_SCALE_1);
     jeng = jeng + eng;
     _mm512_mask_i32scatter_ps(force, rmask, joffset, jeng, _MM_SCALE_1);
   }
 
-  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, 
-			       const SIMD_int &joffset, SIMD_double &eng) {
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
+                               const SIMD_int &joffset, SIMD_double &eng) {
     SIMD_double jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force, _MM_SCALE_2);
     jeng = jeng + eng;
     _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
   }
 
-  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, 
-			       const SIMD_int &joffset, SIMD_float &eng) {
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
+                               const SIMD_int &joffset, SIMD_float &eng) {
     SIMD_double engd, jeng;
     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
     SIMD_conflict_pi_reduce1(rmask, joffset, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force, _MM_SCALE_2);
     jeng = jeng + engd;
     _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
 
     SIMD_mask rmask2 = rmask >> 8;
     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			     _mm512_shuffle_f32x4(eng,eng,238)));
+                             _mm512_shuffle_f32x4(eng,eng,238)));
     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
     SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
-				      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+                                      force, _MM_SCALE_2);
     jeng = jeng + engd;
     _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
   }
 
-  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force, 
-				  const SIMD_int &joffset1, SIMD_float &eng) {
+  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
+                                  const SIMD_int &joffset1, SIMD_float &eng) {
   }
 
-  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force, 
-				  const SIMD_int &joffset1, SIMD_double &eng) {
+  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force,
+                                  const SIMD_int &joffset1, SIMD_double &eng) {
     SIMD_mask rmask = mask >> 8;
     SIMD_int joffset = _mm512_shuffle_i32x4(joffset1, joffset1, 238);
 
     SIMD_double jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force, _MM_SCALE_2);
     jeng = jeng + eng;
     _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
   }
 
   inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
-			       const SIMD_int &i, SIMD_float &fx,
-			       SIMD_float &fy, SIMD_float &fz) {
+                               const SIMD_int &i, SIMD_float &fx,
+                               SIMD_float &fy, SIMD_float &fz) {
     SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
     SIMD_float jfrc;
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
+                                    _MM_SCALE_1);
     jfrc = jfrc + fx;
     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
+                                    _MM_SCALE_1);
     jfrc = jfrc + fy;
     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
-				    _MM_SCALE_1);
+                                    _MM_SCALE_1);
     jfrc = jfrc + fz;
     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
   }
 
   inline void SIMD_safe_jforce(const SIMD_mask &m, double *force,
-			       const SIMD_int &i, SIMD_double &fx,
-			       SIMD_double &fy, SIMD_double &fz) {
+                               const SIMD_int &i, SIMD_double &fx,
+                               SIMD_double &fy, SIMD_double &fz) {
     SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+                                      _MM_SCALE_2);
     jfrc = jfrc + fx;
     _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+                                      _MM_SCALE_2);
     jfrc = jfrc + fy;
     _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
     jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-				      _MM_SCALE_2);
+                                      _MM_SCALE_2);
     jfrc = jfrc + fz;
     _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
   }
 
-  inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force, 
-			       const SIMD_int &joffset, SIMD_float &amx,
-			       SIMD_float &amy, SIMD_float &amz) {
+  inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
+                               const SIMD_int &joffset, SIMD_float &amx,
+                               SIMD_float &amy, SIMD_float &amz) {
     SIMD_double amxd, amyd, amzd;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
     SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force, _MM_SCALE_2);
     jfrc = jfrc + amxd;
     _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force + 1, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force + 1, _MM_SCALE_2);
     jfrc = jfrc + amyd;
     _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force + 2, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force + 2, _MM_SCALE_2);
     jfrc = jfrc + amzd;
     _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
 
     SIMD_mask rmask2 = rmask >> 8;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(amx,amx,238)));
+                                _mm512_shuffle_f32x4(amx,amx,238)));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                                 _mm512_shuffle_f32x4(amy,amy,238)));
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                                 _mm512_shuffle_f32x4(amz,amz,238)));
     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
     SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
-				      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+                                      force, _MM_SCALE_2);
     jfrc = jfrc + amxd;
     _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
-				      force + 1, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+                                      force + 1, _MM_SCALE_2);
     jfrc = jfrc + amyd;
     _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
-				      force + 2, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+                                      force + 2, _MM_SCALE_2);
     jfrc = jfrc + amzd;
     _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
   }
 
   inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
-				 const SIMD_int &i, const SIMD_float &fx,
-				 const SIMD_float &fy, const SIMD_float &fz) {
+                                 const SIMD_int &i, const SIMD_float &fx,
+                                 const SIMD_float &fy, const SIMD_float &fz) {
     SIMD_float jfrc;
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
+                                    _MM_SCALE_1);
     jfrc = jfrc - fx;
     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
+                                    _MM_SCALE_1);
     jfrc = jfrc - fy;
     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
-				    _MM_SCALE_1);
+                                    _MM_SCALE_1);
     jfrc = jfrc - fz;
     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
   }
 
   template <class ft>
   inline void SIMD_scalar_update(const int jj, const int* ejnum, ft *force,
-				 const int* i, const double *fx,
-				 const double *fy, const double *fz,
-				 const double *fx2, const double *fy2,
-				 const double *fz2) {
+                                 const int* i, const double *fx,
+                                 const double *fy, const double *fz,
+                                 const double *fx2, const double *fy2,
+                                 const double *fz2) {
     #pragma novector
     for (int k=0; k<8; k++) {
       if (jj < ejnum[k]) {
-	const int j = i[k];
-	force[j].x -= fx[k];
-	force[j].y -= fy[k];
-	force[j].z -= fz[k];
+        const int j = i[k];
+        force[j].x -= fx[k];
+        force[j].y -= fy[k];
+        force[j].z -= fz[k];
       }
     }
-    
+
     #pragma novector
     for (int k=8; k<16; k++) {
       if (jj < ejnum[k]) {
-	const int j = i[k];
-	force[j].x -= fx2[k-8];
-	force[j].y -= fy2[k-8];
-	force[j].z -= fz2[k-8];
+        const int j = i[k];
+        force[j].x -= fx2[k-8];
+        force[j].y -= fy2[k-8];
+        force[j].z -= fz2[k-8];
       }
     }
   }
 
   inline void SIMD_jforce_update(const SIMD_mask &m, double *force,
-				 const SIMD_int &i, const SIMD_double &fx,
-				 const SIMD_double &fy, const SIMD_double &fz)   {
+                                 const SIMD_int &i, const SIMD_double &fx,
+                                 const SIMD_double &fy, const SIMD_double &fz)   {
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+                                      _MM_SCALE_2);
     jfrc = jfrc - fx;
     _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+                                      _MM_SCALE_2);
     jfrc = jfrc - fy;
     _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
     jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-				      _MM_SCALE_2);
+                                      _MM_SCALE_2);
     jfrc = jfrc - fz;
     _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
   }
 
-  inline void SIMD_jforce_update(const SIMD_mask &rmask, 
+  inline void SIMD_jforce_update(const SIMD_mask &rmask,
          double *force, const SIMD_int &joffset, SIMD_float &amx,
-				 SIMD_float &amy, SIMD_float &amz) {
+                                 SIMD_float &amy, SIMD_float &amz) {
     SIMD_double amxd, amyd, amzd;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
@@ -1084,7 +1084,7 @@ namespace ip_simd {
 
     SIMD_mask rmask2 = rmask >> 8;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(amx,amx,238)));
+                                _mm512_shuffle_f32x4(amx,amx,238)));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                                 _mm512_shuffle_f32x4(amy,amy,238)));
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@@ -1095,8 +1095,8 @@ namespace ip_simd {
   }
 
   inline void SIMD_cache3(float *pr, const int offset,
-			  const SIMD_float &fx,
-			  const SIMD_float &fy, const SIMD_float &fz) {
+                          const SIMD_float &fx,
+                          const SIMD_float &fy, const SIMD_float &fz) {
     float *p = pr;
     SIMD_float t;
     t = SIMD_load(p);
@@ -1113,8 +1113,8 @@ namespace ip_simd {
   }
 
   inline void SIMD_cache3(double *pr, const int offset,
-			  const SIMD_double &fx,
-			  const SIMD_double &fy, const SIMD_double &fz) {
+                          const SIMD_double &fx,
+                          const SIMD_double &fy, const SIMD_double &fz) {
     double *p = pr;
     SIMD_double t;
     t = SIMD_load(p);
@@ -1131,8 +1131,8 @@ namespace ip_simd {
   }
 
   inline void SIMD_cache3(double *pr, const int foffset,
-			  const SIMD_float &fx,
-			  const SIMD_float &fy, const SIMD_float &fz) {
+                          const SIMD_float &fx,
+                          const SIMD_float &fy, const SIMD_float &fz) {
     const int offset = foffset >> 1;
     double *p = pr;
     SIMD_double t, fd;
@@ -1142,7 +1142,7 @@ namespace ip_simd {
     t = t + fd;
     SIMD_store(p,t);
     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fx,fx,238)));
+                                _mm512_shuffle_f32x4(fx,fx,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + fd;
@@ -1154,7 +1154,7 @@ namespace ip_simd {
     t = t + fd;
     SIMD_store(p,t);
     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fy,fy,238)));
+                                _mm512_shuffle_f32x4(fy,fy,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + fd;
@@ -1166,7 +1166,7 @@ namespace ip_simd {
     t = t + fd;
     SIMD_store(p,t);
     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fz,fz,238)));
+                                _mm512_shuffle_f32x4(fz,fz,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + fd;
@@ -1174,15 +1174,15 @@ namespace ip_simd {
   }
 
   inline void SIMD_cache3(float *pr, const int offset,
-			  const SIMD_float &fx, const SIMD_float &fy,
-			  const SIMD_float &fz, const SIMD_float &fx2,
-			  const SIMD_float &fy2, const SIMD_float &fz2) {
+                          const SIMD_float &fx, const SIMD_float &fy,
+                          const SIMD_float &fz, const SIMD_float &fx2,
+                          const SIMD_float &fy2, const SIMD_float &fz2) {
   }
 
   inline void SIMD_cache3(double *pr, const int foffset,
-			  const SIMD_double &fx, const SIMD_double &fy,
-			  const SIMD_double &fz, const SIMD_double &fx2,
-			  const SIMD_double &fy2, const SIMD_double &fz2) {
+                          const SIMD_double &fx, const SIMD_double &fy,
+                          const SIMD_double &fz, const SIMD_double &fx2,
+                          const SIMD_double &fy2, const SIMD_double &fz2) {
     const int offset = foffset >> 1;
     double *p = pr;
     SIMD_double t;
@@ -1214,14 +1214,14 @@ namespace ip_simd {
     SIMD_store(p,t);
   }
 
-  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, 
-			       const SIMD_float &fjy, const SIMD_float &fjz,
-			       SIMD_float &fxtmp, SIMD_float &fytmp,
-			       SIMD_float &fztmp, SIMD_float &fjxtmp,
-			       SIMD_float &fjytmp, SIMD_float &fjztmp,
-			       SIMD_float &fxtmp2, SIMD_float &fytmp2,
-			       SIMD_float &fztmp2, SIMD_float &fjxtmp2,
-			       SIMD_float &fjytmp2, SIMD_float &fjztmp2) {
+  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx,
+                               const SIMD_float &fjy, const SIMD_float &fjz,
+                               SIMD_float &fxtmp, SIMD_float &fytmp,
+                               SIMD_float &fztmp, SIMD_float &fjxtmp,
+                               SIMD_float &fjytmp, SIMD_float &fjztmp,
+                               SIMD_float &fxtmp2, SIMD_float &fytmp2,
+                               SIMD_float &fztmp2, SIMD_float &fjxtmp2,
+                               SIMD_float &fjytmp2, SIMD_float &fjztmp2) {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy);
@@ -1230,14 +1230,14 @@ namespace ip_simd {
     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz);
   }
 
-  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_double &fjx, 
-			       const SIMD_double &fjy, const SIMD_double &fjz,
-			       SIMD_double &fxtmp, SIMD_double &fytmp,
-			       SIMD_double &fztmp, SIMD_double &fjxtmp,
-			       SIMD_double &fjytmp, SIMD_double &fjztmp,
-			       SIMD_double &fxtmp2, SIMD_double &fytmp2,
-			       SIMD_double &fztmp2, SIMD_double &fjxtmp2,
-			       SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
+  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_double &fjx,
+                               const SIMD_double &fjy, const SIMD_double &fjz,
+                               SIMD_double &fxtmp, SIMD_double &fytmp,
+                               SIMD_double &fztmp, SIMD_double &fjxtmp,
+                               SIMD_double &fjytmp, SIMD_double &fjztmp,
+                               SIMD_double &fxtmp2, SIMD_double &fytmp2,
+                               SIMD_double &fztmp2, SIMD_double &fjxtmp2,
+                               SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy);
@@ -1246,20 +1246,20 @@ namespace ip_simd {
     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz);
   }
 
-  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, 
-			       const SIMD_float &fjy, const SIMD_float &fjz,
-			       SIMD_double &fxtmp, SIMD_double &fytmp,
-			       SIMD_double &fztmp, SIMD_double &fjxtmp,
-			       SIMD_double &fjytmp, SIMD_double &fjztmp,
-			       SIMD_double &fxtmp2, SIMD_double &fytmp2,
-			       SIMD_double &fztmp2, SIMD_double &fjxtmp2,
-			       SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
+  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx,
+                               const SIMD_float &fjy, const SIMD_float &fjz,
+                               SIMD_double &fxtmp, SIMD_double &fytmp,
+                               SIMD_double &fztmp, SIMD_double &fjxtmp,
+                               SIMD_double &fjytmp, SIMD_double &fjztmp,
+                               SIMD_double &fxtmp2, SIMD_double &fytmp2,
+                               SIMD_double &fztmp2, SIMD_double &fjxtmp2,
+                               SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
     SIMD_mask kmask2 = kmask >> 8;
     SIMD_double delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjx));
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjx,fjx,238)));
+                                _mm512_shuffle_f32x4(fjx,fjx,238)));
     fxtmp2 = SIMD_sub(fxtmp2, kmask2, fxtmp2, delfd);
     fjxtmp2 = SIMD_sub(fjxtmp2, kmask2, fjxtmp2, delfd);
 
@@ -1267,7 +1267,7 @@ namespace ip_simd {
     fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd);
     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjy,fjy,238)));
+                                _mm512_shuffle_f32x4(fjy,fjy,238)));
     fytmp2 = SIMD_sub(fytmp2, kmask2, fytmp2, delfd);
     fjytmp2 = SIMD_sub(fjytmp2, kmask2, fjytmp2, delfd);
 
@@ -1275,22 +1275,22 @@ namespace ip_simd {
     fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd);
     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjz,fjz,238)));
+                                _mm512_shuffle_f32x4(fjz,fjz,238)));
     fztmp2 = SIMD_sub(fztmp2, kmask2, fztmp2, delfd);
     fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd);
   }
 
-  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, 
-			      const SIMD_float &fjy, const SIMD_float &fjz,
-			      const SIMD_float &fkx, const SIMD_float &fky,
-			      const SIMD_float &fkz,
-			      SIMD_float &fxtmp, SIMD_float &fytmp,
-			      SIMD_float &fztmp, SIMD_float &fjxtmp,
-			      SIMD_float &fjytmp, SIMD_float &fjztmp,
-			      SIMD_float &fxtmp2, SIMD_float &fytmp2,
-			      SIMD_float &fztmp2, SIMD_float &fjxtmp2,
-			      SIMD_float &fjytmp2, SIMD_float &fjztmp2,
-			      float *pr, const int offset) {
+  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx,
+                              const SIMD_float &fjy, const SIMD_float &fjz,
+                              const SIMD_float &fkx, const SIMD_float &fky,
+                              const SIMD_float &fkz,
+                              SIMD_float &fxtmp, SIMD_float &fytmp,
+                              SIMD_float &fztmp, SIMD_float &fjxtmp,
+                              SIMD_float &fjytmp, SIMD_float &fjztmp,
+                              SIMD_float &fxtmp2, SIMD_float &fytmp2,
+                              SIMD_float &fztmp2, SIMD_float &fjxtmp2,
+                              SIMD_float &fjytmp2, SIMD_float &fjztmp2,
+                              float *pr, const int offset) {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky);
@@ -1312,17 +1312,17 @@ namespace ip_simd {
     SIMD_store(p, t);
   }
 
-  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_double &fjx, 
-			      const SIMD_double &fjy, const SIMD_double &fjz,
-			      const SIMD_double &fkx, const SIMD_double &fky,
-			      const SIMD_double &fkz,
-			      SIMD_double &fxtmp, SIMD_double &fytmp,
-			      SIMD_double &fztmp, SIMD_double &fjxtmp,
-			      SIMD_double &fjytmp, SIMD_double &fjztmp,
-			      SIMD_double &fxtmp2, SIMD_double &fytmp2,
-			      SIMD_double &fztmp2, SIMD_double &fjxtmp2,
-			      SIMD_double &fjytmp2, SIMD_double &fjztmp2,
-			      double *pr, const int offset) {
+  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_double &fjx,
+                              const SIMD_double &fjy, const SIMD_double &fjz,
+                              const SIMD_double &fkx, const SIMD_double &fky,
+                              const SIMD_double &fkz,
+                              SIMD_double &fxtmp, SIMD_double &fytmp,
+                              SIMD_double &fztmp, SIMD_double &fjxtmp,
+                              SIMD_double &fjytmp, SIMD_double &fjztmp,
+                              SIMD_double &fxtmp2, SIMD_double &fytmp2,
+                              SIMD_double &fztmp2, SIMD_double &fjxtmp2,
+                              SIMD_double &fjytmp2, SIMD_double &fjztmp2,
+                              double *pr, const int offset) {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky);
@@ -1344,17 +1344,17 @@ namespace ip_simd {
     SIMD_store(p, t);
   }
 
-  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, 
-			      const SIMD_float &fjy, const SIMD_float &fjz,
-			      const SIMD_float &fkx, const SIMD_float &fky,
-			      const SIMD_float &fkz,
-			      SIMD_double &fxtmp, SIMD_double &fytmp,
-			      SIMD_double &fztmp, SIMD_double &fjxtmp,
-			      SIMD_double &fjytmp, SIMD_double &fjztmp,
-			      SIMD_double &fxtmp2, SIMD_double &fytmp2,
-			      SIMD_double &fztmp2, SIMD_double &fjxtmp2,
-			      SIMD_double &fjytmp2, SIMD_double &fjztmp2,
-			      double *pr, const int foffset) {
+  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx,
+                              const SIMD_float &fjy, const SIMD_float &fjz,
+                              const SIMD_float &fkx, const SIMD_float &fky,
+                              const SIMD_float &fkz,
+                              SIMD_double &fxtmp, SIMD_double &fytmp,
+                              SIMD_double &fztmp, SIMD_double &fjxtmp,
+                              SIMD_double &fjytmp, SIMD_double &fjztmp,
+                              SIMD_double &fxtmp2, SIMD_double &fytmp2,
+                              SIMD_double &fztmp2, SIMD_double &fjxtmp2,
+                              SIMD_double &fjytmp2, SIMD_double &fjztmp2,
+                              double *pr, const int foffset) {
     SIMD_mask kmask2 = kmask >> 8;
     const int offset = foffset >> 1;
     double *p = pr;
@@ -1368,9 +1368,9 @@ namespace ip_simd {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd - delfdk);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjx,fjx,238)));
+                                _mm512_shuffle_f32x4(fjx,fjx,238)));
     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fkx,fkx,238)));
+                                _mm512_shuffle_f32x4(fkx,fkx,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + delfdk;
@@ -1387,9 +1387,9 @@ namespace ip_simd {
     fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd - delfdk);
     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjy,fjy,238)));
+                                _mm512_shuffle_f32x4(fjy,fjy,238)));
     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fky,fky,238)));
+                                _mm512_shuffle_f32x4(fky,fky,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + delfdk;
@@ -1406,9 +1406,9 @@ namespace ip_simd {
     fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd - delfdk);
     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjz,fjz,238)));
+                                _mm512_shuffle_f32x4(fjz,fjz,238)));
     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fkz,fkz,238)));
+                                _mm512_shuffle_f32x4(fkz,fkz,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + delfdk;
@@ -1417,11 +1417,11 @@ namespace ip_simd {
     fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd);
   }
 
-  inline void SIMD_acc_energy3(const SIMD_mask &hmask, 
-			       const SIMD_float &evdwl, const int eatom,
-			       SIMD_float &sevdwl, SIMD_float &fwtmp,
-			       SIMD_float &fjtmp, SIMD_float &fwtmp2,
-			       SIMD_float &fjtmp2) {
+  inline void SIMD_acc_energy3(const SIMD_mask &hmask,
+                               const SIMD_float &evdwl, const int eatom,
+                               SIMD_float &sevdwl, SIMD_float &fwtmp,
+                               SIMD_float &fjtmp, SIMD_float &fwtmp2,
+                               SIMD_float &fjtmp2) {
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl);
     if (eatom) {
       const SIMD_float hevdwl = evdwl * (float)0.5;
@@ -1430,11 +1430,11 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_acc_energy3(const SIMD_mask &hmask, 
-			       const SIMD_double &evdwl, const int eatom,
-			       SIMD_double &sevdwl, SIMD_double &fwtmp,
-			       SIMD_double &fjtmp, SIMD_double &fwtmp2,
-			       SIMD_double &fjtmp2) {
+  inline void SIMD_acc_energy3(const SIMD_mask &hmask,
+                               const SIMD_double &evdwl, const int eatom,
+                               SIMD_double &sevdwl, SIMD_double &fwtmp,
+                               SIMD_double &fjtmp, SIMD_double &fwtmp2,
+                               SIMD_double &fjtmp2) {
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl);
     if (eatom) {
       const SIMD_double hevdwl = evdwl * (double)0.5;
@@ -1443,11 +1443,11 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_acc_energy3(const SIMD_mask &hmask, 
-			       const SIMD_float &evdwl, const int eatom,
-			       SIMD_double &sevdwl, SIMD_double &fwtmp,
-			       SIMD_double &fjtmp, SIMD_double &fwtmp2,
-			       SIMD_double &fjtmp2) {
+  inline void SIMD_acc_energy3(const SIMD_mask &hmask,
+                               const SIMD_float &evdwl, const int eatom,
+                               SIMD_double &sevdwl, SIMD_double &fwtmp,
+                               SIMD_double &fjtmp, SIMD_double &fwtmp2,
+                               SIMD_double &fjtmp2) {
     SIMD_double evdwld;
     evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256(evdwl));
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwld);
@@ -1458,7 +1458,7 @@ namespace ip_simd {
     }
     SIMD_mask hmask2 = hmask >> 8;
     evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(evdwl,evdwl,238)));
+                                _mm512_shuffle_f32x4(evdwl,evdwl,238)));
     sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, evdwld);
     if (eatom) {
       const SIMD_double hevdwl = evdwld * (double)0.5;
@@ -1467,48 +1467,48 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, 
-			     const int eatom, SIMD_float &sevdwl, 
-			     SIMD_float &fwtmp, SIMD_float &fjtmp, 
-			     SIMD_float &fwtmp2, SIMD_float &fjtmp2, 
-			     const SIMD_int &k, float *force) {
+  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad,
+                             const int eatom, SIMD_float &sevdwl,
+                             SIMD_float &fwtmp, SIMD_float &fjtmp,
+                             SIMD_float &fwtmp2, SIMD_float &fjtmp2,
+                             const SIMD_int &k, float *force) {
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
     if (eatom) {
       SIMD_float hevdwl = facrad * SIMD_set((float)0.33333333);
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask, 
-						 k, force + 3, _MM_SCALE_1);
+      SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask,
+                                                 k, force + 3, _MM_SCALE_1);
       keng = keng + hevdwl;
       _mm512_mask_i32scatter_ps(force + 3, hmask, k, keng, _MM_SCALE_1);
     }
   }
 
   inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_double &facrad,
-			     const int eatom, SIMD_double &sevdwl, 
-			     SIMD_double &fwtmp, SIMD_double &fjtmp, 
-			     SIMD_double &fwtmp2, SIMD_double &fjtmp2, 
-			     const SIMD_int &k, double *force) {
+                             const int eatom, SIMD_double &sevdwl,
+                             SIMD_double &fwtmp, SIMD_double &fjtmp,
+                             SIMD_double &fwtmp2, SIMD_double &fjtmp2,
+                             const SIMD_int &k, double *force) {
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
     if (eatom) {
       SIMD_double hevdwl = facrad * SIMD_set((double)0.33333333);
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
-						    hmask, k, force + 3, 
-						    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
+                                                    hmask, k, force + 3,
+                                                    _MM_SCALE_2);
       keng = keng + hevdwl;
       _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
     }
   }
 
-  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, 
-			     const int eatom, SIMD_double &sevdwl, 
-			     SIMD_double &fwtmp, SIMD_double &fjtmp, 
-			     SIMD_double &fwtmp2, SIMD_double &fjtmp2, 
-			     const SIMD_int &k, double *force) {
+  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad,
+                             const int eatom, SIMD_double &sevdwl,
+                             SIMD_double &fwtmp, SIMD_double &fjtmp,
+                             SIMD_double &fwtmp2, SIMD_double &fjtmp2,
+                             const SIMD_int &k, double *force) {
     SIMD_double facradd;
     facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(facrad));
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facradd);
@@ -1517,15 +1517,15 @@ namespace ip_simd {
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
-						    hmask, k, force + 3, 
-						    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
+                                                    hmask, k, force + 3,
+                                                    _MM_SCALE_2);
       keng = keng + hevdwl;
       _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
     }
     SIMD_mask hmask2 = hmask >> 8;
     facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(facrad,facrad,238)));
+                                _mm512_shuffle_f32x4(facrad,facrad,238)));
     sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, facradd);
     if (eatom) {
       SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333);
@@ -1533,20 +1533,20 @@ namespace ip_simd {
       fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
       SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
       SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
-						    hmask2, k2, force + 3, 
-						    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
+                                                    hmask2, k2, force + 3,
+                                                    _MM_SCALE_2);
       keng = keng + hevdwl;
       _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
     }
   }
 
-  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 
-				 const float ev_pre,
-  	         const SIMD_float &fpair, const SIMD_float &delx,
-		 const SIMD_float &dely,  const SIMD_float &delz,
-		 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
-		 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
+  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
+                                 const float ev_pre,
+                 const SIMD_float &fpair, const SIMD_float &delx,
+                 const SIMD_float &dely,  const SIMD_float &delz,
+                 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
+                 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
     if (vflag == 1) {
       const SIMD_float prefpair = SIMD_set(ev_pre) * fpair;
       sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair);
@@ -1558,12 +1558,12 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 
-				 const double ev_pre,
-  	         const SIMD_double &fpair, const SIMD_double &delx,
-		 const SIMD_double &dely,  const SIMD_double &delz,
-		 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
-		 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
+  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
+                                 const double ev_pre,
+                 const SIMD_double &fpair, const SIMD_double &delx,
+                 const SIMD_double &dely,  const SIMD_double &delz,
+                 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
+                 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
     if (vflag == 1) {
       const SIMD_double prefpair = SIMD_set(ev_pre) * fpair;
       sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair);
@@ -1575,12 +1575,12 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 
-				 const float ev_pre,
-  	         const SIMD_float &fpair, const SIMD_float &delx,
-		 const SIMD_float &dely,  const SIMD_float &delz,
-		 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
-		 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
+  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
+                                 const float ev_pre,
+                 const SIMD_float &fpair, const SIMD_float &delx,
+                 const SIMD_float &dely,  const SIMD_float &delz,
+                 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
+                 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
     if (vflag == 1) {
       const SIMD_mask m2 = m >> 8;
       const SIMD_float prefpair = SIMD_set(ev_pre) * fpair;
@@ -1588,55 +1588,55 @@ namespace ip_simd {
       SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv0 = SIMD_add(sv0, m, sv0, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv0 = SIMD_add(sv0, m2, sv0, dpaird);
 
       dpair = dely * dely * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv1 = SIMD_add(sv1, m, sv1, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv1 = SIMD_add(sv1, m2, sv1, dpaird);
 
       dpair = delz * delz * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv2 = SIMD_add(sv2, m, sv2, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv2 = SIMD_add(sv2, m2, sv2, dpaird);
 
       dpair = delx * dely * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv3 = SIMD_add(sv3, m, sv3, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv3 = SIMD_add(sv3, m2, sv3, dpaird);
 
       dpair = delx * delz * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv4 = SIMD_add(sv4, m, sv4, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv4 = SIMD_add(sv4, m2, sv4, dpaird);
 
       dpair = dely * delz * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv5 = SIMD_add(sv5, m, sv5, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv5 = SIMD_add(sv5, m2, sv5, dpaird);
     }
   }
 
-  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 
-		 const SIMD_float &fj0, const SIMD_float &fj1,  
-   	         const SIMD_float &fj2, const SIMD_float &fk0,
-   	         const SIMD_float &fk1, const SIMD_float &fk2,
+  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
+                 const SIMD_float &fj0, const SIMD_float &fj1,
+                 const SIMD_float &fj2, const SIMD_float &fk0,
+                 const SIMD_float &fk1, const SIMD_float &fk2,
                  const SIMD_float &delx, const SIMD_float &dely,
                  const SIMD_float &delz, const SIMD_float &delr2x,
                  const SIMD_float &delr2y, const SIMD_float &delr2z,
-		 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
-		 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
+                 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
+                 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
     if (vflag == 1) {
       sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0);
       sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1);
@@ -1647,15 +1647,15 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 
-		 const SIMD_double &fj0, const SIMD_double &fj1,  
-		 const SIMD_double &fj2, const SIMD_double &fk0,  
-		 const SIMD_double &fk1, const SIMD_double &fk2,  
-		 const SIMD_double &delx, const SIMD_double &dely,
-		 const SIMD_double &delz, const SIMD_double &delr2x,
-		 const SIMD_double &delr2y, const SIMD_double &delr2z,
-		 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
-		 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
+  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
+                 const SIMD_double &fj0, const SIMD_double &fj1,
+                 const SIMD_double &fj2, const SIMD_double &fk0,
+                 const SIMD_double &fk1, const SIMD_double &fk2,
+                 const SIMD_double &delx, const SIMD_double &dely,
+                 const SIMD_double &delz, const SIMD_double &delr2x,
+                 const SIMD_double &delr2y, const SIMD_double &delr2z,
+                 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
+                 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
     if (vflag == 1) {
       sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0);
       sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1);
@@ -1666,62 +1666,62 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 
-		 const SIMD_float &fj0, const SIMD_float &fj1,  
-   	         const SIMD_float &fj2, const SIMD_float &fk0,
-   	         const SIMD_float &fk1, const SIMD_float &fk2,
+  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
+                 const SIMD_float &fj0, const SIMD_float &fj1,
+                 const SIMD_float &fj2, const SIMD_float &fk0,
+                 const SIMD_float &fk1, const SIMD_float &fk2,
                  const SIMD_float &delx, const SIMD_float &dely,
                  const SIMD_float &delz, const SIMD_float &delr2x,
                  const SIMD_float &delr2y, const SIMD_float &delr2z,
-		 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
-		 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
+                 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
+                 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
     if (vflag == 1) {
       const SIMD_mask m2 = m >> 8;
       SIMD_float dpair = delx * fj0 + delr2x * fk0;
       SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv0 = SIMD_add(sv0, m, sv0, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv0 = SIMD_add(sv0, m2, sv0, dpaird);
 
       dpair = dely * fj1 + delr2y * fk1;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv1 = SIMD_add(sv1, m, sv1, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv1 = SIMD_add(sv1, m2, sv1, dpaird);
 
       dpair = delz * fj2 + delr2z * fk2;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv2 = SIMD_add(sv2, m, sv2, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv2 = SIMD_add(sv2, m2, sv2, dpaird);
 
       dpair = delx * fj1 + delr2x * fk1;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv3 = SIMD_add(sv3, m, sv3, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv3 = SIMD_add(sv3, m2, sv3, dpaird);
 
       dpair = delx * fj2 + delr2x * fk2;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv4 = SIMD_add(sv4, m, sv4, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv4 = SIMD_add(sv4, m2, sv4, dpaird);
 
       dpair = dely * fj2 + delr2y * fk2;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv5 = SIMD_add(sv5, m, sv5, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv5 = SIMD_add(sv5, m2, sv5, dpaird);
     }
   }
 
-  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 
+  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
          float *force, const SIMD_int &joffset, SIMD_float &amx,
          SIMD_float &amy, SIMD_float &amz, SIMD_float &fxtmp,
          SIMD_float &fytmp, SIMD_float &fztmp, SIMD_float &fxtmp2,
@@ -1733,10 +1733,10 @@ namespace ip_simd {
     SIMD_jforce_update(rmask, force, joffset, amx, amy, amz);
   }
 
-  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 
+  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
          double *force, const SIMD_int &joffset, SIMD_double &amx,
          SIMD_double &amy, SIMD_double &amz, SIMD_double &fxtmp,
-         SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, 
+         SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2,
          SIMD_double &fytmp2, SIMD_double &fztmp2) {
     fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amx);
     fytmp = SIMD_add(fytmp, rmask, fytmp, amy);
@@ -1745,10 +1745,10 @@ namespace ip_simd {
     SIMD_jforce_update(rmask, force, joffset, amx, amy, amz);
   }
 
-  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 
+  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
          double *force, const SIMD_int &joffset, SIMD_float &amx,
          SIMD_float &amy, SIMD_float &amz, SIMD_double &fxtmp,
-         SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, 
+         SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2,
          SIMD_double &fytmp2, SIMD_double &fztmp2) {
     SIMD_double amxd, amyd, amzd;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
@@ -1762,7 +1762,7 @@ namespace ip_simd {
 
     SIMD_mask rmask2 = rmask >> 8;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(amx,amx,238)));
+                                _mm512_shuffle_f32x4(amx,amx,238)));
     fxtmp2 = SIMD_add(fxtmp2, rmask2, fxtmp2, amxd);
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                                 _mm512_shuffle_f32x4(amy,amy,238)));
@@ -1776,57 +1776,57 @@ namespace ip_simd {
   }
 
   inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
-				 const SIMD_int &i, const SIMD_float &fx,
-				 const SIMD_float &fy, const SIMD_float &fz,
-				 const int EVFLAG, const int eatom,
-				 const SIMD_float &fwtmp) {
+                                 const SIMD_int &i, const SIMD_float &fx,
+                                 const SIMD_float &fy, const SIMD_float &fz,
+                                 const int EFLAG, const int eatom,
+                                 const SIMD_float &fwtmp) {
     SIMD_float jfrc;
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
+                                    _MM_SCALE_1);
     jfrc = jfrc + fx;
     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
+                                    _MM_SCALE_1);
     jfrc = jfrc + fy;
     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
-				    _MM_SCALE_1);
+                                    _MM_SCALE_1);
     jfrc = jfrc + fz;
     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
-    if (EVFLAG) {
+    if (EFLAG) {
       if (eatom) {
-	jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
-					_MM_SCALE_1);
-	jfrc = jfrc + fwtmp;
-	_mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1);
+        jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
+                                        _MM_SCALE_1);
+        jfrc = jfrc + fwtmp;
+        _mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1);
       }
     }
   }
 
   inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
-				 const SIMD_int &i, const SIMD_double &fx,
-				 const SIMD_double &fy, const SIMD_double &fz,
-				 const int EVFLAG, const int eatom,
-				 const SIMD_double &fwtmp) {
+                                 const SIMD_int &i, const SIMD_double &fx,
+                                 const SIMD_double &fy, const SIMD_double &fz,
+                                 const int EFLAG, const int eatom,
+                                 const SIMD_double &fwtmp) {
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+                                      _MM_SCALE_2);
     jfrc = jfrc + fx;
     _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+                                      _MM_SCALE_2);
     jfrc = jfrc + fy;
     _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
     jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-				      _MM_SCALE_2);
+                                      _MM_SCALE_2);
     jfrc = jfrc + fz;
     _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
-    if (EVFLAG) {
+    if (EFLAG) {
       if (eatom) {
-	jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, 
-					  force + 3, _MM_SCALE_2);
-	jfrc = jfrc + fwtmp;
-	_mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
+        jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
+                                          force + 3, _MM_SCALE_2);
+        jfrc = jfrc + fwtmp;
+        _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
       }
     }
   }
@@ -1834,8 +1834,8 @@ namespace ip_simd {
   #ifdef SW_GATHER_TEST
   template <class atom_t>
   inline void SIMD_atom_gather(const SIMD_mask &m, const atom_t *atom,
-			       const SIMD_int &i, SIMD_float &x, SIMD_float &y,
-			       SIMD_float &z, SIMD_int &type) {
+                               const SIMD_int &i, SIMD_float &x, SIMD_float &y,
+                               SIMD_float &z, SIMD_int &type) {
     int jv_scalar[16] __attribute__((aligned(64)));
     int jm_scalar[16] __attribute__((aligned(64)));
     _mm512_store_epi32(jv_scalar, i);
@@ -1846,65 +1846,65 @@ namespace ip_simd {
     pl1 = _mm512_loadu_ps((float *)((char *)atom + js));
     js = jv_scalar[1];
     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
-							js)), 1);
+                                                        js)), 1);
     js = jv_scalar[2];
     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
-							js)), 2);
+                                                        js)), 2);
     js = jv_scalar[3];
     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
-							js)), 3);
-    
+                                                        js)), 3);
+
     js = jv_scalar[4];
     pl2 = _mm512_loadu_ps((float *)((char *)atom + js));
     js = jv_scalar[5];
     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
-							js)), 1);
+                                                        js)), 1);
     js = jv_scalar[6];
     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
-							js)), 2);
+                                                        js)), 2);
     js = jv_scalar[7];
     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
-							js)), 3);
-    
+                                                        js)), 3);
+
     js = jv_scalar[8];
     pl3 = _mm512_loadu_ps((float *)((char *)atom + js));
     js = jv_scalar[9];
     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
-							js)), 1);
+                                                        js)), 1);
     js = jv_scalar[10];
     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
-							js)), 2);
+                                                        js)), 2);
     js = jv_scalar[11];
     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
-							js)), 3);
-    
+                                                        js)), 3);
+
     js = jv_scalar[12];
     pl4 = _mm512_loadu_ps((float *)((char *)atom + js));
     js = jv_scalar[13];
     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
-							js)), 1);
+                                                        js)), 1);
     js = jv_scalar[14];
     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
-							js)), 2);
+                                                        js)), 2);
     js = jv_scalar[15];
     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
-							js)), 3);
-    
+                                                        js)), 3);
+
     SIMD_int c0 = _mm512_setr_epi32(0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c,
-				    0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d);
+                                    0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d);
     SIMD_int c1 = _mm512_setr_epi32(0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d,
-				    0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c);
+                                    0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c);
     SIMD_int c2 = _mm512_setr_epi32(0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e,
-				    0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f);
+                                    0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f);
     SIMD_int c3 = _mm512_setr_epi32(0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f,
-				    0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e);
+                                    0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e);
     SIMD_mask k_1 = _mm512_int2mask(65280);
 
     SIMD_float sl1 = _mm512_permutex2var_ps(pl3, c0, pl4);
     SIMD_float sl2 = _mm512_permutex2var_ps(pl1, c1, pl2);
     SIMD_float sl3 = _mm512_permutex2var_ps(pl3, c2, pl4);
     SIMD_float sl4 = _mm512_permutex2var_ps(pl1, c3, pl2);
-    
+
     x = _mm512_shuffle_f32x4(sl2, sl1, 78);
     z = _mm512_shuffle_f32x4(sl4, sl3, 78);
     y = _mm512_mask_blend_ps(k_1, sl2, sl1);
diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h
index 403b74d8fe..547fadb6e9 100644
--- a/src/USER-INTEL/math_extra_intel.h
+++ b/src/USER-INTEL/math_extra_intel.h
@@ -18,110 +18,110 @@
 #ifndef LMP_MATH_EXTRA_INTEL_H
 #define LMP_MATH_EXTRA_INTEL_H
 
-#define ME_quat_to_mat_trans(quat, mat)		\
-{						\
-  flt_t quat_w = quat.w;			\
-  flt_t quat_i = quat.i;			\
-  flt_t quat_j = quat.j;			\
-  flt_t quat_k = quat.k;			\
-  flt_t w2 = quat_w * quat_w;			\
-  flt_t i2 = quat_i * quat_i;			\
-  flt_t j2 = quat_j * quat_j;			\
-  flt_t k2 = quat_k * quat_k;			\
-  flt_t twoij = (flt_t)2.0 * quat_i * quat_j;	\
-  flt_t twoik = (flt_t)2.0 * quat_i * quat_k;	\
-  flt_t twojk = (flt_t)2.0 * quat_j * quat_k;	\
-  flt_t twoiw = (flt_t)2.0 * quat_i * quat_w;	\
-  flt_t twojw = (flt_t)2.0 * quat_j * quat_w;	\
-  flt_t twokw = (flt_t)2.0 * quat_k * quat_w;	\
-  						\
-  mat##_0 = w2 + i2 - j2 - k2;			\
-  mat##_3 = twoij - twokw;			\
-  mat##_6 = twojw + twoik;			\
-  						\
-  mat##_1 = twoij + twokw;			\
-  mat##_4 = w2 - i2 + j2 - k2;			\
-  mat##_7 = twojk - twoiw;			\
-  						\
-  mat##_2 = twoik - twojw;			\
-  mat##_5 = twojk + twoiw;			\
-  mat##_8 = w2 - i2 - j2 + k2;			\
+#define ME_quat_to_mat_trans(quat, mat)         \
+{                                               \
+  flt_t quat_w = quat.w;                        \
+  flt_t quat_i = quat.i;                        \
+  flt_t quat_j = quat.j;                        \
+  flt_t quat_k = quat.k;                        \
+  flt_t w2 = quat_w * quat_w;                   \
+  flt_t i2 = quat_i * quat_i;                   \
+  flt_t j2 = quat_j * quat_j;                   \
+  flt_t k2 = quat_k * quat_k;                   \
+  flt_t twoij = (flt_t)2.0 * quat_i * quat_j;   \
+  flt_t twoik = (flt_t)2.0 * quat_i * quat_k;   \
+  flt_t twojk = (flt_t)2.0 * quat_j * quat_k;   \
+  flt_t twoiw = (flt_t)2.0 * quat_i * quat_w;   \
+  flt_t twojw = (flt_t)2.0 * quat_j * quat_w;   \
+  flt_t twokw = (flt_t)2.0 * quat_k * quat_w;   \
+                                                \
+  mat##_0 = w2 + i2 - j2 - k2;                  \
+  mat##_3 = twoij - twokw;                      \
+  mat##_6 = twojw + twoik;                      \
+                                                \
+  mat##_1 = twoij + twokw;                      \
+  mat##_4 = w2 - i2 + j2 - k2;                  \
+  mat##_7 = twojk - twoiw;                      \
+                                                \
+  mat##_2 = twoik - twojw;                      \
+  mat##_5 = twojk + twoiw;                      \
+  mat##_8 = w2 - i2 - j2 + k2;                  \
 }
 
 /* ----------------------------------------------------------------------
    diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */
 
-#define ME_diag_times3(d, m, ans)			\
-  {							\
-  ans##_0 = d[0] * m##_0;				\
-  ans##_1 = d[0] * m##_1;				\
-  ans##_2 = d[0] * m##_2;				\
-  ans##_3 = d[1] * m##_3;				\
-  ans##_4 = d[1] * m##_4;				\
-  ans##_5 = d[1] * m##_5;				\
-  ans##_6 = d[2] * m##_6;				\
-  ans##_7 = d[2] * m##_7;				\
-  ans##_8 = d[2] * m##_8;				\
+#define ME_diag_times3(d, m, ans)                       \
+  {                                                     \
+  ans##_0 = d[0] * m##_0;                               \
+  ans##_1 = d[0] * m##_1;                               \
+  ans##_2 = d[0] * m##_2;                               \
+  ans##_3 = d[1] * m##_3;                               \
+  ans##_4 = d[1] * m##_4;                               \
+  ans##_5 = d[1] * m##_5;                               \
+  ans##_6 = d[2] * m##_6;                               \
+  ans##_7 = d[2] * m##_7;                               \
+  ans##_8 = d[2] * m##_8;                               \
 }
 
-#define ME_diag_times3a(d, m, ans)			\
-  {							\
-  ans##_0 = d##_0 * m##_0;				\
-  ans##_1 = d##_0 * m##_1;				\
-  ans##_2 = d##_0 * m##_2;				\
-  ans##_3 = d##_1 * m##_3;				\
-  ans##_4 = d##_1 * m##_4;				\
-  ans##_5 = d##_1 * m##_5;				\
-  ans##_6 = d##_2 * m##_6;				\
-  ans##_7 = d##_2 * m##_7;				\
-  ans##_8 = d##_2 * m##_8;				\
+#define ME_diag_times3a(d, m, ans)                      \
+  {                                                     \
+  ans##_0 = d##_0 * m##_0;                              \
+  ans##_1 = d##_0 * m##_1;                              \
+  ans##_2 = d##_0 * m##_2;                              \
+  ans##_3 = d##_1 * m##_3;                              \
+  ans##_4 = d##_1 * m##_4;                              \
+  ans##_5 = d##_1 * m##_5;                              \
+  ans##_6 = d##_2 * m##_6;                              \
+  ans##_7 = d##_2 * m##_7;                              \
+  ans##_8 = d##_2 * m##_8;                              \
 }
 
 /* ----------------------------------------------------------------------
    multiply the transpose of mat1 times mat2
 ------------------------------------------------------------------------- */
 
-#define ME_transpose_times3(m1, m2, ans)                	\
-{								\
-  ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6;	\
-  ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7;	\
-  ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8;	\
-  ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6;	\
-  ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7;	\
-  ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8;	\
-  ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6;	\
-  ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7;	\
-  ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8;	\
+#define ME_transpose_times3(m1, m2, ans)                        \
+{                                                               \
+  ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6;      \
+  ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7;      \
+  ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8;      \
+  ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6;      \
+  ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7;      \
+  ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8;      \
+  ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6;      \
+  ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7;      \
+  ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8;      \
 }
 
 /* ----------------------------------------------------------------------
    normalize a vector, return in ans
 ------------------------------------------------------------------------- */
 
-#define ME_normalize3(v0, v1, v2, ans)	        	\
-{							\
-  flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2);	\
-  ans##_0 = v0 * scale;					\
-  ans##_1 = v1 * scale;					\
-  ans##_2 = v2 * scale;					\
+#define ME_normalize3(v0, v1, v2, ans)                  \
+{                                                       \
+  flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2);   \
+  ans##_0 = v0 * scale;                                 \
+  ans##_1 = v1 * scale;                                 \
+  ans##_2 = v2 * scale;                                 \
 }
 
 /* ----------------------------------------------------------------------
    add two matrices
 ------------------------------------------------------------------------- */
 
-#define ME_plus3(m1, m2, ans)			\
-{						\
-  ans##_0 = m1##_0 + m2##_0;			\
-  ans##_1 = m1##_1 + m2##_1;			\
-  ans##_2 = m1##_2 + m2##_2;			\
-  ans##_3 = m1##_3 + m2##_3;			\
-  ans##_4 = m1##_4 + m2##_4;			\
-  ans##_5 = m1##_5 + m2##_5;			\
-  ans##_6 = m1##_6 + m2##_6;			\
-  ans##_7 = m1##_7 + m2##_7;			\
-  ans##_8 = m1##_8 + m2##_8;			\
+#define ME_plus3(m1, m2, ans)                   \
+{                                               \
+  ans##_0 = m1##_0 + m2##_0;                    \
+  ans##_1 = m1##_1 + m2##_1;                    \
+  ans##_2 = m1##_2 + m2##_2;                    \
+  ans##_3 = m1##_3 + m2##_3;                    \
+  ans##_4 = m1##_4 + m2##_4;                    \
+  ans##_5 = m1##_5 + m2##_5;                    \
+  ans##_6 = m1##_6 + m2##_6;                    \
+  ans##_7 = m1##_7 + m2##_7;                    \
+  ans##_8 = m1##_8 + m2##_8;                    \
 }
 
 /* ----------------------------------------------------------------------
@@ -135,7 +135,7 @@
    determinant of a matrix
 ------------------------------------------------------------------------- */
 
-#define ME_det3(m)				    \
+#define ME_det3(m)                                  \
   ( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \
     m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \
     m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 )
@@ -144,8 +144,8 @@
    row vector times matrix
 ------------------------------------------------------------------------- */
 
-#define ME_vecmat(v, m, ans)				    \
-{							    \
+#define ME_vecmat(v, m, ans)                                \
+{                                                           \
   ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6;  \
   ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7;  \
   ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8;  \
@@ -155,214 +155,214 @@
    cross product of 2 vectors
 ------------------------------------------------------------------------- */
 
-#define ME_cross3(v1, v2, ans)			\
-{						\
-  ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1;	\
-  ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2;	\
-  ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0;	\
+#define ME_cross3(v1, v2, ans)                  \
+{                                               \
+  ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1;  \
+  ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2;  \
+  ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0;  \
 }
 
 /* ----------------------------------------------------------------------
    cross product of 2 vectors
 ------------------------------------------------------------------------- */
 
-#define ME_mv0_cross3(m1, v2, ans)		\
-{						\
-  ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1;	\
-  ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2;	\
-  ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0;	\
+#define ME_mv0_cross3(m1, v2, ans)              \
+{                                               \
+  ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1;  \
+  ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2;  \
+  ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0;  \
 }
 
-#define ME_mv1_cross3(m1, v2, ans)		\
-{						\
-  ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1;	\
-  ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2;	\
-  ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0;	\
+#define ME_mv1_cross3(m1, v2, ans)              \
+{                                               \
+  ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1;  \
+  ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2;  \
+  ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0;  \
 }
 
-#define ME_mv2_cross3(m1, v2, ans)		\
-{						\
-  ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1;	\
-  ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2;	\
-  ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0;	\
+#define ME_mv2_cross3(m1, v2, ans)              \
+{                                               \
+  ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1;  \
+  ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2;  \
+  ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0;  \
 }
 
 
 #define ME_compute_eta_torque(m1, m2, s1, ans)                              \
-{								            \
-  flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7-		    \
-    m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5-				    \
-    m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8;				    \
-  den = (flt_t)1.0 / den;						    \
-									    \
+{                                                                           \
+  flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7-                    \
+    m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5-                              \
+    m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8;                              \
+  den = (flt_t)1.0 / den;                                                   \
+                                                                            \
   ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0-   \
- 		   m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+    \
-		   m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8-	 	    \
-		   m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+		    \
-		   m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den;	    \
-									    \
-  ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+		    \
-		   (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5-    \
-		   (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2-    \
-		   m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+		    \
-		   m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den;	    \
-  									    \
+                   m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+    \
+                   m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8-               \
+                   m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+               \
+                   m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den;          \
+                                                                            \
+  ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+              \
+                   (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5-    \
+                   (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2-    \
+                   m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+               \
+                   m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den;          \
+                                                                            \
   ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4-              \
-		   m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1-		    \
-		   m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+		    \
-		   (flt_t)2.0*m1##_4*m1##_0*m2##_2-                         \
-		   (flt_t)2.0*m1##_3*m2##_2*m1##_1+			    \
-		   m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den;	    \
-									    \
+                   m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1-               \
+                   m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+               \
+                   (flt_t)2.0*m1##_4*m1##_0*m2##_2-                         \
+                   (flt_t)2.0*m1##_3*m2##_2*m1##_1+                         \
+                   m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den;          \
+                                                                            \
   ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+  \
-		   m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+    \
-		   m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8-		    \
-		   m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- 		    \
-		   m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den;	    \
-									    \
-  ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+		    \
-		   (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5-    \
-		   (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+    \
-		   m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2-		    \
-		   m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den;	    \
-									    \
-  ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4-		    \
-		   m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+		    \
-		   (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+    \
-		   m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4-		    \
-		   (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)*   \
-    den;							   	    \
-									    \
-  ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+  	    \
-		   (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+    \
-		   m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5-    \
-		   m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7-		    \
-		   m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den;	    \
-									    \
-  ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7-		    \
-		    (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+   \
-		    (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8-   \
-		    m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+		    \
-		    m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den;	    \
-									    \
-  ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4-		    \
-		   m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7-		    \
-		   m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+		    \
-		   (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+    \
+                   m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+    \
+                   m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8-               \
+                   m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4-               \
+                   m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den;          \
+                                                                            \
+  ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+              \
+                   (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5-    \
+                   (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+    \
+                   m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2-               \
+                   m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den;          \
+                                                                            \
+  ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4-              \
+                   m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+               \
+                   (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+    \
+                   m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4-               \
+                   (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)*   \
+    den;                                                                    \
+                                                                            \
+  ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+             \
+                   (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+    \
+                   m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5-    \
+                   m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7-               \
+                   m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den;          \
+                                                                            \
+  ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7-             \
+                    (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+   \
+                    (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8-   \
+                    m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+              \
+                    m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den;         \
+                                                                            \
+  ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4-              \
+                   m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7-               \
+                   m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+               \
+                   (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+    \
                     m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)*  \
-    den;								    \
+    den;                                                                    \
 }
 
-#define ME_vcopy4(dst,src)			\
-  dst##_0 = src##_0;				\
-  dst##_1 = src##_1;				\
-  dst##_2 = src##_2;				\
+#define ME_vcopy4(dst,src)                      \
+  dst##_0 = src##_0;                            \
+  dst##_1 = src##_1;                            \
+  dst##_2 = src##_2;                            \
   dst##_3 = src##_3;
 
-#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error)	\
-{							\
-  flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5;	\
-  flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t;	\
-							\
-  aug_3 = v_0;						\
-  aug_0 = m1##_0;					\
-  aug_1 = m1##_1;					\
-  aug_2 = m1##_2;					\
-  aug_7 = v_1;						\
-  aug_4 = m1##_3;					\
-  aug_5 = m1##_4;					\
-  aug_6 = m1##_5;					\
-  aug_11 = v_2;						\
-  aug_8 = m1##_6;					\
-  aug_9 = m1##_7;					\
-  aug_10 = m1##_8;					\
-							\
-  if (fabs(aug_4) > fabs(aug_0)) {			\
-    flt_t swapt;					\
-    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;	\
-    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;	\
-    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;	\
-    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;	\
-  }							\
-  if (fabs(aug_8) > fabs(aug_0)) {			\
-    flt_t swapt;					\
-    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
+#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error)     \
+{                                                       \
+  flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5;       \
+  flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t;  \
+                                                        \
+  aug_3 = v_0;                                          \
+  aug_0 = m1##_0;                                       \
+  aug_1 = m1##_1;                                       \
+  aug_2 = m1##_2;                                       \
+  aug_7 = v_1;                                          \
+  aug_4 = m1##_3;                                       \
+  aug_5 = m1##_4;                                       \
+  aug_6 = m1##_5;                                       \
+  aug_11 = v_2;                                         \
+  aug_8 = m1##_6;                                       \
+  aug_9 = m1##_7;                                       \
+  aug_10 = m1##_8;                                      \
+                                                        \
+  if (fabs(aug_4) > fabs(aug_0)) {                      \
+    flt_t swapt;                                        \
+    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;        \
+    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;        \
+    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;        \
+    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;        \
+  }                                                     \
+  if (fabs(aug_8) > fabs(aug_0)) {                      \
+    flt_t swapt;                                        \
+    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;        \
     swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;        \
     swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;      \
     swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;      \
-  }							\
-							\
-  if (aug_0 != (flt_t)0.0) {				\
-  } else if (aug_4 != (flt_t)0.0) {			\
-    flt_t swapt;					\
-    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;	\
-    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;	\
-    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;	\
-    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;	\
-  } else if (aug_8 != (flt_t)0.0) {			\
-    flt_t swapt;					\
-    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
-    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;	\
-    swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;	\
-    swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;	\
-  } else						\
-    error = 1;						\
-							\
-  t = aug_4 / aug_0;					\
-  aug_5 -= t * aug_1;					\
-  aug_6 -= t * aug_2;					\
-  aug_7 -= t * aug_3;					\
-  t = aug_8 / aug_0;					\
-  aug_9 -= t * aug_1;					\
-  aug_10 -= t * aug_2;					\
-  aug_11 -= t * aug_3;					\
-							\
-  if (fabs(aug_9) > fabs(aug_5)) {			\
-    flt_t swapt;					\
-    swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;	\
-    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;	\
-    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;	\
-    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;	\
-  }							\
-							\
-  if (aug_5 != (flt_t)0.0) {				\
-  } else if (aug_9 != (flt_t)0.0) {			\
-    flt_t swapt;					\
+  }                                                     \
+                                                        \
+  if (aug_0 != (flt_t)0.0) {                            \
+  } else if (aug_4 != (flt_t)0.0) {                     \
+    flt_t swapt;                                        \
+    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;        \
+    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;        \
+    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;        \
+    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;        \
+  } else if (aug_8 != (flt_t)0.0) {                     \
+    flt_t swapt;                                        \
+    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;        \
+    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;        \
+    swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;      \
+    swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;      \
+  } else                                                \
+    error = 1;                                          \
+                                                        \
+  t = aug_4 / aug_0;                                    \
+  aug_5 -= t * aug_1;                                   \
+  aug_6 -= t * aug_2;                                   \
+  aug_7 -= t * aug_3;                                   \
+  t = aug_8 / aug_0;                                    \
+  aug_9 -= t * aug_1;                                   \
+  aug_10 -= t * aug_2;                                  \
+  aug_11 -= t * aug_3;                                  \
+                                                        \
+  if (fabs(aug_9) > fabs(aug_5)) {                      \
+    flt_t swapt;                                        \
     swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;        \
-    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;	\
-    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;	\
-    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;	\
-  }							\
-							\
-  t = aug_9 / aug_5;					\
-  aug_10 -= t * aug_6;					\
-  aug_11 -= t * aug_7;					\
-							\
-  if (aug_10 == (flt_t)0.0)				\
-    error = 1;						\
-							\
-  ans##_2 = aug_11/aug_10;				\
-  t = (flt_t)0.0;					\
-  t += aug_6 * ans##_2;					\
-  ans##_1 = (aug_7-t) / aug_5;				\
-  t = (flt_t)0.0;					\
-  t += aug_1 * ans##_1;					\
-  t += aug_2 * ans##_2;					\
-  ans##_0 = (aug_3 - t) / aug_0;			\
+    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;        \
+    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;      \
+    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;      \
+  }                                                     \
+                                                        \
+  if (aug_5 != (flt_t)0.0) {                            \
+  } else if (aug_9 != (flt_t)0.0) {                     \
+    flt_t swapt;                                        \
+    swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;        \
+    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;        \
+    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;      \
+    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;      \
+  }                                                     \
+                                                        \
+  t = aug_9 / aug_5;                                    \
+  aug_10 -= t * aug_6;                                  \
+  aug_11 -= t * aug_7;                                  \
+                                                        \
+  if (aug_10 == (flt_t)0.0)                             \
+    error = 1;                                          \
+                                                        \
+  ans##_2 = aug_11/aug_10;                              \
+  t = (flt_t)0.0;                                       \
+  t += aug_6 * ans##_2;                                 \
+  ans##_1 = (aug_7-t) / aug_5;                          \
+  t = (flt_t)0.0;                                       \
+  t += aug_1 * ans##_1;                                 \
+  t += aug_2 * ans##_2;                                 \
+  ans##_0 = (aug_3 - t) / aug_0;                        \
 }
 
 /* ----------------------------------------------------------------------
    normalize a quaternion
 ------------------------------------------------------------------------- */
 
-#define ME_qnormalize(q)						\
-{									\
-  double norm = 1.0 /							\
-    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);	\
-  q##_w *= norm;							\
-  q##_i *= norm;							\
-  q##_j *= norm;							\
-  q##_k *= norm;							\
+#define ME_qnormalize(q)                                                \
+{                                                                       \
+  double norm = 1.0 /                                                   \
+    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);        \
+  q##_w *= norm;                                                        \
+  q##_i *= norm;                                                        \
+  q##_j *= norm;                                                        \
+  q##_k *= norm;                                                        \
 }
 
 /* ----------------------------------------------------------------------
@@ -373,106 +373,106 @@
      and divide by principal moments
 ------------------------------------------------------------------------- */
 
-#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)	\
-{									\
-  double wbody_0, wbody_1, wbody_2;					\
-  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8;	\
-									\
-  double w2 = quat##_w * quat##_w;					\
-  double i2 = quat##_i * quat##_i;					\
-  double j2 = quat##_j * quat##_j;					\
-  double k2 = quat##_k * quat##_k;					\
-  double twoij = 2.0 * quat##_i * quat##_j;				\
-  double twoik = 2.0 * quat##_i * quat##_k;				\
-  double twojk = 2.0 * quat##_j * quat##_k;				\
-  double twoiw = 2.0 * quat##_i * quat##_w;				\
-  double twojw = 2.0 * quat##_j * quat##_w;				\
-  double twokw = 2.0 * quat##_k * quat##_w;				\
-    									\
-  rot##_0 = w2 + i2 - j2 - k2;					        \
-  rot##_1 = twoij - twokw;						\
-  rot##_2 = twojw + twoik;						\
-	  								\
-  rot##_3 = twoij + twokw;					        \
-  rot##_4 = w2 - i2 + j2 - k2;				                \
-  rot##_5 = twojk - twoiw;					        \
-									\
-  rot##_6 = twoik - twojw;				                \
-  rot##_7 = twojk + twoiw;				                \
-  rot##_8 = w2 - i2 - j2 + k2;			                        \
-									\
+#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)     \
+{                                                                       \
+  double wbody_0, wbody_1, wbody_2;                                     \
+  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
+                                                                        \
+  double w2 = quat##_w * quat##_w;                                      \
+  double i2 = quat##_i * quat##_i;                                      \
+  double j2 = quat##_j * quat##_j;                                      \
+  double k2 = quat##_k * quat##_k;                                      \
+  double twoij = 2.0 * quat##_i * quat##_j;                             \
+  double twoik = 2.0 * quat##_i * quat##_k;                             \
+  double twojk = 2.0 * quat##_j * quat##_k;                             \
+  double twoiw = 2.0 * quat##_i * quat##_w;                             \
+  double twojw = 2.0 * quat##_j * quat##_w;                             \
+  double twokw = 2.0 * quat##_k * quat##_w;                             \
+                                                                        \
+  rot##_0 = w2 + i2 - j2 - k2;                                          \
+  rot##_1 = twoij - twokw;                                              \
+  rot##_2 = twojw + twoik;                                              \
+                                                                        \
+  rot##_3 = twoij + twokw;                                              \
+  rot##_4 = w2 - i2 + j2 - k2;                                          \
+  rot##_5 = twojk - twoiw;                                              \
+                                                                        \
+  rot##_6 = twoik - twojw;                                              \
+  rot##_7 = twojk + twoiw;                                              \
+  rot##_8 = w2 - i2 - j2 + k2;                                          \
+                                                                        \
   wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2;              \
   wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2;              \
   wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2;              \
-									\
-  wbody_0 *= moments_0;							\
-  wbody_1 *= moments_1;							\
-  wbody_2 *= moments_2;							\
-									\
+                                                                        \
+  wbody_0 *= moments_0;                                                 \
+  wbody_1 *= moments_1;                                                 \
+  wbody_2 *= moments_2;                                                 \
+                                                                        \
   w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2;          \
   w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2;          \
   w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2;          \
 }
 
-#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)	\
-{									\
-  angmomin[0] += dtf * torque[0];					\
-  double angmom_0 = angmomin[0];					\
-  angmomin[1] += dtf * torque[1];					\
-  double angmom_1 = angmomin[1];					\
-  angmomin[2] += dtf * torque[2];					\
-  double angmom_2 = angmomin[2];					\
-									\
-  double quat_w = quatin[0];						\
-  double quat_i = quatin[1];						\
-  double quat_j = quatin[2];						\
-  double quat_k = quatin[3];						\
-									\
-  double omega_0, omega_1, omega_2;					\
-  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);				\
-									\
-  double wq_0, wq_1, wq_2, wq_3;					\
-  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;		\
-  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;		\
-  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;		\
-  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;		\
-									\
-  double qfull_w, qfull_i, qfull_j, qfull_k;				\
-  qfull_w = quat_w + dtq * wq_0;					\
-  qfull_i = quat_i + dtq * wq_1;					\
-  qfull_j = quat_j + dtq * wq_2;					\
-  qfull_k = quat_k + dtq * wq_3;					\
-  ME_qnormalize(qfull);							\
-									\
-  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;				\
-  qhalf_w = quat_w + 0.5*dtq * wq_0;					\
-  qhalf_i = quat_i + 0.5*dtq * wq_1;					\
-  qhalf_j = quat_j + 0.5*dtq * wq_2;					\
-  qhalf_k = quat_k + 0.5*dtq * wq_3;					\
-  ME_qnormalize(qhalf);							\
-  									\
-  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);				\
-  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;		\
-  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;		\
-  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;		\
-  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;		\
-									\
-  qhalf_w += 0.5*dtq * wq_0;						\
-  qhalf_i += 0.5*dtq * wq_1;						\
-  qhalf_j += 0.5*dtq * wq_2;						\
-  qhalf_k += 0.5*dtq * wq_3;						\
-  ME_qnormalize(qhalf);							\
-									\
-  quat_w = 2.0*qhalf_w - qfull_w;					\
-  quat_i = 2.0*qhalf_i - qfull_i;					\
-  quat_j = 2.0*qhalf_j - qfull_j;					\
-  quat_k = 2.0*qhalf_k - qfull_k;					\
-  ME_qnormalize(quat);							\
-									\
-  quatin[0] = quat_w;							\
-  quatin[1] = quat_i;							\
-  quatin[2] = quat_j;							\
-  quatin[3] = quat_k;							\
+#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)    \
+{                                                                       \
+  angmomin[0] += dtf * torque[0];                                       \
+  double angmom_0 = angmomin[0];                                        \
+  angmomin[1] += dtf * torque[1];                                       \
+  double angmom_1 = angmomin[1];                                        \
+  angmomin[2] += dtf * torque[2];                                       \
+  double angmom_2 = angmomin[2];                                        \
+                                                                        \
+  double quat_w = quatin[0];                                            \
+  double quat_i = quatin[1];                                            \
+  double quat_j = quatin[2];                                            \
+  double quat_k = quatin[3];                                            \
+                                                                        \
+  double omega_0, omega_1, omega_2;                                     \
+  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);                           \
+                                                                        \
+  double wq_0, wq_1, wq_2, wq_3;                                        \
+  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;             \
+  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;              \
+  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;              \
+  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;              \
+                                                                        \
+  double qfull_w, qfull_i, qfull_j, qfull_k;                            \
+  qfull_w = quat_w + dtq * wq_0;                                        \
+  qfull_i = quat_i + dtq * wq_1;                                        \
+  qfull_j = quat_j + dtq * wq_2;                                        \
+  qfull_k = quat_k + dtq * wq_3;                                        \
+  ME_qnormalize(qfull);                                                 \
+                                                                        \
+  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;                            \
+  qhalf_w = quat_w + 0.5*dtq * wq_0;                                    \
+  qhalf_i = quat_i + 0.5*dtq * wq_1;                                    \
+  qhalf_j = quat_j + 0.5*dtq * wq_2;                                    \
+  qhalf_k = quat_k + 0.5*dtq * wq_3;                                    \
+  ME_qnormalize(qhalf);                                                 \
+                                                                        \
+  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);                          \
+  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;          \
+  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;           \
+  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;           \
+  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;           \
+                                                                        \
+  qhalf_w += 0.5*dtq * wq_0;                                            \
+  qhalf_i += 0.5*dtq * wq_1;                                            \
+  qhalf_j += 0.5*dtq * wq_2;                                            \
+  qhalf_k += 0.5*dtq * wq_3;                                            \
+  ME_qnormalize(qhalf);                                                 \
+                                                                        \
+  quat_w = 2.0*qhalf_w - qfull_w;                                       \
+  quat_i = 2.0*qhalf_i - qfull_i;                                       \
+  quat_j = 2.0*qhalf_j - qfull_j;                                       \
+  quat_k = 2.0*qhalf_k - qfull_k;                                       \
+  ME_qnormalize(quat);                                                  \
+                                                                        \
+  quatin[0] = quat_w;                                                   \
+  quatin[1] = quat_i;                                                   \
+  quatin[2] = quat_j;                                                   \
+  quatin[3] = quat_k;                                                   \
 }
 
 #endif
diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp
index c3335b2c26..c5574a78c7 100644
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@@ -51,7 +51,7 @@ NBinIntel::~NBinIntel() {
     const int * bins = this->bins;
     const int * _atombin = this->_atombin;
     const int * _binpacked = this->_binpacked;
-    #pragma offload_transfer target(mic:_cop)	\
+    #pragma offload_transfer target(mic:_cop)   \
       nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1))
   }
   #endif
@@ -70,7 +70,7 @@ void NBinIntel::bin_atoms_setup(int nall)
     #ifdef _LMP_INTEL_OFFLOAD
     if (_offload_alloc) {
       const int * binhead = this->binhead;
-      #pragma offload_transfer target(mic:_cop)	\
+      #pragma offload_transfer target(mic:_cop) \
         nocopy(binhead:alloc_if(0) free_if(1))
     }
     #endif
@@ -98,7 +98,7 @@ void NBinIntel::bin_atoms_setup(int nall)
       const int * bins = this->bins;
       const int * _atombin = this->_atombin;
       const int * _binpacked = this->_binpacked;
-      #pragma offload_transfer target(mic:_cop)	\
+      #pragma offload_transfer target(mic:_cop) \
         nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
     }
     #endif
@@ -174,9 +174,11 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
   biga.w = 1;
   buffers->get_x()[nall] = biga;
 
-  const int nthreads = comm->nthreads;
+  int nthreads;
+  if (comm->nthreads > INTEL_HTHREADS) nthreads = comm->nthreads;
+  else nthreads = 1;
   #if defined(_OPENMP)
-  #pragma omp parallel default(none) shared(buffers)
+  #pragma omp parallel if(nthreads > INTEL_HTHREADS)
   #endif
   {
     int ifrom, ito, tid;
diff --git a/src/USER-INTEL/npair_full_bin_intel.cpp b/src/USER-INTEL/npair_full_bin_intel.cpp
index 7e0d2abdcb..06c10c080f 100644
--- a/src/USER-INTEL/npair_full_bin_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_intel.cpp
@@ -78,475 +78,54 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
                          neighbor->cutneighmax);
 
   #ifdef _LMP_INTEL_OFFLOAD
-  if (need_ic) {
-    if (offload_noghost) {
-      fbi<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
-      fbi<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
+  if (_fix->three_body_neighbor()) {
+    if (need_ic) {
+      if (offload_noghost) {
+        bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+        bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
+      }
     } else {
-      fbi<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
-      fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+      if (offload_noghost) {
+        bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+        bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
+      }
     }
   } else {
-    if (offload_noghost) {
-      fbi<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
-      fbi<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
+    if (need_ic) {
+      if (offload_noghost) {
+        bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+        bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
+      }
     } else {
-      fbi<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
-      fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
+      if (offload_noghost) {
+        bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+        bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
+      }
     }
   }
   #else
-  if (need_ic)
-    fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
-  else
-    fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
-
-template <class flt_t, class acc_t, int offload_noghost, int need_ic>
-void NPairFullBinIntel::
-fbi(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
-    const int astart, const int aend, const int offload_end) {
-
-  if (aend-astart == 0) return;
-
-  const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
-  int nall_t = nall;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload_noghost && offload) nall_t = atom->nlocal;
-  #endif
-
-  const int pack_width = _fix->nbor_pack_width();
-  const int pad_width = pad;
-
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
-  const int e_nall = nall_t;
-
-  const int molecular = atom->molecular;
-  int *ns = NULL;
-  tagint *s = NULL;
-  int tag_size = 0, special_size;
-  if (buffers->need_tag()) tag_size = e_nall;
-  if (molecular) {
-    s = atom->special[0];
-    ns = atom->nspecial[0];
-    special_size = aend;
-  } else {
-    s = &buffers->_special_holder;
-    ns = &buffers->_nspecial_holder;
-    special_size = 0;
-  }
-  const tagint * _noalias const special = s;
-  const int * _noalias const nspecial = ns;
-  const int maxspecial = atom->maxspecial;
-  const tagint * _noalias const tag = atom->tag;
-
-  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int nstencil = this->nstencil;
-  const int * _noalias const stencil = this->stencil;
-  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
-  const int ntypes = atom->ntypes + 1;
-  const int nlocal = atom->nlocal;
-
-  #ifndef _LMP_INTEL_OFFLOAD
-  int * const mask = atom->mask;
-  tagint * const molecule = atom->molecule;
-  #endif
-
-  int tnum;
-  int *overflow;
-  double *timer_compute;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    timer_compute = _fix->off_watch_neighbor();
-    tnum = buffers->get_off_threads();
-    overflow = _fix->get_off_overflow_flag();
-    _fix->stop_watch(TIME_HOST_NEIGHBOR);
-    _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else
-  #endif
-  {
-    tnum = comm->nthreads;
-    overflow = _fix->get_overflow_flag();
-  }
-  const int nthreads = tnum;
-  const int maxnbors = buffers->get_max_nbors();
-  int * _noalias const atombin = buffers->get_atombin();
-  const int * _noalias const binpacked = buffers->get_binpacked();
-
-  const int xperiodic = domain->xperiodic;
-  const int yperiodic = domain->yperiodic;
-  const int zperiodic = domain->zperiodic;
-  const flt_t xprd_half = domain->xprd_half;
-  const flt_t yprd_half = domain->yprd_half;
-  const flt_t zprd_half = domain->zprd_half;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  const int * _noalias const binhead = this->binhead;
-  const int * _noalias const bins = this->bins;
-  const int cop = _fix->coprocessor_number();
-  const int separate_buffers = _fix->separate_buffers();
-  #pragma offload target(mic:cop) if(offload) \
-    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
-    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
-    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
-    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
-    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
-    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
-    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
-    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
-    out(numneigh:length(0) alloc_if(0) free_if(0)) \
-    in(ilist:length(0) alloc_if(0) free_if(0)) \
-    in(atombin:length(aend) alloc_if(0) free_if(0)) \
-    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width)	\
-    in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
-    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
-    out(overflow:length(5) alloc_if(0) free_if(0)) \
-    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(tag)
-  #endif
-  {
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime();
-    #endif
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    overflow[LMP_LOCAL_MIN] = astart;
-    overflow[LMP_LOCAL_MAX] = aend - 1;
-    overflow[LMP_GHOST_MIN] = e_nall;
-    overflow[LMP_GHOST_MAX] = -1;
-    #endif
-
-    int nstencilp = 0;
-    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
-    for (int k = 0; k < nstencil; k++) {
-      binstart[nstencilp] = stencil[k];
-      int end = stencil[k] + 1;
-      for (int kk = k + 1; kk < nstencil; kk++) {
-        if (stencil[kk-1]+1 == stencil[kk]) {
-          end++;
-          k++;
-        } else break;
-      }
-      binend[nstencilp] = end;
-      nstencilp++;
-    }
-
-    #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
-    #endif
-    {
-      #ifdef _LMP_INTEL_OFFLOAD
-      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
-      #endif
-
-      const int num = aend - astart;
-      int tid, ifrom, ito;
-
-      IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
-      ifrom += astart;
-      ito += astart;
-      int e_ito = ito;
-      if (ito == num) {
-        int imod = ito % pack_width;
-        if (imod) e_ito += pack_width - imod;
-      }
-      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
-      int which;
-      int pack_offset = maxnbors * pack_width;
-      int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
-      const int obound = pack_offset + maxnbors * 2;
-
-      int max_chunk = 0;
-      int lane = 0;
-      for (int i = ifrom; i < ito; i++) {
-        const flt_t xtmp = x[i].x;
-        const flt_t ytmp = x[i].y;
-        const flt_t ztmp = x[i].z;
-        const int itype = x[i].w;
-        const tagint itag = tag[i];
-        const int ioffset = ntypes * itype;
-
-        const int ibin = atombin[i];
-        int raw_count = pack_offset;
-
-        // loop over all atoms in surrounding bins in stencil including self
-        // skip i = j
-        if (exclude) {
-          for (int k = 0; k < nstencilp; k++) {
-            const int bstart = binhead[ibin + binstart[k]];
-            const int bend = binhead[ibin + binend[k]];
-            #ifndef _LMP_INTEL_OFFLOAD
-            #ifdef INTEL_VMASK
-            #pragma simd
-            #endif
-            #endif
-            for (int jj = bstart; jj < bend; jj++) {
-              int j = binpacked[jj];
-
-              if (i == j) j=e_nall;
-
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (offload_noghost) {
-                if (j < nlocal) {
-                  if (i < offload_end) continue;
-                } else if (offload) continue;
-              }
-              #endif
-
-              #ifndef _LMP_INTEL_OFFLOAD
-              const int jtype = x[j].w;
-              if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
-              #endif
-
-              neighptr[raw_count++] = j;
-            }
-          }
-        } else {
-          for (int k = 0; k < nstencilp; k++) {
-            const int bstart = binhead[ibin + binstart[k]];
-            const int bend = binhead[ibin + binend[k]];
-            #ifndef _LMP_INTEL_OFFLOAD
-            #ifdef INTEL_VMASK
-            #pragma simd
-            #endif
-            #endif
-            for (int jj = bstart; jj < bend; jj++) {
-              int j = binpacked[jj];
-
-              if (i == j) j=e_nall;
-
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (offload_noghost) {
-                if (j < nlocal) {
-                  if (i < offload_end) continue;
-                } else if (offload) continue;
-              }
-              #endif
-
-              neighptr[raw_count++] = j;
-            }
-          }
-        }
-
-        if (raw_count > obound) *overflow = 1;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #ifdef _LMP_INTEL_OFFLOAD
-        int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
-        #if __INTEL_COMPILER+0 > 1499
-        #pragma vector aligned
-        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
-        #endif
-        #else
-        #pragma vector aligned
-        #pragma simd
-        #endif
-        #endif
-        for (int u = pack_offset; u < raw_count; u++) {
-          int j = neighptr[u];
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutneighsq[ioffset + jtype])
-            neighptr[u] = e_nall;
-          else {
-            if (need_ic) {
-              int no_special;
-              ominimum_image_check(no_special, delx, dely, delz);
-              if (no_special)
-                neighptr[u] = -j - 1;
-            }
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j < nlocal) {
-              if (j < vlmin) vlmin = j;
-              if (j > vlmax) vlmax = j;
-            } else {
-              if (j < vgmin) vgmin = j;
-              if (j > vgmax) vgmax = j;
-            }
-            #endif
-          }
-        }
-        #ifdef _LMP_INTEL_OFFLOAD
-        lmin = MIN(lmin,vlmin);
-        gmin = MIN(gmin,vgmin);
-        lmax = MAX(lmax,vlmax);
-        gmax = MAX(gmax,vgmax);
-        #endif
-
-        int n = lane, n2 = pack_offset;
-        for (int u = pack_offset; u < raw_count; u++) {
-          const int j = neighptr[u];
-          int pj = j;
-          if (pj < e_nall) {
-            if (need_ic)
-              if (pj < 0) pj = -pj - 1;
-
-            const int jtag = tag[pj];
-            int flist = 0;
-            if (itag > jtag) {
-              if ((itag+jtag) % 2 == 0) flist = 1;
-            } else if (itag < jtag) {
-              if ((itag+jtag) % 2 == 1) flist = 1;
-            } else {
-              if (x[pj].z < ztmp) flist = 1;
-              else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
-              else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp)
-              flist = 1;
-            }
-            if (flist) {
-              neighptr[n2++] = j;
-            } else {
-              neighptr[n] = j;
-              n += pack_width;
-            }
-          }
-        }
-        int ns = (n - lane) / pack_width;
-        atombin[i] = ns;
-        for (int u = pack_offset; u < n2; u++) {
-          neighptr[n] = neighptr[u];
-          n += pack_width;
-        }
-
-        ilist[i] = i;
-        cnumneigh[i] = ct + lane;
-        ns += n2 - pack_offset;
-        numneigh[i] = ns;
-
-        if (ns > max_chunk) max_chunk = ns;
-        lane++;
-        if (lane == pack_width) {
-          ct += max_chunk * pack_width;
-          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          const int edge = (ct % alignb);
-          if (edge) ct += alignb - edge;
-          neighptr = firstneigh + ct;
-          max_chunk = 0;
-          pack_offset = maxnbors * pack_width;
-          lane = 0;
-          if (ct + obound > list_size) {
-              if (i < ito - 1) {
-              *overflow = 1;
-              ct = (ifrom + tid * 2) * maxnbors;
-            }
-          }
-        }
-      }
-
-      if (*overflow == 1)
-        for (int i = ifrom; i < ito; i++)
-          numneigh[i] = 0;
-
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (separate_buffers) {
-        #if defined(_OPENMP)
-        #pragma omp critical
-        #endif
-        {
-          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
-        }
-        #pragma omp barrier
-      }
-
-      int ghost_offset = 0, nall_offset = e_nall;
-      if (separate_buffers) {
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-        if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      }
-      #endif
-
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-
-          const int trip = jnum * pack_width;
-          for (int jj = 0; jj < trip; jj+=pack_width) {
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == e_nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-            if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        }
-      }
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          for (jj = 0; jj < jnum; jj++) {
-            if (jlist[jj] >= nlocal) {
-              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-              else jlist[jj] -= ghost_offset;
-            }
-          }
-        }
-      }
-      #endif
-    } // end omp
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime() - *timer_compute;
-    #endif
-  } // end offload
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
-    _fix->start_watch(TIME_HOST_NEIGHBOR);
-    for (int n = 0; n < aend; n++) {
-      ilist[n] = n;
-      numneigh[n] = 0;
-    }
-  } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
-    if (separate_buffers) {
-      _fix->start_watch(TIME_PACK);
-      _fix->set_neighbor_host_sizes();
-      buffers->pack_sep_from_single(_fix->host_min_local(),
-                                    _fix->host_used_local(),
-                                    _fix->host_min_ghost(),
-                                    _fix->host_used_ghost());
-      _fix->stop_watch(TIME_PACK);
-    }
-  }
-  #else
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
+  if (_fix->three_body_neighbor()) {
+    if (need_ic)
+      bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
+    else
+      bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
+  } else {
+    if (need_ic)
+      bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
+    else
+      bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
+  }
   #endif
 }
diff --git a/src/USER-INTEL/npair_full_bin_intel.h b/src/USER-INTEL/npair_full_bin_intel.h
index f1be71abbc..0f8a27b3b4 100644
--- a/src/USER-INTEL/npair_full_bin_intel.h
+++ b/src/USER-INTEL/npair_full_bin_intel.h
@@ -36,9 +36,6 @@ class NPairFullBinIntel : public NPairIntel {
  private:
   template <class flt_t, class acc_t>
   void fbi(NeighList *, IntelBuffers<flt_t,acc_t> *);
-  template <class flt_t, class acc_t, int, int>
-  void fbi(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
-           const int, const int offload_end = 0);
 };
 
 }
diff --git a/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp b/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp
deleted file mode 100644
index 9a40e2a07c..0000000000
--- a/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: W. Michael Brown (Intel)
-------------------------------------------------------------------------- */
-
-#include "npair_half_bin_newtoff_intel.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "atom.h"
-#include "comm.h"
-#include "group.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-NPairHalfBinNewtoffIntel::NPairHalfBinNewtoffIntel(LAMMPS *lmp) :
-  NPairIntel(lmp) {}
-
-/* ----------------------------------------------------------------------
-   binned neighbor list construction with partial Newton's 3rd law
-   each owned atom i checks own bin and other bins in stencil
-   pair stored once if i,j are both owned and i < j
-   pair stored by me if j is ghost (also stored by proc owning j)
-------------------------------------------------------------------------- */
-
-void NPairHalfBinNewtoffIntel::build(NeighList *list)
-{
-  if (nstencil > INTEL_MAX_STENCIL_CHECK)
-    error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (exclude)
-    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
-  #endif
-
-  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
-    hbnni(list, _fix->get_mixed_buffers());
-  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
-    hbnni(list, _fix->get_double_buffers());
-  else
-    hbnni(list, _fix->get_single_buffers());
-
-  _fix->stop_watch(TIME_HOST_NEIGHBOR);
-}
-
-template <class flt_t, class acc_t>
-void NPairHalfBinNewtoffIntel::
-hbnni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
-  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
-  list->inum = nlocal;
-
-  const int off_end = _fix->offload_end_neighbor();
-  int host_start = off_end;;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (off_end) grow_stencil();
-  if (_fix->full_host_list()) host_start = 0;
-  #endif
-
-  buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
-
-  int need_ic = 0;
-  if (atom->molecular)
-    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-                         neighbor->cutneighmax);
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (need_ic) {
-    hbnni<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
-    hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
-  } else {
-    hbnni<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
-    hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
-  }
-  #else
-  if (need_ic)
-    hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
-  else
-    hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
-
-template <class flt_t, class acc_t, int need_ic>
-void NPairHalfBinNewtoffIntel::
-hbnni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
-      const int astart, const int aend) {
-
-  if (aend-astart == 0) return;
-
-  const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    if (INTEL_MIC_NBOR_PAD > 1)
-      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  } else
-  #endif
-    if (INTEL_NBOR_PAD > 1)
-      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  const int pad_width = pad;
-
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
-
-  const int molecular = atom->molecular;
-  int *ns = NULL;
-  tagint *s = NULL;
-  int tag_size = 0, special_size;
-  if (buffers->need_tag()) tag_size = nall;
-  if (molecular) {
-    s = atom->special[0];
-    ns = atom->nspecial[0];
-    special_size = aend;
-  } else {
-    s = &buffers->_special_holder;
-    ns = &buffers->_nspecial_holder;
-    special_size = 0;
-  }
-  const tagint * _noalias const special = s;
-  const int * _noalias const nspecial = ns;
-  const int maxspecial = atom->maxspecial;
-  const tagint * _noalias const tag = atom->tag;
-
-  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int nstencil = this->nstencil;
-  const int * _noalias const stencil = this->stencil;
-  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
-  const int ntypes = atom->ntypes + 1;
-  const int nlocal = atom->nlocal;
-
-  #ifndef _LMP_INTEL_OFFLOAD
-  int * const mask = atom->mask;
-  tagint * const molecule = atom->molecule;
-  #endif
-
-  int tnum;
-  int *overflow;
-  double *timer_compute;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    timer_compute = _fix->off_watch_neighbor();
-    tnum = buffers->get_off_threads();
-    overflow = _fix->get_off_overflow_flag();
-    _fix->stop_watch(TIME_HOST_NEIGHBOR);
-    _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else
-  #endif
-  {
-    tnum = comm->nthreads;
-    overflow = _fix->get_overflow_flag();
-  }
-  const int nthreads = tnum;
-  const int maxnbors = buffers->get_max_nbors();
-  int * _noalias const atombin = buffers->get_atombin();
-  const int * _noalias const binpacked = buffers->get_binpacked();
-
-  const int xperiodic = domain->xperiodic;
-  const int yperiodic = domain->yperiodic;
-  const int zperiodic = domain->zperiodic;
-  const flt_t xprd_half = domain->xprd_half;
-  const flt_t yprd_half = domain->yprd_half;
-  const flt_t zprd_half = domain->zprd_half;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  const int * _noalias const binhead = this->binhead;
-  const int * _noalias const bins = this->bins;
-  const int cop = _fix->coprocessor_number();
-  const int separate_buffers = _fix->separate_buffers();
-  #pragma offload target(mic:cop) if(offload) \
-    in(x:length(nall+1) alloc_if(0) free_if(0)) \
-    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
-    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
-    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
-    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
-    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
-    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
-    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
-    out(numneigh:length(0) alloc_if(0) free_if(0)) \
-    in(ilist:length(0) alloc_if(0) free_if(0)) \
-    in(atombin:length(aend) alloc_if(0) free_if(0)) \
-    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,pad_width,offload,nall)  \
-    in(separate_buffers, astart, aend, nlocal, molecular, ntypes) \
-    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
-    out(overflow:length(5) alloc_if(0) free_if(0)) \
-    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(tag)
-  #endif
-  {
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime();
-    #endif
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    overflow[LMP_LOCAL_MIN] = astart;
-    overflow[LMP_LOCAL_MAX] = aend - 1;
-    overflow[LMP_GHOST_MIN] = nall;
-    overflow[LMP_GHOST_MAX] = -1;
-    #endif
-
-    int nstencilp = 0;
-    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
-    for (int k = 0; k < nstencil; k++) {
-      binstart[nstencilp] = stencil[k];
-      int end = stencil[k] + 1;
-      for (int kk = k + 1; kk < nstencil; kk++) {
-        if (stencil[kk-1]+1 == stencil[kk]) {
-          end++;
-          k++;
-        } else break;
-      }
-      binend[nstencilp] = end;
-      nstencilp++;
-    }
-
-    #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
-    #endif
-    {
-      #ifdef _LMP_INTEL_OFFLOAD
-      int lmin = nall, lmax = -1, gmin = nall, gmax = -1;
-      #endif
-
-      const int num = aend - astart;
-      int tid, ifrom, ito;
-      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
-      ifrom += astart;
-      ito += astart;
-
-      int which;
-
-      const int list_size = (ito + tid + 1) * maxnbors;
-      int ct = (ifrom + tid) * maxnbors;
-      int *neighptr = firstneigh + ct;
-
-      for (int i = ifrom; i < ito; i++) {
-        int j, k, n, n2, itype, jtype, ibin;
-        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
-
-        n = 0;
-        n2 = maxnbors;
-
-        xtmp = x[i].x;
-        ytmp = x[i].y;
-        ztmp = x[i].z;
-        itype = x[i].w;
-        const int ioffset = ntypes*itype;
-
-        // loop over all atoms in other bins in stencil including self
-        // only store pair if i < j
-        // stores own/own pairs only once
-        // stores own/ghost pairs on both procs
-
-        ibin = atombin[i];
-
-        for (k = 0; k < nstencilp; k++) {
-          const int bstart = binhead[ibin + binstart[k]];
-          const int bend = binhead[ibin + binend[k]];
-          for (int jj = bstart; jj < bend; jj++) {
-            const int j = binpacked[jj];
-             if (j <= i) continue;
-
-            jtype = x[j].w;
-            #ifndef _LMP_INTEL_OFFLOAD
-            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-            #endif
-
-            delx = xtmp - x[j].x;
-            dely = ytmp - x[j].y;
-            delz = ztmp - x[j].z;
-            rsq = delx * delx + dely * dely + delz * delz;
-            if (rsq <= cutneighsq[ioffset + jtype]) {
-              if (j < nlocal) {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n++] = -j - 1;
-                  else
-                    neighptr[n++] = j;
-                } else
-                  neighptr[n++] = j;
-                #ifdef _LMP_INTEL_OFFLOAD
-                if (j < lmin) lmin = j;
-                if (j > lmax) lmax = j;
-                #endif
-              } else {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n2++] = -j - 1;
-                  else
-                    neighptr[n2++] = j;
-                } else
-                  neighptr[n2++] = j;
-                #ifdef _LMP_INTEL_OFFLOAD
-                if (j < gmin) gmin = j;
-                if (j > gmax) gmax = j;
-                #endif
-              }
-            }
-          }
-        }
-        ilist[i] = i;
-
-        cnumneigh[i] = ct;
-        if (n > maxnbors) *overflow = 1;
-        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
-
-        const int edge = (n % pad_width);
-        if (edge) {
-          const int pad_end = n + (pad_width - edge);
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min=1, max=15, avg=8
-          #endif
-          for ( ; n < pad_end; n++)
-            neighptr[n] = nall;
-        }
-        numneigh[i] = n;
-        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
-        ct += n;
-        neighptr += n;
-        if (ct + n + maxnbors > list_size) {
-          *overflow = 1;
-          ct = (ifrom + tid) * maxnbors;
-        }
-      }
-
-      if (*overflow == 1)
-        for (int i = ifrom; i < ito; i++)
-          numneigh[i] = 0;
-
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (separate_buffers) {
-        #if defined(_OPENMP)
-        #pragma omp critical
-        #endif
-        {
-          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
-        }
-        #pragma omp barrier
-      }
-
-      int ghost_offset = 0, nall_offset = nall;
-      if (separate_buffers) {
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-        if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      }
-      #endif
-
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          for (int jj = 0; jj < jnum; jj++) {
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-              if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        }
-      }
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          for (jj = 0; jj < jnum; jj++)
-            if (jlist[jj] >= nlocal) break;
-          while (jj < jnum) {
-            if (jlist[jj] == nall) jlist[jj] = nall_offset;
-            else jlist[jj] -= ghost_offset;
-            jj++;
-          }
-        }
-      }
-      #endif
-    } // end omp
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime() - *timer_compute;
-    #endif
-  } // end offload
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
-    _fix->start_watch(TIME_HOST_NEIGHBOR);
-    for (int n = 0; n < aend; n++) {
-      ilist[n] = n;
-      numneigh[n] = 0;
-    }
-  } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
-    if (separate_buffers) {
-      _fix->start_watch(TIME_PACK);
-      _fix->set_neighbor_host_sizes();
-      buffers->pack_sep_from_single(_fix->host_min_local(),
-                        	    _fix->host_used_local(),
-                        	    _fix->host_min_ghost(),
-                        	    _fix->host_used_ghost());
-      _fix->stop_watch(TIME_PACK);
-    }
-  }
-  #else
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
-  #endif
-}
diff --git a/src/USER-INTEL/npair_half_bin_newtoff_intel.h b/src/USER-INTEL/npair_half_bin_newtoff_intel.h
deleted file mode 100644
index 49482f8b3e..0000000000
--- a/src/USER-INTEL/npair_half_bin_newtoff_intel.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef NPAIR_CLASS
-
-NPairStyle(half/bin/newtoff/intel,
-           NPairHalfBinNewtoffIntel,
-           NP_HALF | NP_BIN | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL)
-
-#else
-
-#ifndef LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H
-#define LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H
-
-#include "npair_intel.h"
-#include "fix_intel.h"
-
-namespace LAMMPS_NS {
-
-class NPairHalfBinNewtoffIntel : public NPairIntel {
- public:
-  NPairHalfBinNewtoffIntel(class LAMMPS *);
-  ~NPairHalfBinNewtoffIntel() {}
-  void build(class NeighList *);
-
- private:
-  template <class flt_t, class acc_t>
-  void hbnni(NeighList *, IntelBuffers<flt_t,acc_t> *);
-  template <class flt_t, class acc_t, int>
-  void hbnni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
-             const int);
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-
-*/
diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
index 6313ab944f..c761557097 100644
--- a/src/USER-INTEL/npair_half_bin_newton_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
@@ -80,531 +80,27 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (need_ic) {
     if (offload_noghost) {
-      hbni<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
-      hbni<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
     } else {
-      hbni<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
-      hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+      bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
     }
   } else {
     if (offload_noghost) {
-      hbni<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
-      hbni<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
     } else {
-      hbni<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
-      hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
+      bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
     }
   }
   #else
   if (need_ic)
-    hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+    bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
   else
-    hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
-
-template <class flt_t, class acc_t, int offload_noghost, int need_ic>
-void NPairHalfBinNewtonIntel::
-hbni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
-     const int astart, const int aend, const int offload_end) {
-
-  if (aend-astart == 0) return;
-
-  const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
-  int nall_t = nall;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload_noghost && offload) nall_t = atom->nlocal;
-  if (offload) {
-    if (INTEL_MIC_NBOR_PAD > 1)
-      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  } else
-  #endif
-    if (INTEL_NBOR_PAD > 1)
-      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  const int pad_width = pad;
-
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
-  const int e_nall = nall_t;
-
-  const int molecular = atom->molecular;
-  int *ns = NULL;
-  tagint *s = NULL;
-  int tag_size = 0, special_size;
-  if (buffers->need_tag()) tag_size = e_nall;
-  if (molecular) {
-    s = atom->special[0];
-    ns = atom->nspecial[0];
-    special_size = aend;
-  } else {
-    s = &buffers->_special_holder;
-    ns = &buffers->_nspecial_holder;
-    special_size = 0;
-  }
-  const tagint * _noalias const special = s;
-  const int * _noalias const nspecial = ns;
-  const int maxspecial = atom->maxspecial;
-  const tagint * _noalias const tag = atom->tag;
-
-  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int nstencil = this->nstencil;
-  const int * _noalias const stencil = this->stencil;
-  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
-  const int ntypes = atom->ntypes + 1;
-  const int nlocal = atom->nlocal;
-
-  #ifndef _LMP_INTEL_OFFLOAD
-  int * const mask = atom->mask;
-  tagint * const molecule = atom->molecule;
-  #endif
-
-  int tnum;
-  int *overflow;
-  double *timer_compute;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    timer_compute = _fix->off_watch_neighbor();
-    tnum = buffers->get_off_threads();
-    overflow = _fix->get_off_overflow_flag();
-    _fix->stop_watch(TIME_HOST_NEIGHBOR);
-    _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else
-  #endif
-  {
-    tnum = comm->nthreads;
-    overflow = _fix->get_overflow_flag();
-  }
-  const int nthreads = tnum;
-  const int maxnbors = buffers->get_max_nbors();
-  int * _noalias const atombin = buffers->get_atombin();
-  const int * _noalias const binpacked = buffers->get_binpacked();
-
-  const int xperiodic = domain->xperiodic;
-  const int yperiodic = domain->yperiodic;
-  const int zperiodic = domain->zperiodic;
-  const flt_t xprd_half = domain->xprd_half;
-  const flt_t yprd_half = domain->yprd_half;
-  const flt_t zprd_half = domain->zprd_half;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  const int * _noalias const binhead = this->binhead;
-  const int * _noalias const bins = this->bins;
-  const int cop = _fix->coprocessor_number();
-  const int separate_buffers = _fix->separate_buffers();
-  #pragma offload target(mic:cop) if(offload) \
-    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
-    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
-    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
-    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
-    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
-    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
-    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
-    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
-    out(numneigh:length(0) alloc_if(0) free_if(0)) \
-    in(ilist:length(0) alloc_if(0) free_if(0)) \
-    in(atombin:length(aend) alloc_if(0) free_if(0)) \
-    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pad_width) \
-    in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
-    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
-    out(overflow:length(5) alloc_if(0) free_if(0)) \
-    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(tag)
-  #endif
-  {
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime();
-    #endif
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    overflow[LMP_LOCAL_MIN] = astart;
-    overflow[LMP_LOCAL_MAX] = aend - 1;
-    overflow[LMP_GHOST_MIN] = e_nall;
-    overflow[LMP_GHOST_MAX] = -1;
-    #endif
-
-    int nstencilp = 0;
-    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
-    for (int k = 0; k < nstencil; k++) {
-      binstart[nstencilp] = stencil[k];
-      int end = stencil[k] + 1;
-      for (int kk = k + 1; kk < nstencil; kk++) {
-        if (stencil[kk-1]+1 == stencil[kk]) {
-          end++;
-          k++;
-        } else break;
-      }
-      binend[nstencilp] = end;
-      nstencilp++;
-    }
-
-    #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
-    #endif
-    {
-      #ifdef _LMP_INTEL_OFFLOAD
-      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
-      #endif
-
-      const int num = aend - astart;
-      int tid, ifrom, ito;
-
-      #ifdef OUTER_CHUNK
-      const int swidth = ip_simd::SIMD_type<flt_t>::width();
-      IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
-      ifrom += astart;
-      ito += astart;
-      int e_ito = ito;
-      if (ito == num) {
-        int imod = ito % swidth;
-        if (imod) e_ito += swidth - imod;
-      }
-      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
-      #else
-      const int swidth = 1;
-      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
-      ifrom += astart;
-      ito += astart;
-      const int list_size = (ito + tid * 2 + 2) * maxnbors;
-      #endif
-
-      int which;
-
-      int pack_offset = maxnbors * swidth;
-      int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
-      const int obound = pack_offset + maxnbors * 2;
-
-      int max_chunk = 0;
-      int lane = 0;
-      for (int i = ifrom; i < ito; i++) {
-        const flt_t xtmp = x[i].x;
-        const flt_t ytmp = x[i].y;
-        const flt_t ztmp = x[i].z;
-        const int itype = x[i].w;
-        const int ioffset = ntypes * itype;
-
-        // loop over rest of atoms in i's bin, ghosts are at end of linked list
-        // if j is owned atom, store it, since j is beyond i in linked list
-        // if j is ghost, only store if j coords are "above/to the right" of i
-
-        int raw_count = pack_offset;
-        for (int j = bins[i]; j >= 0; j = bins[j]) {
-          if (j >= nlocal) {
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (offload_noghost && offload) continue;
-            #endif
-            if (x[j].z < ztmp) continue;
-            if (x[j].z == ztmp) {
-              if (x[j].y < ytmp) continue;
-              if (x[j].y == ytmp && x[j].x < xtmp) continue;
-            }
-          }
-          #ifdef _LMP_INTEL_OFFLOAD
-          else if (offload_noghost && i < offload_end) continue;
-          #endif
-
-          #ifndef _LMP_INTEL_OFFLOAD
-          if (exclude) {
-            const int jtype = x[j].w;
-            if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
-          }
-          #endif
-
-          neighptr[raw_count++] = j;
-        }
-
-        // loop over all atoms in other bins in stencil, store every pair
-
-        const int ibin = atombin[i];
-        if (exclude) {
-          for (int k = 0; k < nstencilp; k++) {
-            const int bstart = binhead[ibin + binstart[k]];
-            const int bend = binhead[ibin + binend[k]];
-            #ifndef _LMP_INTEL_OFFLOAD
-            #ifdef INTEL_VMASK
-            #pragma simd
-            #endif
-            #endif
-            for (int jj = bstart; jj < bend; jj++) {
-              const int j = binpacked[jj];
-
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (offload_noghost) {
-                if (j < nlocal) {
-                  if (i < offload_end) continue;
-                } else if (offload) continue;
-              }
-              #endif
-
-              #ifndef _LMP_INTEL_OFFLOAD
-              const int jtype = x[j].w;
-              if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
-              #endif
-
-              neighptr[raw_count++] = j;
-            }
-          }
-        } else {
-          for (int k = 0; k < nstencilp; k++) {
-            const int bstart = binhead[ibin + binstart[k]];
-            const int bend = binhead[ibin + binend[k]];
-            #ifndef _LMP_INTEL_OFFLOAD
-            #ifdef INTEL_VMASK
-            #pragma simd
-            #endif
-            #endif
-            for (int jj = bstart; jj < bend; jj++) {
-              const int j = binpacked[jj];
-
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (offload_noghost) {
-                if (j < nlocal) {
-                  if (i < offload_end) continue;
-                } else if (offload) continue;
-              }
-              #endif
-
-              neighptr[raw_count++] = j;
-            }
-          }
-        }
-
-        if (raw_count > obound) *overflow = 1;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #ifdef _LMP_INTEL_OFFLOAD
-        int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
-        #if __INTEL_COMPILER+0 > 1499
-        #pragma vector aligned
-        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
-        #endif
-        #else
-        #pragma vector aligned
-        #pragma simd
-        #endif
-        #endif
-        for (int u = pack_offset; u < raw_count; u++) {
-          int j = neighptr[u];
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutneighsq[ioffset + jtype])
-            neighptr[u] = e_nall;
-          else {
-            if (need_ic) {
-              int no_special;
-              ominimum_image_check(no_special, delx, dely, delz);
-              if (no_special)
-                neighptr[u] = -j - 1;
-            }
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j < nlocal) {
-              if (j < vlmin) vlmin = j;
-              if (j > vlmax) vlmax = j;
-            } else {
-              if (j < vgmin) vgmin = j;
-              if (j > vgmax) vgmax = j;
-            }
-            #endif
-          }
-        }
-        #ifdef _LMP_INTEL_OFFLOAD
-        lmin = MIN(lmin,vlmin);
-        gmin = MIN(gmin,vgmin);
-        lmax = MAX(lmax,vlmax);
-        gmax = MAX(gmax,vgmax);
-        #endif
-
-        int n = lane, n2 = pack_offset;
-        for (int u = pack_offset; u < raw_count; u++) {
-          const int j = neighptr[u];
-          int pj = j;
-          if (pj < e_nall) {
-            if (need_ic)
-              if (pj < 0) pj = -pj - 1;
-
-            if (pj < nlocal) {
-              neighptr[n] = j;
-              n += swidth;
-            } else
-              neighptr[n2++] = j;
-          }
-        }
-        int ns = (n - lane) / swidth;
-        for (int u = pack_offset; u < n2; u++) {
-          neighptr[n] = neighptr[u];
-          n += swidth;
-        }
-
-        ilist[i] = i;
-        cnumneigh[i] = ct + lane;
-        ns += n2 - pack_offset;
-        #ifndef OUTER_CHUNK
-        int edge = (ns % pad_width);
-        if (edge) {
-          const int pad_end = ns + (pad_width - edge);
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min=1, max=15, avg=8
-          #endif
-          for ( ; ns < pad_end; ns++)
-            neighptr[ns] = e_nall;
-        }
-        #endif
-        numneigh[i] = ns;
-
-        #ifdef OUTER_CHUNK
-        if (ns > max_chunk) max_chunk = ns;
-        lane++;
-        if (lane == swidth) {
-          ct += max_chunk * swidth;
-          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          int edge = (ct % alignb);
-          if (edge) ct += alignb - edge;
-          neighptr = firstneigh + ct;
-          max_chunk = 0;
-          pack_offset = maxnbors * swidth;
-          lane = 0;
-          if (ct + obound > list_size) {
-            if (i < ito - 1) {
-              *overflow = 1;
-              ct = (ifrom + tid * 2) * maxnbors;
-            }
-          }
-        }
-        #else
-        ct += ns;
-        const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-        edge = (ct % alignb);
-        if (edge) ct += alignb - edge;
-        neighptr = firstneigh + ct;
-        if (ct + obound > list_size) {
-          if (i < ito - 1) {
-            *overflow = 1;
-            ct = (ifrom + tid * 2) * maxnbors;
-          }
-        }
-        #endif
-      }
-
-      if (*overflow == 1)
-        for (int i = ifrom; i < ito; i++)
-          numneigh[i] = 0;
-
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (separate_buffers) {
-        #if defined(_OPENMP)
-        #pragma omp critical
-        #endif
-        {
-            if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
-        }
-        #pragma omp barrier
-      }
-
-      int ghost_offset = 0, nall_offset = e_nall;
-      if (separate_buffers) {
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-         if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      }
-      #endif
-
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          #ifndef OUTER_CHUNK
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
-          #pragma simd
-          #endif
-          for (int jj = 0; jj < jnum; jj++) {
-          #else
-          const int trip = jnum * swidth;
-          for (int jj = 0; jj < trip; jj+= swidth) {
-          #endif
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == e_nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-            if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        }
-      }
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          for (jj = 0; jj < jnum; jj++)
-            if (jlist[jj] >= nlocal) break;
-          while (jj < jnum) {
-            if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-            else jlist[jj] -= ghost_offset;
-            jj++;
-          }
-        }
-      }
-      #endif
-    } // end omp
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime() - *timer_compute;
-    #endif
-  } // end offload
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
-    _fix->start_watch(TIME_HOST_NEIGHBOR);
-    for (int n = 0; n < aend; n++) {
-      ilist[n] = n;
-      numneigh[n] = 0;
-    }
-  } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
-    if (separate_buffers) {
-      _fix->start_watch(TIME_PACK);
-      _fix->set_neighbor_host_sizes();
-      buffers->pack_sep_from_single(_fix->host_min_local(),
-                        	    _fix->host_used_local(),
-                        	    _fix->host_min_ghost(),
-                        	    _fix->host_used_ghost());
-      _fix->stop_watch(TIME_PACK);
-    }
-  }
-  #else
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
+    bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
   #endif
 }
diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.h b/src/USER-INTEL/npair_half_bin_newton_intel.h
index 9b5d0780a1..54a8e24135 100644
--- a/src/USER-INTEL/npair_half_bin_newton_intel.h
+++ b/src/USER-INTEL/npair_half_bin_newton_intel.h
@@ -36,9 +36,6 @@ class NPairHalfBinNewtonIntel : public NPairIntel {
  private:
   template <class flt_t, class acc_t>
   void hbni(NeighList *, IntelBuffers<flt_t,acc_t> *);
-  template <class flt_t, class acc_t, int, int>
-  void hbni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
-            const int, const int offload_end = 0);
 };
 
 }
diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
index 5f191e0797..d70f1ec589 100644
--- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
@@ -80,434 +80,27 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (need_ic) {
     if (offload_noghost) {
-      hbnti<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
-      hbnti<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
     } else {
-      hbnti<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
-      hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+      bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
     }
   } else {
     if (offload_noghost) {
-      hbnti<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
-      hbnti<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
     } else {
-      hbnti<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
-      hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
+      bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
     }
   }
   #else
   if (need_ic)
-    hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+    bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
   else
-    hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
-
-template <class flt_t, class acc_t, int offload_noghost, int need_ic>
-void NPairHalfBinNewtonTriIntel::
-hbnti(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
-      const int astart, const int aend, const int offload_end) {
-  if (aend-astart == 0) return;
-
-  const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
-  int nall_t = nall;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload_noghost && offload) nall_t = atom->nlocal;
-  if (offload) {
-    if (INTEL_MIC_NBOR_PAD > 1)
-      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  } else
-  #endif
-    if (INTEL_NBOR_PAD > 1)
-      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  const int pad_width = pad;
-
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
-  const int e_nall = nall_t;
-
-  const int molecular = atom->molecular;
-  int *ns = NULL;
-  tagint *s = NULL;
-  int tag_size = 0, special_size;
-  if (buffers->need_tag()) tag_size = e_nall;
-  if (molecular) {
-    s = atom->special[0];
-    ns = atom->nspecial[0];
-    special_size = aend;
-  } else {
-    s = &buffers->_special_holder;
-    ns = &buffers->_nspecial_holder;
-    special_size = 0;
-  }
-  const tagint * _noalias const special = s;
-  const int * _noalias const nspecial = ns;
-  const int maxspecial = atom->maxspecial;
-  const tagint * _noalias const tag = atom->tag;
-
-  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int nstencil = this->nstencil;
-  const int * _noalias const stencil = this->stencil;
-  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
-  const int ntypes = atom->ntypes + 1;
-  const int nlocal = atom->nlocal;
-
-  #ifndef _LMP_INTEL_OFFLOAD
-  int * const mask = atom->mask;
-  tagint * const molecule = atom->molecule;
-  #endif
-
-  int tnum;
-  int *overflow;
-  double *timer_compute;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    timer_compute = _fix->off_watch_neighbor();
-    tnum = buffers->get_off_threads();
-    overflow = _fix->get_off_overflow_flag();
-    _fix->stop_watch(TIME_HOST_NEIGHBOR);
-    _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else
-  #endif
-  {
-    tnum = comm->nthreads;
-    overflow = _fix->get_overflow_flag();
-  }
-  const int nthreads = tnum;
-  const int maxnbors = buffers->get_max_nbors();
-  int * _noalias const atombin = buffers->get_atombin();
-  const int * _noalias const binpacked = buffers->get_binpacked();
-
-  const int xperiodic = domain->xperiodic;
-  const int yperiodic = domain->yperiodic;
-  const int zperiodic = domain->zperiodic;
-  const flt_t xprd_half = domain->xprd_half;
-  const flt_t yprd_half = domain->yprd_half;
-  const flt_t zprd_half = domain->zprd_half;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  const int * _noalias const binhead = this->binhead;
-  const int * _noalias const bins = this->bins;
-  const int cop = _fix->coprocessor_number();
-  const int separate_buffers = _fix->separate_buffers();
-  #pragma offload target(mic:cop) if(offload) \
-    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
-    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
-    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
-    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
-    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
-    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
-    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
-    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
-    out(numneigh:length(0) alloc_if(0) free_if(0)) \
-    in(ilist:length(0) alloc_if(0) free_if(0)) \
-    in(atombin:length(aend) alloc_if(0) free_if(0)) \
-    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,offload_end,pad_width,e_nall) \
-    in(offload,separate_buffers, astart, aend, nlocal, molecular, ntypes) \
-    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
-    out(overflow:length(5) alloc_if(0) free_if(0)) \
-    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(tag)
-  #endif
-  {
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime();
-    #endif
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    overflow[LMP_LOCAL_MIN] = astart;
-    overflow[LMP_LOCAL_MAX] = aend - 1;
-    overflow[LMP_GHOST_MIN] = e_nall;
-    overflow[LMP_GHOST_MAX] = -1;
-    #endif
-
-    int nstencilp = 0;
-    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
-    for (int k = 0; k < nstencil; k++) {
-      binstart[nstencilp] = stencil[k];
-      int end = stencil[k] + 1;
-      for (int kk = k + 1; kk < nstencil; kk++) {
-        if (stencil[kk-1]+1 == stencil[kk]) {
-          end++;
-          k++;
-        } else break;
-      }
-      binend[nstencilp] = end;
-      nstencilp++;
-    }
-
-    #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
-    #endif
-    {
-      #ifdef _LMP_INTEL_OFFLOAD
-      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
-      #endif
-
-      const int num = aend - astart;
-      int tid, ifrom, ito;
-      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
-      ifrom += astart;
-      ito += astart;
-
-      int which;
-
-      const int list_size = (ito + tid * 2 + 2) * maxnbors;
-      int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
-      const int obound = maxnbors * 3;
-
-      for (int i = ifrom; i < ito; i++) {
-        const flt_t xtmp = x[i].x;
-        const flt_t ytmp = x[i].y;
-        const flt_t ztmp = x[i].z;
-        const int itype = x[i].w;
-        const int ioffset = ntypes * itype;
-
-        // loop over all atoms in bins in stencil
-        // pairs for atoms j "below" i are excluded
-        // below = lower z or (equal z and lower y) or (equal zy and lower x)
-        //         (equal zyx and j <= i)
-        // latter excludes self-self interaction but allows superposed atoms
-
-        const int ibin = atombin[i];
-
-        int raw_count = maxnbors;
-        for (int k = 0; k < nstencilp; k++) {
-          const int bstart = binhead[ibin + binstart[k]];
-          const int bend = binhead[ibin + binend[k]];
-          for (int jj = bstart; jj < bend; jj++) {
-            const int j = binpacked[jj];
-
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (offload_noghost) {
-              if (j < nlocal) {
-                if (i < offload_end) continue;
-              } else if (offload) continue;
-            }
-            #endif
-
-            if (x[j].z < ztmp) continue;
-            if (x[j].z == ztmp) {
-              if (x[j].y < ytmp) continue;
-              if (x[j].y == ytmp) {
-                if (x[j].x < xtmp) continue;
-                if (x[j].x == xtmp && j <= i) continue;
-              }
-            }
-
-            #ifndef _LMP_INTEL_OFFLOAD
-            if (exclude) {
-              const int jtype = x[j].w;
-              if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
-            }
-            #endif
-
-            neighptr[raw_count++] = j;
-          }
-        }
-        if (raw_count > obound)
-          *overflow = 1;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #ifdef _LMP_INTEL_OFFLOAD
-        int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
-        #if __INTEL_COMPILER+0 > 1499
-        #pragma vector aligned
-        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
-        #endif
-        #else
-        #pragma vector aligned
-        #pragma simd
-        #endif
-        #endif
-        for (int u = maxnbors; u < raw_count; u++) {
-          int j = neighptr[u];
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutneighsq[ioffset + jtype])
-            neighptr[u] = e_nall;
-          else {
-            if (need_ic) {
-              int no_special;
-              ominimum_image_check(no_special, delx, dely, delz);
-              if (no_special)
-                neighptr[u] = -j - 1;
-            }
-
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j < nlocal) {
-              if (j < vlmin) vlmin = j;
-              if (j > vlmax) vlmax = j;
-            } else {
-              if (j < vgmin) vgmin = j;
-              if (j > vgmax) vgmax = j;
-            }
-            #endif
-          }
-        }
-
-        int n = 0, n2 = maxnbors;
-        for (int u = maxnbors; u < raw_count; u++) {
-          const int j = neighptr[u];
-          int pj = j;
-          if (pj < e_nall) {
-            if (need_ic)
-              if (pj < 0) pj = -pj - 1;
-
-            if (pj < nlocal)
-              neighptr[n++] = j;
-            else
-              neighptr[n2++] = j;
-          }
-        }
-        int ns = n;
-        for (int u = maxnbors; u < n2; u++)
-          neighptr[n++] = neighptr[u];
-
-        ilist[i] = i;
-        cnumneigh[i] = ct;
-        ns += n2 - maxnbors;
-
-        int edge = (ns % pad_width);
-        if (edge) {
-          const int pad_end = ns + (pad_width - edge);
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min=1, max=15, avg=8
-          #endif
-          for ( ; ns < pad_end; ns++)
-            neighptr[ns] = e_nall;
-        }
-        numneigh[i] = ns;
-
-        ct += ns;
-        const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-        edge = (ct % alignb);
-        if (edge) ct += alignb - edge;
-        neighptr = firstneigh + ct;
-        if (ct + obound > list_size) {
-          if (i < ito - 1) {
-            *overflow = 1;
-            ct = (ifrom + tid * 2) * maxnbors;
-          }
-        }
-      }
-
-      if (*overflow == 1)
-        for (int i = ifrom; i < ito; i++)
-          numneigh[i] = 0;
-
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (separate_buffers) {
-        #if defined(_OPENMP)
-        #pragma omp critical
-        #endif
-        {
-          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
-        }
-        #pragma omp barrier
-      }
-
-      int ghost_offset = 0, nall_offset = e_nall;
-      if (separate_buffers) {
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-        if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      }
-      #endif
-
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
-          #pragma simd
-          #endif
-          for (int jj = 0; jj < jnum; jj++) {
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == e_nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-              if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        }
-      }
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          for (jj = 0; jj < jnum; jj++)
-            if (jlist[jj] >= nlocal) break;
-          while (jj < jnum) {
-            if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-            else jlist[jj] -= ghost_offset;
-            jj++;
-          }
-        }
-      }
-      #endif
-    } // end omp
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime() - *timer_compute;
-    #endif
-  } // end offload
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
-    _fix->start_watch(TIME_HOST_NEIGHBOR);
-    for (int n = 0; n < aend; n++) {
-      ilist[n] = n;
-      numneigh[n] = 0;
-    }
-  } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
-    if (separate_buffers) {
-      _fix->start_watch(TIME_PACK);
-      _fix->set_neighbor_host_sizes();
-      buffers->pack_sep_from_single(_fix->host_min_local(),
-                        	    _fix->host_used_local(),
-                        	    _fix->host_min_ghost(),
-                        	    _fix->host_used_ghost());
-      _fix->stop_watch(TIME_PACK);
-    }
-  }
-  #else
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
+    bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
   #endif
 }
diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.h b/src/USER-INTEL/npair_half_bin_newton_tri_intel.h
index d144c2fc52..7a7f4c8030 100644
--- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.h
+++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.h
@@ -36,9 +36,6 @@ class NPairHalfBinNewtonTriIntel : public NPairIntel {
  private:
   template <class flt_t, class acc_t>
   void hbnti(NeighList *, IntelBuffers<flt_t,acc_t> *);
-  template <class flt_t, class acc_t, int, int>
-  void hbnti(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
-             const int, const int offload_end = 0);
 };
 
 }
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index c92ed88774..b20b1dcd08 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -40,7 +40,7 @@ NPairIntel::~NPairIntel() {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_off_map_stencil) {
     const int * stencil = this->stencil;
-    #pragma offload_transfer target(mic:_cop)	\
+    #pragma offload_transfer target(mic:_cop)   \
       nocopy(stencil:alloc_if(0) free_if(1))
   }
   #endif
@@ -48,6 +48,678 @@ NPairIntel::~NPairIntel() {
 
 /* ---------------------------------------------------------------------- */
 
+template <class flt_t, class acc_t, int offload_noghost, int need_ic,
+          int FULL, int TRI, int THREE>
+void NPairIntel::bin_newton(const int offload, NeighList *list,
+                            IntelBuffers<flt_t,acc_t> *buffers,
+                            const int astart, const int aend,
+                            const int offload_end) {
+
+  if (aend-astart == 0) return;
+
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+  int nall_t = nall;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload_noghost && offload) nall_t = atom->nlocal;
+  if (THREE == 0 && offload) {
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else
+  #endif
+    if (THREE == 0 && INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  const int pad_width = pad;
+  const int pack_width = _fix->nbor_pack_width();
+
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL;
+  tagint *s = NULL;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    special_size = 0;
+  }
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const tagint * _noalias const tag = atom->tag;
+
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = this->nstencil;
+  const int * _noalias const stencil = this->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  tagint * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    timer_compute = _fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = _fix->get_off_overflow_flag();
+    _fix->stop_watch(TIME_HOST_NEIGHBOR);
+    _fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else
+  #endif
+  {
+    tnum = comm->nthreads;
+    overflow = _fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+  int * _noalias const atombin = buffers->get_atombin();
+  const int * _noalias const binpacked = buffers->get_binpacked();
+
+  const int xperiodic = domain->xperiodic;
+  const int yperiodic = domain->yperiodic;
+  const int zperiodic = domain->zperiodic;
+  const flt_t xprd_half = domain->xprd_half;
+  const flt_t yprd_half = domain->yprd_half;
+  const flt_t zprd_half = domain->zprd_half;
+
+  flt_t * _noalias const ncachex = buffers->get_ncachex();
+  flt_t * _noalias const ncachey = buffers->get_ncachey();
+  flt_t * _noalias const ncachez = buffers->get_ncachez();
+  int * _noalias const ncachej = buffers->get_ncachej();
+  int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  const int ncache_stride = buffers->ncache_stride();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const bins = this->bins;
+  const int cop = _fix->coprocessor_number();
+  const int separate_buffers = _fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(atombin:length(aend) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
+    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
+    in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
+    in(pack_width) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(tag)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    int nstencilp = 0;
+    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
+    for (int k = 0; k < nstencil; k++) {
+      binstart[nstencilp] = stencil[k];
+      int end = stencil[k] + 1;
+      for (int kk = k + 1; kk < nstencil; kk++) {
+        if (stencil[kk-1]+1 == stencil[kk]) {
+          end++;
+          k++;
+        } else break;
+      }
+      binend[nstencilp] = end;
+      nstencilp++;
+    }
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(numneigh, overflow, nstencilp, binstart, binend)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
+      #endif
+
+      const int num = aend - astart;
+      int tid, ifrom, ito;
+
+      if (THREE) {
+        IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
+      } else {
+        IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+      }
+      ifrom += astart;
+      ito += astart;
+      int e_ito = ito;
+      if (THREE && ito == num) {
+        int imod = ito % pack_width;
+        if (imod) e_ito += pack_width - imod;
+      }
+      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
+
+      int which;
+
+      int pack_offset = maxnbors;
+      if (THREE) pack_offset *= pack_width;
+      int ct = (ifrom + tid * 2) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      const int obound = pack_offset + maxnbors * 2;
+
+      const int toffs = tid * ncache_stride;
+      flt_t * _noalias const tx = ncachex + toffs;
+      flt_t * _noalias const ty = ncachey + toffs;
+      flt_t * _noalias const tz = ncachez + toffs;
+      int * _noalias const tj = ncachej + toffs;
+      int * _noalias const tjtype = ncachejtype + toffs;
+
+      flt_t * _noalias itx;
+      flt_t * _noalias ity;
+      flt_t * _noalias itz;
+      int * _noalias itj;
+      int * _noalias itjtype;
+
+      // loop over all atoms in other bins in stencil, store every pair
+      int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
+      if (THREE) {
+        lane = 0;
+        max_chunk = 0;
+      }
+      for (int i = ifrom; i < ito; i++) {
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
+        tagint itag;
+        if (THREE) itag = tag[i];
+        const int ioffset = ntypes * itype;
+
+        const int ibin = atombin[i];
+        if (ibin != oldbin) {
+          oldbin = ibin;
+          ncount = 0;
+          for (int k = 0; k < nstencilp; k++) {
+            const int bstart = binhead[ibin + binstart[k]];
+            const int bend = binhead[ibin + binend[k]];
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma vector aligned
+            #pragma simd
+            #endif
+            for (int jj = bstart; jj < bend; jj++)
+              tj[ncount++] = binpacked[jj];
+          }
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int u = 0; u < ncount; u++) {
+            const int j = tj[u];
+            tx[u] = x[j].x;
+            ty[u] = x[j].y;
+            tz[u] = x[j].z;
+            tjtype[u] = x[j].w;
+          }
+
+          if (FULL == 0 || TRI == 1) {
+            icount = 0;
+            istart = ncount;
+            const int alignb = INTEL_DATA_ALIGN / sizeof(int);
+            int nedge = istart % alignb;
+            if (nedge) istart + (alignb - nedge);
+            itx = tx + istart;
+            ity = ty + istart;
+            itz = tz + istart;
+            itj = tj + istart;
+            itjtype = tjtype + istart;
+
+            const int bstart = binhead[ibin];
+            const int bend = binhead[ibin + 1];
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma vector aligned
+            #pragma simd
+            #endif
+            for (int jj = bstart; jj < bend; jj++) {
+              const int j = binpacked[jj];
+              itj[icount] = j;
+              itx[icount] = x[j].x;
+              ity[icount] = x[j].y;
+              itz[icount] = x[j].z;
+              itjtype[icount] = x[j].w;
+              icount++;
+            }
+            if (icount + istart > obound) *overflow = 1;
+          } else
+            if (ncount > obound) *overflow = 1;
+        }
+
+        // ---------------------- Loop over i bin
+
+        int n = 0;
+        if (FULL == 0 || TRI == 1) {
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma ivdep
+          #endif
+          for (int u = 0; u < icount; u++) {
+            int addme = 1;
+            int j = itj[u];
+
+            // Cutoff Check
+            const flt_t delx = xtmp - itx[u];
+            const flt_t dely = ytmp - ity[u];
+            const flt_t delz = ztmp - itz[u];
+            const int jtype = itjtype[u];
+            const flt_t rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
+
+            // i bin (half) check and offload ghost check
+            if (j < nlocal) {
+              const int ijmod = (i + j) % 2;
+              if (i > j) {
+                if (ijmod == 0) addme = 0;
+              } else if (i < j) {
+                if (ijmod == 1) addme = 0;
+              } else
+                addme = 0;
+              #ifdef _LMP_INTEL_OFFLOAD
+              if (offload_noghost && i < offload_end) addme = 0;
+              #endif
+            } else {
+              #ifdef _LMP_INTEL_OFFLOAD
+              if (offload_noghost && offload) addme = 0;
+              #endif
+              if (itz[u] < ztmp) addme = 0;
+              if (itz[u] == ztmp) {
+                if (ity[u] < ytmp) addme = 0;
+                if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
+              }
+            }
+
+            if (need_ic) {
+              int no_special;
+              ominimum_image_check(no_special, delx, dely, delz);
+              if (no_special)
+                j = -j - 1;
+            }
+
+            if (addme)
+              neighptr[n++] = j;
+          }
+        } // if FULL==0
+
+        // ---------------------- Loop over other bins
+
+        int n2, *neighptr2;
+        if (THREE) {
+          n = pack_offset;
+          n2 = pack_offset + maxnbors;
+          neighptr2 = neighptr;
+        }
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int u = 0; u < ncount; u++) {
+          int addme = 1;
+          int j = tj[u];
+
+          if (FULL)
+            if (i == j) addme = 0;
+
+          // Cutoff Check
+          const flt_t delx = xtmp - tx[u];
+          const flt_t dely = ytmp - ty[u];
+          const flt_t delz = ztmp - tz[u];
+          const int jtype = tjtype[u];
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
+
+          // Triclinic
+          if (TRI) {
+            if (tz[u] < ztmp) addme = 0;
+            if (tz[u] == ztmp) {
+              if (ty[u] < ytmp) addme = 0;
+              if (ty[u] == ytmp) {
+                if (tx[u] < xtmp) addme = 0;
+                if (tx[u] == xtmp && j <= i) addme = 0;
+              }
+            }
+          }
+
+          // offload ghost check
+          #ifdef _LMP_INTEL_OFFLOAD
+          if (offload_noghost) {
+            if (j < nlocal) {
+              if (i < offload_end) addme = 0;
+            } else if (offload) addme = 0;
+          }
+          #endif
+
+          int pj;
+          if (THREE) pj = j;
+          if (need_ic) {
+            int no_special;
+            ominimum_image_check(no_special, delx, dely, delz);
+            if (no_special)
+              j = -j - 1;
+          }
+
+          if (THREE) {
+            const int jtag = tag[pj];
+            int flist = 0;
+            if (itag > jtag) {
+              if ((itag+jtag) % 2 == 0) flist = 1;
+            } else if (itag < jtag) {
+              if ((itag+jtag) % 2 == 1) flist = 1;
+            } else {
+              if (tz[u] < ztmp) flist = 1;
+              else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
+              else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
+                flist = 1;
+            }
+            if (addme) {
+              if (flist)
+                neighptr2[n2++] = j;
+              else
+                neighptr[n++] = j;
+            }
+          } else {
+            if (addme)
+              neighptr[n++] = j;
+          }
+        } // for u
+
+        #ifndef _LMP_INTEL_OFFLOAD
+        if (exclude) {
+          int alln = n;
+          if (THREE) n = pack_offset;
+          else n = 0;
+          for (int u = pack_offset; u < alln; u++) {
+            const int j = neighptr[u];
+            int pj = j;
+            if (need_ic)
+              if (pj < 0) pj = -j - 1;
+            const int jtype = x[pj].w;
+            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+            neighptr[n++] = j;
+          }
+          if (THREE) {
+            alln = n2;
+            n2 = pack_offset + maxnbors;
+            for (int u = pack_offset + maxnbors; u < alln; u++) {
+              const int j = neighptr[u];
+              int pj = j;
+              if (need_ic)
+                if (pj < 0) pj = -j - 1;
+              const int jtype = x[pj].w;
+              if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+              neighptr[n2++] = j;
+            }
+          }
+        }
+        #endif
+        int ns;
+        if (THREE) {
+          int alln = n;
+          ns = n - pack_offset;
+          atombin[i] = ns;
+          n = lane;
+          for (int u = pack_offset; u < alln; u++) {
+            neighptr[n] = neighptr[u];
+            n += pack_width;
+          }
+          ns += n2 - pack_offset - maxnbors;
+          for (int u = pack_offset + maxnbors; u < n2; u++) {
+            neighptr[n] = neighptr[u];
+            n += pack_width;
+          }
+          if (ns > maxnbors) *overflow = 1;
+        } else
+          if (n > maxnbors) *overflow = 1;
+
+        ilist[i] = i;
+        cnumneigh[i] = ct;
+        if (THREE) {
+          cnumneigh[i] += lane;
+          numneigh[i] = ns;
+        } else {
+          int edge = (n % pad_width);
+          if (edge) {
+            const int pad_end = n + (pad_width - edge);
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma vector aligned
+            #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+                    avg=INTEL_COMPILE_WIDTH/2
+            #endif
+            for ( ; n < pad_end; n++)
+              neighptr[n] = e_nall;
+          }
+          numneigh[i] = n;
+        }
+
+        if (THREE) {
+          if (ns > max_chunk) max_chunk = ns;
+          lane++;
+          if (lane == pack_width) {
+            ct += max_chunk * pack_width;
+            const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+            const int edge = (ct % alignb);
+            if (edge) ct += alignb - edge;
+            neighptr = firstneigh + ct;
+            max_chunk = 0;
+            pack_offset = maxnbors * pack_width;
+            lane = 0;
+            if (ct + obound > list_size) {
+              if (i < ito - 1) {
+                *overflow = 1;
+                ct = (ifrom + tid * 2) * maxnbors;
+              }
+            }
+          }
+        } else {
+          ct += n;
+          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+          const int edge = (ct % alignb);
+          if (edge) ct += alignb - edge;
+          neighptr = firstneigh + ct;
+          if (ct + obound > list_size) {
+            if (i < ito - 1) {
+              *overflow = 1;
+              ct = (ifrom + tid * 2) * maxnbors;
+            }
+          }
+        }
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          #if __INTEL_COMPILER+0 > 1499
+          #pragma vector aligned
+          #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            int j = jlist[jj];
+            if (need_ic && j < 0) j = -j - 1;
+            if (j < nlocal) {
+              if (j < vlmin) vlmin = j;
+              if (j > vlmax) vlmax = j;
+            } else {
+              if (j < vgmin) vgmin = j;
+              if (j > vgmax) vgmax = j;
+            }
+          }
+        }
+        lmin = MIN(lmin,vlmin);
+        gmin = MIN(gmin,vgmin);
+        lmax = MAX(lmax,vlmax);
+        gmax = MAX(gmax,vgmax);
+
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+        #pragma omp barrier
+
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+        } else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      } // if separate_buffers
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+
+          if (THREE) {
+            const int trip = jnum * pack_width;
+            for (int jj = 0; jj < trip; jj+=pack_width) {
+              const int j = jlist[jj];
+              if (need_ic && j < 0) {
+                which = 0;
+                jlist[jj] = -j - 1;
+              } else
+                ofind_special(which, special, nspecial, i, tag[j]);
+              #ifdef _LMP_INTEL_OFFLOAD
+              if (j >= nlocal) {
+                if (j == e_nall)
+                  jlist[jj] = nall_offset;
+                else if (which)
+                  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+                else jlist[jj]-=ghost_offset;
+              } else
+              #endif
+              if (which) jlist[jj] = j ^ (which << SBBITS);
+            }
+          } else {
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma vector aligned
+            #pragma simd
+            #endif
+            for (int jj = 0; jj < jnum; jj++) {
+              const int j = jlist[jj];
+              if (need_ic && j < 0) {
+                which = 0;
+                jlist[jj] = -j - 1;
+              } else
+                ofind_special(which, special, nspecial, i, tag[j]);
+              #ifdef _LMP_INTEL_OFFLOAD
+              if (j >= nlocal) {
+                if (j == e_nall)
+                  jlist[jj] = nall_offset;
+                else if (which)
+                  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+                else jlist[jj]-=ghost_offset;
+              } else
+              #endif
+              if (which) jlist[jj] = j ^ (which << SBBITS);
+            }
+          }
+        } // for i
+      } // if molecular
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          int jj = 0;
+          #pragma vector aligned
+          #pragma simd
+          for (jj = 0; jj < jnum; jj++) {
+            if (jlist[jj] >= nlocal) {
+              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+              else jlist[jj] -= ghost_offset;
+            }
+          }
+        }
+      }
+      #endif
+    } // end omp
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    _fix->start_watch(TIME_HOST_NEIGHBOR);
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    if (separate_buffers) {
+      _fix->start_watch(TIME_PACK);
+      _fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(_fix->host_min_local(),
+                                    _fix->host_used_local(),
+                                    _fix->host_min_ghost(),
+                                    _fix->host_used_ghost());
+      _fix->stop_watch(TIME_PACK);
+    }
+  }
+  #else
+  #pragma vector aligned
+  #pragma simd
+  for (int i = astart; i < aend; i++)
+    list->firstneigh[i] = firstneigh + cnumneigh[i];
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
 #ifdef _LMP_INTEL_OFFLOAD
 void NPairIntel::grow_stencil()
 {
@@ -60,8 +732,206 @@ void NPairIntel::grow_stencil()
     _off_map_stencil = stencil;
     const int * stencil = _off_map_stencil;
     const int maxstencil = ns->get_maxstencil();
-    #pragma offload_transfer target(mic:_cop)	\
+    #pragma offload_transfer target(mic:_cop)   \
       in(stencil:length(maxstencil) alloc_if(1) free_if(0))
   }
 }
 #endif
+
+/* ---------------------------------------------------------------------- */
+
+// ---- Half, no IC
+
+template void NPairIntel::bin_newton<float, float, 0, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Half, IC
+
+template void NPairIntel::bin_newton<float, float, 0, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Tri, no IC
+
+template void NPairIntel::bin_newton<float, float, 0, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Tri, IC
+
+template void NPairIntel::bin_newton<float, float, 0, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Full, no IC
+
+template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Full, IC
+
+template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- 3-body, no IC
+
+template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- 3-body, IC
+
+template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+#ifdef _LMP_INTEL_OFFLOAD
+
+// ---- Half, no IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Half, IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Tri, no IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Tri, IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Full, no IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Full, IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- 3-body, no IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- 3-body, IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+#endif
diff --git a/src/USER-INTEL/npair_intel.h b/src/USER-INTEL/npair_intel.h
index 06d5d79cac..55a529b2cb 100644
--- a/src/USER-INTEL/npair_intel.h
+++ b/src/USER-INTEL/npair_intel.h
@@ -25,10 +25,6 @@
 #include "intel_simd.h"
 #endif
 
-#ifdef OUTER_CHUNK
-#include "intel_simd.h"
-#endif
-
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
@@ -87,6 +83,10 @@ class NPairIntel : public NPair {
  protected:
   FixIntel *_fix;
 
+  template <class flt_t, class acc_t, int, int, int, int, int>
+  void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
+                  const int, const int, const int offload_end = 0);
+
   #ifdef _LMP_INTEL_OFFLOAD
   int _cop;
   int *_off_map_stencil;
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
index 4f34a484cb..07beae1e41 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@@ -55,7 +55,7 @@ PairBuckCoulCutIntel::~PairBuckCoulCutIntel()
 void PairBuckCoulCutIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
-    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@@ -70,8 +70,8 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairBuckCoulCutIntel::compute(int eflag, int vflag,
-				   IntelBuffers<flt_t,acc_t> *buffers,
-				   const ForceConst<flt_t> &fc)
+                                   IntelBuffers<flt_t,acc_t> *buffers,
+                                   const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -85,57 +85,51 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				nthreads, sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
-  
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
-				IntelBuffers<flt_t,acc_t> *buffers,
-				const ForceConst<flt_t> &fc,
-				const int astart, const int aend)
+                                IntelBuffers<flt_t,acc_t> *buffers,
+                                const ForceConst<flt_t> &fc,
+                                const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -165,9 +159,9 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -204,31 +198,30 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
     *timer_compute = MIC_Wtime();
     #endif
 
-    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
-			      f_stride, x, q);
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none)        \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
@@ -240,21 +233,20 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
-  
+
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
@@ -270,19 +262,19 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r = sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
-	  
-          #ifdef INTEL_VMASK 
+
+          #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cut_coulsq) {
           #endif
             forcecoul = qqrd2e * qtmp*q[j]/r;
-            if (EFLAG) 
+            if (EFLAG)
               ecoul = forcecoul;
             if (sbindex){
               const flt_t factor_coul = special_coul[sbindex];
               forcecoul *= factor_coul;
               if(EFLAG)
                 ecoul *= factor_coul;
-              
+
             }
           #ifdef INTEL_VMASK
           }
@@ -290,7 +282,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
           if (rsq >= c_cuti[jtype].cut_coulsq)
             { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
           #endif
-          
+
           #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cut_ljsq) {
           #endif
@@ -298,14 +290,14 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
             flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
             forcebuck = r * rexp * c_forcei[jtype].buck1 -
               r6inv * c_forcei[jtype].buck2;
-            if (EFLAG) 
+            if (EFLAG)
               evdwl = rexp * c_energyi[jtype].a -
                 r6inv * c_energyi[jtype].c -
                 c_energyi[jtype].offset;
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
-              if (EFLAG) 
+              if (EFLAG)
                 evdwl *= factor_lj;
             }
           #ifdef INTEL_VMASK
@@ -319,71 +311,72 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
           if (rsq < c_cuti[jtype].cutsq) {
           #endif
             const flt_t fpair = (forcecoul + forcebuck) * r2inv;
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
-            }
-            
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i < nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j < nlocal)
-                ev_pre += (flt_t)0.5;
-              
-              if (EFLAG) {
-                sevdwl += ev_pre * evdwl;
-                secoul += ev_pre * ecoul;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                  if (NEWTON_PAIR || j < nlocal) 
-                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                }
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
+
+
+            if (EFLAG) {
+              sevdwl += evdwl;
+              secoul += ecoul;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
               }
-              IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
             }
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
-
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-
-        IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride, 
-			       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -395,7 +388,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -406,6 +399,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 void PairBuckCoulCutIntel::init_style()
 {
   PairBuckCoulCut::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -413,7 +410,7 @@ void PairBuckCoulCutIntel::init_style()
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
-  
+
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
@@ -495,9 +492,9 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-							   const int ntable,
-							   Memory *memory,
-							   const int cop) {
+                                                           const int ntable,
+                                                           Memory *memory,
+                                                           const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -508,12 +505,12 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_cut_t * oc_cut = c_cut[0];
 
       if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
-          oc_energy != NULL && ospecial_coul != NULL && 
+          oc_energy != NULL && ospecial_coul != NULL &&
           _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
           nocopy(oc_force, oc_energy: alloc_if(0) free_if(1))        \
-          nocopy(oc_cut: alloc_if(0) free_if(1)) 
+          nocopy(oc_cut: alloc_if(0) free_if(1))
       }
       #endif
 
@@ -537,7 +534,7 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_cut_t * oc_cut = c_cut[0];
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
-          oc_energy != NULL && ospecial_coul != NULL &&  
+          oc_energy != NULL && ospecial_coul != NULL &&
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.h b/src/USER-INTEL/pair_buck_coul_cut_intel.h
index 6590cd9c16..7204323903 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.h
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.h
@@ -49,10 +49,10 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
                const ForceConst<flt_t> &fc);
 
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -75,7 +75,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
 
     void set_ntypes(const int ntypes, const int ntable, Memory *memory,
-		    const int cop);
+                    const int cop);
 
    private:
     int _ntypes, _ntable, _cop;
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
index 9319f531e1..995e2e8583 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@@ -55,7 +55,7 @@ PairBuckCoulLongIntel::~PairBuckCoulLongIntel()
 void PairBuckCoulLongIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
-    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@@ -70,8 +70,8 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairBuckCoulLongIntel::compute(int eflag, int vflag,
-				    IntelBuffers<flt_t,acc_t> *buffers,
-				    const ForceConst<flt_t> &fc)
+                                    IntelBuffers<flt_t,acc_t> *buffers,
+                                    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -85,57 +85,51 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				nthreads, sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
-  
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
-				 IntelBuffers<flt_t,acc_t> *buffers,
-				 const ForceConst<flt_t> &fc,
-				 const int astart, const int aend)
+                                 IntelBuffers<flt_t,acc_t> *buffers,
+                                 const ForceConst<flt_t> &fc,
+                                 const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -170,11 +164,19 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -208,8 +210,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
-    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)    \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -220,129 +224,149 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     *timer_compute = MIC_Wtime();
     #endif
 
-    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
-			      f_stride, x, q);
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none)        \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
-      for (int i = iifrom; i < iito; ++i) {
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
-        const flt_t * _noalias const rho_invi = rho_inv + ptr_off; 
+        const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
 
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
-	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-	}
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
+        int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
-          flt_t forcecoul, forcebuck, evdwl, ecoul;
-          forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
-
-          const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
-
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          if (rsq < c_forcei[jtype].cutsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=jtype;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+                                 sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          flt_t forcecoul, forcebuck, evdwl, ecoul;
+          forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
+
+          const int j = tj[jj] & NEIGHMASK;
+          const int sbindex = tj[jj] >> SBBITS & 3;
+          const int jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
           const flt_t r = (flt_t)1.0 / sqrt(r2inv);
 
-          #ifdef INTEL_VMASK
-          if (rsq < c_forcei[jtype].cutsq) {
+          #ifdef INTEL_ALLOW_TABLE
+          if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
-            #ifdef INTEL_ALLOW_TABLE
-            if (!ncoultablebits || rsq <= tabinnersq) {
-            #endif
-              const flt_t A1 =  0.254829592;
-              const flt_t A2 = -0.284496736;
-              const flt_t A3 =  1.421413741;
-              const flt_t A4 = -1.453152027;
-              const flt_t A5 =  1.061405429;
-              const flt_t EWALD_F = 1.12837917;
-              const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+            const flt_t A1 =  0.254829592;
+            const flt_t A2 = -0.284496736;
+            const flt_t A3 =  1.421413741;
+            const flt_t A4 = -1.453152027;
+            const flt_t A5 =  1.061405429;
+            const flt_t EWALD_F = 1.12837917;
+            const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
-              const flt_t grij = g_ewald * r;
-              const flt_t expm2 = exp(-grij * grij);
-              const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
-              const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-              const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
-              forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
-              if (EFLAG) ecoul = prefactor * erfc;
+            const flt_t grij = g_ewald * r;
+            const flt_t expm2 = exp(-grij * grij);
+            const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+            const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+            forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+            if (EFLAG) ecoul = prefactor * erfc;
 
-	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
-		prefactor;
-	      forcecoul -= adjust;
-	      if (EFLAG) ecoul -= adjust;
+            const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+              prefactor;
+            forcecoul -= adjust;
+            if (EFLAG) ecoul -= adjust;
 
-            #ifdef INTEL_ALLOW_TABLE
-            } else {
-              float rsq_lookup = rsq;
-              const int itable = (__intel_castf32_u32(rsq_lookup) &
-                                  ncoulmask) >> ncoulshiftbits;
-              const flt_t fraction = (rsq_lookup - table[itable].r) *
-                table[itable].dr;
+          #ifdef INTEL_ALLOW_TABLE
+          } else {
+            float rsq_lookup = rsq;
+            const int itable = (__intel_castf32_u32(rsq_lookup) &
+              ncoulmask) >> ncoulshiftbits;
+            const flt_t fraction = (rsq_lookup - table[itable].r) *
+              table[itable].dr;
 
-              const flt_t tablet = table[itable].f +
-                fraction * table[itable].df;
-              forcecoul = qtmp * q[j] * tablet;
-              if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
-                                                fraction * detable[itable]);
-              if (sbindex) {
-                const flt_t table2 = ctable[itable] +
-                  fraction * dctable[itable];
-                const flt_t prefactor = qtmp * q[j] * table2;
-                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
-                  prefactor;
-                forcecoul -= adjust;
-                if (EFLAG) ecoul -= adjust;
-              }
+            const flt_t tablet = table[itable].f +
+              fraction * table[itable].df;
+            forcecoul = qtmp * q[j] * tablet;
+            if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+              fraction * detable[itable]);
+            if (sbindex) {
+              const flt_t table2 = ctable[itable] +
+                fraction * dctable[itable];
+              const flt_t prefactor = qtmp * q[j] * table2;
+              const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
             }
-            #endif
-            #ifdef INTEL_VMASK
           }
-	  #endif
+          #endif
 
-	  #ifdef INTEL_VMASK
+          #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
@@ -361,80 +385,74 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
           #ifdef INTEL_VMASK
           }
           #else
-          if (rsq > c_forcei[jtype].cutsq)
-            { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
           if (rsq > c_forcei[jtype].cut_ljsq)
             { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
-          #ifdef INTEL_VMASK
-          if (rsq < c_forcei[jtype].cutsq) {
-          #endif
-            const flt_t fpair = (forcecoul + forcebuck) * r2inv;
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
-            }
+          const flt_t fpair = (forcecoul + forcebuck) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
 
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i < nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j < nlocal)
-                ev_pre += (flt_t)0.5;
-
-              if (EFLAG) {
-                sevdwl += ev_pre * evdwl;
-                secoul += ev_pre * ecoul;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                  if (NEWTON_PAIR || j < nlocal) 
-                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                }
-              }
-              IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += ecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
             }
-          #ifdef INTEL_VMASK
           }
-          #endif
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
         } // for jj
-
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride, 
-			       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -446,7 +464,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -457,6 +475,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
 void PairBuckCoulLongIntel::init_style()
 {
   PairBuckCoulLong::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -464,7 +486,7 @@ void PairBuckCoulLongIntel::init_style()
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
-  
+
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
@@ -484,6 +506,13 @@ template <class flt_t, class acc_t>
 void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
@@ -518,6 +547,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
+      if (cutsq[i][j] < cut_ljsq[i][j])
+        error->all(FLERR,
+         "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].buck1 = buck1[i][j];
@@ -571,9 +603,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-							   const int ntable,
-							   Memory *memory,
-							   const int cop) {
+                                                           const int ntable,
+                                                           Memory *memory,
+                                                           const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -593,10 +625,10 @@ void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
           ospecial_coul != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
-	  nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
-	  nocopy(orho_inv: alloc_if(0) free_if(1)) \
-	  nocopy(otable: alloc_if(0) free_if(1)) \
-	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+          nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
+          nocopy(orho_inv: alloc_if(0) free_if(1)) \
+          nocopy(otable: alloc_if(0) free_if(1)) \
+          nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.h b/src/USER-INTEL/pair_buck_coul_long_intel.h
index 57e4517404..ec37c699c8 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.h
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.h
@@ -40,7 +40,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
 
  private:
   FixIntel *fix;
-  int _cop, _lrt;
+  int _cop, _lrt, _ccache_stride;
 
   template <class flt_t> class ForceConst;
 
@@ -48,10 +48,10 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
                const ForceConst<flt_t> &fc);
 
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -76,7 +76,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
 
     void set_ntypes(const int ntypes, const int ntable, Memory *memory,
-		    const int cop);
+                    const int cop);
 
    private:
     int _ntypes, _ntable, _cop;
diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp
index 4815d1e025..8c63d2e62d 100644
--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@@ -48,7 +48,7 @@ PairBuckIntel::~PairBuckIntel()
 void PairBuckIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
-    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@@ -63,8 +63,8 @@ void PairBuckIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairBuckIntel::compute(int eflag, int vflag,
-			    IntelBuffers<flt_t,acc_t> *buffers,
-			    const ForceConst<flt_t> &fc)
+                            IntelBuffers<flt_t,acc_t> *buffers,
+                            const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -78,57 +78,51 @@ void PairBuckIntel::compute(int eflag, int vflag,
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				nthreads, sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
-  
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckIntel::eval(const int offload, const int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc,
-				     const int astart, const int aend)
+                         IntelBuffers<flt_t,acc_t> *buffers,
+                         const ForceConst<flt_t> &fc,
+                         const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -152,9 +146,9 @@ void PairBuckIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -166,7 +160,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
- 
+
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload)                 \
     in(special_lj:length(0) alloc_if(0) free_if(0)) \
@@ -188,31 +182,30 @@ void PairBuckIntel::eval(const int offload, const int vflag,
     *timer_compute = MIC_Wtime();
     #endif
 
-    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
-			      f_stride, x, 0);
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl =  (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl =  (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none)        \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
@@ -222,24 +215,23 @@ void PairBuckIntel::eval(const int offload, const int vflag,
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
-	acc_t sevdwl,  sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl,  sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
+        if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
-          
+
           flt_t  forcebuck, evdwl;
           forcebuck = evdwl =  (flt_t)0.0;
 
@@ -253,7 +245,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r = sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
-	  
+
           #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cutsq) {
           #endif
@@ -265,7 +257,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
             #ifndef INTEL_VMASK
             if (rsq > c_forcei[jtype].cutsq)
               forcebuck =(flt_t)0.0;
-            #endif 
+            #endif
             if (EFLAG) {
               evdwl = rexp * c_energyi[jtype].a -
                 r6inv * c_energyi[jtype].c -
@@ -280,73 +272,74 @@ void PairBuckIntel::eval(const int offload, const int vflag,
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
-              if (EFLAG) 
+              if (EFLAG)
                 evdwl *= factor_lj;
             }
             const flt_t fpair =  forcebuck * r2inv;
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
-            }
-            
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i < nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j < nlocal)
-                ev_pre += (flt_t)0.5;
-              
-              if (EFLAG) {
-                sevdwl += ev_pre * evdwl;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += (flt_t)0.5 * evdwl;
-                  if (NEWTON_PAIR || j < nlocal) 
-                    f[j].w += (flt_t)0.5 * evdwl;
-                }
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
+
+            if (EFLAG) {
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl;
               }
-              IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
             }
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
-
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride, 
-			       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -358,7 +351,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -367,6 +360,10 @@ void PairBuckIntel::eval(const int offload, const int vflag,
 void PairBuckIntel::init_style()
 {
   PairBuck::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -374,7 +371,7 @@ void PairBuckIntel::init_style()
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
-  
+
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
@@ -445,7 +442,7 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
-void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, 
+void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                   Memory *memory,
                                                   const int cop) {
   if ( (ntypes != _ntypes ) ) {
@@ -455,8 +452,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
 
-      if (ospecial_lj != NULL && oc_force != NULL && 
-          oc_energy != NULL  && 
+      if (ospecial_lj != NULL && oc_force != NULL &&
+          oc_energy != NULL  &&
           _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: alloc_if(0) free_if(1)) \
@@ -479,8 +476,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       int tp1sq = ntypes*ntypes;
-      if (ospecial_lj != NULL && oc_force != NULL && 
-          oc_energy != NULL &&  
+      if (ospecial_lj != NULL && oc_force != NULL &&
+          oc_energy != NULL &&
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
diff --git a/src/USER-INTEL/pair_buck_intel.h b/src/USER-INTEL/pair_buck_intel.h
index 4f039c3f97..ab5e135262 100644
--- a/src/USER-INTEL/pair_buck_intel.h
+++ b/src/USER-INTEL/pair_buck_intel.h
@@ -48,10 +48,10 @@ private:
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
                const ForceConst<flt_t> &fc);
 
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -59,7 +59,7 @@ private:
 
   template <class flt_t>
   class ForceConst {
-  
+
   public:
     typedef struct { flt_t buck1, buck2, rhoinv, cutsq; } c_force_t;
     typedef struct { flt_t a, c, offset, pad; } c_energy_t;
@@ -78,7 +78,7 @@ private:
     int _ntypes, _cop;
     Memory *_memory;
   };
-  
+
   ForceConst<float> force_const_single;
   ForceConst<double> force_const_double;
 };
diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp
index f8c972ab8b..b97128bf9f 100644
--- a/src/USER-INTEL/pair_eam_intel.cpp
+++ b/src/USER-INTEL/pair_eam_intel.cpp
@@ -74,8 +74,8 @@ void PairEAMIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairEAMIntel::compute(int eflag, int vflag,
-			   IntelBuffers<flt_t,acc_t> *buffers,
-			   const ForceConst<flt_t> &fc)
+                           IntelBuffers<flt_t,acc_t> *buffers,
+                           const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag, vflag);
@@ -90,78 +90,58 @@ void PairEAMIntel::compute(int eflag, int vflag,
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-                                nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
   if (_onetype) {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-        if (force->newton_pair) {
-          eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-        } else {
-          eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-        }
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        if (force->newton_pair) {
-          eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-        } else {
-          eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-        }
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-        eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
-        eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
-        eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-        if (force->newton_pair) {
-          eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-        } else {
-          eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-        }
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        if (force->newton_pair) {
-          eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-        } else {
-          eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-        }
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-        eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
-        eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
-        eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
@@ -169,11 +149,10 @@ void PairEAMIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
-	  class acc_t>
+template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairEAMIntel::eval(const int offload, const int vflag,
-			IntelBuffers<flt_t,acc_t> *buffers,
-			const ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t,acc_t> *buffers,
+                        const ForceConst<flt_t> &fc,
                         const int astart, const int aend)
 {
   const int inum = aend - astart;
@@ -186,7 +165,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
     nmax = atom->nmax;
     int edge = (nmax * sizeof(acc_t)) % INTEL_DATA_ALIGN;
     if (edge) nmax += (INTEL_DATA_ALIGN - edge) / sizeof(acc_t);
-    memory->create(rho,nmax*comm->nthreads,"pair:rho");
+    if (NEWTON_PAIR)
+      memory->create(rho,nmax*comm->nthreads,"pair:rho");
+    else
+      memory->create(rho,nmax,"pair:rho");
     memory->create(fp,nmax,"pair:fp");
     // Use single precision allocation for single/mixed mode
     // Keep double version for single and swap_eam
@@ -222,9 +204,17 @@ void PairEAMIntel::eval(const int offload, const int vflag,
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
@@ -252,123 +242,146 @@ void PairEAMIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(fp_f, f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
-      IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, 
-			      INTEL_VECTOR_WIDTH);
+      IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads,
+                              INTEL_VECTOR_WIDTH);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      double * _noalias const trho = rho + tid*nmax;
-      if (NEWTON_PAIR)
-	memset(trho, 0, nall * sizeof(double));
-      else
-	memset(trho, 0, nlocal * sizeof(double));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) foff = tid * nmax;
+      else foff = 0;
+      double * _noalias const trho = rho + foff;
+      if (NEWTON_PAIR) {
+        memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+        memset(trho, 0, nall * sizeof(double));
+      }
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
 
       flt_t oscale;
       int rhor_joff, frho_ioff;
       if (ONETYPE) {
         const int ptr_off=_onetype * ntypes + _onetype;
-	oscale = scale_f[ptr_off];
-	int rhor_ioff = istride * _onetype;
-	rhor_joff = rhor_ioff + _onetype * jstride;
-	frho_ioff = fstride * _onetype;
+        oscale = scale_f[ptr_off];
+        int rhor_ioff = istride * _onetype;
+        rhor_joff = rhor_ioff + _onetype * jstride;
+        frho_ioff = fstride * _onetype;
       }
       for (int i = iifrom; i < iito; ++i) {
         int itype, rhor_ioff;
-	if (!ONETYPE) {
+        if (!ONETYPE) {
           itype = x[i].w;
-	  rhor_ioff = istride * itype;
-	}
-	const int * _noalias const jlist = firstneigh + cnumneigh[i];
-	const int jnum = numneigh[i];
+          rhor_ioff = istride * itype;
+        }
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
 
-	const flt_t xtmp = x[i].x;
-	const flt_t ytmp = x[i].y;
-	const flt_t ztmp = x[i].z;
-
-	acc_t rhoi = (acc_t)0.0;
-	#if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-        #pragma simd reduction(+:rhoi)
-	#endif
-	for (int jj = 0; jj < jnum; jj++) {
-          int j, jtype;
-	  j = jlist[jj] & NEIGHMASK;
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
 
+        acc_t rhoi = (acc_t)0.0;
+        int ej = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-	  const flt_t rsq = delx*delx + dely*dely + delz*delz;
+          const flt_t rsq = delx*delx + dely*dely + delz*delz;
 
-	  if (rsq < fcutforcesq) {
-	    if (!ONETYPE) jtype = x[j].w;
-	    flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
-	    int m = static_cast<int> (p);
-	    m = MIN(m,nr-1);
-	    p -= m;
-	    p = MIN(p,(flt_t)1.0);
-	    if (!ONETYPE)
-	      rhor_joff = rhor_ioff + jtype * jstride;
-	    const int joff = rhor_joff + m;
-	    flt_t ra;
-	    ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
-		  rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
-	    rhoi += ra;
-	    if (NEWTON_PAIR || j < nlocal) {
-	      if (!ONETYPE) {
-		const int ioff = jtype * istride + itype * jstride + m;
-		ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
-                      rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
-	      }
-	      trho[j] += ra;
+          if (rsq < fcutforcesq) {
+            trsq[ej]=rsq;
+            if (!ONETYPE) tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:rhoi)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          int jtype;
+          const int j = tj[jj] & NEIGHMASK;
+          if (!ONETYPE) jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
+          flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
+          int m = static_cast<int> (p);
+          m = MIN(m,nr-1);
+          p -= m;
+          p = MIN(p,(flt_t)1.0);
+          if (!ONETYPE)
+            rhor_joff = rhor_ioff + jtype * jstride;
+          const int joff = rhor_joff + m;
+          flt_t ra;
+          ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
+                rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
+          rhoi += ra;
+          if (NEWTON_PAIR) {
+            if (!ONETYPE) {
+              const int ioff = jtype * istride + itype * jstride + m;
+              ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
+                    rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
             }
+            trho[j] += ra;
           }
         } // for jj
-	trho[i] += rhoi;
+        if (NEWTON_PAIR)
+          trho[i] += rhoi;
+        else
+          trho[i] = rhoi;
       } // for i
 
       #if defined(_OPENMP)
-      if (nthreads > 1) {
+      if (NEWTON_PAIR && nthreads > 1) {
         #pragma omp barrier
-        if (tid == 0) {  
-          int rcount;
-	  if (NEWTON_PAIR) rcount = nall;
-	  else rcount = nlocal;
-	  if (nthreads == 2) {
+        if (tid == 0) {
+          const int rcount = nall;
+          if (nthreads == 2) {
             double *trho2 = rho + nmax;
-	    #pragma vector aligned
+            #pragma vector aligned
             #pragma simd
-	    for (int n = 0; n < rcount; n++)
-	      rho[n] += trho2[n];
+            for (int n = 0; n < rcount; n++)
+              rho[n] += trho2[n];
           } else if (nthreads == 4) {
             double *trho2 = rho + nmax;
-	    double *trho3 = trho2 + nmax;
-	    double *trho4 = trho3 + nmax;
-	    #pragma vector aligned
-	    #pragma simd
-	    for (int n = 0; n < rcount; n++)
-	      rho[n] += trho2[n] + trho3[n] + trho4[n];
+            double *trho3 = trho2 + nmax;
+            double *trho4 = trho3 + nmax;
+            #pragma vector aligned
+            #pragma simd
+            for (int n = 0; n < rcount; n++)
+              rho[n] += trho2[n] + trho3[n] + trho4[n];
           } else {
-	    double *trhon = rho + nmax;
-	    for (int t = 1; t < nthreads; t++) {
-  	      #pragma vector aligned
-	      #pragma simd
-	      for (int n = 0; n < rcount; n++)
-	        rho[n] += trhon[n];
-	      trhon += nmax;
+            double *trhon = rho + nmax;
+            for (int t = 1; t < nthreads; t++) {
+              #pragma vector aligned
+              #pragma simd
+              for (int n = 0; n < rcount; n++)
+                rho[n] += trhon[n];
+              trhon += nmax;
             }
           }
         }
@@ -398,32 +411,32 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       #pragma simd reduction(+:tevdwl)
       #endif
       for (int i = iifrom; i < iito; ++i) {
-	int itype;
-	if (!ONETYPE) itype = x[i].w;
-	flt_t p = rho[i]*frdrho + (flt_t)1.0;
-	int m = static_cast<int> (p);
-	m = MAX(1,MIN(m,nrho-1));
-	p -= m;
-	p = MIN(p,(flt_t)1.0);
-	if (!ONETYPE) frho_ioff = itype * fstride;
-	const int ioff = frho_ioff + m;
-	fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p + 
-	  frho_spline_f[ioff].c;
-	if (EFLAG) {
-	  flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p + 
-		       frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d;
-	  if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax);
-	  if (!ONETYPE) {
-	    const int ptr_off=itype*ntypes + itype;
-	    oscale = scale_f[ptr_off];
-	  }
-	  phi *= oscale;
-	  tevdwl += phi;
-	  if (eatom) f[i].w += phi;
-	}
+        int itype;
+        if (!ONETYPE) itype = x[i].w;
+        flt_t p = rho[i]*frdrho + (flt_t)1.0;
+        int m = static_cast<int> (p);
+        m = MAX(1,MIN(m,nrho-1));
+        p -= m;
+        p = MIN(p,(flt_t)1.0);
+        if (!ONETYPE) frho_ioff = itype * fstride;
+        const int ioff = frho_ioff + m;
+        fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p +
+          frho_spline_f[ioff].c;
+        if (EFLAG) {
+          flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p +
+                       frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d;
+          if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax);
+          if (!ONETYPE) {
+            const int ptr_off=itype*ntypes + itype;
+            oscale = scale_f[ptr_off];
+          }
+          phi *= oscale;
+          tevdwl += phi;
+          if (eatom) f[i].w += phi;
+        }
       }
       if (EFLAG) oevdwl += tevdwl;
-      
+
 
       // communicate derivative of embedding function
 
@@ -431,11 +444,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       #pragma omp barrier
       #endif
 
-      if (tid == 0) {
+      if (tid == 0)
         comm->forward_comm_pair(this);
-	memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
-      } else
-	memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      if (NEWTON_PAIR)
+        memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       #if defined(_OPENMP)
       #pragma omp barrier
@@ -446,140 +458,158 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 
       for (int i = iifrom; i < iito; ++i) {
         int itype, rhor_ioff;
-	const flt_t * _noalias scale_fi;
-	if (!ONETYPE) {
-	  itype = x[i].w;
-	  rhor_ioff = istride * itype;
-	  scale_fi = scale_f + itype*ntypes;
-	}
-	const int * _noalias const jlist = firstneigh + cnumneigh[i];
-	const int jnum = numneigh[i];
-
-	acc_t fxtmp, fytmp, fztmp, fwtmp;
-	acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
-
-	const flt_t xtmp = x[i].x;
-	const flt_t ytmp = x[i].y;
-	const flt_t ztmp = x[i].z;
-	fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
-          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        const flt_t * _noalias scale_fi;
+        if (!ONETYPE) {
+          itype = x[i].w;
+          rhor_ioff = istride * itype;
+          scale_fi = scale_f + itype*ntypes;
         }
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
 
-	#if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl,	\
-	                         sv0, sv1, sv2, sv3, sv4, sv5)
-	#endif
-	for (int jj = 0; jj < jnum; jj++) {
-          int j, jtype;
-	  j = jlist[jj] & NEIGHMASK;
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
 
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+        int ej = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-	  const flt_t rsq = delx*delx + dely*dely + delz*delz;
+          const flt_t rsq = delx*delx + dely*dely + delz*delz;
 
+          if (rsq < fcutforcesq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            if (!ONETYPE) tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
 
-	  if (rsq < fcutforcesq) {
-	    if (!ONETYPE) jtype = x[j].w;
-	    const flt_t r = sqrt(rsq);
-	    flt_t p = r*frdr + (flt_t)1.0;
-	    int m = static_cast<int> (p);
-	    m = MIN(m,nr-1);
-	    p -= m;
-	    p = MIN(p,(flt_t)1.0);
-	    if (!ONETYPE)
-	      rhor_joff = rhor_ioff + jtype * jstride;
-	    const int joff = rhor_joff + m;
-	    const flt_t rhojp = (rhor_spline_f[joff].a*p + 
-                                 rhor_spline_f[joff].b)*p + 
-	                        rhor_spline_f[joff].c;
-	    flt_t rhoip;
-	    if (!ONETYPE) {
-	      const int ioff = jtype * istride + itype * jstride + m;
-	      rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + 
-		      rhor_spline_f[ioff].c;
-	    } else
-	      rhoip = rhojp;
-	    const flt_t z2p = (z2r_spline_t[joff].a*p + 
-                               z2r_spline_t[joff].b)*p + 
-                              z2r_spline_t[joff].c;
-	    const flt_t z2 = ((z2r_spline_t[joff].d*p + 
-			       z2r_spline_t[joff].e)*p + 
-			      z2r_spline_t[joff].f)*p + 
-	                     z2r_spline_t[joff].g;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                                 sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          int jtype;
+          const int j = tj[jj] & NEIGHMASK;
+          if (!ONETYPE) jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
+          const flt_t r = sqrt(rsq);
+          flt_t p = r*frdr + (flt_t)1.0;
+          int m = static_cast<int> (p);
+          m = MIN(m,nr-1);
+          p -= m;
+          p = MIN(p,(flt_t)1.0);
+          if (!ONETYPE)
+            rhor_joff = rhor_ioff + jtype * jstride;
+          const int joff = rhor_joff + m;
+          const flt_t rhojp = (rhor_spline_f[joff].a*p +
+                               rhor_spline_f[joff].b)*p +
+            rhor_spline_f[joff].c;
+          flt_t rhoip;
+          if (!ONETYPE) {
+            const int ioff = jtype * istride + itype * jstride + m;
+            rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
+              rhor_spline_f[ioff].c;
+          } else
+            rhoip = rhojp;
+          const flt_t z2p = (z2r_spline_t[joff].a*p +
+                             z2r_spline_t[joff].b)*p +
+            z2r_spline_t[joff].c;
+          const flt_t z2 = ((z2r_spline_t[joff].d*p +
+                             z2r_spline_t[joff].e)*p +
+                            z2r_spline_t[joff].f)*p +
+            z2r_spline_t[joff].g;
 
-	    const flt_t recip = (flt_t)1.0/r;
-	    const flt_t phi = z2*recip;
-	    const flt_t phip = z2p*recip - phi*recip;
-	    const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
-	    if (!ONETYPE)
-	      oscale = scale_fi[jtype];
-	    const flt_t fpair = -oscale*psip*recip;
+          const flt_t recip = (flt_t)1.0/r;
+          const flt_t phi = z2*recip;
+          const flt_t phip = z2p*recip - phi*recip;
+          const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
+          if (!ONETYPE)
+            oscale = scale_fi[jtype];
+          const flt_t fpair = -oscale*psip*recip;
 
-	    fxtmp += delx*fpair;
-	    fytmp += dely*fpair;
-	    fztmp += delz*fpair;
-	    if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx*fpair;
-	      f[j].y -= dely*fpair;
-	      f[j].z -= delz*fpair;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+          if (EFLAG) {
+            const flt_t evdwl = oscale*phi;
+            sevdwl += evdwl;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl;
             }
-
-	    if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i<nlocal)
-	        ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j<nlocal)
-	        ev_pre += (flt_t)0.5;
-
-	      if (EFLAG) {
-                const flt_t evdwl = oscale*phi;
-                sevdwl += ev_pre * evdwl;
-		if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-		    fwtmp += (flt_t)0.5 * evdwl;
-		  if (NEWTON_PAIR || j < nlocal)
-		    f[j].w += (flt_t)0.5 * evdwl;
-                }
-              }
-              IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
-	                           delx, dely, delz);
-            }
-          } // if rsq
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
         } // for jj
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+          sevdwl *= (acc_t)0.5;
+        }
 
-        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for i
 
-      if (vflag == 2) {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride,
-			       x, offload);
-      }
-
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } /// omp
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -591,7 +621,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -604,6 +634,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 void PairEAMIntel::init_style()
 {
   PairEAM::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -631,8 +665,15 @@ void PairEAMIntel::init_style()
 
 template <class flt_t, class acc_t>
 void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
-				    IntelBuffers<flt_t,acc_t> *buffers)
+                                    IntelBuffers<flt_t,acc_t> *buffers)
 {
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,nr,nrho,memory,_cop);
   buffers->set_ntypes(tp1);
@@ -643,14 +684,14 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
-	cut = init_one(i,j);
-	cutneigh = cut + neighbor->skin;
-	cutsq[i][j] = cutsq[j][i] = cut*cut;
-	cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
-  
+
   _onetype=-1;
   double oldscale=-1;
   for (int i = 1; i < tp1; i++) {
@@ -668,32 +709,32 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int j = 1; j < tp1; j++) {
       fc.scale_f[i][j] = scale[i][j];
       if (type2rhor[i][j] >= 0) {
-	const int joff = ioff + j * fc.rhor_jstride();
-	for (int k = 0; k < nr + 1; k++) {
-	  if (type2rhor[j][i] != type2rhor[i][j])
-	    _onetype = 0;
+        const int joff = ioff + j * fc.rhor_jstride();
+        for (int k = 0; k < nr + 1; k++) {
+          if (type2rhor[j][i] != type2rhor[i][j])
+            _onetype = 0;
           else if (_onetype < 0)
-	    _onetype = i;
+            _onetype = i;
           if (oldscale < 0)
             oldscale = scale[i][j];
           else
-	    if (oldscale != scale[i][j])
-	      _onetype = 0;
-	  fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0];
-	  fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1];
-	  fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2];
-	  fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3];
-	  fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
-	  fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
-	  fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
-	  fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
-	  fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
-	  fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
-	  fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
-	  fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
-	  fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
-	  fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
-	}
+            if (oldscale != scale[i][j])
+              _onetype = 0;
+          fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0];
+          fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1];
+          fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2];
+          fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3];
+          fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
+          fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
+          fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
+          fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
+          fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
+          fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
+          fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
+          fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
+          fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
+          fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
+        }
       }
     }
   }
@@ -704,9 +745,9 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-						 const int nr, const int nrho,
-						 Memory *memory,
-						 const int cop) {
+                                                 const int nr, const int nrho,
+                                                 Memory *memory,
+                                                 const int cop) {
   if (ntypes != _ntypes || nr > _nr || nrho > _nrho) {
     if (_ntypes > 0) {
       _memory->destroy(rhor_spline_f);
@@ -739,7 +780,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 /* ---------------------------------------------------------------------- */
 
 int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
-				    int pbc_flag, int *pbc)
+                                    int pbc_flag, int *pbc)
 {
   if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     return pack_forward_comm(n, list, buf, fp);
@@ -761,7 +802,7 @@ void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf)
 
 template<class flt_t>
 int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
-				    flt_t *fp_f)
+                                    flt_t *fp_f)
 {
   int i,j,m;
 
@@ -776,8 +817,8 @@ int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
 /* ---------------------------------------------------------------------- */
 
 template<class flt_t>
-void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf, 
-				       flt_t *fp_f)
+void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf,
+                                       flt_t *fp_f)
 {
   int i,m,last;
 
diff --git a/src/USER-INTEL/pair_eam_intel.h b/src/USER-INTEL/pair_eam_intel.h
index f7fb71ad2c..f34e740bda 100644
--- a/src/USER-INTEL/pair_eam_intel.h
+++ b/src/USER-INTEL/pair_eam_intel.h
@@ -41,7 +41,7 @@ class PairEAMIntel : public PairEAM {
  protected:
 
   FixIntel *fix;
-  int _cop, _onetype;
+  int _cop, _onetype, _ccache_stride;
   float *fp_float;
 
   template <class flt_t>
@@ -53,8 +53,8 @@ class PairEAMIntel : public PairEAM {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
-	    class acc_t>
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t,
+            class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers,
             const ForceConst<flt_t> &fc, const int astart, const int aend);
@@ -79,8 +79,8 @@ class PairEAMIntel : public PairEAM {
     ForceConst() : _ntypes(0), _nr(0)  {}
     ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
 
-    void set_ntypes(const int ntypes, const int nr, const int nrho, 
-		    Memory *memory, const int cop);
+    void set_ntypes(const int ntypes, const int nr, const int nrho,
+                    Memory *memory, const int cop);
     inline int rhor_jstride() const { return _nr; }
     inline int rhor_istride() const { return _nr * _ntypes; }
     inline int frho_stride() const { return _nrho; }
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index c1e3d1b37f..ed7dd424af 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -88,23 +88,27 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
     const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
     const int * const ellipsoid = atom->ellipsoid;
     QUAT_T * _noalias const quat = buffers->get_quat();
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
-				sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
+                                sizeof(ATOM_T));
       if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
 
       for (int i = ifrom; i < ito; i++) {
-	int qi = ellipsoid[i];
-	if (qi > -1) {
-	  quat[i].w = bonus[qi].quat[0];
-	  quat[i].i = bonus[qi].quat[1];
-	  quat[i].j = bonus[qi].quat[2];
-	  quat[i].k = bonus[qi].quat[3];
-	}
+        int qi = ellipsoid[i];
+        if (qi > -1) {
+          quat[i].w = bonus[qi].quat[0];
+          quat[i].i = bonus[qi].quat[1];
+          quat[i].j = bonus[qi].quat[2];
+          quat[i].k = bonus[qi].quat[3];
+        }
       }
     }
     quat[nall].w = (flt_t)1.0;
@@ -114,39 +118,29 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
     fix->stop_watch(TIME_PACK);
   }
 
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairGayBerneIntel::eval(const int offload, const int vflag,
                              IntelBuffers<flt_t,acc_t> *buffers,
                              const ForceConst<flt_t> &fc,
@@ -167,66 +161,65 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   if (fix->separate_buffers()) {
     fix->start_watch(TIME_PACK);
     if (offload) {
-      #pragma omp parallel default(none) \
-	shared(buffers,nlocal,nall,bonus,ellipsoid)
+      #pragma omp parallel
       {
         int ifrom, ito, tid;
-	int nthreads = comm->nthreads;
-	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
-				  nthreads, sizeof(ATOM_T));
-	if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
-	for (int i = ifrom; i < ito; i++) {
-	  int qi = ellipsoid[i];
-	  if (qi > -1) {
-	    quat[i].w = bonus[qi].quat[0];
-	    quat[i].i = bonus[qi].quat[1];
-	    quat[i].j = bonus[qi].quat[2];
-	    quat[i].k = bonus[qi].quat[3];
-	  }
-	}
-	int nghost = nall - nlocal;
-	if (nghost) {
-	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
-				 nthreads, sizeof(ATOM_T));
-	  int offset = 0;
-	  ifrom += nlocal;
-	  ito += nlocal;
-	  if (ago != 0) {
-	    offset = fix->offload_min_ghost() - nlocal;
-	    buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
-	  }
-	  for (int i = ifrom; i < ito; i++) {
-	    int qi = ellipsoid[i + offset];
-	    if (qi > -1) {
-	      quat[i].w = bonus[qi].quat[0];
-	      quat[i].i = bonus[qi].quat[1];
-	      quat[i].j = bonus[qi].quat[2];
-	      quat[i].k = bonus[qi].quat[3];
-	    }
-	  }
-	}
+        int nthreads = comm->nthreads;
+        IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
+                                  nthreads, sizeof(ATOM_T));
+        if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
+        for (int i = ifrom; i < ito; i++) {
+          int qi = ellipsoid[i];
+          if (qi > -1) {
+            quat[i].w = bonus[qi].quat[0];
+            quat[i].i = bonus[qi].quat[1];
+            quat[i].j = bonus[qi].quat[2];
+            quat[i].k = bonus[qi].quat[3];
+          }
+        }
+        int nghost = nall - nlocal;
+        if (nghost) {
+          IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
+                                 nthreads, sizeof(ATOM_T));
+          int offset = 0;
+          ifrom += nlocal;
+          ito += nlocal;
+          if (ago != 0) {
+            offset = fix->offload_min_ghost() - nlocal;
+            buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
+          }
+          for (int i = ifrom; i < ito; i++) {
+            int qi = ellipsoid[i + offset];
+            if (qi > -1) {
+              quat[i].w = bonus[qi].quat[0];
+              quat[i].i = bonus[qi].quat[1];
+              quat[i].j = bonus[qi].quat[2];
+              quat[i].k = bonus[qi].quat[3];
+            }
+          }
+        }
       }
     } else {
       if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
       for (int i = fix->host_min_local(); i < nlocal; i++) {
-	int qi = ellipsoid[i];
-	if (qi > -1) {
-	  quat[i].w = bonus[qi].quat[0];
-	  quat[i].i = bonus[qi].quat[1];
-	  quat[i].j = bonus[qi].quat[2];
-	  quat[i].k = bonus[qi].quat[3];
-	}
+        int qi = ellipsoid[i];
+        if (qi > -1) {
+          quat[i].w = bonus[qi].quat[0];
+          quat[i].i = bonus[qi].quat[1];
+          quat[i].j = bonus[qi].quat[2];
+          quat[i].k = bonus[qi].quat[3];
+        }
       }
       int offset = fix->host_min_ghost() - nlocal;
       if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
       for (int i = nlocal; i < nall; i++) {
-	int qi = ellipsoid[i + offset];
-	if (qi > -1) {
-	  quat[i].w = bonus[qi].quat[0];
-	  quat[i].i = bonus[qi].quat[1];
-	  quat[i].j = bonus[qi].quat[2];
-	  quat[i].k = bonus[qi].quat[3];
-	}
+        int qi = ellipsoid[i + offset];
+        if (qi > -1) {
+          quat[i].w = bonus[qi].quat[0];
+          quat[i].i = bonus[qi].quat[1];
+          quat[i].j = bonus[qi].quat[2];
+          quat[i].k = bonus[qi].quat[3];
+        }
       }
     }
     fix->stop_watch(TIME_PACK);
@@ -258,9 +251,9 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -310,30 +303,31 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     #ifdef _LMP_INTEL_OFFLOAD
     if (separate_flag) {
       if (separate_flag < 3) {
-	int all_local = nlocal;
-	int ghost_min = overflow[LMP_GHOST_MIN];
-	nlocal = overflow[LMP_LOCAL_MAX] + 1;
-	int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
-	if (nghost < 0) nghost = 0;
-	nall = nlocal + nghost;
-	separate_flag--;
-	int flength;
-	if (NEWTON_PAIR) flength = nall;
-	else flength = nlocal;
-	IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
-			     separate_flag);
-	if (nghost) {
-	  if (nlocal < all_local || ghost_min > all_local) {
-	    memmove(x + nlocal, x + ghost_min,
-		    (nall - nlocal) * sizeof(ATOM_T));
-	    memmove(quat + nlocal, quat + ghost_min,
-		    (nall - nlocal) * sizeof(QUAT_T));
-	  }
-	}
+        int all_local = nlocal;
+        int ghost_min = overflow[LMP_GHOST_MIN];
+        nlocal = overflow[LMP_LOCAL_MAX] + 1;
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
+        if (nghost < 0) nghost = 0;
+        nall = nlocal + nghost;
+        separate_flag--;
+        int flength;
+        if (NEWTON_PAIR) flength = nall;
+        else flength = nlocal;
+        IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
+                             separate_flag);
+        if (nghost) {
+          if (nlocal < all_local || ghost_min > all_local) {
+            memmove(x + nlocal, x + ghost_min,
+                    (nall - nlocal) * sizeof(ATOM_T));
+            memmove(quat + nlocal, quat + ghost_min,
+                    (nall - nlocal) * sizeof(QUAT_T));
+          }
+        }
       }
       x[nall].x = (flt_t)INTEL_BIGP;
       x[nall].y = (flt_t)INTEL_BIGP;
       x[nall].z = (flt_t)INTEL_BIGP;
+      x[nall].w = 1;
       quat[nall].w = (flt_t)1.0;
       quat[nall].i = (flt_t)0.0;
       quat[nall].j = (flt_t)0.0;
@@ -342,25 +336,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     #endif
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0.0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0.0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    if (NEWTON_PAIR == 0) f_start[1].w = 0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
-      memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal * 2;
+      else foff = minlocal*-2;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
 
       flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
       flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
@@ -370,7 +364,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
       int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
 
       int ierror = 0;
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         // const int i = ilist[ii];
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
@@ -401,13 +395,16 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
         fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
 
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
+        if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
-        }
 
         bool multiple_forms = false;
         int packed_j = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
         for (int jj = 0; jj < jnum; jj++) {
           int jm = jlist[jj];
           int j = jm & NEIGHMASK;
@@ -431,27 +428,27 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           } else
             multiple_forms = true;
         }
-	const int edge = (packed_j % pad_width);
-	if (edge) {
-	  const int packed_end = packed_j + (pad_width - edge);
+        const int edge = (packed_j % pad_width);
+        if (edge) {
+          const int packed_end = packed_j + (pad_width - edge);
           #if defined(LMP_SIMD_COMPILER)
           #pragma loop_count min=1, max=15, avg=8
           #endif
-	  for ( ; packed_j < packed_end; packed_j++)
-	    jlist_form[packed_j] = nall;
-	}
-	  
+          for ( ; packed_j < packed_end; packed_j++)
+            jlist_form[packed_j] = nall;
+        }
+
         // -------------------------------------------------------------
 
-	#ifdef INTEL_V512
-	__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
-	__assume(packed_j % 8 == 0);
-	__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
-	#endif
+        #ifdef INTEL_V512
+        __assume(packed_j % INTEL_VECTOR_WIDTH == 0);
+        __assume(packed_j % 8 == 0);
+        __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
+        #endif
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
-	                         sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
+                                 sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
         #endif
         for (int jj = 0; jj < packed_j; jj++) {
           flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
@@ -461,15 +458,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
           flt_t rtor_0, rtor_1, rtor_2;
 
-	  const int sbindex = jlist_form[jj] >> SBBITS & 3;
-	  const int j = jlist_form[jj] & NEIGHMASK;
+          const int sbindex = jlist_form[jj] >> SBBITS & 3;
+          const int j = jlist_form[jj] & NEIGHMASK;
           flt_t factor_lj = special_lj[sbindex];
           const int jtype = jtype_form[jj];
-	  const flt_t sigma = ijci[jtype].sigma;
-	  const flt_t epsilon = ijci[jtype].epsilon;
-	  const flt_t shape2_0 = ic[jtype].shape2[0];
-	  const flt_t shape2_1 = ic[jtype].shape2[1];
-	  const flt_t shape2_2 = ic[jtype].shape2[2];
+          const flt_t sigma = ijci[jtype].sigma;
+          const flt_t epsilon = ijci[jtype].epsilon;
+          const flt_t shape2_0 = ic[jtype].shape2[0];
+          const flt_t shape2_1 = ic[jtype].shape2[1];
+          const flt_t shape2_2 = ic[jtype].shape2[2];
           flt_t one_eng, evdwl;
 
           ME_quat_to_mat_trans(quat[j], a2);
@@ -491,7 +488,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ME_plus3(g1, g2, g12);
           flt_t kappa_0, kappa_1, kappa_2;
           ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
-		       kappa, ierror);
+                       kappa, ierror);
 
           // tempv = G12^-1*r12hat
 
@@ -523,7 +520,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           flt_t iota_0, iota_1, iota_2;
           ME_plus3(b1, b2, b12);
           ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
-		       iota, ierror);
+                       iota, ierror);
 
           // tempv = G12^-1*r12hat
 
@@ -537,7 +534,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           // compute dUr/dr
 
           temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
-	    sigma;
+            sigma;
           temp1 = temp1 * (flt_t)24.0 * epsilon;
           flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
           flt_t dUr_0, dUr_1, dUr_2;
@@ -551,8 +548,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
           flt_t dchi_0, dchi_1, dchi_2;
           temp1 = ME_dot3(iota, r12hat);
-          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * 
-	    std::pow(chi, (mu - (flt_t)1.0) / mu);
+          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
+            std::pow(chi, (mu - (flt_t)1.0) / mu);
           dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
           dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
           dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
@@ -573,7 +570,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ME_cross3(tempv, tempv2, dUr);
           flt_t dUr2_0, dUr2_1, dUr2_2;
 
-          if (NEWTON_PAIR || j < nlocal) {
+          if (NEWTON_PAIR) {
             ME_vecmat(kappa, g2, tempv2);
             ME_cross3(tempv, tempv2, dUr2);
           }
@@ -588,7 +585,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           dchi_2 *= temp1;
           flt_t dchi2_0, dchi2_1, dchi2_2;
 
-          if (NEWTON_PAIR || j < nlocal) {
+          if (NEWTON_PAIR) {
             ME_vecmat(iota, b2, tempv);
             ME_cross3(tempv, iota, dchi2);
             dchi2_0 *= temp1;
@@ -630,7 +627,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           // compute d_eta for particle 2
 
           flt_t deta2_0, deta2_1, deta2_2;
-          if (NEWTON_PAIR || j < nlocal) {
+          if (NEWTON_PAIR) {
             deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
             ME_compute_eta_torque(g12, a2, shape2, temp);
 
@@ -666,36 +663,36 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           temp3 = chi * eta;
 
           ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
-	    (flt_t)-1.0;
+            (flt_t)-1.0;
           ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
-	    (flt_t)-1.0;
+            (flt_t)-1.0;
           ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
-	    (flt_t)-1.0;
+            (flt_t)-1.0;
 
-          if (NEWTON_PAIR || j < nlocal) {
+          if (NEWTON_PAIR) {
             rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
-	      (flt_t)-1.0;
+              (flt_t)-1.0;
             rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
-	      (flt_t)-1.0;
+              (flt_t)-1.0;
             rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
-	      (flt_t)-1.0;
+              (flt_t)-1.0;
           }
 
           one_eng = temp1 * chi;
-	  #ifndef INTEL_VMASK
-	  if (jlist_form[jj] == nall) {
-	    one_eng = (flt_t)0.0;
-	    fforce_0 = 0.0;
-	    fforce_1 = 0.0;
-	    fforce_2 = 0.0;
-	    ttor_0 = 0.0;
-	    ttor_1 = 0.0;
-	    ttor_2 = 0.0;
-	    rtor_0 = 0.0;
-	    rtor_1 = 0.0;
-	    rtor_2 = 0.0;
-	  }
-	  #endif
+          #ifndef INTEL_VMASK
+          if (jlist_form[jj] == nall) {
+            one_eng = (flt_t)0.0;
+            fforce_0 = 0.0;
+            fforce_1 = 0.0;
+            fforce_2 = 0.0;
+            ttor_0 = 0.0;
+            ttor_1 = 0.0;
+            ttor_2 = 0.0;
+            rtor_0 = 0.0;
+            rtor_1 = 0.0;
+            rtor_2 = 0.0;
+          }
+          #endif
 
           fforce_0 *= factor_lj;
           fforce_1 *= factor_lj;
@@ -704,61 +701,53 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ttor_1 *= factor_lj;
           ttor_2 *= factor_lj;
 
-	  #ifdef INTEL_VMASK
-	  if (jlist_form[jj] < nall) {
-	  #endif
-	    fxtmp += fforce_0;
-	    fytmp += fforce_1;
-	    fztmp += fforce_2;
-	    t1tmp += ttor_0;
-	    t2tmp += ttor_1;
-	    t3tmp += ttor_2;
+          #ifdef INTEL_VMASK
+          if (jlist_form[jj] < nall) {
+          #endif
+            fxtmp += fforce_0;
+            fytmp += fforce_1;
+            fztmp += fforce_2;
+            t1tmp += ttor_0;
+            t2tmp += ttor_1;
+            t3tmp += ttor_2;
 
-	    if (NEWTON_PAIR || j < nlocal) {
-	      rtor_0 *= factor_lj;
-	      rtor_1 *= factor_lj;
-	      rtor_2 *= factor_lj;
-	      int jp = j * 2;
-	      f[jp].x -= fforce_0;
-	      f[jp].y -= fforce_1;
-	      f[jp].z -= fforce_2;
-	      jp++;
-	      f[jp].x += rtor_0;
-	      f[jp].y += rtor_1;
-	      f[jp].z += rtor_2;
-	    }
+            if (NEWTON_PAIR) {
+              rtor_0 *= factor_lj;
+              rtor_1 *= factor_lj;
+              rtor_2 *= factor_lj;
+              int jp = j * 2;
+              f[jp].x -= fforce_0;
+              f[jp].y -= fforce_1;
+              f[jp].z -= fforce_2;
+              jp++;
+              f[jp].x += rtor_0;
+              f[jp].y += rtor_1;
+              f[jp].z += rtor_2;
+            }
 
-	    if (EVFLAG) {
-	      flt_t ev_pre = (flt_t)0.0;
-	      if (NEWTON_PAIR || i < nlocal)
-		ev_pre += (flt_t)0.5;
-	      if (NEWTON_PAIR || j < nlocal)
-		ev_pre += (flt_t)0.5;
+            if (EFLAG) {
+              evdwl = factor_lj * one_eng;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j*2].w += (flt_t)0.5 * evdwl;
+              }
+            }
 
-	      if (EFLAG) {
-		evdwl = factor_lj * one_eng;
-		sevdwl += ev_pre * evdwl;
-		if (eatom) {
-		  if (NEWTON_PAIR || i < nlocal)
-		    fwtmp += (flt_t)0.5 * evdwl;
-		  if (NEWTON_PAIR || j < nlocal)
-		    f[j*2].w += (flt_t)0.5 * evdwl;
-		}
-	      }
-
-	      if (vflag == 1) {
-		ev_pre *= (flt_t)-1.0;
-		sv0 += ev_pre * delx_form[jj] * fforce_0;
-		sv1 += ev_pre * dely_form[jj] * fforce_1;
-		sv2 += ev_pre * delz_form[jj] * fforce_2;
-		sv3 += ev_pre * delx_form[jj] * fforce_1;
-		sv4 += ev_pre * delx_form[jj] * fforce_2;
-		sv5 += ev_pre * dely_form[jj] * fforce_2;
-	      }
-	    } // EVFLAG
-	  #ifdef INTEL_VMASK
-	  }
-	  #endif
+            if (NEWTON_PAIR == 0) {
+              if (vflag == 1) {
+                sv0 += delx_form[jj] * fforce_0;
+                sv1 += dely_form[jj] * fforce_1;
+                sv2 += delz_form[jj] * fforce_2;
+                sv3 += delx_form[jj] * fforce_1;
+                sv4 += delx_form[jj] * fforce_2;
+                sv5 += dely_form[jj] * fforce_2;
+              }
+            } // EVFLAG
+          #ifdef INTEL_VMASK
+          }
+          #endif
         } // for jj
 
         // -------------------------------------------------------------
@@ -767,19 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ierror = 2;
 
         int ip = i * 2;
-        f[ip].x += fxtmp;
-        f[ip].y += fytmp;
-        f[ip].z += fztmp;
-        ip++;
-        f[ip].x += t1tmp;
-        f[ip].y += t2tmp;
-        f[ip].z += t3tmp;
+        if (NEWTON_PAIR) {
+          f[ip].x += fxtmp;
+          f[ip].y += fytmp;
+          f[ip].z += fztmp;
+          ip++;
+          f[ip].x += t1tmp;
+          f[ip].y += t2tmp;
+          f[ip].z += t3tmp;
+        } else {
+          f[ip].x = fxtmp;
+          f[ip].y = fytmp;
+          f[ip].z = fztmp;
+          ip++;
+          f[ip].x = t1tmp;
+          f[ip].y = t2tmp;
+          f[ip].z = t3tmp;
+        }
 
-        if (EVFLAG) {
-          if (EFLAG) {
-            if (eatom) f[i * 2].w += fwtmp;
-            oevdwl += sevdwl;
-          }
+        if (EFLAG) {
+          oevdwl += sevdwl;
+          if (eatom) f[i * 2].w += fwtmp;
+        }
+        if (NEWTON_PAIR == 0) {
           if (vflag == 1) {
             ov0 += sv0;
             ov1 += sv1;
@@ -791,57 +790,32 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         }
       } // for i
       int o_range;
-      if (NEWTON_PAIR)
+      if (NEWTON_PAIR) {
         o_range = nall;
-      else
-        o_range = nlocal;
-      if (offload == 0) o_range -= minlocal;
-      IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
-			     sizeof(FORCE_T));
-      const int two_iito = iito * 2;
-
-      acc_t *facc = &(f_start[0].x);
-      const int sto = two_iito * 4;
-      const int fst4 = f_stride * 4;
-      #if defined(_OPENMP)
-      #pragma omp barrier
-      #endif
-      int t_off = f_stride;
-      if (EFLAG && eatom) {
+        if (offload == 0) o_range -= minlocal;
+        IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
+                               sizeof(FORCE_T));
+        const int sto = iito * 8;
+        const int fst4 = f_stride * 4;
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        acc_t *f_scalar = &f_start[0].x;
+        acc_t *f_scalar2 = f_scalar + fst4;
         for (int t = 1; t < nthreads; t++) {
           #if defined(LMP_SIMD_COMPILER)
-	  #pragma vector nontemporal
-	  #pragma novector
+          #pragma vector aligned
+          #pragma simd
           #endif
-          for (int n = iifrom * 2; n < two_iito; n++) {
-            f_start[n].x += f_start[n + t_off].x;
-            f_start[n].y += f_start[n + t_off].y;
-            f_start[n].z += f_start[n + t_off].z;
-            f_start[n].w += f_start[n + t_off].w;
-          }
-          t_off += f_stride;
+          for (int n = iifrom * 8; n < sto; n++)
+            f_scalar[n] += f_scalar2[n];
+          f_scalar2 += fst4;
         }
-      } else {
-        for (int t = 1; t < nthreads; t++) {
-          #if defined(LMP_SIMD_COMPILER)
-	  #pragma vector nontemporal
-	  #pragma novector
-          #endif
-          for (int n = iifrom * 2; n < two_iito; n++) {
-            f_start[n].x += f_start[n + t_off].x;
-            f_start[n].y += f_start[n + t_off].y;
-            f_start[n].z += f_start[n + t_off].z;
-          }
-          t_off += f_stride;
-        }
-      }
 
-      if (EVFLAG) {
         if (vflag==2) {
           const ATOM_T * _noalias const xo = x + minlocal;
           #if defined(LMP_SIMD_COMPILER)
-	  #pragma vector nontemporal
-	  #pragma novector
+          #pragma novector
           #endif
           for (int n = iifrom; n < iito; n++) {
             const int nt2 = n * 2;
@@ -859,19 +833,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         f_start[1].w = ierror;
     } // omp
 
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)-0.5;
+        ov1 *= (acc_t)-0.5;
+        ov2 *= (acc_t)-0.5;
+        ov3 *= (acc_t)-0.5;
+        ov4 *= (acc_t)-0.5;
+        ov5 *= (acc_t)-0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
 
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
@@ -884,7 +865,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
   else
     fix->add_result_array(f_start, 0, offload, 0, 0, 2);
@@ -895,6 +876,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 void PairGayBerneIntel::init_style()
 {
   PairGayBerne::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -997,7 +982,7 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                       const int one_length,
                                                       const int nthreads,
                                                       Memory *memory,
-						      const int cop) {
+                                                      const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       fc_packed3 *oic = ic;
@@ -1014,9 +999,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       int * ojlist_form = jlist_form[0];
 
       if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
-	  orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
-	  odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
-	  _cop >= 0) {
+          orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
+          odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
+          _cop >= 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
           nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
@@ -1048,14 +1033,14 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       memory->create(jlist_form, nthreads, one_length, "jlist_form");
 
       for (int zn = 0; zn < nthreads; zn++)
-	for (int zo = 0; zo < one_length; zo++) {
-	  rsq_form[zn][zo] = 10.0;
-	  delx_form[zn][zo] = 10.0;
-	  dely_form[zn][zo] = 10.0;
-	  delz_form[zn][zo] = 10.0;
-	  jtype_form[zn][zo] = 1;
-	  jlist_form[zn][zo] = 0;
-	}
+        for (int zo = 0; zo < one_length; zo++) {
+          rsq_form[zn][zo] = 10.0;
+          delx_form[zn][zo] = 10.0;
+          dely_form[zn][zo] = 10.0;
+          delz_form[zn][zo] = 10.0;
+          jtype_form[zn][zo] = 1;
+          jlist_form[zn][zo] = 0;
+        }
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
@@ -1072,9 +1057,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
-	  oic != NULL && orsq_form != NULL && odelx_form != NULL &&
-	  odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
-	  ojlist_form !=NULL && cop >= 0) {
+          oic != NULL && orsq_form != NULL && odelx_form != NULL &&
+          odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
+          ojlist_form !=NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
diff --git a/src/USER-INTEL/pair_gayberne_intel.h b/src/USER-INTEL/pair_gayberne_intel.h
index aaed31d567..07dfba14d1 100644
--- a/src/USER-INTEL/pair_gayberne_intel.h
+++ b/src/USER-INTEL/pair_gayberne_intel.h
@@ -43,7 +43,7 @@ class PairGayBerneIntel : public PairGayBerne {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers,
             const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
index ce6e40141f..fe99525122 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -67,8 +67,8 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
-					IntelBuffers<flt_t,acc_t> *buffers,
-					const ForceConst<flt_t> &fc)
+                                        IntelBuffers<flt_t,acc_t> *buffers,
+                                        const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -82,58 +82,52 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
-			      nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   // -------------------- Regular version
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc,
-				     const int astart, const int aend)
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc,
+                                     const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -182,9 +176,9 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -233,28 +227,27 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
-			      f_stride, x, q);
+                              f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
       flt_t cutboth = cut_coulsq;
 
       const int toffs = tid * ccache_stride;
@@ -265,8 +258,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
-      for (int i = iifrom; i < iito; ++i) {
-	//        const int i = ilist[ii];
+      for (int i = iifrom; i < iito; i += iip) {
+        //        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
@@ -277,221 +270,219 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
-	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-	}
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
-	int ej = 0;
+        int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma ivdep
+        #pragma vector aligned
+        #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           const int j = jlist[jj] & NEIGHMASK;
-	  const flt_t delx = xtmp - x[j].x;
+          const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
-	  if (rsq < cut_coulsq) {
-	    trsq[ej]=rsq;
-	    tdelx[ej]=delx;
-	    tdely[ej]=dely;
-	    tdelz[ej]=delz;
-	    tjtype[ej]=x[j].w;
-	    tj[ej]=jlist[jj];
-	    ej++;
-	  }
-	}
+          if (rsq < cut_coulsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
 
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
           forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
 
-	  const int j = tj[jj] & NEIGHMASK;
+          const int j = tj[jj] & NEIGHMASK;
           const int sbindex = tj[jj] >> SBBITS & 3;
-	  const int jtype = tjtype[jj];
-	  const flt_t rsq = trsq[jj];
+          const int jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_ALLOW_TABLE
           if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
             const flt_t A1 =  0.254829592;
-	    const flt_t A2 = -0.284496736;
-	    const flt_t A3 =  1.421413741;
-	    const flt_t A4 = -1.453152027;
-	    const flt_t A5 =  1.061405429;
-	    const flt_t EWALD_F = 1.12837917;
-	    const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+            const flt_t A2 = -0.284496736;
+            const flt_t A3 =  1.421413741;
+            const flt_t A4 = -1.453152027;
+            const flt_t A5 =  1.061405429;
+            const flt_t EWALD_F = 1.12837917;
+            const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
-	    const flt_t r = (flt_t)1.0 / sqrt(r2inv);
-	    const flt_t grij = g_ewald * r;
-	    const flt_t expm2 = exp(-grij * grij);
-	    const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
-	    const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	    const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
-	    forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
-	    if (EFLAG) ecoul = prefactor * erfc;
+            const flt_t r = (flt_t)1.0 / sqrt(r2inv);
+            const flt_t grij = g_ewald * r;
+            const flt_t expm2 = exp(-grij * grij);
+            const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+            const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+            forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+            if (EFLAG) ecoul = prefactor * erfc;
 
-	    const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
-	      prefactor;
-	    forcecoul -= adjust;
-	    if (EFLAG) ecoul -= adjust;
+            const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+              prefactor;
+            forcecoul -= adjust;
+            if (EFLAG) ecoul -= adjust;
 
           #ifdef INTEL_ALLOW_TABLE
-	  } else {
-	    float rsq_lookup = rsq;
-	    const int itable = (__intel_castf32_u32(rsq_lookup) &
-	                        ncoulmask) >> ncoulshiftbits;
-	    const flt_t fraction = (rsq_lookup - table[itable].r) *
-	      table[itable].dr;
+          } else {
+            float rsq_lookup = rsq;
+            const int itable = (__intel_castf32_u32(rsq_lookup) &
+                                ncoulmask) >> ncoulshiftbits;
+            const flt_t fraction = (rsq_lookup - table[itable].r) *
+              table[itable].dr;
 
-	    const flt_t tablet = table[itable].f +
-	      fraction * table[itable].df;
-	    forcecoul = qtmp * q[j] * tablet;
-	    if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
-	                       fraction * detable[itable]);
-	    if (sbindex) {
-   	      const flt_t table2 = ctable[itable] +
-		fraction * dctable[itable];
-	      const flt_t prefactor = qtmp * q[j] * table2;
-	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
-		prefactor;
-	      forcecoul -= adjust;
-	      if (EFLAG) ecoul -= adjust;
-	    }
+            const flt_t tablet = table[itable].f +
+              fraction * table[itable].df;
+            forcecoul = qtmp * q[j] * tablet;
+            if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+                               fraction * detable[itable]);
+            if (sbindex) {
+              const flt_t table2 = ctable[itable] +
+                fraction * dctable[itable];
+              const flt_t prefactor = qtmp * q[j] * table2;
+              const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
           }
           #endif
 
-	  #ifdef INTEL_VMASK
-	  if (rsq < cut_ljsq) {
-	  #endif
+          #ifdef INTEL_VMASK
+          if (rsq < cut_ljsq) {
+          #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
             if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
 
-	    #ifdef INTEL_VMASK
-	    if (rsq > cut_lj_innersq) {
-	    #endif
+            #ifdef INTEL_VMASK
+            if (rsq > cut_lj_innersq) {
+            #endif
               const flt_t drsq = cut_ljsq - rsq;
               const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
               const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
                   inv_denom_lj;
               const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
               if (EFLAG) {
-		#ifndef INTEL_VMASK
-		if (rsq > cut_lj_innersq) {
-		#endif
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq) {
+                #endif
                   forcelj = forcelj * switch1 + evdwl * switch2;
                   evdwl *= switch1;
-		#ifndef INTEL_VMASK
-		}
-		#endif
+                #ifndef INTEL_VMASK
+                }
+                #endif
               } else {
                 const flt_t philj = r6inv * (lji[jtype].z*r6inv -
                     lji[jtype].w);
-		#ifndef INTEL_VMASK
-		if (rsq > cut_lj_innersq)
-		#endif
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq)
+                #endif
                   forcelj =  forcelj * switch1 + philj * switch2;
               }
-	    #ifdef INTEL_VMASK
-	    }
-	    #endif
+            #ifdef INTEL_VMASK
+            }
+            #endif
 
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcelj *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
-	  #ifdef INTEL_VMASK
-	  }
-	  #else
-	  if (rsq > cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
-	  if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
-	  #endif
+          #ifdef INTEL_VMASK
+          }
+          #else
+          if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+          #endif
 
-	  const flt_t delx = tdelx[jj];
-	  const flt_t dely = tdely[jj];
-	  const flt_t delz = tdelz[jj];
-	  const flt_t fpair = (forcecoul + forcelj) * r2inv;
-	  fxtmp += delx * fpair;
-	  fytmp += dely * fpair;
-	  fztmp += delz * fpair;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    f[j].x -= delx * fpair;
-	    f[j].y -= dely * fpair;
-	    f[j].z -= delz * fpair;
-	  }
+          const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
 
-	  if (EVFLAG) {
-	    flt_t ev_pre = (flt_t)0;
-	    if (NEWTON_PAIR || i < nlocal)
-	      ev_pre += (flt_t)0.5;
-	    if (NEWTON_PAIR || j < nlocal)
-	      ev_pre += (flt_t)0.5;
-	    
-	    if (EFLAG) {
-	      sevdwl += ev_pre * evdwl;
-	      secoul += ev_pre * ecoul;
-	      if (eatom) {
-		if (NEWTON_PAIR || i < nlocal)
-		  fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-		if (NEWTON_PAIR || j < nlocal)
-		  f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-	      }
-	    }
-
-	    IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
-				 delx, dely, delz);
-	  }
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += ecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+            }
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
         } // for jj
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-
-	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-	  		       nlocal, minlocal, nthreads, f_start, f_stride,
-	                       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
       }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -503,7 +494,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -514,6 +505,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 void PairLJCharmmCoulLongIntel::init_style()
 {
   PairLJCharmmCoulLong::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -541,11 +536,6 @@ template <class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
-  int tp1 = atom->ntypes + 1;
-  int ntable = 1;
-  if (ncoultablebits)
-    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
-
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop >= 0) off_ccache = 1;
@@ -553,6 +543,11 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
   buffers->grow_ccache(off_ccache, comm->nthreads, 1);
   _ccache_stride = buffers->ccache_stride();
 
+  int tp1 = atom->ntypes + 1;
+  int ntable = 1;
+  if (ncoultablebits)
+    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
+
   fc.set_ntypes(tp1, ntable, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
@@ -561,7 +556,7 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
   double cut, cutneigh;
   if (cut_lj > cut_coul)
     error->all(FLERR,
-	 "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
+         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
@@ -642,7 +637,7 @@ template <class flt_t>
 void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                               const int ntable,
                                                               Memory *memory,
-							      const int cop) {
+                                                              const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -658,12 +653,12 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
           otable != NULL && oetable != NULL && odetable != NULL &&
           octable != NULL && odctable != NULL && ospecial_coul != NULL &&
-	  cop >= 0) {
+          cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
-	  nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
-	  nocopy(otable: alloc_if(0) free_if(1)) \
-	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+          nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
+          nocopy(otable: alloc_if(0) free_if(1)) \
+          nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
@@ -699,7 +694,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
           otable !=NULL && oetable != NULL && odetable != NULL &&
           octable != NULL && odctable != NULL && ospecial_coul != NULL &&
-	  cop >= 0) {
+          cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
index 6a207d8400..1b13d78497 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@@ -48,10 +48,10 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -75,7 +75,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
 
     void set_ntypes(const int ntypes, const int ntable, Memory *memory,
-		    const int cop);
+                    const int cop);
 
    private:
     int _ntypes, _ntable, _cop;
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
index f26ff724c8..e9775d6ec5 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -68,8 +68,8 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc)
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -83,57 +83,50 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-				nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc,
-				     const int astart, const int aend)
+                                  IntelBuffers<flt_t,acc_t> *buffers,
+                                  const ForceConst<flt_t> &fc,
+                                  const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -167,11 +160,19 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -204,8 +205,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
-    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)    \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -217,30 +220,37 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
-			      f_stride, x, q);
+                              f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
-      for (int i = iifrom; i < iito; ++i) {
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
@@ -251,100 +261,112 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
-	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-	}
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
+        int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma vector aligned
+        #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
-          flt_t forcecoul, forcelj, evdwl, ecoul;
-          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
-
-          const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
-
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
+          if (rsq < c_forcei[jtype].cutsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=jtype;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+                                 sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          flt_t forcecoul, forcelj, evdwl, ecoul;
+          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
+
+          const int j = tj[jj] & NEIGHMASK;
+          const int sbindex = tj[jj] >> SBBITS & 3;
+          const int jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
-	  #ifdef INTEL_VMASK
-	  if (rsq < c_forcei[jtype].cutsq) {
+          #ifdef INTEL_ALLOW_TABLE
+          if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
-            #ifdef INTEL_ALLOW_TABLE
-            if (!ncoultablebits || rsq <= tabinnersq) {
-            #endif
-              const flt_t A1 =  0.254829592;
-              const flt_t A2 = -0.284496736;
-              const flt_t A3 =  1.421413741;
-              const flt_t A4 = -1.453152027;
-              const flt_t A5 =  1.061405429;
-              const flt_t EWALD_F = 1.12837917;
-              const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+            const flt_t A1 =  0.254829592;
+            const flt_t A2 = -0.284496736;
+            const flt_t A3 =  1.421413741;
+            const flt_t A4 = -1.453152027;
+            const flt_t A5 =  1.061405429;
+            const flt_t EWALD_F = 1.12837917;
+            const flt_t INV_EWALD_P = 1.0 / 0.3275911;
 
-              const flt_t r = (flt_t)1.0 / sqrt(r2inv);
-              const flt_t grij = g_ewald * r;
-              const flt_t expm2 = exp(-grij * grij);
-              const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
-              const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-              const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
-              forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
-              if (EFLAG) ecoul = prefactor * erfc;
+            const flt_t r = (flt_t)1.0 / sqrt(r2inv);
+            const flt_t grij = g_ewald * r;
+            const flt_t expm2 = exp(-grij * grij);
+            const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+            const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+            forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+            if (EFLAG) ecoul = prefactor * erfc;
 
-	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
-		prefactor;
-	      forcecoul -= adjust;
-	      if (EFLAG) ecoul -= adjust;
+            const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+              prefactor;
+            forcecoul -= adjust;
+            if (EFLAG) ecoul -= adjust;
 
-            #ifdef INTEL_ALLOW_TABLE
-            } else {
-              float rsq_lookup = rsq;
-              const int itable = (__intel_castf32_u32(rsq_lookup) &
-                  ncoulmask) >> ncoulshiftbits;
-              const flt_t fraction = (rsq_lookup - table[itable].r) *
-                  table[itable].dr;
+          #ifdef INTEL_ALLOW_TABLE
+          } else {
+            float rsq_lookup = rsq;
+            const int itable = (__intel_castf32_u32(rsq_lookup) &
+                                ncoulmask) >> ncoulshiftbits;
+            const flt_t fraction = (rsq_lookup - table[itable].r) *
+              table[itable].dr;
 
-              const flt_t tablet = table[itable].f +
-                  fraction * table[itable].df;
-              forcecoul = qtmp * q[j] * tablet;
-              if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
-                  fraction * detable[itable]);
-              if (sbindex) {
-                const flt_t table2 = ctable[itable] +
-                    fraction * dctable[itable];
-                const flt_t prefactor = qtmp * q[j] * table2;
-                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
-                    prefactor;
-                forcecoul -= adjust;
-                if (EFLAG) ecoul -= adjust;
-              }
+            const flt_t tablet = table[itable].f +
+              fraction * table[itable].df;
+            forcecoul = qtmp * q[j] * tablet;
+            if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+                                              fraction * detable[itable]);
+            if (sbindex) {
+              const flt_t table2 = ctable[itable] +
+                fraction * dctable[itable];
+              const flt_t prefactor = qtmp * q[j] * table2;
+              const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
             }
-            #endif
-	  #ifdef INTEL_VMASK
-	  }
-	  #endif
+          }
+          #endif
 
-	  #ifdef INTEL_VMASK
-	  if (rsq < c_forcei[jtype].cut_ljsq) {
-	  #endif
+          #ifdef INTEL_VMASK
+          if (rsq < c_forcei[jtype].cut_ljsq) {
+          #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv -
-			       c_forcei[jtype].lj2);
+                               c_forcei[jtype].lj2);
             if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv -
                                       c_energyi[jtype].lj4) -
                                c_energyi[jtype].offset;
@@ -354,83 +376,82 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
               forcelj *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
-	  #ifdef INTEL_VMASK
-	  }
-	  #else
-	  if (rsq > c_forcei[jtype].cutsq)
-	    { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
-	  if (rsq > c_forcei[jtype].cut_ljsq)
-	    { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
-	  #endif
-
-	  #ifdef INTEL_VMASK
-	  if (rsq < c_forcei[jtype].cutsq) {
-	  #endif
-            const flt_t fpair = (forcecoul + forcelj) * r2inv;
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
-            }
-
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i < nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j < nlocal)
-                ev_pre += (flt_t)0.5;
-
-              if (EFLAG) {
-                sevdwl += ev_pre * evdwl;
-                secoul += ev_pre * ecoul;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                  if (NEWTON_PAIR || j < nlocal)
-                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                }
-              }
- 	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
-            }
           #ifdef INTEL_VMASK
-	  }
-	  #endif
+          }
+          #else
+          if (rsq > c_forcei[jtype].cut_ljsq)
+            { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+          #endif
+
+          const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += ecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+            }
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
         } // for jj
 
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride,
-	                       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
       }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -442,7 +463,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -453,6 +474,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 void PairLJCutCoulLongIntel::init_style()
 {
   PairLJCutCoulLong::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -480,6 +505,13 @@ template <class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
@@ -514,6 +546,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
+      if (cutsq[i][j] < cut_ljsq[i][j])
+        error->all(FLERR,
+         "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].lj1 = lj1[i][j];
@@ -563,9 +598,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-							   const int ntable,
-							   Memory *memory,
-							   const int cop) {
+                                                           const int ntable,
+                                                           Memory *memory,
+                                                           const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -584,9 +619,9 @@ void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
           ospecial_coul != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
-	  nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
-	  nocopy(otable: alloc_if(0) free_if(1)) \
-	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+          nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
+          nocopy(otable: alloc_if(0) free_if(1)) \
+          nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
index dad73d18bd..288a6a7bc4 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
@@ -42,16 +42,16 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
 
  private:
   FixIntel *fix;
-  int _cop, _lrt;
+  int _cop, _lrt, _ccache_stride;
 
   template <class flt_t> class ForceConst;
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -76,7 +76,7 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
 
     void set_ntypes(const int ntypes, const int ntable, Memory *memory,
-		    const int cop);
+                    const int cop);
 
    private:
     int _ntypes, _ntable, _cop;
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
index dd08dc023c..4871821842 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -75,85 +75,64 @@ void PairLJCutIntel::compute(int eflag, int vflag,
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-                                nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
   if (_onetype) {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-	if (force->newton_pair) {
-	  eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-	} else {
-	  eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-	}
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	if (force->newton_pair) {
-	  eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-	} else {
-	  eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-	}
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-	eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
-	eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
-	eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-	if (force->newton_pair) {
-	  eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-	} else {
-	  eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-	}
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	if (force->newton_pair) {
-	  eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-	} else {
-	  eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-	}
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-	eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
-	eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
-	eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
 }
 
-template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
-	  class acc_t>
+template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCutIntel::eval(const int offload, const int vflag,
                           IntelBuffers<flt_t,acc_t> *buffers,
                           const ForceConst<flt_t> &fc,
@@ -181,9 +160,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -197,48 +176,47 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
-			      f_stride, x, 0);
+                              f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       flt_t cutsq, lj1, lj2, lj3, lj4, offset;
       if (ONETYPE) {
-	cutsq = ljc12o[3].cutsq;
-	lj1 = ljc12o[3].lj1;
-	lj2 = ljc12o[3].lj2;
-	lj3 = lj34[3].lj3;
-	lj4 = lj34[3].lj4;
-	offset = ljc12o[3].offset;
+        cutsq = ljc12o[3].cutsq;
+        lj1 = ljc12o[3].lj1;
+        lj2 = ljc12o[3].lj2;
+        lj3 = lj34[3].lj3;
+        lj4 = lj34[3].lj4;
+        offset = ljc12o[3].offset;
       }
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         int itype, ptr_off;
         const FC_PACKED1_T * _noalias ljc12oi;
         const FC_PACKED2_T * _noalias lj34i;
-	if (!ONETYPE) {
-	  itype = x[i].w;
+        if (!ONETYPE) {
+          itype = x[i].w;
           ptr_off = itype * ntypes;
           ljc12oi = ljc12o + ptr_off;
           lj34i = lj34 + ptr_off;
-	}
+        }
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
@@ -250,134 +228,134 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
           forcelj = evdwl = (flt_t)0.0;
 
-	  int j, jtype, sbindex;
-	  if (!ONETYPE) {
-	    sbindex = jlist[jj] >> SBBITS & 3;
-	    j = jlist[jj] & NEIGHMASK;
-	  } else
-	    j = jlist[jj];
+          int j, jtype, sbindex;
+          if (!ONETYPE) {
+            sbindex = jlist[jj] >> SBBITS & 3;
+            j = jlist[jj] & NEIGHMASK;
+          } else
+            j = jlist[jj];
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           if (!ONETYPE) {
-	    jtype = x[j].w;
+            jtype = x[j].w;
             cutsq = ljc12oi[jtype].cutsq;
-	  }
+          }
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
           #ifdef INTEL_VMASK
           if (rsq < cutsq) {
-	  #endif
+          #endif
             flt_t factor_lj;
-	    if (!ONETYPE) factor_lj = special_lj[sbindex];
+            if (!ONETYPE) factor_lj = special_lj[sbindex];
             flt_t r2inv = 1.0 / rsq;
             flt_t r6inv = r2inv * r2inv * r2inv;
             #ifndef INTEL_VMASK
-	    if (rsq > cutsq) r6inv = (flt_t)0.0;
-	    #endif
-	    if (!ONETYPE) {
-	      lj1 = ljc12oi[jtype].lj1;
-	      lj2 = ljc12oi[jtype].lj2;
-	    }
+            if (rsq > cutsq) r6inv = (flt_t)0.0;
+            #endif
+            if (!ONETYPE) {
+              lj1 = ljc12oi[jtype].lj1;
+              lj2 = ljc12oi[jtype].lj2;
+            }
             forcelj = r6inv * (lj1 * r6inv - lj2);
             flt_t fpair;
-	    if (!ONETYPE)
-	      fpair = factor_lj * forcelj * r2inv;
-	    else
-	      fpair = forcelj * r2inv;
+            if (!ONETYPE)
+              fpair = factor_lj * forcelj * r2inv;
+            else
+              fpair = forcelj * r2inv;
 
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
-            }
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
 
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i<nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j<nlocal)
-                ev_pre += (flt_t)0.5;
-
-              if (EFLAG) {
-		if (!ONETYPE) {
-		  lj3 = lj34i[jtype].lj3;
-		  lj4 = lj34i[jtype].lj4;
-		  offset = ljc12oi[jtype].offset;
-		}
-                evdwl = r6inv * (lj3 * r6inv - lj4);
-                #ifdef INTEL_VMASK
-		evdwl -= offset;
-		#else
-		if (rsq < cutsq) evdwl -= offset;
-		#endif
-                if (!ONETYPE) evdwl *= factor_lj;
-                sevdwl += ev_pre*evdwl;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += 0.5 * evdwl;
-                  if (NEWTON_PAIR || j < nlocal)
-                    f[j].w += 0.5 * evdwl;
-                }
+            if (EFLAG) {
+              if (!ONETYPE) {
+                lj3 = lj34i[jtype].lj3;
+                lj4 = lj34i[jtype].lj4;
+                offset = ljc12oi[jtype].offset;
+              }
+              evdwl = r6inv * (lj3 * r6inv - lj4);
+              #ifdef INTEL_VMASK
+              evdwl -= offset;
+              #else
+              if (rsq < cutsq) evdwl -= offset;
+              #endif
+              if (!ONETYPE) evdwl *= factor_lj;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl;
               }
-
-	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
-				   delx, dely, delz);
             }
+
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           } // if rsq
           #endif
         } // for jj
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
 
-	IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-	                       nlocal, minlocal, nthreads, f_start, f_stride,
-	                       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end omp
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-	ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -389,7 +367,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -400,6 +378,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 void PairLJCutIntel::init_style()
 {
   PairLJCut::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -472,7 +454,7 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
 template <class flt_t>
 void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                    Memory *memory,
-						   const int cop) {
+                                                   const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       fc_packed1 *oljc12o = ljc12o[0];
diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h
index a9c77324f3..b577a04658 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_intel.h
@@ -45,8 +45,7 @@ class PairLJCutIntel : public PairLJCut {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
-	    class acc_t>
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers,
             const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
new file mode 100644
index 0000000000..86929d41ea
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
@@ -0,0 +1,50 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_lj_long_coul_long_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "kspace.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+
+
+using namespace LAMMPS_NS;
+
+#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
+#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
+#define TABLE_T typename ForceConst<flt_t>::table_t
+
+PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
+  PairLJLongCoulLong(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+
+PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
+{
+}
diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.h b/src/USER-INTEL/pair_lj_long_coul_long_intel.h
new file mode 100644
index 0000000000..b7d3504ecd
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.h
@@ -0,0 +1,39 @@
+/* *- c++ -*- -----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
+#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
+
+#include "pair_lj_long_coul_long.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+  class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
+  public:
+    PairLJLongCoulLongIntel(class LAMMPS *);
+    virtual ~PairLJLongCoulLongIntel();
+
+  };
+}
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index 09e00fd867..7a6b7afd92 100644
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -77,7 +77,7 @@ void PairSWIntel::compute(int eflag, int vflag)
 {
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
-			  force_const_single);
+                          force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
@@ -109,85 +109,59 @@ void PairSWIntel::compute(int eflag, int vflag,
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-                                nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom, ito, ago);
     }
 
     fix->stop_watch(TIME_PACK);
   }
 
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
   if (_onetype) {
     if (_spq) {
-      if (evflag || vflag_fdotr) {
-	int ovflag = 0;
-	if (vflag_fdotr) ovflag = 2;
-	else if (vflag) ovflag = 1;
-	if (eflag) {
-	  eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	} else {
-	  eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	}
+      if (eflag) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<1,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
-      if (evflag || vflag_fdotr) {
-	int ovflag = 0;
-	if (vflag_fdotr) ovflag = 2;
-	else if (vflag) ovflag = 1;
-	if (eflag) {
-	  eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	} else {
-	  eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	}
+      if (eflag) {
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<0,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     }
   } else {
     if (_spq) {
-      if (evflag || vflag_fdotr) {
-	int ovflag = 0;
-	if (vflag_fdotr) ovflag = 2;
-	else if (vflag) ovflag = 1;
-	if (eflag) {
-	  eval<1,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<1,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	} else {
-	  eval<1,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<1,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	}
+      if (eflag) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
-      if (evflag || vflag_fdotr) {
-	int ovflag = 0;
-	if (vflag_fdotr) ovflag = 2;
-	else if (vflag) ovflag = 1;
-	if (eflag) {
-	  eval<0,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<0,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	} else {
-	  eval<0,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<0,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	}
+      if (eflag) {
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     }
   }
@@ -196,11 +170,11 @@ void PairSWIntel::compute(int eflag, int vflag,
 /* ---------------------------------------------------------------------- */
 #ifndef LMP_USE_AVXCD
 
-template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
+template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart,
-		       const int aend, const int pad_width)
+                       const int aend, const int pad_width)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -235,7 +209,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
@@ -276,19 +250,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
@@ -308,50 +278,49 @@ void PairSWIntel::eval(const int offload, const int vflag,
       flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
       if (ONETYPE) {
         cutsq = p2[3].cutsq;
-	cut = p2f[3].cut;
-	sigma = p2f[3].sigma;
-	c1 = p2f2[3].c1;
-	c2 = p2f2[3].c2;
-	c3 = p2f2[3].c3;
-	c4 = p2f2[3].c4;
-	sigma_gamma = p2[3].sigma_gamma;
-	costheta = p3[7].costheta;
-	lambda_epsilon = p3[7].lambda_epsilon;
-	lambda_epsilon2 = p3[7].lambda_epsilon2;
-	if (SPQ == 0) {
+        cut = p2f[3].cut;
+        sigma = p2f[3].sigma;
+        c1 = p2f2[3].c1;
+        c2 = p2f2[3].c2;
+        c3 = p2f2[3].c3;
+        c4 = p2f2[3].c4;
+        sigma_gamma = p2[3].sigma_gamma;
+        costheta = p3[7].costheta;
+        lambda_epsilon = p3[7].lambda_epsilon;
+        lambda_epsilon2 = p3[7].lambda_epsilon2;
+        if (SPQ == 0) {
           powerp = p2f[3].powerp;
-	  powerq = p2f[3].powerq;
+          powerq = p2f[3].powerq;
         }
-	if (EFLAG) {
+        if (EFLAG) {
           c5 = p2e[3].c5;
-	  c6 = p2e[3].c6;
+          c6 = p2e[3].c6;
         }
       }
 
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         int itype, itype_offset;
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
 
-	if (!ONETYPE) {
+        if (!ONETYPE) {
           itype = x[i].w;
-	  itype_offset = itype * ntypes;
-        } 
+          itype_offset = itype * ntypes;
+        }
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
-	const int jnumhalf = numneighhalf[i];
+        const int jnumhalf = numneighhalf[i];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
-        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl;
         fxtmp = fytmp = fztmp = (acc_t)0.0;
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
-          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
 
-	int ejnum = 0, ejnumhalf = 0;
+        int ejnum = 0, ejnumhalf = 0;
+        #pragma vector aligned
+        #pragma ivdep
         for (int jj = 0; jj < jnum; jj++) {
           int j = jlist[jj];
           j &= NEIGHMASK;
@@ -360,117 +329,115 @@ void PairSWIntel::eval(const int offload, const int vflag,
           const flt_t delz = x[j].z - ztmp;
           int jtype, ijtype;
           if (!ONETYPE) {
-	    jtype = x[j].w;
-	    ijtype = itype_offset + jtype;
-	    cutsq = p2[ijtype].cutsq;
-	  } 
+            jtype = x[j].w;
+            ijtype = itype_offset + jtype;
+            cutsq = p2[ijtype].cutsq;
+          }
           const flt_t rsq1 = delx * delx + dely * dely + delz * delz;
           if (rsq1 < cutsq) {
-	    tdelx[ejnum] = delx;
-	    tdely[ejnum] = dely;
-	    tdelz[ejnum] = delz;
-	    trsq[ejnum] = rsq1;
-	    tj[ejnum] = j;
-	    if (!ONETYPE) tjtype[ejnum] = jtype;
-	    ejnum++;
-	    if (jj < jnumhalf) ejnumhalf++;
-	  }
-	}
-	int ejnum_pad = ejnum;
-	
-	while ( (ejnum_pad % pad_width) != 0) {
-	  tdelx[ejnum_pad] = (flt_t)0.0;
-	  tdely[ejnum_pad] = (flt_t)0.0;
-	  tdelz[ejnum_pad] = (flt_t)0.0;
-	  trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
-	  tj[ejnum_pad] = nall;
-	  if (!ONETYPE) tjtype[ejnum_pad] = 0;
-	  ejnum_pad++;
-	}
-	
+            tdelx[ejnum] = delx;
+            tdely[ejnum] = dely;
+            tdelz[ejnum] = delz;
+            trsq[ejnum] = rsq1;
+            tj[ejnum] = j;
+            if (!ONETYPE) tjtype[ejnum] = jtype;
+            ejnum++;
+            if (jj < jnumhalf) ejnumhalf++;
+          }
+        }
+        int ejnum_pad = ejnum;
+
+        while ( (ejnum_pad % pad_width) != 0) {
+          tdelx[ejnum_pad] = (flt_t)0.0;
+          tdely[ejnum_pad] = (flt_t)0.0;
+          tdelz[ejnum_pad] = (flt_t)0.0;
+          trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
+          tj[ejnum_pad] = nall;
+          if (!ONETYPE) tjtype[ejnum_pad] = 0;
+          ejnum_pad++;
+        }
+
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-		#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-					 sv0, sv1, sv2, sv3, sv4, sv5)
-	#endif
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
+        #endif
         for (int jj = 0; jj < ejnum_pad; jj++) {
           acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
           fjxtmp = fjytmp = fjztmp = (acc_t)0.0;
           if (EFLAG) fjtmp = (acc_t)0.0;
-	  int ijtype;
+          int ijtype;
 
-          const flt_t delx = tdelx[jj];
-          const flt_t dely = tdely[jj];
-          const flt_t delz = tdelz[jj];
-	  if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
+          if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
           const flt_t rsq1 = trsq[jj];
 
           const flt_t rinvsq1 = (flt_t)1.0 / rsq1;
           const flt_t r1 = (flt_t)1.0/sqrt(rinvsq1);
-	  if (!ONETYPE) cut = p2f[ijtype].cut;
+          if (!ONETYPE) cut = p2f[ijtype].cut;
           const flt_t rainv1 = (flt_t)1.0 / (r1 - cut);
-	  
-	  // two-body interactions, skip half of them
-	  flt_t rp, rq;
-	  if (SPQ == 1) {
-	    rp = r1 * r1;
-	    rp *= rp;
-	    rp = (flt_t)1.0 / rp;
-	    rq = (flt_t)1.0;
-	  } else {
+
+          // two-body interactions, skip half of them
+          flt_t rp, rq;
+          if (SPQ == 1) {
+            rp = r1 * r1;
+            rp *= rp;
+            rp = (flt_t)1.0 / rp;
+            rq = (flt_t)1.0;
+          } else {
             if (!ONETYPE) {
               powerp = p2f[ijtype].powerp;
-	      powerq = p2f[ijtype].powerq;
+              powerq = p2f[ijtype].powerq;
             }
-	    rp = std::pow(r1, powerp);
-	    rq = std::pow(r1, powerq);
-	  }
-
-	  if (!ONETYPE) {
-            sigma = p2f[ijtype].sigma;
-	    c1 = p2f2[ijtype].c1;
-	    c2 = p2f2[ijtype].c2;
-	    c3 = p2f2[ijtype].c3; 
-	    c4 = p2f2[ijtype].c4;
+            rp = std::pow(r1, powerp);
+            rq = std::pow(r1, powerq);
           }
 
-	  const flt_t rainvsq = rainv1 * rainv1 * r1;
-	  flt_t expsrainv = exp(sigma * rainv1);
-	  if (jj >= ejnumhalf) expsrainv = (flt_t)0.0;
-	  const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * 
-			       rainvsq) * expsrainv * rinvsq1;
+          if (!ONETYPE) {
+            sigma = p2f[ijtype].sigma;
+            c1 = p2f2[ijtype].c1;
+            c2 = p2f2[ijtype].c2;
+            c3 = p2f2[ijtype].c3;
+            c4 = p2f2[ijtype].c4;
+          }
 
-	  fxtmp -= delx * fpair;
-	  fytmp -= dely * fpair;
-	  fztmp -= delz * fpair;
-	  fjxtmp += delx * fpair;
-	  fjytmp += dely * fpair;
-	  fjztmp += delz * fpair;
+          const flt_t rainvsq = rainv1 * rainv1 * r1;
+          flt_t expsrainv = exp(sigma * rainv1);
+          if (jj >= ejnumhalf) expsrainv = (flt_t)0.0;
+          const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
+                               rainvsq) * expsrainv * rinvsq1;
 
-	  if (EVFLAG) {
-	    if (EFLAG) {
-	      flt_t evdwl;
-	      if (!ONETYPE) {
-		c5 = p2e[ijtype].c5;
-		c6 = p2e[ijtype].c6;
-	      }
-	      evdwl = (c5 * rp - c6 * rq) * expsrainv;
-	      sevdwl += evdwl;
-	      if (eatom) {
-		fwtmp += (acc_t)0.5 * evdwl;
-		fjtmp += (acc_t)0.5 * evdwl;
-	      }
-	    }
-	    IP_PRE_ev_tally_nbor(vflag, (flt_t)1.0, fpair,
-				 -delx, -dely, -delz);
-	  }
+          const flt_t delx = tdelx[jj];
+          const flt_t dely = tdely[jj];
+          const flt_t delz = tdelz[jj];
+          const flt_t fpx = fpair * delx;
+          fxtmp -= fpx;
+          fjxtmp += fpx;
+          const flt_t fpy = fpair * dely;
+          fytmp -= fpy;
+          fjytmp += fpy;
+          const flt_t fpz = fpair * delz;
+          fztmp -= fpz;
+          fjztmp += fpz;
 
-	  /*---------------------------------------------*/
+          if (EFLAG) {
+            flt_t evdwl;
+            if (!ONETYPE) {
+              c5 = p2e[ijtype].c5;
+              c6 = p2e[ijtype].c6;
+            }
+            evdwl = (c5 * rp - c6 * rq) * expsrainv;
+            sevdwl += evdwl;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl;
+              fjtmp += (flt_t)0.5 * evdwl;
+            }
+          }
 
-	  int ijkoff;
-	  if (!ONETYPE) {
+          /*---------------------------------------------*/
+
+          int ijkoff;
+          if (!ONETYPE) {
             sigma_gamma = p2[ijtype].sigma_gamma;
-	    ijkoff = ijtype * ntypes;
+            ijkoff = ijtype * ntypes;
           }
 
           flt_t gsrainv1 = sigma_gamma * rainv1;
@@ -479,15 +446,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
           for (int kk = 0; kk < ejnum; kk++) {
             int iktype, ijktype;
-	    if (!ONETYPE) {
+            if (!ONETYPE) {
               iktype = tjtype[kk];
-	      ijktype = ijkoff + iktype;
-	      iktype += itype_offset;
-	      cut = p2[iktype].cut;
-	      sigma_gamma = p2[iktype].sigma_gamma;
-	      costheta = p3[ijktype].costheta;
-	      lambda_epsilon = p3[ijktype].lambda_epsilon;
-	      lambda_epsilon2 = p3[ijktype].lambda_epsilon2;
+              ijktype = ijkoff + iktype;
+              iktype += itype_offset;
+              cut = p2[iktype].cut;
+              sigma_gamma = p2[iktype].sigma_gamma;
+              costheta = p3[ijktype].costheta;
+              lambda_epsilon = p3[ijktype].lambda_epsilon;
+              lambda_epsilon2 = p3[ijktype].lambda_epsilon2;
             }
 
             flt_t delr2[3];
@@ -496,95 +463,88 @@ void PairSWIntel::eval(const int offload, const int vflag,
             delr2[2] = tdelz[kk];
             const flt_t rsq2 = trsq[kk];
 
-	    const flt_t rinvsq2 = (flt_t)1.0 / rsq2;
-	    const flt_t r2 = (flt_t)1.0 / sqrt(rinvsq2);
-	    const flt_t rainv2 = (flt_t)1.0 / (r2 - cut);
-	    const flt_t gsrainv2 = sigma_gamma * rainv2;
-	    const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
-	    const flt_t expgsrainv2 = exp(gsrainv2);
+            const flt_t rinvsq2 = (flt_t)1.0 / rsq2;
+            const flt_t r2 = (flt_t)1.0 / sqrt(rinvsq2);
+            const flt_t rainv2 = (flt_t)1.0 / (r2 - cut);
+            const flt_t gsrainv2 = sigma_gamma * rainv2;
+            const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
+            const flt_t expgsrainv2 = exp(gsrainv2);
 
-	    const flt_t rinv12 = (flt_t)1.0 / (r1 * r2);
-	    const flt_t cs = (delx * delr2[0] + dely * delr2[1] +
+            const flt_t rinv12 = (flt_t)1.0 / (r1 * r2);
+            const flt_t cs = (delx * delr2[0] + dely * delr2[1] +
                               delz * delr2[2]) * rinv12;
-	    const flt_t delcs = cs - costheta;
-	    const flt_t delcssq = delcs*delcs;
+            const flt_t delcs = cs - costheta;
+            const flt_t delcssq = delcs*delcs;
 
-	    flt_t kfactor;
-	    if (jj == kk || jj >= ejnum) kfactor = (flt_t)0.0;
-	    else kfactor = (flt_t)1.0;
+            flt_t kfactor;
+            if (jj == kk || jj >= ejnum) kfactor = (flt_t)0.0;
+            else kfactor = (flt_t)1.0;
 
-	    const flt_t facexp = expgsrainv1*expgsrainv2*kfactor;
-	    const flt_t facrad = lambda_epsilon * facexp * delcssq;
-	    const flt_t frad1 = facrad*gsrainvsq1;
-	    const flt_t frad2 = facrad*gsrainvsq2;
-	    const flt_t facang = lambda_epsilon2 * facexp * delcs;
-	    const flt_t facang12 = rinv12*facang;
-	    const flt_t csfacang = cs*facang;
-	    const flt_t csfac1 = rinvsq1*csfacang;
+            const flt_t facexp = expgsrainv1*expgsrainv2*kfactor;
+            const flt_t facrad = lambda_epsilon * facexp * delcssq;
+            const flt_t frad1 = facrad*gsrainvsq1;
+            const flt_t frad2 = facrad*gsrainvsq2;
+            const flt_t facang = lambda_epsilon2 * facexp * delcs;
+            const flt_t facang12 = rinv12*facang;
+            const flt_t csfacang = cs*facang;
+            const flt_t csfac1 = rinvsq1*csfacang;
 
-	    const flt_t fjx = delx*(frad1+csfac1)-delr2[0]*facang12;
-	    const flt_t fjy = dely*(frad1+csfac1)-delr2[1]*facang12;
-	    const flt_t fjz = delz*(frad1+csfac1)-delr2[2]*facang12;
+            const flt_t fjx = delx*(frad1+csfac1)-delr2[0]*facang12;
+            const flt_t fjy = dely*(frad1+csfac1)-delr2[1]*facang12;
+            const flt_t fjz = delz*(frad1+csfac1)-delr2[2]*facang12;
 
-	    fxtmp -= fjx;
-	    fytmp -= fjy;
-	    fztmp -= fjz;
-	    fjxtmp += fjx;
-	    fjytmp += fjy;
-	    fjztmp += fjz;
+            fxtmp -= fjx;
+            fytmp -= fjy;
+            fztmp -= fjz;
+            fjxtmp += fjx;
+            fjytmp += fjy;
+            fjztmp += fjz;
 
-	    if (EVFLAG) {
-	      if (EFLAG) {
-	        const flt_t evdwl = facrad * (flt_t)0.5;
-		sevdwl += evdwl;
-		if (eatom) {
-		  fwtmp += (acc_t)0.33333333 * evdwl;
-		  fjtmp += (acc_t)0.33333333 * facrad;
-		}
-	      }
-	      IP_PRE_ev_tally_nbor3v(vflag, fjx, fjy, fjz,
-				     delx, dely, delz);
-	    }
-	  } // for kk
-	  const int j = tj[jj];
+            if (EFLAG) {
+              const flt_t evdwl = facrad * (flt_t)0.5;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (acc_t)0.33333333 * evdwl;
+                fjtmp += (acc_t)0.33333333 * facrad;
+              }
+            }
+          } // for kk
+          const int j = tj[jj];
           f[j].x += fjxtmp;
           f[j].y += fjytmp;
           f[j].z += fjztmp;
-          if (EFLAG) 
-	    if (eatom) f[j].w += fjtmp;
+          if (EFLAG)
+            if (eatom) f[j].w += fjtmp;
         } // for jj
 
         f[i].x += fxtmp;
         f[i].y += fytmp;
         f[i].z += fztmp;
-        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+
+        if (EFLAG) {
+          f[i].w += fwtmp;
+          oevdwl += sevdwl;
+        }
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride,
-			       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
+                              x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
     } // end omp
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -595,13 +555,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
 }
 
-#else 
+#else
 
 /* ----------------------------------------------------------------------
 
@@ -614,11 +574,11 @@ authors for more details.
 
 ------------------------------------------------------------------------- */
 
-template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
+template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
-                       const ForceConst<flt_t> &fc, const int astart, 
-		       const int aend, const int pad_width)
+                       const ForceConst<flt_t> &fc, const int astart,
+                       const int aend, const int pad_width)
 {
   typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
   typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
@@ -659,7 +619,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
@@ -686,7 +646,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
     in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \
     in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
     in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
-    in(ccache_stride3)						\
+    in(ccache_stride3)                                          \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -701,19 +661,17 @@ void PairSWIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, swidth);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads,
+                               swidth);
+
       iifrom += astart;
       iito += astart;
 
@@ -734,22 +692,22 @@ void PairSWIntel::eval(const int offload, const int vflag,
       SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6;
       SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
       if (ONETYPE) {
-	cutsq = SIMD_set(p2[3].cutsq);
-	cut = SIMD_set(p2f[3].cut);
-	sigma = SIMD_set(p2f[3].sigma);
-	c1 = SIMD_set(p2f2[3].c1);
-	c2 = SIMD_set(p2f2[3].c2);
-	c3 = SIMD_set(p2f2[3].c3);
-	c4 = SIMD_set(p2f2[3].c4);
-	sigma_gamma = SIMD_set(p2[3].sigma_gamma);
-	costheta = SIMD_set(p3[7].costheta);
-	lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
-	lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
-	if (SPQ == 0) {
-	  powerp = SIMD_set(p2f[3].powerp);
-	  powerq = SIMD_set(p2f[3].powerq);
-	}
-	if (EFLAG) {
+        cutsq = SIMD_set(p2[3].cutsq);
+        cut = SIMD_set(p2f[3].cut);
+        sigma = SIMD_set(p2f[3].sigma);
+        c1 = SIMD_set(p2f2[3].c1);
+        c2 = SIMD_set(p2f2[3].c2);
+        c3 = SIMD_set(p2f2[3].c3);
+        c4 = SIMD_set(p2f2[3].c4);
+        sigma_gamma = SIMD_set(p2[3].sigma_gamma);
+        costheta = SIMD_set(p3[7].costheta);
+        lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
+        lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
+        if (SPQ == 0) {
+          powerp = SIMD_set(p2f[3].powerp);
+          powerq = SIMD_set(p2f[3].powerq);
+        }
+        if (EFLAG) {
           c5 = SIMD_set(p2e[3].c5);
           c6 = SIMD_set(p2e[3].c6);
         }
@@ -757,130 +715,120 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
       SIMD_int ilist = SIMD_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
       const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128,
-					144,160,176,192,208,224,240);
+                                        144,160,176,192,208,224,240);
       ilist = ilist + iifrom;
       acc_t * const dforce = &(f[0].x);
-      for (int i = iifrom; i < iito; i += swidth) {
-	SIMD_mask imask = ilist < iito;
-	SIMD_flt_t xtmp, ytmp, ztmp;
-	SIMD_int itype, itype_offset;
+      for (int i = iifrom; i < iito; i += iip) {
+        SIMD_mask imask = ilist < iito;
+        SIMD_flt_t xtmp, ytmp, ztmp;
+        SIMD_int itype, itype_offset;
 
-	if (ONETYPE)
-	  SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp);
-	else {
-	  SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype);
-	  itype_offset = itype * ntypes;
-	}
-
-	#ifdef OUTER_CHUNK
-	const int* ng = firstneigh + cnumneigh[i] - swidth;
-	#else
-        SIMD_int ng = SIMD_load(cnumneigh + i);
-	ng = ng - 1;
-	#endif
-	const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
-	const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
-	const int jnum_max = SIMD_max(jnum);
-
-	SIMD_acc_t fxtmp = SIMD_set((acc_t)0);
-	SIMD_acc_t fytmp = SIMD_set((acc_t)0);
-	SIMD_acc_t fztmp = SIMD_set((acc_t)0);
-	SIMD_acc_t fwtmp, fxtmp2, fytmp2, fztmp2, fwtmp2;
-	if (is_same<flt_t,acc_t>::value == 0) {
-	  fxtmp2 = SIMD_set((acc_t)0);
-	  fytmp2 = SIMD_set((acc_t)0);
-	  fztmp2 = SIMD_set((acc_t)0);
-          if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
-	}
-
-        SIMD_acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
-        if (EVFLAG) {
-          if (EFLAG) {
-            fwtmp = SIMD_set((acc_t)0);
-	    sevdwl = SIMD_set((acc_t)0);
-          }
-          if (vflag==1) {
-            sv0 = SIMD_set((acc_t)0);
-	    sv1 = SIMD_set((acc_t)0);
-	    sv2 = SIMD_set((acc_t)0);
-	    sv3 = SIMD_set((acc_t)0);
-	    sv4 = SIMD_set((acc_t)0);
-	    sv5 = SIMD_set((acc_t)0);
-          }
+        if (ONETYPE)
+          SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp);
+        else {
+          SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype);
+          itype_offset = itype * ntypes;
         }
 
-	SIMD_int ejnum = SIMD_set(0);
-	SIMD_int ejnumhalf = SIMD_set(0);
-	SIMD_int coffset = SIMD_set(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
-				    11, 12, 13, 14, 15);
+        #ifdef OUTER_CHUNK
+        const int* ng = firstneigh + cnumneigh[i] - swidth;
+        #else
+        SIMD_int ng = SIMD_load(cnumneigh + i);
+        ng = ng - 1;
+        #endif
+        const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
+        const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
+        const int jnum_max = SIMD_max(jnum);
+
+        SIMD_acc_t fxtmp = SIMD_set((acc_t)0);
+        SIMD_acc_t fytmp = SIMD_set((acc_t)0);
+        SIMD_acc_t fztmp = SIMD_set((acc_t)0);
+        SIMD_acc_t fwtmp, fxtmp2, fytmp2, fztmp2, fwtmp2;
+        if (is_same<flt_t,acc_t>::value == 0) {
+          fxtmp2 = SIMD_set((acc_t)0);
+          fytmp2 = SIMD_set((acc_t)0);
+          fztmp2 = SIMD_set((acc_t)0);
+          if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
+        }
+
+        SIMD_acc_t sevdwl;
+        if (EFLAG) {
+          fwtmp = SIMD_set((acc_t)0);
+          sevdwl = SIMD_set((acc_t)0);
+        }
+
+        SIMD_int ejnum = SIMD_set(0);
+        SIMD_int ejnumhalf = SIMD_set(0);
+        SIMD_int coffset = SIMD_set(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                                    11, 12, 13, 14, 15);
         for (int jj = 0; jj < jnum_max; jj++) {
           SIMD_mask jmask = jj < jnum;
 
-	  #ifdef OUTER_CHUNK
-	  ng += swidth;
-	  SIMD_int j = SIMD_load(ng);
-	  #else
-	  ng = ng + 1;
-	  SIMD_int j = SIMD_gather(jmask, firstneigh, ng);
-	  #endif
+          #ifdef OUTER_CHUNK
+          ng += swidth;
+          SIMD_int j = SIMD_load(ng);
+          #else
+          ng = ng + 1;
+          SIMD_int j = SIMD_gather(jmask, firstneigh, ng);
+          #endif
           j = j & SIMD_set(NEIGHMASK);
-	  const SIMD_int joffset = j << 4;
+          const SIMD_int joffset = j << 4;
 
-	  SIMD_flt_t delx, dely, delz;
-	  SIMD_int jtype, ijtype;
-	  if (ONETYPE)
-	    SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz);
-	  else {
-	    SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz, 
-			     jtype);
-	    ijtype = (jtype + itype_offset) << 2;
-	    cutsq = SIMD_gather(jmask, &(p2[0].cutsq), ijtype);
-	  }
+          SIMD_flt_t delx, dely, delz;
+          SIMD_int jtype, ijtype;
+          if (ONETYPE)
+            SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz);
+          else {
+            SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz,
+                             jtype);
+            ijtype = (jtype + itype_offset) << 2;
+            cutsq = SIMD_gather(jmask, &(p2[0].cutsq), ijtype);
+          }
 
-	  delx = delx - xtmp;
-	  dely = dely - ytmp;
-	  delz = delz - ztmp;
+          delx = delx - xtmp;
+          dely = dely - ytmp;
+          delz = delz - ztmp;
           SIMD_flt_t rsq1 = delx * delx;
-	  rsq1 = SIMD_fma(dely, dely, rsq1);
-	  rsq1 = SIMD_fma(delz, delz, rsq1);
+          rsq1 = SIMD_fma(dely, dely, rsq1);
+          rsq1 = SIMD_fma(delz, delz, rsq1);
 
-	  const SIMD_mask rmask = SIMD_lt(jmask, rsq1, cutsq);
-	  SIMD_scatter(rmask, tdelx, coffset, delx);
-	  SIMD_scatter(rmask, tdely, coffset, dely);
-	  SIMD_scatter(rmask, tdelz, coffset, delz);
-	  SIMD_scatter(rmask, trsq, coffset, rsq1);
-	  SIMD_scatter(rmask, tj, coffset, j);
-	  if (!ONETYPE) SIMD_scatter(rmask, tjtype, coffset, jtype);
-	  ejnum = SIMD_add(rmask, ejnum, 1);
-	  coffset = SIMD_add(rmask, coffset, swidth);
-	  const SIMD_mask hmask = SIMD_lt(rmask, SIMD_set(jj), jnumhalf);
-	  ejnumhalf = SIMD_add(hmask, ejnumhalf, 1);
-	}
+          const SIMD_mask rmask = SIMD_lt(jmask, rsq1, cutsq);
+          SIMD_scatter(rmask, tdelx, coffset, delx);
+          SIMD_scatter(rmask, tdely, coffset, dely);
+          SIMD_scatter(rmask, tdelz, coffset, delz);
+          SIMD_scatter(rmask, trsq, coffset, rsq1);
+          SIMD_scatter(rmask, tj, coffset, j);
+          if (!ONETYPE) SIMD_scatter(rmask, tjtype, coffset, jtype);
+          ejnum = SIMD_add(rmask, ejnum, 1);
+          coffset = SIMD_add(rmask, coffset, swidth);
+          const SIMD_mask hmask = SIMD_lt(rmask, SIMD_set(jj), jnumhalf);
+          ejnumhalf = SIMD_add(hmask, ejnumhalf, 1);
+        }
 
-	const int ejnum_max = SIMD_max(ejnum);
-	const int ejnumhalf_max = SIMD_max(ejnumhalf);
-	memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3);
+        const int ejnum_max = SIMD_max(ejnum);
+        const int ejnumhalf_max = SIMD_max(ejnumhalf);
+        memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3);
         for (int jj = 0; jj < ejnum_max; jj++) {
           SIMD_int ijtype;
-	  const int coffset = jj * swidth;
-	  if (!ONETYPE) {
-	    ijtype = SIMD_load(tjtype + coffset);
-	    ijtype = (ijtype + itype_offset) << 2;
-	    cut = SIMD_gather(&(p2f[0].cut), ijtype);
-	  }
+          const int coffset = jj * swidth;
+          if (!ONETYPE) {
+            ijtype = SIMD_load(tjtype + coffset);
+            ijtype = (ijtype + itype_offset) << 2;
+            cut = SIMD_gather(&(p2f[0].cut), ijtype);
+          }
 
-	  SIMD_acc_t fjxtmp = SIMD_set((acc_t)0);
-	  SIMD_acc_t fjytmp = SIMD_set((acc_t)0);
-	  SIMD_acc_t fjztmp = SIMD_set((acc_t)0);
-	  SIMD_acc_t fjtmp, fjxtmp2, fjytmp2, fjztmp2, fjtmp2;
+          SIMD_acc_t fjxtmp = SIMD_set((acc_t)0);
+          SIMD_acc_t fjytmp = SIMD_set((acc_t)0);
+          SIMD_acc_t fjztmp = SIMD_set((acc_t)0);
+          SIMD_acc_t fjtmp, fjxtmp2, fjytmp2, fjztmp2, fjtmp2;
           if (EFLAG) fjtmp = SIMD_set((acc_t)0.0);
 
-	  if (is_same<flt_t,acc_t>::value == 0) {
-	    fjxtmp2 = SIMD_set((acc_t)0);
-	    fjytmp2 = SIMD_set((acc_t)0);
-	    fjztmp2 = SIMD_set((acc_t)0);
-	    if (EFLAG) fjtmp2 = SIMD_set((acc_t)0.0);
-	  }
+          if (is_same<flt_t,acc_t>::value == 0) {
+            fjxtmp2 = SIMD_set((acc_t)0);
+            fjytmp2 = SIMD_set((acc_t)0);
+            fjztmp2 = SIMD_set((acc_t)0);
+            if (EFLAG) fjtmp2 = SIMD_set((acc_t)0.0);
+          }
 
           const SIMD_flt_t delx = SIMD_load(tdelx + coffset);
           const SIMD_flt_t dely = SIMD_load(tdely + coffset);
@@ -888,251 +836,223 @@ void PairSWIntel::eval(const int offload, const int vflag,
           const SIMD_flt_t rsq1 = SIMD_load(trsq + coffset);
 
           const SIMD_flt_t rinvsq1 = SIMD_rcp(rsq1);
-          const SIMD_flt_t r1 = SIMD_invsqrt(rinvsq1); 
+          const SIMD_flt_t r1 = SIMD_invsqrt(rinvsq1);
           const SIMD_flt_t rainv1 = SIMD_rcp(r1 - cut);
-	  
-	  // two-body interactions, skip half of them
-	  if (jj < ejnumhalf_max) {
+
+          // two-body interactions, skip half of them
+          if (jj < ejnumhalf_max) {
             SIMD_flt_t rp, rq;
-	    if (SPQ == 1) {
+            if (SPQ == 1) {
               rp = r1 * r1;
-	      rp = rp * rp;
-	      rp = SIMD_rcp(rp);
-	      rq = SIMD_set((flt_t)1.0);
+              rp = rp * rp;
+              rp = SIMD_rcp(rp);
+              rq = SIMD_set((flt_t)1.0);
             } else {
-	      if (!ONETYPE) {
-		powerp = SIMD_gather(&(p2f[0].powerp), ijtype);
-		powerq = SIMD_gather(&(p2f[0].powerq), ijtype);
-	      }
-	      rp = SIMD_pow(r1, powerp);
-	      rq = SIMD_pow(r1, powerq);
-	    }
+              if (!ONETYPE) {
+                powerp = SIMD_gather(&(p2f[0].powerp), ijtype);
+                powerq = SIMD_gather(&(p2f[0].powerq), ijtype);
+              }
+              rp = SIMD_pow(r1, powerp);
+              rq = SIMD_pow(r1, powerq);
+            }
 
-	    if (!ONETYPE) {
-	      sigma = SIMD_gather(&(p2f[0].sigma), ijtype);
-	      c1 = SIMD_gather(&(p2f2[0].c1), ijtype);
-	      c2 = SIMD_gather(&(p2f2[0].c2), ijtype);
-	      c3 = SIMD_gather(&(p2f2[0].c3), ijtype);
-	      c4 = SIMD_gather(&(p2f2[0].c4), ijtype);
-	    }
+            if (!ONETYPE) {
+              sigma = SIMD_gather(&(p2f[0].sigma), ijtype);
+              c1 = SIMD_gather(&(p2f2[0].c1), ijtype);
+              c2 = SIMD_gather(&(p2f2[0].c2), ijtype);
+              c3 = SIMD_gather(&(p2f2[0].c3), ijtype);
+              c4 = SIMD_gather(&(p2f2[0].c4), ijtype);
+            }
 
-	    const SIMD_flt_t rainvsq = rainv1 * rainv1 * r1;
-	    const SIMD_flt_t expsrainv = SIMD_exp(sigma * rainv1);
-	    const SIMD_flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
-				      rainvsq) * expsrainv * rinvsq1;
+            const SIMD_flt_t rainvsq = rainv1 * rainv1 * r1;
+            const SIMD_flt_t expsrainv = SIMD_exp(sigma * rainv1);
+            const SIMD_flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
+                                      rainvsq) * expsrainv * rinvsq1;
 
-	    const SIMD_flt_t fjx = delx * fpair;
-	    const SIMD_flt_t fjy = dely * fpair;
-	    const SIMD_flt_t fjz = delz * fpair;
+            const SIMD_flt_t fjx = delx * fpair;
+            const SIMD_flt_t fjy = dely * fpair;
+            const SIMD_flt_t fjz = delz * fpair;
 
-	    const SIMD_mask hmask = jj < ejnumhalf;
-	    SIMD_accumulate3(hmask, fjx, fjy, fjz, fxtmp, fytmp, fztmp,
-			     fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
-			     fztmp2, fjxtmp2, fjytmp2, fjztmp2); 
-          
-	    if (EVFLAG) {
-	      if (EFLAG) {
-	        if (!ONETYPE) {
-	          c5 = SIMD_gather(&(p2e[0].c5), ijtype);
-	          c6 = SIMD_gather(&(p2e[0].c6), ijtype);
-                }            
-	        SIMD_flt_t evdwl;
-		evdwl = (c5 * rp - c6 * rq) * expsrainv;
-		SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
-                                 fwtmp2, fjtmp2);
-	      }
-	      SIMD_ev_tally_nbor(hmask, vflag, (flt_t)1.0, fpair, delx, dely, 
-                                 delz, sv0, sv1, sv2, sv3, sv4, sv5);
-	    }
+            const SIMD_mask hmask = jj < ejnumhalf;
+            SIMD_accumulate3(hmask, fjx, fjy, fjz, fxtmp, fytmp, fztmp,
+                             fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
+                             fztmp2, fjxtmp2, fjytmp2, fjztmp2);
+
+            if (EFLAG) {
+              if (!ONETYPE) {
+                c5 = SIMD_gather(&(p2e[0].c5), ijtype);
+                c6 = SIMD_gather(&(p2e[0].c6), ijtype);
+              }
+              SIMD_flt_t evdwl;
+              evdwl = (c5 * rp - c6 * rq) * expsrainv;
+              SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
+                               fwtmp2, fjtmp2);
+            }
           }
 
-	  /*---------------------------------------------*/
-	  SIMD_int ijkoff;
-	  if (!ONETYPE) {
-	    sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), ijtype);
-	    ijkoff = ijtype * ntypes;
-	  }
+          /*---------------------------------------------*/
+          SIMD_int ijkoff;
+          if (!ONETYPE) {
+            sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), ijtype);
+            ijkoff = ijtype * ntypes;
+          }
           const SIMD_flt_t gsrainv1 = sigma_gamma * rainv1;
           const SIMD_flt_t gsrainvsq1 = gsrainv1 * rainv1 / r1;
           const SIMD_flt_t expgsrainv1 = SIMD_exp(gsrainv1);
 
-	  const SIMD_mask jmask = jj < ejnum;
+          const SIMD_mask jmask = jj < ejnum;
           for (int kk = jj+1; kk < ejnum_max; kk++) {
-	    SIMD_int iktype, ijktype;
-	    const int kcoffset = kk * swidth;
-	    if (!ONETYPE) {
-	      iktype = SIMD_load(tjtype + kcoffset);
-	      ijktype = ijkoff + (iktype << 2);
-	      iktype = (iktype + itype_offset) << 2;
-	      cut = SIMD_gather(&(p2[0].cut), iktype);
-	      sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), iktype);
-	      costheta = SIMD_gather(&(p3[0].costheta), ijktype);
-	      lambda_epsilon = SIMD_gather(&(p3[0].lambda_epsilon), ijktype);
-	      lambda_epsilon2 = SIMD_gather(&(p3[0].lambda_epsilon2), ijktype);
-	    }
-	    const SIMD_flt_t delr2x = SIMD_load(tdelx + kcoffset);
-	    const SIMD_flt_t delr2y = SIMD_load(tdely + kcoffset);
-	    const SIMD_flt_t delr2z = SIMD_load(tdelz + kcoffset);
-	    const SIMD_flt_t rsq2 = SIMD_load(trsq + kcoffset);
+            SIMD_int iktype, ijktype;
+            const int kcoffset = kk * swidth;
+            if (!ONETYPE) {
+              iktype = SIMD_load(tjtype + kcoffset);
+              ijktype = ijkoff + (iktype << 2);
+              iktype = (iktype + itype_offset) << 2;
+              cut = SIMD_gather(&(p2[0].cut), iktype);
+              sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), iktype);
+              costheta = SIMD_gather(&(p3[0].costheta), ijktype);
+              lambda_epsilon = SIMD_gather(&(p3[0].lambda_epsilon), ijktype);
+              lambda_epsilon2 = SIMD_gather(&(p3[0].lambda_epsilon2), ijktype);
+            }
+            const SIMD_flt_t delr2x = SIMD_load(tdelx + kcoffset);
+            const SIMD_flt_t delr2y = SIMD_load(tdely + kcoffset);
+            const SIMD_flt_t delr2z = SIMD_load(tdelz + kcoffset);
+            const SIMD_flt_t rsq2 = SIMD_load(trsq + kcoffset);
 
-	    const SIMD_flt_t rinvsq2 = SIMD_rcp(rsq2);
-	    const SIMD_flt_t r2 = SIMD_invsqrt(rinvsq2);
-	    const SIMD_flt_t rainv2 = SIMD_rcp(r2 - cut);
-	    const SIMD_flt_t gsrainv2 = sigma_gamma * rainv2;
-	    const SIMD_flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
-	    const SIMD_flt_t expgsrainv2 = SIMD_exp(gsrainv2);
-	    const SIMD_flt_t rinv12 = SIMD_rcp(r1 * r2);
-	    const SIMD_flt_t cs = (delx * delr2x + dely * delr2y + 
+            const SIMD_flt_t rinvsq2 = SIMD_rcp(rsq2);
+            const SIMD_flt_t r2 = SIMD_invsqrt(rinvsq2);
+            const SIMD_flt_t rainv2 = SIMD_rcp(r2 - cut);
+            const SIMD_flt_t gsrainv2 = sigma_gamma * rainv2;
+            const SIMD_flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
+            const SIMD_flt_t expgsrainv2 = SIMD_exp(gsrainv2);
+            const SIMD_flt_t rinv12 = SIMD_rcp(r1 * r2);
+            const SIMD_flt_t cs = (delx * delr2x + dely * delr2y +
                               delz * delr2z) * rinv12;
-	    const SIMD_flt_t delcs = cs - costheta;
-	    const SIMD_flt_t delcssq = delcs*delcs;
+            const SIMD_flt_t delcs = cs - costheta;
+            const SIMD_flt_t delcssq = delcs*delcs;
 
-	    const SIMD_flt_t facexp = expgsrainv1*expgsrainv2;
-	    const SIMD_flt_t facrad = lambda_epsilon * facexp * delcssq;
-	    const SIMD_flt_t frad1 = facrad * gsrainvsq1;
-	    const SIMD_flt_t frad2 = facrad * gsrainvsq2;
-	    const SIMD_flt_t facang = lambda_epsilon2 * facexp * delcs;
-	    const SIMD_flt_t facang12 = rinv12 * facang;
-	    const SIMD_flt_t csfacang = cs * facang;
+            const SIMD_flt_t facexp = expgsrainv1*expgsrainv2;
+            const SIMD_flt_t facrad = lambda_epsilon * facexp * delcssq;
+            const SIMD_flt_t frad1 = facrad * gsrainvsq1;
+            const SIMD_flt_t frad2 = facrad * gsrainvsq2;
+            const SIMD_flt_t facang = lambda_epsilon2 * facexp * delcs;
+            const SIMD_flt_t facang12 = rinv12 * facang;
+            const SIMD_flt_t csfacang = cs * facang;
 
-	    const SIMD_flt_t csfac1 = rinvsq1 * csfacang;
-	    const SIMD_flt_t fjx = delx * (frad1 + csfac1)-delr2x*facang12;
-	    const SIMD_flt_t fjy = dely * (frad1 + csfac1)-delr2y*facang12;
-	    const SIMD_flt_t fjz = delz * (frad1 + csfac1)-delr2z*facang12;
+            const SIMD_flt_t csfac1 = rinvsq1 * csfacang;
+            const SIMD_flt_t fjx = delx * (frad1 + csfac1)-delr2x*facang12;
+            const SIMD_flt_t fjy = dely * (frad1 + csfac1)-delr2y*facang12;
+            const SIMD_flt_t fjz = delz * (frad1 + csfac1)-delr2z*facang12;
 
-	    const SIMD_flt_t csfac2 = rinvsq2 * csfacang;
-	    SIMD_flt_t fkx = delx * facang12 - delr2x * (frad2 + csfac2);
-	    SIMD_flt_t fky = dely * facang12 - delr2y * (frad2 + csfac2);
-	    SIMD_flt_t fkz = delz * facang12 - delr2z * (frad2 + csfac2);
+            const SIMD_flt_t csfac2 = rinvsq2 * csfacang;
+            SIMD_flt_t fkx = delx * facang12 - delr2x * (frad2 + csfac2);
+            SIMD_flt_t fky = dely * facang12 - delr2y * (frad2 + csfac2);
+            SIMD_flt_t fkz = delz * facang12 - delr2z * (frad2 + csfac2);
 
-	    const SIMD_mask kmask = SIMD_lt(jmask, kk, ejnum);
+            const SIMD_mask kmask = SIMD_lt(jmask, kk, ejnum);
 
-	    SIMD_acc_cache3(kmask, fjx, fjy, fjz, fkx, fky, fkz, fxtmp, fytmp,
-			    fztmp, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
-			    fztmp2, fjxtmp2, fjytmp2, fjztmp2, 
-			    tf + kcoffset * 3, swidth); 
+            SIMD_acc_cache3(kmask, fjx, fjy, fjz, fkx, fky, fkz, fxtmp, fytmp,
+                            fztmp, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
+                            fztmp2, fjxtmp2, fjytmp2, fjztmp2,
+                            tf + kcoffset * 3, swidth);
 
-	    if (EVFLAG) {
-	      if (EFLAG) {
-		SIMD_int k;
-		if (eatom) {
-		  k = SIMD_load(tj + kcoffset);
-		  k = k << 4;
-		}
-		SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
-			       fwtmp2, fjtmp2, k, dforce);
-	      }
-	      SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz,
-				   delx, dely, delz, delr2x, delr2y, delr2z,
-				   sv0, sv1, sv2, sv3, sv4, sv5);
-	    }
-	    
-	  } // for kk
-	  if (is_same<flt_t,acc_t>::value == 1)
-	    SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
-	  else
-	    SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp, 
-	                fjxtmp2, fjytmp2, fjztmp2);
+            if (EFLAG) {
+              SIMD_int k;
+              if (eatom) {
+                k = SIMD_load(tj + kcoffset);
+                k = k << 4;
+              }
+              SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
+                             fwtmp2, fjtmp2, k, dforce);
+            }
+          } // for kk
+          if (is_same<flt_t,acc_t>::value == 1)
+            SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
+          else
+            SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp,
+                        fjxtmp2, fjytmp2, fjztmp2);
 
-	  if (EFLAG) {
-	    if (eatom) { 
-	      SIMD_int j = SIMD_load(tj + coffset);
-	      j = j << 4;
-	      SIMD_jeng_update(jmask, dforce + 3, j, fjtmp);
-	      if (is_same<flt_t,acc_t>::value == 0)
-		SIMD_jeng_update_hi(jmask, dforce + 3, j, fjtmp2);
-	    }
-	  }
+          if (EFLAG) {
+            if (eatom) {
+              SIMD_int j = SIMD_load(tj + coffset);
+              j = j << 4;
+              SIMD_jeng_update(jmask, dforce + 3, j, fjtmp);
+              if (is_same<flt_t,acc_t>::value == 0)
+                SIMD_jeng_update_hi(jmask, dforce + 3, j, fjtmp2);
+            }
+          }
         } // for jj first loop
 
         for (int jj = 0; jj < ejnum_max; jj++) {
-	  const int coffset = jj * swidth;
-	  const SIMD_mask jmask = jj < ejnum;
+          const int coffset = jj * swidth;
+          const SIMD_mask jmask = jj < ejnum;
           const SIMD_int j = SIMD_load(tj + coffset);
-	  const SIMD_int joffset = j << 4;
+          const SIMD_int joffset = j << 4;
 
-	  SIMD_acc_t fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2;
-	  int foffset = swidth;
-	  if (is_same<flt_t,acc_t>::value == 0) foffset = foffset >> 1;
-	  acc_t *p = tf + coffset * 3;
-	  fjxtmp = SIMD_load(p);
-	  if (is_same<flt_t,acc_t>::value == 0) {
-	    p = p + foffset;
-	    fjxtmp2 = SIMD_load(p);
-	  }
-	  p = p + foffset;
-	  fjytmp = SIMD_load(p);
-	  if (is_same<flt_t,acc_t>::value == 0) {
-	    p = p + foffset;
-	    fjytmp2 = SIMD_load(p);
-	  }
-	  p = p + foffset;
-	  fjztmp = SIMD_load(p);
-	  if (is_same<flt_t,acc_t>::value == 0) {
-	    p = p + foffset;
-	    fjztmp2 = SIMD_load(p);
-	  }
-	  
-	  SIMD_conflict_pi_reduce3(jmask, joffset, fjxtmp, fjytmp, fjztmp);
-	  SIMD_jforce_update(jmask, dforce, joffset, fjxtmp, fjytmp, 
-			     fjztmp);
+          SIMD_acc_t fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2;
+          int foffset = swidth;
+          if (is_same<flt_t,acc_t>::value == 0) foffset = foffset >> 1;
+          acc_t *p = tf + coffset * 3;
+          fjxtmp = SIMD_load(p);
           if (is_same<flt_t,acc_t>::value == 0) {
-	    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
-	    SIMD_mask jmask2 = jmask >> 8;
-	    SIMD_conflict_pi_reduce3(jmask2, joffset2, fjxtmp2, fjytmp2, 
-				     fjztmp2);
-	    SIMD_jforce_update(jmask2, dforce, joffset2, fjxtmp2, fjytmp2, 
-			       fjztmp2);
-	  }
-	} // for jj second loop
+            p = p + foffset;
+            fjxtmp2 = SIMD_load(p);
+          }
+          p = p + foffset;
+          fjytmp = SIMD_load(p);
+          if (is_same<flt_t,acc_t>::value == 0) {
+            p = p + foffset;
+            fjytmp2 = SIMD_load(p);
+          }
+          p = p + foffset;
+          fjztmp = SIMD_load(p);
+          if (is_same<flt_t,acc_t>::value == 0) {
+            p = p + foffset;
+            fjztmp2 = SIMD_load(p);
+          }
 
-	SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
-			   EVFLAG, eatom, fwtmp);
-	if (is_same<flt_t,acc_t>::value == 0) {
-	  imask = imask >> 8;
-	  SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, 
-			     fztmp2, EVFLAG, eatom, fwtmp2);
-	}
-	if (EVFLAG) {
-	  if (EFLAG) oevdwl += SIMD_sum(sevdwl);
-	  if (vflag == 1) {
-	    ov0 += SIMD_sum(sv0);
-	    ov1 += SIMD_sum(sv1);
-	    ov2 += SIMD_sum(sv2);
-	    ov3 += SIMD_sum(sv3);
-	    ov4 += SIMD_sum(sv4);
-	    ov5 += SIMD_sum(sv5);
-	  }
-	}
-	ilist = ilist + swidth;
+          SIMD_conflict_pi_reduce3(jmask, joffset, fjxtmp, fjytmp, fjztmp);
+          SIMD_jforce_update(jmask, dforce, joffset, fjxtmp, fjytmp,
+                             fjztmp);
+          if (is_same<flt_t,acc_t>::value == 0) {
+            SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
+            SIMD_mask jmask2 = jmask >> 8;
+            SIMD_conflict_pi_reduce3(jmask2, joffset2, fjxtmp2, fjytmp2,
+                                     fjztmp2);
+            SIMD_jforce_update(jmask2, dforce, joffset2, fjxtmp2, fjytmp2,
+                               fjztmp2);
+          }
+        } // for jj second loop
+
+        SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
+                           EFLAG, eatom, fwtmp);
+        if (is_same<flt_t,acc_t>::value == 0) {
+          imask = imask >> 8;
+          SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2,
+                             fztmp2, EFLAG, eatom, fwtmp2);
+        }
+        if (EFLAG) oevdwl += SIMD_sum(sevdwl);
+        ilist = ilist + iip;
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall, nlocal, 
-			       minlocal, nthreads, f_start, f_stride, x, 
-			       offload);
-      }
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
+                              x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
     } // end omp
-  
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -1143,7 +1063,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -1199,7 +1119,7 @@ void PairSWIntel::init_style()
   #if defined(__INTEL_COMPILER)
   if (__INTEL_COMPILER_BUILD_DATE < 20141023)
     error->all(FLERR, "Intel compiler versions before "
-	       "15 Update 1 not supported for sw/intel");
+               "15 Update 1 not supported for sw/intel");
   #endif
 }
 
@@ -1212,6 +1132,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
   #ifdef LMP_USE_AVXCD
   fix->nbor_pack_width(SIMD_type<flt_t>::width());
   #endif
+  fix->three_body_neighbor(1);
 
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
@@ -1247,7 +1168,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
       }
     }
   }
-  
+
   _onetype = 0;
   if (atom->ntypes == 1) _onetype = 1;
 
@@ -1257,55 +1178,55 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int jj = 0; jj < tp1; jj++) {
       int j = map[jj];
       if (i < 0 || j < 0 || ii == 0 || jj == 0) {
-	fc.p2[ii][jj].cutsq = 0;
-	fc.p2[ii][jj].cut = 0;
-	fc.p2[ii][jj].sigma_gamma = 0;
-	fc.p2f[ii][jj].cut = 0;
-	fc.p2f[ii][jj].powerp = 0;
-	fc.p2f[ii][jj].powerq = 0;
-	fc.p2f[ii][jj].sigma = 0;
-	fc.p2f2[ii][jj].c1 = 0;
-	fc.p2f2[ii][jj].c2 = 0;
-	fc.p2f2[ii][jj].c3 = 0;
-	fc.p2f2[ii][jj].c4 = 0;
-	fc.p2e[ii][jj].c5 = 0;
-	fc.p2e[ii][jj].c6 = 0;
+        fc.p2[ii][jj].cutsq = 0;
+        fc.p2[ii][jj].cut = 0;
+        fc.p2[ii][jj].sigma_gamma = 0;
+        fc.p2f[ii][jj].cut = 0;
+        fc.p2f[ii][jj].powerp = 0;
+        fc.p2f[ii][jj].powerq = 0;
+        fc.p2f[ii][jj].sigma = 0;
+        fc.p2f2[ii][jj].c1 = 0;
+        fc.p2f2[ii][jj].c2 = 0;
+        fc.p2f2[ii][jj].c3 = 0;
+        fc.p2f2[ii][jj].c4 = 0;
+        fc.p2e[ii][jj].c5 = 0;
+        fc.p2e[ii][jj].c6 = 0;
       } else {
-	int ijparam = elem2param[i][j][j];
-	fc.p2[ii][jj].cutsq = params[ijparam].cutsq;
-	fc.p2[ii][jj].cut = params[ijparam].cut;
-	fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma;
-	fc.p2f[ii][jj].cut = params[ijparam].cut;
-	fc.p2f[ii][jj].powerp = -params[ijparam].powerp;
-	fc.p2f[ii][jj].powerq = -params[ijparam].powerq;
-	fc.p2f[ii][jj].sigma = params[ijparam].sigma;
-	fc.p2f2[ii][jj].c1 = params[ijparam].c1;
-	fc.p2f2[ii][jj].c2 = params[ijparam].c2;
-	fc.p2f2[ii][jj].c3 = params[ijparam].c3;
-	fc.p2f2[ii][jj].c4 = params[ijparam].c4;
-	fc.p2e[ii][jj].c5 = params[ijparam].c5;
-	fc.p2e[ii][jj].c6 = params[ijparam].c6;
+        int ijparam = elem2param[i][j][j];
+        fc.p2[ii][jj].cutsq = params[ijparam].cutsq;
+        fc.p2[ii][jj].cut = params[ijparam].cut;
+        fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma;
+        fc.p2f[ii][jj].cut = params[ijparam].cut;
+        fc.p2f[ii][jj].powerp = -params[ijparam].powerp;
+        fc.p2f[ii][jj].powerq = -params[ijparam].powerq;
+        fc.p2f[ii][jj].sigma = params[ijparam].sigma;
+        fc.p2f2[ii][jj].c1 = params[ijparam].c1;
+        fc.p2f2[ii][jj].c2 = params[ijparam].c2;
+        fc.p2f2[ii][jj].c3 = params[ijparam].c3;
+        fc.p2f2[ii][jj].c4 = params[ijparam].c4;
+        fc.p2e[ii][jj].c5 = params[ijparam].c5;
+        fc.p2e[ii][jj].c6 = params[ijparam].c6;
 
-	double cutcut = params[ijparam].cut * params[ijparam].cut;
-	if (params[ijparam].cutsq >= cutcut)
-	  fc.p2[ii][jj].cutsq *= 0.98;
+        double cutcut = params[ijparam].cut * params[ijparam].cut;
+        if (params[ijparam].cutsq >= cutcut)
+          fc.p2[ii][jj].cutsq *= 0.98;
 
-	if (params[ijparam].powerp != 4.0 || params[ijparam].powerq != 0.0)
-	  _spq = 0;
+        if (params[ijparam].powerp != 4.0 || params[ijparam].powerq != 0.0)
+          _spq = 0;
       }
 
       for (int kk = 0; kk < tp1; kk++) {
         int k = map[kk];
-	if (i < 0 || j < 0 || k < 0  || ii == 0 || jj == 0 || kk == 0) {
-	  fc.p3[ii][jj][kk].costheta = 0;
-	  fc.p3[ii][jj][kk].lambda_epsilon = 0;
-	  fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
-	} else {
-	  int ijkparam = elem2param[i][j][k];
-	  fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
-	  fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
-	  fc.p3[ii][jj][kk].lambda_epsilon2 = params[ijkparam].lambda_epsilon2;
-	}
+        if (i < 0 || j < 0 || k < 0  || ii == 0 || jj == 0 || kk == 0) {
+          fc.p3[ii][jj][kk].costheta = 0;
+          fc.p3[ii][jj][kk].lambda_epsilon = 0;
+          fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
+        } else {
+          int ijkparam = elem2param[i][j][k];
+          fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
+          fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
+          fc.p3[ii][jj][kk].lambda_epsilon2 = params[ijkparam].lambda_epsilon2;
+        }
       }
     }
   }
@@ -1326,10 +1247,10 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   int tp1cu = tp1sq * tp1;
-  if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
+  if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
       op3 != NULL && ocutneighsq != NULL) {
     #pragma offload_transfer target(mic:_cop) \
-      in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))	\
+      in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))     \
       in(op3: length(tp1cu) alloc_if(0) free_if(0)) \
       in(ocutneighsq: length(tp1sq))
   }
@@ -1351,8 +1272,8 @@ void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       fc_packed3 *op3 = p3[0][0];
 
       #ifdef _LMP_INTEL_OFFLOAD
-      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
-	  op3 != NULL && _cop >= 0) {
+      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
+          op3 != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(op2, op2f, op2f2, op2e, op3: alloc_if(0) free_if(1))
       }
@@ -1380,8 +1301,8 @@ void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       fc_packed3 *op3 = p3[0][0];
       int tp1sq = ntypes * ntypes;
       int tp1cu = tp1sq * ntypes;
-      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
-	  op3 != NULL && cop >= 0) {
+      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
+          op3 != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(op3: length(tp1cu) alloc_if(1) free_if(0))
diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h
index 8723803a35..ffcf9a6fb6 100644
--- a/src/USER-INTEL/pair_sw_intel.h
+++ b/src/USER-INTEL/pair_sw_intel.h
@@ -46,10 +46,10 @@ class PairSWIntel : public PairSW {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
+  template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
-	    const int astart, const int aend, const int pad_width);
+            const int astart, const int aend, const int pad_width);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp
index 88354ec4d0..9e0a888638 100644
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@@ -47,7 +47,7 @@ void PairTersoffIntel::init_style()
 {
   if (comm->me == 0) {
     error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
-		   "Using MANYBODY version.");
+                   "Using MANYBODY version.");
   }
   PairTersoff::init_style();
 }
@@ -87,7 +87,7 @@ PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
 void PairTersoffIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
-    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   } else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@@ -104,8 +104,8 @@ void PairTersoffIntel::compute(int eflag, int vflag)
 //  do we need to calculate energy/virial
 template <class flt_t, class acc_t>
 void PairTersoffIntel::compute(int eflag, int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc)
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -119,32 +119,30 @@ void PairTersoffIntel::compute(int eflag, int vflag,
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				nthreads, sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
-  
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-    } else {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-    }
+
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    eval<1>(1, ovflag, buffers, fc, 0, offload_end);
+    eval<1>(0, ovflag, buffers, fc, host_start, inum);
   } else {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    eval<0>(1, ovflag, buffers, fc, 0, offload_end);
+    eval<0>(0, ovflag, buffers, fc, host_start, inum);
   }
 }
 
@@ -172,14 +170,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
   // what's done in here is that they are inlined and vectorized
   // attractive() also provides an option to compute zeta as well
   static fvec zeta_vector(
-      const c_inner_t * param, 
-      ivec xjw, bvec mask, 
-      fvec vrij, fvec rsq2, 
-      fvec vdijx, fvec vdijy, fvec vdijz, 
+      const c_inner_t * param,
+      ivec xjw, bvec mask,
+      fvec vrij, fvec rsq2,
+      fvec vdijx, fvec vdijy, fvec vdijz,
       fvec dikx, fvec diky, fvec dikz
   );
   static void force_zeta_vector(
-      const c_outer_t * param, 
+      const c_outer_t * param,
       ivec xjw,
       bvec mask,
       fvec vrijsq, fvec vzeta_ij,
@@ -202,49 +200,47 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
   );
 
   // perform the actual computation
-  template<bool EVFLAG, bool EFLAG>
+  template<bool EFLAG>
   static void kernel(
-      int iito, int iifrom, int eatom, int vflag, 
+      int iito, int iifrom, int eatom, int vflag,
       const int * _noalias const numneigh,
       const int * _noalias const numneighhalf,
-      const int * _noalias const cnumneigh, 
-      const int * _noalias const firstneigh, int ntypes, 
+      const int * _noalias const cnumneigh,
+      const int * _noalias const firstneigh, int ntypes,
       typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-      const c_inner_t * _noalias const c_inner, 
-      const c_outer_t * _noalias const c_outer, 
+      const c_inner_t * _noalias const c_inner,
+      const c_outer_t * _noalias const c_outer,
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-      acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
+      acc_t *evdwl
   );
 
   // perform one step of calculation, pass in i-j pairs of atoms (is, js)
-  template<int EVFLAG, int EFLAG>
+  template<int EFLAG>
   static void kernel_step(
-      int eatom, int vflag, 
+      int eatom, int vflag,
       const int * _noalias const numneigh,
-      const int * _noalias const cnumneigh, 
-      const int * _noalias const firstneigh, 
+      const int * _noalias const cnumneigh,
+      const int * _noalias const firstneigh,
       int ntypes,
       typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-      const c_inner_t * _noalias const c_inner, 
-      const c_outer_t * _noalias const c_outer, 
+      const c_inner_t * _noalias const c_inner,
+      const c_outer_t * _noalias const c_outer,
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-      avec *vsevdwl, avec *vsv0, avec * vsv1, avec *vsv2, avec* vsv3, avec *vsv4, avec *vsv5,
-      int compress_idx, iarr is, iarr js, bvec vmask_repulsive
+      avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive
   );
 
   // perform one step of calculation, as opposed to the previous method now
   //  with fixed i and a number of js
-  template<int EVFLAG, int EFLAG>
+  template<int EFLAG>
   static void kernel_step_const_i(
-    int eatom, int vflag, 
-    const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
-    const int * _noalias const firstneigh, int ntypes, 
+    int eatom, int vflag,
+    const int * _noalias const numneigh, const int * _noalias const cnumneigh,
+    const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-    const c_inner_t * _noalias const c_inner, 
-    const c_outer_t * _noalias const c_outer, 
+    const c_inner_t * _noalias const c_inner,
+    const c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    avec *vsevdwl, avec *vsv0, avec *vsv1, avec *vsv2, avec *vsv3, avec *vsv4, avec *vsv5,
-    int compress_idx, int i, iarr js, bvec vmask_repulsive
+    avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
   );
 };
 
@@ -257,11 +253,11 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
 // Dispatch to correct kernel instatiation and perform all the work neccesary
 //  for offloading. In this routine we enter the Phi.
 // This method is nearly identical to what happens in the other /intel styles
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, class flt_t, class acc_t>
 void PairTersoffIntel::eval(const int offload, const int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc,
-				     const int astart, const int aend)
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc,
+                                     const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -292,9 +288,9 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+  IP_PRE_get_transfern(ago, 1, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -330,20 +326,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
     #endif
     #endif
 
-    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
-			       f_stride, x, 0);
+    IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
@@ -355,61 +347,45 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       {
-        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
-        sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.;
+        acc_t sevdwl;
+        sevdwl = 0.;
         #define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
-          firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
+          firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl
         // Pick the variable i algorithm under specific conditions
         // do use scalar algorithm with very short vectors
         int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
-        bool pack_i = VL >= 8 && 
+        bool pack_i = VL >= 8 &&
           lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
         bool use_scalar = VL < 4;
         if (use_scalar) {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS);
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EFLAG>(ARGS);
         } else if (pack_i) {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS);
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EFLAG>(ARGS);
         } else {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS);
-        }
-	if (EVFLAG) {
-          if (EFLAG) oevdwl += sevdwl;
-          if (vflag == 1) {
-            ov0 += sv0;
-            ov1 += sv1;
-            ov2 += sv2;
-            ov3 += sv3;
-            ov4 += sv4;
-            ov5 += sv5;
-          }
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
         }
+        if (EFLAG) oevdwl += sevdwl;
       }
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-	  		       nlocal, minlocal, nthreads, f_start, f_stride, 
-                               x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = 0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      ev_global[0] = oevdwl;
+      ev_global[1] = 0.0;
+    }
+    if (vflag) {
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
 
     #ifdef _LMP_INTEL_OFFLOAD
@@ -424,7 +400,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -455,8 +431,9 @@ void PairTersoffIntel::init_style()
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
-  
+
   fix->pair_init_check();
+  fix->three_body_neighbor(1);
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
@@ -504,25 +481,25 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
       for (int k = 1; k < tp1; k++) {
         Param * param = &params[elem2param[map[i]][map[j]][map[k]]];
         fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
-	fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
+        fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
         fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
         fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
         fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
         fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
         fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
         fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
-        fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);  
+        fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);
 
         fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
-	fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
+        fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
         fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
         fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
         fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
         fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
         fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
         fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
-        fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);  
- 
+        fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);
+
       }
       Param * param = &params[elem2param[map[i]][map[j]][map[j]]];
       fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
@@ -538,7 +515,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
       fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
       fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
       fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);
-     
+
       fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
       fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
       fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
@@ -586,8 +563,8 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
 // As in any other /intel pair style
 template <class flt_t>
 void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-							   Memory *memory,
-							   const int cop) {
+                                                           Memory *memory,
+                                                           const int cop) {
   if ( (ntypes != _ntypes) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -598,12 +575,12 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
       c_inner_t * oc_inner = c_inner[0][0];
       c_outer_t * oc_outer = c_outer[0];
-      if (c_first_loop != NULL && c_second_loop != NULL && 
+      if (c_first_loop != NULL && c_second_loop != NULL &&
           c_inner_loop != NULL &&  _cop >= 0) {
 
         #pragma offload_transfer target(mic:cop) \
-	  nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
-	  nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
+          nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
+          nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
           nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
       }
       #endif
@@ -637,7 +614,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       int tp1sq = ntypes * ntypes;
       int tp1cb = ntypes * ntypes * ntypes;
       int tp1cb_pad = ntypes * ntypes * ntypes_pad;
-      if (oc_first_loop != NULL && oc_second_loop != NULL && 
+      if (oc_first_loop != NULL && oc_second_loop != NULL &&
           oc_inner_loop != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
@@ -663,23 +640,17 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 static const int N_CACHE = 8;
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
-template<int EVFLAG, int EFLAG>
+template<int EFLAG>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
-    int eatom, int vflag, 
-    const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
-    const int * _noalias const firstneigh, int ntypes, 
+    int eatom, int vflag,
+    const int * _noalias const numneigh, const int * _noalias const cnumneigh,
+    const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, 
-    const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, 
+    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
+    const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    avec *vsevdwl, 
-    avec *vsv0, 
-    avec *vsv1, 
-    avec *vsv2, 
-    avec* vsv3, 
-    avec *vsv4, 
-    avec *vsv5,
-    int compress_idx, 
+    avec *vsevdwl,
+    int compress_idx,
     iarr is,
     iarr js,
     bvec vmask_repulsive
@@ -691,7 +662,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   ivec v_i0(0);
   ivec v_i_ntypes(ntypes);
   ivec v_i_NEIGHMASK(NEIGHMASK);
-  
+
   farr fx, fy, fz, fw;
   int cache_idx = 0;
   fvec vfkx_cache[N_CACHE];
@@ -701,7 +672,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   bvec vmask_cache[N_CACHE];
   ivec vkks_final_cache;
   bvec vmask_final_cache;
-  iarr ts; 
+  iarr ts;
   // compute all the stuff we know from i and j
   // TDO: We could extract this from the driver routine
   ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
@@ -767,7 +738,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
             &vfix,&vfiy,&vfiz,
             &vfjx,&vfjy,&vfjz,
             &vfkx,&vfky,&vfkz,
-	    &vzeta_contrib);
+            &vzeta_contrib);
         vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
         vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
         vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@@ -778,9 +749,9 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
         vfkx_cache[cache_idx] = vfkx;
         vfky_cache[cache_idx] = vfky;
         vfkz_cache[cache_idx] = vfkz;
-	vks_cache[cache_idx] = vks;
-	vmask_cache[cache_idx] = veff_mask;
-	cache_idx += 1;
+        vks_cache[cache_idx] = vks;
+        vmask_cache[cache_idx] = veff_mask;
+        cache_idx += 1;
 
         vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
         vkks = vkks + v_i1;
@@ -828,22 +799,12 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
   vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
   vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
- 
-  if (EVFLAG) {
-    if (EFLAG) {
-      *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
-      if (eatom) {
-        v::store(fw, (v_0_5 * vevdwl));
-      }
+
+  if (EFLAG) {
+    *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
+    if (eatom) {
+      v::store(fw, (v_0_5 * vevdwl));
     }
-    if (vflag == 1) {				
-      *vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
-      *vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
-      *vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
-      *vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
-      *vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
-      *vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
-    }						
   }
   {
     while (cache_idx-- > 0) {
@@ -872,7 +833,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     fvec vx_k, vy_k, vz_k, vcutsq;
     while (! v::mask_testz(vactive_mask)) {
       bvec vnew_mask = vactive_mask & ~ veff_old_mask;
-      vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & 
+      vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
           v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
       v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
       fvec vdx_ik = vx_k - vx_i;
@@ -894,7 +855,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
             &vfix,&vfiy,&vfiz,
             &vfjx,&vfjy,&vfjz,
             &vfkx,&vfky,&vfkz,
-	    0);
+            0);
         vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
         vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
         vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@@ -933,7 +894,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
-    if (EVFLAG && EFLAG && eatom) {
+    if (EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
@@ -945,7 +906,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
-    if (EVFLAG && EFLAG && eatom) {
+    if (EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
@@ -954,23 +915,17 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
 // Specialized kernel step for fixed i, means that we don't have to use the
 //  convoluted iteration scheme above, as the loop variables are uniform.
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
-template<int EVFLAG, int EFLAG>
+template<int EFLAG>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
-    int eatom, int vflag, 
-    const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
-    const int * _noalias const firstneigh, int ntypes, 
+    int eatom, int vflag,
+    const int * _noalias const numneigh, const int * _noalias const cnumneigh,
+    const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, 
-    const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, 
+    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
+    const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    avec *vsevdwl, 
-    avec *vsv0, 
-    avec *vsv1, 
-    avec *vsv2, 
-    avec* vsv3, 
-    avec *vsv4, 
-    avec *vsv5,
-    int compress_idx, 
+    avec *vsevdwl,
+    int compress_idx,
     int i,
     iarr js,
     bvec vmask_repulsive
@@ -996,7 +951,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
   int kk_final_cache;
 
   aarr fx, fy, fz, fw;
-  iarr ts; 
+  iarr ts;
 
   bvec vmask = v::mask_enable_lower(compress_idx);
   fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
@@ -1042,7 +997,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
       fvec vfix, vfiy, vfiz;
       fvec vfjx, vfjy, vfjz;
       fvec vfkx, vfky, vfkz;
-      
+
       attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
           vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
           &vfix,&vfiy,&vfiz,
@@ -1055,7 +1010,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
       vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
       vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
       vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
-      
+
       vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
       vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
       vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
@@ -1082,7 +1037,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
     bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
     if (! v::mask_testz(veff_mask)) {
-      fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq, 
+      fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq,
           vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
       vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
     }
@@ -1096,23 +1051,13 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
   vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
   vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
   vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
- 
-  if (EVFLAG) {
-    if (EFLAG) {
-      *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
-      if (eatom) {
-        vfwtmp = v_0_5 * vevdwl;
-        v::store(fw, vfwtmp);
-      }
+
+  if (EFLAG) {
+    *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
+    if (eatom) {
+      vfwtmp = v_0_5 * vevdwl;
+      v::store(fw, vfwtmp);
     }
-    if (vflag == 1) {				
-      *vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
-      *vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
-      *vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
-      *vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
-      *vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
-      *vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
-    }						
   }
   while (cache_idx-- > 0) {
     fvec vfkx = vprefactor * vfkx_cache[cache_idx];
@@ -1148,7 +1093,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
            &vfix,&vfiy,&vfiz,
            &vfjx,&vfjy,&vfjz,
            &vfkx,&vfky,&vfkz,
-	   0);
+           0);
        vfxtmp  = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
        vfytmp  = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
        vfztmp  = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@@ -1169,38 +1114,36 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
-    if (EVFLAG && EFLAG && eatom) {
+    if (EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
   f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
   f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
   f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
-  if (EVFLAG && EFLAG && eatom) {
+  if (EFLAG && eatom) {
     f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
   }
 }
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
-template<bool EVFLAG, bool EFLAG>
+template<bool EFLAG>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
-    int iito, int iifrom, int eatom, int vflag, 
-    const int * _noalias const numneigh, 
-    const int * _noalias const numneighhalf, 
-    const int * _noalias const cnumneigh, 
-    const int * _noalias const firstneigh, int ntypes, 
+    int iito, int iifrom, int eatom, int vflag,
+    const int * _noalias const numneigh,
+    const int * _noalias const numneighhalf,
+    const int * _noalias const cnumneigh,
+    const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-    const c_inner_t * _noalias const c_inner, 
-    const c_outer_t * _noalias const c_outer, 
+    const c_inner_t * _noalias const c_inner,
+    const c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
+    acc_t *evdwl
 ) {
   int compress_idx = 0;
   int ii, jj;
   iarr is, js;
   avec vsevdwl = v::acc_zero();
-  avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero();
-  avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero();
   ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
   ivec vj, v_NEIGHMASK(NEIGHMASK);
   bvec vmask_repulsive(0);
@@ -1237,11 +1180,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
       if (pack_i) {
         if (compress_idx == v::VL) {
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
-          kernel_step<EVFLAG,EFLAG>(
-              eatom, vflag, 
+          kernel_step<EFLAG>(
+              eatom, vflag,
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
-              &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
+              &vsevdwl, compress_idx,
               is, js, vmask_repulsive
           );
           compress_idx = 0;
@@ -1250,11 +1193,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
       } else {
         if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
-          kernel_step_const_i<EVFLAG,EFLAG>(
-              eatom, vflag, 
+          kernel_step_const_i<EFLAG>(
+              eatom, vflag,
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
-              &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
+              &vsevdwl, compress_idx,
               i, js, vmask_repulsive
           );
           compress_idx = 0;
@@ -1265,36 +1208,26 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
   }
   if (compress_idx > 0) {
         vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
-        IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>(
-            eatom, vflag, 
+        IntelKernelTersoff::kernel_step<EFLAG>(
+            eatom, vflag,
             numneigh, cnumneigh, firstneigh, ntypes,
             x, c_inner, c_outer, f,
-            &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
+            &vsevdwl, compress_idx,
             is, js, vmask_repulsive
         );
   }
-  if (EVFLAG) {
-    if (EFLAG) {
-      *evdwl += v::acc_reduce_add(vsevdwl);
-    }
-    if (vflag == 1) {
-      *ov0 += v::acc_reduce_add(vsv0);
-      *ov1 += v::acc_reduce_add(vsv1);
-      *ov2 += v::acc_reduce_add(vsv2);
-      *ov3 += v::acc_reduce_add(vsv3);
-      *ov4 += v::acc_reduce_add(vsv4);
-      *ov5 += v::acc_reduce_add(vsv5);
-    }
+  if (EFLAG) {
+    *evdwl += v::acc_reduce_add(vsevdwl);
   }
 }
 
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
-    const c_inner_t * param, 
-    ivec xjw, bvec mask, 
-    fvec vrij, fvec rsq2, 
-    fvec vdijx, fvec vdijy, fvec vdijz, 
+    const c_inner_t * param,
+    ivec xjw, bvec mask,
+    fvec vrij, fvec rsq2,
+    fvec vdijx, fvec vdijy, fvec vdijz,
     fvec dikx, fvec diky, fvec dikz
 ) {
   fvec v_1_0(1.0);
@@ -1317,7 +1250,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t
   // Its kind of important to check the mask.
   // Some simulations never/rarely invoke this branch.
   if (! v::mask_testz(vmask_need_sine)) {
-    vfc = v::blend(vmask_need_sine, vfc, 
+    vfc = v::blend(vmask_need_sine, vfc,
         v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
   }
   return vgijk * vex_delr * vfc;
@@ -1325,7 +1258,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
-    const c_outer_t * param, 
+    const c_outer_t * param,
     ivec xjw,
     bvec mask,
     fvec vrij, fvec vzeta_ij,
@@ -1469,9 +1402,9 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
     vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
   }
 
-  fvec vzeta_d_fc = vfc_d * vgijk * vex_delr; 
-  fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr; 
-  fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d; 
+  fvec vzeta_d_fc = vfc_d * vgijk * vex_delr;
+  fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr;
+  fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d;
   if (ZETA) *zeta = vfc * vgijk * vex_delr;
 
   fvec vminus_costheta = - vcostheta;
@@ -1484,7 +1417,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
   fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
   fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
   fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
-  
+
   *fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
   *fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
   *fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);
diff --git a/src/USER-INTEL/pair_tersoff_intel.h b/src/USER-INTEL/pair_tersoff_intel.h
index c9604f2797..6da478c10f 100644
--- a/src/USER-INTEL/pair_tersoff_intel.h
+++ b/src/USER-INTEL/pair_tersoff_intel.h
@@ -75,14 +75,14 @@ class PairTersoffIntel : public PairTersoff {
   };
   ForceConst<float> force_const_single;
   ForceConst<double> force_const_double;
-  
+
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp
new file mode 100644
index 0000000000..ec5f5150c2
--- /dev/null
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@@ -0,0 +1,3034 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <stdlib.h>
+#include <math.h>
+#include "pppm_disp_intel.h"
+#include "atom.h"
+#include "error.h"
+#include "fft3d_wrap.h"
+#include "gridcomm.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER   7
+#define OFFSET 16384
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER};
+enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE};
+enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM,
+     FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G,
+     FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A,
+     FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE,
+     FORWARD_AD_PERATOM_NONE};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDispIntel::PPPMDispIntel(LAMMPS *lmp, int narg, char **arg) :
+  PPPMDisp(lmp, narg, arg)
+{
+  suffix_flag |= Suffix::INTEL;
+
+  order = 7;
+  order_6 = 7; //sets default stencil sizes to 7
+
+  perthread_density = NULL;
+  particle_ekx = particle_eky = particle_ekz = NULL;
+  particle_ekx0 = particle_eky0 = particle_ekz0 = NULL;
+  particle_ekx1 = particle_eky1 = particle_ekz1 = NULL;
+  particle_ekx2 = particle_eky2 = particle_ekz2 = NULL;
+  particle_ekx3 = particle_eky3 = particle_ekz3 = NULL;
+  particle_ekx4 = particle_eky4 = particle_ekz4 = NULL;
+  particle_ekx5 = particle_eky5 = particle_ekz5 = NULL;
+  particle_ekx6 = particle_eky6 = particle_ekz6 = NULL;
+
+  rho_lookup = drho_lookup = NULL;
+  rho6_lookup = drho6_lookup = NULL;
+  rho_points = 0;
+
+  _use_table = _use_packing = _use_lrt = 0;
+}
+
+PPPMDispIntel::~PPPMDispIntel()
+{
+  memory->destroy(perthread_density);
+  memory->destroy(particle_ekx);
+  memory->destroy(particle_eky);
+  memory->destroy(particle_ekz);
+
+  memory->destroy(rho_lookup);
+  memory->destroy(drho_lookup);
+  memory->destroy(rho6_lookup);
+  memory->destroy(drho6_lookup);
+}
+
+
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+
+void PPPMDispIntel::init()
+{
+
+  PPPMDisp::init();
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->kspace_init_check();
+
+  _use_lrt = fix->lrt();
+  if (_use_lrt)
+    error->all(FLERR,
+               "LRT mode is currently not supported for pppm/disp/intel");
+
+
+  // For vectorization, we need some padding in the end
+  // The first thread computes on the global density
+  if ((comm->nthreads > 1) && !_use_lrt) {
+    memory->destroy(perthread_density);
+    memory->create(perthread_density, comm->nthreads-1,
+                   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmdispintel:perthread_density");
+  }
+
+  _use_table = fix->pppm_table();
+  if (_use_table) {
+    rho_points = 5000;
+    memory->destroy(rho_lookup);
+    memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmdispintel:rho_lookup");
+    memory->destroy(rho6_lookup);
+    memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmdispintel:rho6_lookup");
+
+    if(differentiation_flag == 1) {
+      memory->destroy(drho_lookup);
+      memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                     "pppmdispintel:drho_lookup");
+      memory->destroy(drho6_lookup);
+      memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                     "pppmdispintel:drho6_lookup");
+    }
+    precompute_rho();
+  }
+  if (order > INTEL_P3M_MAXORDER)
+    error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMDispIntel long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPMDispIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    PPPMDisp::compute(eflag, vflag);
+    return;
+  }
+  #endif
+  int i;
+  // convert atoms from box to lamda coords
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    if (function[0]) {
+      cg_peratom->ghost_notify();
+      cg_peratom->setup();
+    }
+    if (function[1] + function[2] + function[3]) {
+      cg_peratom_6->ghost_notify();
+      cg_peratom_6->setup();
+    }
+    peratom_allocate_flag = 1;
+  }
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+
+    if (function[0]) memory->destroy(part2grid);
+    if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6);
+    if (differentiation_flag == 1) {
+      memory->destroy(particle_ekx);
+      memory->destroy(particle_eky);
+      memory->destroy(particle_ekz);
+      if (function[2] == 1){
+        memory->destroy(particle_ekx0);
+        memory->destroy(particle_eky0);
+        memory->destroy(particle_ekz0);
+        memory->destroy(particle_ekx1);
+        memory->destroy(particle_eky1);
+        memory->destroy(particle_ekz1);
+        memory->destroy(particle_ekx2);
+        memory->destroy(particle_eky2);
+        memory->destroy(particle_ekz2);
+        memory->destroy(particle_ekx3);
+        memory->destroy(particle_eky3);
+        memory->destroy(particle_ekz3);
+        memory->destroy(particle_ekx4);
+        memory->destroy(particle_eky4);
+        memory->destroy(particle_ekz4);
+        memory->destroy(particle_ekx5);
+        memory->destroy(particle_eky5);
+        memory->destroy(particle_ekz5);
+        memory->destroy(particle_ekx6);
+        memory->destroy(particle_eky6);
+        memory->destroy(particle_ekz6);
+      }
+
+    }
+    nmax = atom->nmax;
+    if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid");
+    if (function[1] + function[2] + function[3])
+      memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6");
+    if (differentiation_flag == 1) {
+      memory->create(particle_ekx, nmax, "pppmdispintel:pekx");
+      memory->create(particle_eky, nmax, "pppmdispintel:peky");
+      memory->create(particle_ekz, nmax, "pppmdispintel:pekz");
+      if (function[2] == 1){
+        memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0");
+        memory->create(particle_eky0, nmax, "pppmdispintel:peky0");
+        memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0");
+        memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1");
+        memory->create(particle_eky1, nmax, "pppmdispintel:peky1");
+        memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1");
+        memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2");
+        memory->create(particle_eky2, nmax, "pppmdispintel:peky2");
+        memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2");
+        memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3");
+        memory->create(particle_eky3, nmax, "pppmdispintel:peky3");
+        memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3");
+        memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4");
+        memory->create(particle_eky4, nmax, "pppmdispintel:peky4");
+        memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4");
+        memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5");
+        memory->create(particle_eky5, nmax, "pppmdispintel:peky5");
+        memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5");
+        memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6");
+        memory->create(particle_eky6, nmax, "pppmdispintel:peky6");
+        memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6");
+      }
+    }
+  }
+  energy = 0.0;
+  energy_1 = 0.0;
+  energy_6 = 0.0;
+  if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0;
+
+  // find grid points for all my particles
+  // distribute partcles' charges/dispersion coefficients on the grid
+  // communication between processors and remapping two fft
+  // Solution of poissons equation in k-space and backtransformation
+  // communication between processors
+  // calculation of forces
+
+  if (function[0]) {
+
+    //perform calculations for coulomb interactions only
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid,
+                                 nupper, nlower, nxlo_out, nylo_out, nzlo_out,
+                                 nxhi_out, nyhi_out, nzhi_out,
+                                 fix->get_mixed_buffers());
+      make_rho_c<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid,
+                                  nupper, nlower, nxlo_out, nylo_out,
+                                  nzlo_out, nxhi_out, nyhi_out, nzhi_out,
+                                  fix->get_double_buffers());
+      make_rho_c<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid,
+                                nupper, nlower, nxlo_out, nylo_out, nzlo_out,
+                                nxhi_out, nyhi_out, nzhi_out,
+                                fix->get_single_buffers());
+      make_rho_c<float,float>(fix->get_single_buffers());
+    }
+
+    cg->reverse_comm(this,REVERSE_RHO);
+
+    brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+              density_brick, density_fft, work1,remap);
+
+    if (differentiation_flag == 1) {
+      poisson_ad(work1, work2, density_fft, fft1, fft2,
+                 nx_pppm, ny_pppm, nz_pppm, nfft,
+                 nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
+                 nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+                 energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick,
+                 v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
+
+      cg->forward_comm(this,FORWARD_AD);
+
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+        fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+        fieldforce_c_ad<double,double>(fix->get_double_buffers());
+      } else {
+        fieldforce_c_ad<float,float>(fix->get_single_buffers());
+      }
+
+      if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM);
+
+    } else {
+      poisson_ik(work1, work2, density_fft, fft1, fft2,
+                 nx_pppm, ny_pppm, nz_pppm, nfft,
+                 nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
+                 nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+                 energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2,
+                 vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2,
+                 u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick,
+                 v5_brick);
+
+      cg->forward_comm(this, FORWARD_IK);
+
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+        fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+        fieldforce_c_ik<double,double>(fix->get_double_buffers());
+      } else {
+        fieldforce_c_ik<float,float>(fix->get_single_buffers());
+      }
+
+      if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM);
+    }
+    if (evflag_atom) fieldforce_c_peratom();
+  }
+
+  if (function[1]) {
+    //perfrom calculations for geometric mixing
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                 part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                 nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                 nyhi_out_6, nzhi_out_6,
+                                 fix->get_mixed_buffers());
+      make_rho_g<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                  nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                  nyhi_out_6, nzhi_out_6,
+                                  fix->get_double_buffers());
+      make_rho_g<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                nyhi_out_6, nzhi_out_6,
+                                fix->get_single_buffers());
+      make_rho_g<float,float>(fix->get_single_buffers());
+    }
+
+
+    cg_6->reverse_comm(this, REVERSE_RHO_G);
+
+    brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
+              density_brick_g, density_fft_g, work1_6,remap_6);
+
+    if (differentiation_flag == 1) {
+
+      poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
+                 nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6,
+                 nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6,
+                 nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6,
+                 virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g,
+                 v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
+
+      cg_6->forward_comm(this,FORWARD_AD_G);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_g_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_g_ad<float,float>(fix->get_single_buffers());
+    }
+
+      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G);
+
+    } else {
+      poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
+                 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
+                 nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,
+                 fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g,
+                 vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g,
+                 v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
+
+      cg_6->forward_comm(this,FORWARD_IK_G);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_g_ik<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_g_ik<float,float>(fix->get_single_buffers());
+    }
+
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G);
+    }
+    if (evflag_atom) fieldforce_g_peratom();
+  }
+
+  if (function[2]) {
+    //perform calculations for arithmetic mixing
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                 part2grid_6, nupper_6, nlower_6,
+                                 nxlo_out_6, nylo_out_6, nzlo_out_6,
+                                 nxhi_out_6, nyhi_out_6, nzhi_out_6,
+                                 fix->get_mixed_buffers());
+      make_rho_a<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                  nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                  nyhi_out_6, nzhi_out_6,
+                                  fix->get_double_buffers());
+      make_rho_a<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                nyhi_out_6, nzhi_out_6,
+                                fix->get_single_buffers());
+      make_rho_a<float,float>(fix->get_single_buffers());
+    }
+
+    cg_6->reverse_comm(this, REVERSE_RHO_A);
+
+    brick2fft_a();
+
+    if ( differentiation_flag == 1) {
+
+      poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
+                 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
+                 nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6,
+                 u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3,
+                 v3_brick_a3, v4_brick_a3, v5_brick_a3);
+      poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0,
+                    v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0,
+                    v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6,
+                    v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6);
+      poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1,
+                    v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1,
+                    v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5,
+                    v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5);
+      poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2,
+                    v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2,
+                    v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4,
+                    v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
+
+      cg_6->forward_comm(this, FORWARD_AD_A);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_a_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_a_ad<float,float>(fix->get_single_buffers());
+    }
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A);
+
+    }  else {
+
+      poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
+                 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
+                 nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6,
+                 fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3,
+                 virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3,
+                 v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3);
+      poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0,
+                    vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6,
+                    vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0,
+                    v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0,
+                    u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6,
+                    v3_brick_a6, v4_brick_a6, v5_brick_a6);
+      poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1,
+                    vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5,
+                    vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1,
+                    v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1,
+                    u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5,
+                    v3_brick_a5, v4_brick_a5, v5_brick_a5);
+      poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2,
+                    vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4,
+                    vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2,
+                    v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2,
+                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
+                    v3_brick_a4, v4_brick_a4, v5_brick_a4);
+
+      cg_6->forward_comm(this, FORWARD_IK_A);
+
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+        fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+        fieldforce_a_ik<double,double>(fix->get_double_buffers());
+      } else {
+        fieldforce_a_ik<float,float>(fix->get_single_buffers());
+      }
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A);
+    }
+    if (evflag_atom) fieldforce_a_peratom();
+  }
+
+  if (function[3]) {
+    //perform calculations if no mixing rule applies
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                 part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                 nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                 nyhi_out_6, nzhi_out_6,
+                                 fix->get_mixed_buffers());
+      make_rho_none<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                  nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                  nyhi_out_6, nzhi_out_6,
+                                  fix->get_double_buffers());
+      make_rho_none<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                nyhi_out_6, nzhi_out_6,
+                                fix->get_single_buffers());
+      make_rho_none<float,float>(fix->get_single_buffers());
+    }
+
+    cg_6->reverse_comm(this, REVERSE_RHO_NONE);
+
+    brick2fft_none();
+
+    if (differentiation_flag == 1) {
+
+      int n = 0;
+      for (int k = 0; k<nsplit_alloc/2; k++) {
+        poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1],
+                        u_brick_none[n],u_brick_none[n+1],
+                        v0_brick_none, v1_brick_none, v2_brick_none,
+                        v3_brick_none, v4_brick_none, v5_brick_none);
+        n += 2;
+      }
+
+      cg_6->forward_comm(this,FORWARD_AD_NONE);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_none_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_none_ad<float,float>(fix->get_single_buffers());
+    }
+
+      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE);
+
+    } else {
+      int n = 0;
+      for (int k = 0; k<nsplit_alloc/2; k++) {
+
+        poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1],
+                        vdx_brick_none[n], vdy_brick_none[n],
+                        vdz_brick_none[n], vdx_brick_none[n+1],
+                        vdy_brick_none[n+1], vdz_brick_none[n+1],
+                        u_brick_none, v0_brick_none, v1_brick_none,
+                        v2_brick_none, v3_brick_none, v4_brick_none,
+                        v5_brick_none);
+        n += 2;
+      }
+
+      cg_6->forward_comm(this,FORWARD_IK_NONE);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_none_ik<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_none_ik<float,float>(fix->get_single_buffers());
+    }
+
+      if (evflag_atom)
+        cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE);
+    }
+    if (evflag_atom) fieldforce_none_peratom();
+  }
+
+  // update qsum and qsqsum, if atom count has changed and energy needed
+
+  if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) {
+    qsum_qsq();
+    natoms_original = atom->natoms;
+  }
+
+  // sum energy across procs and add in volume-dependent term
+
+  const double qscale = force->qqrd2e * scale;
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy_1 = energy_all;
+    MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy_6 = energy_all;
+
+    energy_1 *= 0.5*volume;
+    energy_6 *= 0.5*volume;
+
+    energy_1 -= g_ewald*qsqsum/MY_PIS +
+      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij +
+      1.0/12.0*pow(g_ewald_6,6)*csum;
+    energy_1 *= qscale;
+  }
+
+  // sum virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+    MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i];
+    if (function[1]+function[2]+function[3]){
+      double a =  MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij;
+      virial[0] -= a;
+      virial[1] -= a;
+      virial[2] -= a;
+    }
+  }
+
+  if (eflag_atom) {
+    if (function[0]) {
+      double *q = atom->q;
+      for (i = 0; i < atom->nlocal; i++) {
+        eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]*
+          qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction
+      }
+    }
+    if (function[1] + function[2] + function[3]) {
+      int tmp;
+      for (i = 0; i < atom->nlocal; i++) {
+        tmp = atom->type[i];
+        eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] +
+                      1.0/12.0*pow(g_ewald_6,6)*cii[tmp];
+      }
+    }
+  }
+
+  if (vflag_atom) {
+    if (function[1] + function[2] + function[3]) {
+      int tmp;
+      for (i = 0; i < atom->nlocal; i++) {
+        tmp = atom->type[i];
+        //dispersion self virial correction
+        for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*
+                                      pow(g_ewald_6,3)*csumi[tmp];
+      }
+    }
+  }
+
+
+  // 2d slab correction
+
+  if (slabflag) slabcorr(eflag);
+  if (function[0]) energy += energy_1;
+  if (function[1] + function[2] + function[3]) energy += energy_6;
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PPPMDispIntel::particle_map(double delx, double dely, double delz,
+                                 double sft, int** p2g, int nup, int nlow,
+                                 int nxlo, int nylo, int nzlo,
+                                 int nxhi, int nyhi, int nzhi,
+                                 IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2]))
+    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
+
+  int flag = 0;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\
+           nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if(!_use_lrt)
+  #endif
+  {
+    double **x = atom->x;
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delx;
+    const flt_t yi = dely;
+    const flt_t zi = delz;
+    const flt_t fshift = sft;
+
+
+    int iifrom, iito, tid;
+    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma vector aligned
+    #pragma simd reduction(+:flag)
+    #endif
+    for (int i = iifrom; i < iito; i++) {
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET;
+    int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET;
+    int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET;
+
+    p2g[i][0] = nx;
+    p2g[i][1] = ny;
+    p2g[i][2] = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlow < nxlo || nx+nup > nxhi ||
+        ny+nlow < nylo || ny+nup > nyhi ||
+        nz+nlow < nzlo || nz+nup > nzhi)
+      flag = 1;
+  }
+  }
+
+  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias global_density =
+    &(density_brick[nzlo_out][nylo_out][nxlo_out]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+  double *q = atom->q;
+  double **x = atom->x;
+
+    const int nix = nxhi_out - nxlo_out + 1;
+    const int niy = nyhi_out - nylo_out + 1;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshift = shift;
+    const flt_t fshiftone = shiftone;
+    const flt_t fdelvolinv = delvolinv;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
+      perthread_density[tid - 1];
+    // clear 3d density array
+    memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nysum = nlower + ny - nylo_out;
+      int nxsum = nlower + nx - nxlo_out;
+      int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order-1; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+        }
+      }
+
+      FFT_SCALAR z0 = fdelvolinv * q[i];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            my_density[mzyx] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- geometric mixing
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias global_density =
+    &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+    int type;
+    double **x = atom->x;
+
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
+      perthread_density[tid - 1];
+
+    // clear 3d density array
+    memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nysum = nlower_6 + ny - nylo_out_6;
+      int nxsum = nlower_6 + nx - nxlo_out_6;
+      int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv * B[type];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            my_density[mzyx] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- arithmetic mixing
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+
+    double **x = atom->x;
+
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    for (int i = 0; i < nlocal; i++) {
+
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      const int type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            FFT_SCALAR w = x0*rho[0][l];
+            density_brick_a0[mz][my][mx] += w*B[7*type];
+            density_brick_a1[mz][my][mx] += w*B[7*type+1];
+            density_brick_a2[mz][my][mx] += w*B[7*type+2];
+            density_brick_a3[mz][my][mx] += w*B[7*type+3];
+            density_brick_a4[mz][my][mx] += w*B[7*type+4];
+            density_brick_a5[mz][my][mx] += w*B[7*type+5];
+            density_brick_a6[mz][my][mx] += w*B[7*type+6];
+          }
+        }
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- case when mixing rules don't apply
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+    int type;
+    double **x = atom->x;
+
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
+      perthread_density[tid - 1];
+    // clear 3d density array
+    memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nysum = nlower_6 + ny - nylo_out_6;
+      int nxsum = nlower_6 + nx - nxlo_out_6;
+      int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            FFT_SCALAR w0 = x0*rho[0][l];
+            for(int k = 0; k < nsplit; k++)
+              my_density[mzyx + k*ngrid_6] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+   for ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  //double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *q = atom->q;
+    double **x = atom->x;
+    double **f = atom->f;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nxsum = nx + nlower;
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho_lookup[idx][k];
+          rho1[k] = rho_lookup[idy][k];
+          rho2[k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1 = rho_coeff[order-1][k];
+          FFT_SCALAR r2 = rho_coeff[order-1][k];
+          FFT_SCALAR r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower] = r1;
+          rho1[k-nlower] = r2;
+          rho2[k-nlower] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
+              eky_arr[l] -= x0*vdy_brick[mz][my][mx];
+              ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
+
+          }
+        }
+      }
+
+      FFT_SCALAR ekx, eky, ekz;
+      ekx = eky = ekz = ZEROF;
+
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx += ekx_arr[l];
+          eky += eky_arr[l];
+          ekz += ekz_arr[l];
+        }
+
+      // convert E-field to force
+
+      const flt_t qfactor = fqqrd2es * q[i];
+      f[i][0] += qfactor*ekx;
+      f[i][1] += qfactor*eky;
+      if (slabflag != 2) f[i][2] += qfactor*ekz;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+   for ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  //double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
+  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
+  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double *q = atom->q;
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm/xprd;
+    const flt_t hy_inv = ny_pppm/yprd;
+    const flt_t hz_inv = nz_pppm/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff[0];
+    const flt_t fsf_coeff1 = sf_coeff[1];
+    const flt_t fsf_coeff2 = sf_coeff[2];
+    const flt_t fsf_coeff3 = sf_coeff[3];
+    const flt_t fsf_coeff4 = sf_coeff[4];
+    const flt_t fsf_coeff5 = sf_coeff[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      int nxsum = nx + nlower;
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+          drho[0][k] = drho_lookup[idx][k];
+          drho[1][k] = drho_lookup[idy][k];
+          drho[2][k] = drho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff[order-1][k];
+          r2 = rho_coeff[order-1][k];
+          r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1 * dx;
+            r2 = rho_coeff[l][k] + r2 * dy;
+            r3 = rho_coeff[l][k] + r3 * dz;
+            dr1 = drho_coeff[l][k] + dr1 * dx;
+            dr2 = drho_coeff[l][k] + dr2 * dy;
+            dr3 = drho_coeff[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+          drho[0][k-nlower] = dr1;
+          drho[1][k-nlower] = dr2;
+          drho[2][k-nlower] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
+            eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
+            ekz[l] +=  rho[0][l] * ekz_p * u_brick[mz][my][mx];
+          }
+        }
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        particle_ekx[i] += ekx[l];
+        particle_eky[i] += eky[l];
+        particle_ekz[i] += ekz[l];
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx[i] *= hx_inv;
+      particle_eky[i] *= hy_inv;
+      particle_ekz[i] *= hz_inv;
+
+      // convert E-field to force
+
+      const flt_t qfactor = fqqrd2es * q[i];
+      const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
+
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= twoqsq;
+      f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf;
+
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= twoqsq;
+      f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf;
+
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= twoqsq;
+
+      if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for geometric mixing rule
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double lj;
+    int type;
+    double **x = atom->x;
+    double **f = atom->f;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
+              eky_arr[l] -= x0*vdy_brick_g[mz][my][mx];
+              ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx];
+
+          }
+        }
+      }
+
+      FFT_SCALAR ekx, eky, ekz;
+      ekx = eky = ekz = ZEROF;
+
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx += ekx_arr[l];
+          eky += eky_arr[l];
+          ekz += ekz_arr[l];
+        }
+
+      // convert E-field to force
+
+      type = atom->type[i];
+      lj = B[type];
+      f[i][0] += lj*ekx;
+      f[i][1] += lj*eky;
+      if (slabflag != 2) f[i][2] += lj*ekz;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for geometric mixing rule for ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
+  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
+  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
+            eky[l] +=  rho[0][l] * eky_p * u_brick_g[mz][my][mx];
+            ekz[l] +=  rho[0][l] * ekz_p * u_brick_g[mz][my][mx];
+          }
+        }
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        particle_ekx[i] += ekx[l];
+        particle_eky[i] += eky[l];
+        particle_ekz[i] += ekz[l];
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx[i] *= hx_inv;
+      particle_eky[i] *= hy_inv;
+      particle_ekz[i] *= hz_inv;
+
+      // convert E-field to force
+
+      const int type = atom->type[i];
+      const flt_t lj = B[type];
+      const flt_t twoljsq = 2.*lj*lj;
+
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= twoljsq;
+      f[i][0] += lj * particle_ekx[i] - sf;
+
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= twoljsq;
+      f[i][1] += lj * particle_eky[i] - sf;
+
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= twoljsq;
+
+      if (slabflag != 2) f[i][2] += lj * particle_ekz[i] -  sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for arithmetic mixing rule and ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+    double **x = atom->x;
+    double **f = atom->f;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
+              eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx];
+              ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx];
+              ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx];
+              eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx];
+              ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx];
+              ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx];
+              eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx];
+              ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx];
+              ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx];
+              eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx];
+              ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx];
+              ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx];
+              eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx];
+              ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx];
+              ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx];
+              eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx];
+              ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx];
+              ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx];
+              eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx];
+              ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx];
+          }
+        }
+      }
+
+      FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
+      FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
+      FFT_SCALAR ekx6, eky6, ekz6;
+      ekx0 = eky0 = ekz0 = ZEROF;
+      ekx1 = eky1 = ekz1 = ZEROF;
+      ekx2 = eky2 = ekz2 = ZEROF;
+      ekx3 = eky3 = ekz3 = ZEROF;
+      ekx4 = eky4 = ekz4 = ZEROF;
+      ekx5 = eky5 = ekz5 = ZEROF;
+      ekx6 = eky6 = ekz6 = ZEROF;
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx0 += ekx0_arr[l];
+          eky0 += eky0_arr[l];
+          ekz0 += ekz0_arr[l];
+          ekx1 += ekx1_arr[l];
+          eky1 += eky1_arr[l];
+          ekz1 += ekz1_arr[l];
+          ekx2 += ekx2_arr[l];
+          eky2 += eky2_arr[l];
+          ekz2 += ekz2_arr[l];
+          ekx3 += ekx3_arr[l];
+          eky3 += eky3_arr[l];
+          ekz3 += ekz3_arr[l];
+          ekx4 += ekx4_arr[l];
+          eky4 += eky4_arr[l];
+          ekz4 += ekz4_arr[l];
+          ekx5 += ekx5_arr[l];
+          eky5 += eky5_arr[l];
+          ekz5 += ekz5_arr[l];
+          ekx6 += ekx6_arr[l];
+          eky6 += eky6_arr[l];
+          ekz6 += ekz6_arr[l];
+        }
+
+      // convert D-field to force
+
+      const int type = atom->type[i];
+      const FFT_SCALAR lj0 = B[7*type+6];
+      const FFT_SCALAR lj1 = B[7*type+5];
+      const FFT_SCALAR lj2 = B[7*type+4];
+      const FFT_SCALAR lj3 = B[7*type+3];
+      const FFT_SCALAR lj4 = B[7*type+2];
+      const FFT_SCALAR lj5 = B[7*type+1];
+      const FFT_SCALAR lj6 = B[7*type];
+
+      f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 +
+        lj4*ekx4 + lj5*ekx5 + lj6*ekx6;
+      f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 +
+        lj4*eky4 + lj5*eky5 + lj6*eky6;
+      if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 +
+                           lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for arithmetic mixing rule for the ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0;
+  FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0;
+  FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0;
+  FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1;
+  FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1;
+  FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1;
+  FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2;
+  FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2;
+  FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2;
+  FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3;
+  FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3;
+  FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3;
+  FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4;
+  FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4;
+  FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4;
+  FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5;
+  FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5;
+  FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5;
+  FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6;
+  FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6;
+  FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF;
+      particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF;
+      particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF;
+      particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF;
+      particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF;
+      particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF;
+      particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            FFT_SCALAR x0 = drho[0][l] * ekx_p;
+            FFT_SCALAR y0 = rho[0][l] * eky_p;
+            FFT_SCALAR z0 = rho[0][l] * ekz_p;
+
+            ekx0[l] +=  x0 * u_brick_a0[mz][my][mx];
+            eky0[l] +=  y0 * u_brick_a0[mz][my][mx];
+            ekz0[l] +=  z0 * u_brick_a0[mz][my][mx];
+            ekx1[l] +=  x0 * u_brick_a1[mz][my][mx];
+            eky1[l] +=  y0 * u_brick_a1[mz][my][mx];
+            ekz1[l] +=  z0 * u_brick_a1[mz][my][mx];
+            ekx2[l] +=  x0 * u_brick_a2[mz][my][mx];
+            eky2[l] +=  y0 * u_brick_a2[mz][my][mx];
+            ekz2[l] +=  z0 * u_brick_a2[mz][my][mx];
+            ekx3[l] +=  x0 * u_brick_a3[mz][my][mx];
+            eky3[l] +=  y0 * u_brick_a3[mz][my][mx];
+            ekz3[l] +=  z0 * u_brick_a3[mz][my][mx];
+            ekx4[l] +=  x0 * u_brick_a4[mz][my][mx];
+            eky4[l] +=  y0 * u_brick_a4[mz][my][mx];
+            ekz4[l] +=  z0 * u_brick_a4[mz][my][mx];
+            ekx5[l] +=  x0 * u_brick_a5[mz][my][mx];
+            eky5[l] +=  y0 * u_brick_a5[mz][my][mx];
+            ekz5[l] +=  z0 * u_brick_a5[mz][my][mx];
+            ekx6[l] +=  x0 * u_brick_a6[mz][my][mx];
+            eky6[l] +=  y0 * u_brick_a6[mz][my][mx];
+            ekz6[l] +=  z0 * u_brick_a6[mz][my][mx];
+          }
+        }
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        particle_ekx0[i] += ekx0[l];
+        particle_eky0[i] += eky0[l];
+        particle_ekz0[i] += ekz0[l];
+        particle_ekx1[i] += ekx1[l];
+        particle_eky1[i] += eky1[l];
+        particle_ekz1[i] += ekz1[l];
+        particle_ekx2[i] += ekx2[l];
+        particle_eky2[i] += eky2[l];
+        particle_ekz2[i] += ekz2[l];
+        particle_ekx3[i] += ekx3[l];
+        particle_eky3[i] += eky3[l];
+        particle_ekz3[i] += ekz3[l];
+        particle_ekx4[i] += ekx4[l];
+        particle_eky4[i] += eky4[l];
+        particle_ekz4[i] += ekz4[l];
+        particle_ekx5[i] += ekx5[l];
+        particle_eky5[i] += eky5[l];
+        particle_ekz5[i] += ekz5[l];
+        particle_ekx6[i] += ekx6[l];
+        particle_eky6[i] += eky6[l];
+        particle_ekz6[i] += ekz6[l];
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx0[i] *= hx_inv;
+      particle_eky0[i] *= hy_inv;
+      particle_ekz0[i] *= hz_inv;
+      particle_ekx1[i] *= hx_inv;
+      particle_eky1[i] *= hy_inv;
+      particle_ekz1[i] *= hz_inv;
+      particle_ekx2[i] *= hx_inv;
+      particle_eky2[i] *= hy_inv;
+      particle_ekz2[i] *= hz_inv;
+      particle_ekx3[i] *= hx_inv;
+      particle_eky3[i] *= hy_inv;
+      particle_ekz3[i] *= hz_inv;
+      particle_ekx4[i] *= hx_inv;
+      particle_eky4[i] *= hy_inv;
+      particle_ekz4[i] *= hz_inv;
+      particle_ekx5[i] *= hx_inv;
+      particle_eky5[i] *= hy_inv;
+      particle_ekz5[i] *= hz_inv;
+      particle_ekx6[i] *= hx_inv;
+      particle_eky6[i] *= hy_inv;
+      particle_ekz6[i] *= hz_inv;
+
+      // convert D-field to force
+
+      const int type = atom->type[i];
+      const FFT_SCALAR lj0 = B[7*type+6];
+      const FFT_SCALAR lj1 = B[7*type+5];
+      const FFT_SCALAR lj2 = B[7*type+4];
+      const FFT_SCALAR lj3 = B[7*type+3];
+      const FFT_SCALAR lj4 = B[7*type+2];
+      const FFT_SCALAR lj5 = B[7*type+1];
+      const FFT_SCALAR lj6 = B[7*type];
+
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] +
+        lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] +
+        lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf;
+
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] +
+        lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] +
+        lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf;
+
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      if (slabflag != 2)
+      f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] +
+        lj2*particle_ekz2[i] + lj3*particle_ekz3[i] + lj4*particle_ekz4[i] +
+        lj5*particle_ekz5[i] + lj6*particle_ekz6[i] - sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for no mixing rule and ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double lj;
+    int type;
+    double **x = atom->x;
+    double **f = atom->f;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+
+      _alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+      _alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+      _alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+
+      for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF;
+      }
+
+      for (int k = 0; k < nsplit; k++) {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int n = 0; n < order_6; n++) {
+          int mz = n+nzsum;
+          FFT_SCALAR z0 = rho2[n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count=7
+          #endif
+          for (int m = 0; m < order_6; m++) {
+            int my = m+nysum;
+            FFT_SCALAR y0 = z0*rho1[m];
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma simd
+            #endif
+            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+              int mx = l+nxsum;
+              FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
+                x0*vdx_brick_none[k][mz][my][mx];
+              eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
+                x0*vdy_brick_none[k][mz][my][mx];
+              ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
+                x0*vdz_brick_none[k][mz][my][mx];
+            }
+          }
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx[nsplit], 64);
+      _alignvar(FFT_SCALAR eky[nsplit], 64);
+      _alignvar(FFT_SCALAR ekz[nsplit], 64);
+      for (int k = 0; k < nsplit; k++) {
+        ekx[k] = eky[k] = ekz[k] = ZEROF;
+      }
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int k = 0; k < nsplit; k++) {
+            ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+            eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+            ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+          }
+        }
+
+      // convert E-field to force
+
+      type = atom->type[i];
+      for (int k = 0; k < nsplit; k++) {
+        lj = B[nsplit*type + k];
+        f[i][0] += lj*ekx[k];
+        f[i][1] += lj*eky[k];
+        if (slabflag != 2) f[i][2] += lj*ekz[k];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for no mixing rule for the ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+   #if defined(_OPENMP)
+   #pragma omp parallel default(none)           \
+     shared(nlocal, nthr) if(!_use_lrt)
+   #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+      _alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+      _alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+
+      for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        ekx[k]=eky[k]=ekz[k]=ZEROF;
+      }
+
+      for (int k = 0; k < nsplit; k++) {
+        particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int n = 0; n < order_6; n++) {
+          int mz = n + nzsum;
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count=7
+          #endif
+          for (int m = 0; m < order_6; m++) {
+            int my = m + nysum;
+            FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+            FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+            FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma simd
+            #endif
+            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+              int mx = l + nxsum;
+              ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
+                u_brick_none[k][mz][my][mx];
+              eky[k*INTEL_P3M_ALIGNED_MAXORDER+l] +=  rho[0][l] * eky_p *
+                u_brick_none[k][mz][my][mx];
+              ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l] +=  rho[0][l] * ekz_p *
+                u_brick_none[k][mz][my][mx];
+            }
+          }
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx_tot[nsplit], 64);
+      _alignvar(FFT_SCALAR eky_tot[nsplit], 64);
+      _alignvar(FFT_SCALAR ekz_tot[nsplit], 64);
+      for (int k = 0; k < nsplit; k++) {
+        ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF;
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        for (int k = 0; k < nsplit; k++) {
+          ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+          eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+          ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+        }
+      }
+
+      for (int k = 0; k < nsplit; k++) {
+        ekx_tot[k] *= hx_inv;
+        eky_tot[k] *= hy_inv;
+        ekz_tot[k] *= hz_inv;
+      }
+      // convert D-field to force
+
+      const int type = atom->type[i];
+
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf1 += fsf_coeff1 * sin(ffour_pi * s1);
+
+      flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf2 += fsf_coeff3 * sin(ffour_pi * s2);
+
+      flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf3 += fsf_coeff5 * sin(ffour_pi * s3);
+      for (int k = 0; k < nsplit; k++) {
+        const flt_t lj = B[nsplit*type + k];
+        const flt_t twoljsq = lj*lj * B[k] * 2;
+        flt_t sf = sf1*twoljsq;
+        f[i][0] += lj * ekx_tot[k] - sf;
+        sf = sf2*twoljsq;
+        f[i][1] += lj * eky_tot[k] - sf;
+        sf = sf3*twoljsq;
+        if (slabflag != 2) f[i][2] += lj * ekz_tot[k] -  sf;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   precompute rho coefficients as a lookup table to save time in make_rho
+   and fieldforce.  Instead of doing this polynomial for every atom 6 times
+   per time step, precompute it for some number of points.
+------------------------------------------------------------------------- */
+
+void PPPMDispIntel::precompute_rho()
+{
+
+  half_rho_scale = (rho_points - 1.)/2.;
+  half_rho_scale_plus = half_rho_scale + 0.5;
+
+  for (int i = 0; i < rho_points; i++) {
+    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int k=nlower; k<=nupper;k++){
+      FFT_SCALAR r1 = ZEROF;
+      for(int l=order-1; l>=0; l--){
+        r1 = rho_coeff[l][k] + r1*dx;
+      }
+      rho_lookup[i][k-nlower] = r1;
+    }
+    for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+      rho_lookup[i][k] = 0;
+    }
+    if (differentiation_flag == 1) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for(int k=nlower; k<=nupper;k++){
+        FFT_SCALAR r1 = ZEROF;
+        for(int l=order-2; l>=0; l--){
+          r1 = drho_coeff[l][k] + r1*dx;
+        }
+        drho_lookup[i][k-nlower] = r1;
+      }
+      for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        drho_lookup[i][k] = 0;
+      }
+    }
+  }
+  for (int i = 0; i < rho_points; i++) {
+    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int k=nlower_6; k<=nupper_6;k++){
+      FFT_SCALAR r1 = ZEROF;
+      for(int l=order_6-1; l>=0; l--){
+        r1 = rho_coeff_6[l][k] + r1*dx;
+      }
+      rho6_lookup[i][k-nlower_6] = r1;
+    }
+    for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+      rho6_lookup[i][k] = 0;
+    }
+    if (differentiation_flag == 1) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for(int k=nlower_6; k<=nupper_6;k++){
+        FFT_SCALAR r1 = ZEROF;
+        for(int l=order_6-2; l>=0; l--){
+          r1 = drho_coeff_6[l][k] + r1*dx;
+        }
+        drho6_lookup[i][k-nlower_6] = r1;
+      }
+      for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        drho6_lookup[i][k] = 0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Returns 0 if Intel optimizations for PPPM ignored due to offload
+------------------------------------------------------------------------- */
+
+#ifdef _LMP_INTEL_OFFLOAD
+int PPPMDispIntel::use_base() {
+  return _use_base;
+}
+#endif
diff --git a/src/USER-INTEL/pppm_disp_intel.h b/src/USER-INTEL/pppm_disp_intel.h
new file mode 100644
index 0000000000..65c43dd486
--- /dev/null
+++ b/src/USER-INTEL/pppm_disp_intel.h
@@ -0,0 +1,238 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
+
+#else
+
+#ifndef LMP_PPPMINTEL_DISP_H
+#define LMP_PPPMINTEL_DISP_H
+
+#include "pppm_disp.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMDispIntel : public PPPMDisp {
+  public:
+    PPPMDispIntel(class LAMMPS *, int, char **);
+    virtual ~PPPMDispIntel();
+    virtual void init();
+    virtual void compute(int, int);
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    int use_base();
+    #endif
+
+  protected:
+    FixIntel *fix;
+
+    int _use_lrt;
+    FFT_SCALAR **perthread_density;
+    FFT_SCALAR *particle_ekx;
+    FFT_SCALAR *particle_eky;
+    FFT_SCALAR *particle_ekz;
+    FFT_SCALAR *particle_ekx0;
+    FFT_SCALAR *particle_eky0;
+    FFT_SCALAR *particle_ekz0;
+    FFT_SCALAR *particle_ekx1;
+    FFT_SCALAR *particle_eky1;
+    FFT_SCALAR *particle_ekz1;
+    FFT_SCALAR *particle_ekx2;
+    FFT_SCALAR *particle_eky2;
+    FFT_SCALAR *particle_ekz2;
+    FFT_SCALAR *particle_ekx3;
+    FFT_SCALAR *particle_eky3;
+    FFT_SCALAR *particle_ekz3;
+    FFT_SCALAR *particle_ekx4;
+    FFT_SCALAR *particle_eky4;
+    FFT_SCALAR *particle_ekz4;
+    FFT_SCALAR *particle_ekx5;
+    FFT_SCALAR *particle_eky5;
+    FFT_SCALAR *particle_ekz5;
+    FFT_SCALAR *particle_ekx6;
+    FFT_SCALAR *particle_eky6;
+    FFT_SCALAR *particle_ekz6;
+
+
+
+    int _use_table;
+    int rho_points;
+    FFT_SCALAR **rho_lookup;
+    FFT_SCALAR **rho6_lookup;
+    FFT_SCALAR **drho_lookup;
+    FFT_SCALAR **drho6_lookup;
+    FFT_SCALAR half_rho_scale, half_rho_scale_plus;
+
+    int _use_packing;
+
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    int _use_base;
+    #endif
+
+    template<class flt_t, class acc_t>
+    void particle_map(double, double, double,
+                      double, int **, int, int,
+                      int, int, int,
+                      int, int, int,
+                      IntelBuffers<flt_t,acc_t> *buffers);
+
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_c<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_c<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_g<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_g<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_a<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_a<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_none<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_none<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_c_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_c_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_c_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_c_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_g_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_g_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_g_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_g_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_a_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_a_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_a_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_a_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
+     template<class flt_t, class acc_t>
+    void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_none_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_none_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_none_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_none_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    void precompute_rho();
+
+  };
+
+}
+#endif
+#endif
+
+
diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp
index c420a23bf4..8416b6f3a3 100644
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@@ -12,7 +12,9 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Rodrigo Canales (RWTH Aachen University)
+   Contributing authors: William McDoniel (RWTH Aachen University)
+                         Rodrigo Canales (RWTH Aachen University)
+                         Markus Hoehnerbach (RWTH Aachen University)
                          W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
@@ -22,6 +24,7 @@
 #include "pppm_intel.h"
 #include "atom.h"
 #include "error.h"
+#include "fft3d_wrap.h"
 #include "gridcomm.h"
 #include "math_const.h"
 #include "math_special.h"
@@ -54,10 +57,37 @@ enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
 PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
 {
   suffix_flag |= Suffix::INTEL;
+
+  order = 7; //sets default stencil size to 7
+
+  perthread_density = NULL;
+  particle_ekx = particle_eky = particle_ekz = NULL;
+
+  rho_lookup = drho_lookup = NULL;
+  rho_points = 0;
+
+  vdxy_brick = vdz0_brick = NULL;
+  work3 = NULL;
+  cg_pack = NULL;
+
+  _use_table = _use_packing = _use_lrt = 0;
 }
 
 PPPMIntel::~PPPMIntel()
 {
+  memory->destroy(perthread_density);
+  memory->destroy(particle_ekx);
+  memory->destroy(particle_eky);
+  memory->destroy(particle_ekz);
+
+  memory->destroy(rho_lookup);
+  memory->destroy(drho_lookup);
+
+  memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
+  memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
+  memory->destroy(work3);
+
+  delete cg_pack;
 }
 
 /* ----------------------------------------------------------------------
@@ -83,17 +113,64 @@ void PPPMIntel::init()
 
   fix->kspace_init_check();
 
+  _use_lrt = fix->lrt();
+
+  // For vectorization, we need some padding in the end
+  // The first thread computes on the global density
+  if ((comm->nthreads > 1) && !_use_lrt) {
+    memory->destroy(perthread_density);
+    memory->create(perthread_density, comm->nthreads-1,
+                   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmintel:perthread_density");
+  }
+
+  _use_table = fix->pppm_table();
+  if (_use_table) {
+    rho_points = 5000;
+    memory->destroy(rho_lookup);
+    memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmintel:rho_lookup");
+    if(differentiation_flag == 1) {
+      memory->destroy(drho_lookup);
+      memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                     "pppmintel:drho_lookup");
+    }
+    precompute_rho();
+  }
+
   if (order > INTEL_P3M_MAXORDER)
     error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
 
-  /*
-  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
-    pack_force_const(force_const_single, fix->get_mixed_buffers());
-  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
-    pack_force_const(force_const_double, fix->get_double_buffers());
-  else
-    pack_force_const(force_const_single, fix->get_single_buffers());
-  */
+  _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16)
+                              && (sizeof(FFT_SCALAR) == sizeof(float))
+                              && (differentiation_flag == 0);
+  if (_use_packing) {
+    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
+    memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
+                            nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+                            "pppmintel:vdxy_brick");
+    memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
+    memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
+                            nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+                            "pppmintel:vdz0_brick");
+    memory->destroy(work3);
+    memory->create(work3, 2*nfft_both, "pppmintel:work3");
+
+    // new communicator for the double-size bricks
+    delete cg_pack;
+    int (*procneigh)[2] = comm->procneigh;
+    cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in,
+                           nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
+                           nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+
+    cg_pack->ghost_notify();
+    cg_pack->setup();
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -154,8 +231,18 @@ void PPPMIntel::compute_first(int eflag, int vflag)
 
   if (atom->nmax > nmax) {
     memory->destroy(part2grid);
+    if (differentiation_flag == 1) {
+      memory->destroy(particle_ekx);
+      memory->destroy(particle_eky);
+      memory->destroy(particle_ekz);
+    }
     nmax = atom->nmax;
     memory->create(part2grid,nmax,3,"pppm:part2grid");
+    if (differentiation_flag == 1) {
+      memory->create(particle_ekx, nmax, "pppmintel:pekx");
+      memory->create(particle_eky, nmax, "pppmintel:peky");
+      memory->create(particle_ekz, nmax, "pppmintel:pekz");
+    }
   }
 
   // find grid points for all my particles
@@ -184,13 +271,19 @@ void PPPMIntel::compute_first(int eflag, int vflag)
   // return gradients (electric fields) in 3d brick decomposition
   // also performs per-atom calculations via poisson_peratom()
 
-  poisson();
+  if (differentiation_flag == 1) poisson_ad();
+  else poisson_ik_intel();
 
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
   if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  else {
+    if (_use_packing)
+      cg_pack->forward_comm(this,FORWARD_IK);
+    else
+      cg->forward_comm(this,FORWARD_IK);
+  }
 
   // extra per-atom energy/virial communication
 
@@ -297,48 +390,60 @@ void PPPMIntel::compute_second(int eflag, int vflag)
 template<class flt_t, class acc_t>
 void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
 {
-  int nx,ny,nz;
-
   ATOM_T * _noalias const x = buffers->get_x(0);
   int nlocal = atom->nlocal;
+  int nthr;
+  if (_use_lrt)
+    nthr = 1;
+  else
+    nthr = comm->nthreads;
 
   int flag = 0;
 
   if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2]))
     error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
 
-  const flt_t lo0 = boxlo[0];
-  const flt_t lo1 = boxlo[1];
-  const flt_t lo2 = boxlo[2];
-  const flt_t xi = delxinv;
-  const flt_t yi = delyinv;
-  const flt_t zi = delzinv;
-  const flt_t fshift = shift;
-
-  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
-  #pragma simd
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) reduction(+:flag) if(!_use_lrt)
   #endif
-  for (int i = 0; i < nlocal; i++) {
+  {
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshift = shift;
 
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // current particle coord can be outside global and local box
-    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+    int iifrom, iito, tid;
+    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
 
-    nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET;
-    ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET;
-    nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma vector aligned
+    #pragma simd reduction(+:flag)
+    #endif
+    for (int i = iifrom; i < iito; i++) {
 
-    part2grid[i][0] = nx;
-    part2grid[i][1] = ny;
-    part2grid[i][2] = nz;
+      // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+      // current particle coord can be outside global and local box
+      // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
 
-    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+      int nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET;
+      int ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET;
+      int nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET;
 
-    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
-        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
-        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
-      flag = 1;
+      part2grid[i][0] = nx;
+      part2grid[i][1] = ny;
+      part2grid[i][2] = nz;
+
+      // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+      if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+          ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+          nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+        flag = 1;
+    }
   }
 
   if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
@@ -352,13 +457,11 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
    in global grid
 ------------------------------------------------------------------------- */
 
-template<class flt_t, class acc_t>
+template<class flt_t, class acc_t, int use_table>
 void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
 {
-  // clear 3d density array
-
-  memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
-         ngrid*sizeof(FFT_SCALAR));
+  FFT_SCALAR * _noalias global_density =
+    &(density_brick[nzlo_out][nylo_out][nxlo_out]);
 
   // loop over my charges, add their contribution to nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -368,52 +471,129 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
   ATOM_T * _noalias const x = buffers->get_x(0);
   flt_t * _noalias const q = buffers->get_q(0);
   int nlocal = atom->nlocal;
+  int nthr;
+  if (_use_lrt)
+    nthr = 1;
+  else
+    nthr = comm->nthreads;
 
-  const flt_t lo0 = boxlo[0];
-  const flt_t lo1 = boxlo[1];
-  const flt_t lo2 = boxlo[2];
-  const flt_t xi = delxinv;
-  const flt_t yi = delyinv;
-  const flt_t zi = delzinv;
-  const flt_t fshift = shift;
-  const flt_t fshiftone = shiftone;
-  const flt_t fdelvolinv = delvolinv;
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+    const int nix = nxhi_out - nxlo_out + 1;
+    const int niy = nyhi_out - nylo_out + 1;
 
-  for (int i = 0; i < nlocal; i++) {
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshift = shift;
+    const flt_t fshiftone = shiftone;
+    const flt_t fdelvolinv = delvolinv;
 
-    int nx = part2grid[i][0];
-    int ny = part2grid[i][1];
-    int nz = part2grid[i][2];
-    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
-    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
-    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ?
+      global_density : perthread_density[tid - 1];
+    // clear 3d density array
+    memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
 
-    flt_t rho[3][INTEL_P3M_MAXORDER];
+    for (int i = ifrom; i < ito; i++) {
 
-    for (int k = nlower; k <= nupper; k++) {
-      FFT_SCALAR r1,r2,r3;
-      r1 = r2 = r3 = ZEROF;
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
 
-      for (int l = order-1; l >= 0; l--) {
-        r1 = rho_coeff[l][k] + r1*dx;
-        r2 = rho_coeff[l][k] + r2*dy;
-        r3 = rho_coeff[l][k] + r3*dz;
+      int nysum = nlower + ny - nylo_out;
+      int nxsum = nlower + nx - nxlo_out;
+      int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order-1; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+        }
       }
-      rho[0][k-nlower] = r1;
-      rho[1][k-nlower] = r2;
-      rho[2][k-nlower] = r3;
-    }
 
-    FFT_SCALAR z0 = fdelvolinv * q[i];
-    for (int n = nlower; n <= nupper; n++) {
-      int mz = n+nz;
-      FFT_SCALAR y0 = z0*rho[2][n-nlower];
-      for (int m = nlower; m <= nupper; m++) {
-        int my = m+ny;
-        FFT_SCALAR x0 = y0*rho[1][m-nlower];
-        for (int l = nlower; l <= nupper; l++) {
-          int mx = l+nx;
-          density_brick[mz][my][mx] += x0*rho[0][l-nlower];
+      FFT_SCALAR z0 = fdelvolinv * q[i];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            my_density[mzyx] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  if (nthr > 1) {
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(nthr, global_density) if(!_use_lrt)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int i = ifrom; i < ito; i++) {
+        for(int j = 1; j < nthr; j++) {
+          global_density[i] += perthread_density[j-1][i];
         }
       }
     }
@@ -424,7 +604,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
    interpolate from grid to get electric field & force on my particles for ik
 ------------------------------------------------------------------------- */
 
-template<class flt_t, class acc_t>
+template<class flt_t, class acc_t, int use_table, int use_packing>
 void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
 {
   // loop over my charges, interpolate electric field from nearby grid points
@@ -437,68 +617,151 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
   flt_t * _noalias const q = buffers->get_q(0);
   FORCE_T * _noalias const f = buffers->get_f();
   int nlocal = atom->nlocal;
+  int nthr;
+  if (_use_lrt)
+    nthr = 1;
+  else
+    nthr = comm->nthreads;
 
-  const flt_t lo0 = boxlo[0];
-  const flt_t lo1 = boxlo[1];
-  const flt_t lo2 = boxlo[2];
-  const flt_t xi = delxinv;
-  const flt_t yi = delyinv;
-  const flt_t zi = delzinv;
-  const flt_t fshiftone = shiftone;
-  const flt_t fqqrd2es = qqrd2e * scale;
-
-  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned nontemporal
-  #pragma simd
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
   #endif
-  for (int i = 0; i < nlocal; i++) {
-    int nx = part2grid[i][0];
-    int ny = part2grid[i][1];
-    int nz = part2grid[i][2];
-    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
-    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
-    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+  {
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
 
-    flt_t rho[3][INTEL_P3M_MAXORDER];
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
 
-    for (int k = nlower; k <= nupper; k++) {
-      FFT_SCALAR r1 = rho_coeff[order-1][k];
-      FFT_SCALAR r2 = rho_coeff[order-1][k];
-      FFT_SCALAR r3 = rho_coeff[order-1][k];
-      for (int l = order-2; l >= 0; l--) {
-        r1 = rho_coeff[l][k] + r1*dx;
-        r2 = rho_coeff[l][k] + r2*dy;
-        r3 = rho_coeff[l][k] + r3*dz;
-      }
-      rho[0][k-nlower] = r1;
-      rho[1][k-nlower] = r2;
-      rho[2][k-nlower] = r3;
-    }
+    _alignvar(flt_t rho0[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
 
-    FFT_SCALAR ekx, eky, ekz;
-    ekx = eky = ekz = ZEROF;
-    for (int n = nlower; n <= nupper; n++) {
-      int mz = n+nz;
-      FFT_SCALAR z0 = rho[2][n-nlower];
-      for (int m = nlower; m <= nupper; m++) {
-        int my = m+ny;
-        FFT_SCALAR y0 = z0*rho[1][m-nlower];
-        for (int l = nlower; l <= nupper; l++) {
-          int mx = l+nx;
-          FFT_SCALAR x0 = y0*rho[0][l-nlower];
-          ekx -= x0*vdx_brick[mz][my][mx];
-          eky -= x0*vdy_brick[mz][my][mx];
-          ekz -= x0*vdz_brick[mz][my][mx];
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nxsum = (use_packing ? 2 : 1) * (nx + nlower);
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          if (use_packing) {
+            rho0[2 * k] = rho_lookup[idx][k];
+            rho0[2 * k + 1] = rho_lookup[idx][k];
+          } else {
+            rho0[k] = rho_lookup[idx][k];
+          }
+          rho1[k] = rho_lookup[idy][k];
+          rho2[k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1 = rho_coeff[order-1][k];
+          FFT_SCALAR r2 = rho_coeff[order-1][k];
+          FFT_SCALAR r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+          if (use_packing) {
+            rho0[2 * (k-nlower)] = r1;
+            rho0[2 * (k-nlower) + 1] = r1;
+          } else {
+            rho0[k-nlower] = r1;
+          }
+          rho1[k-nlower] = r2;
+          rho2[k-nlower] = r3;
         }
       }
+
+      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekxy_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < (use_packing ? 2 : 1) *
+                 INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+            if (use_packing) {
+              ekxy_arr[l] -= x0*vdxy_brick[mz][my][mx];
+              ekz0_arr[l] -= x0*vdz0_brick[mz][my][mx];
+            } else {
+              ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
+              eky_arr[l] -= x0*vdy_brick[mz][my][mx];
+              ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
+            }
+          }
+        }
+      }
+
+      FFT_SCALAR ekx, eky, ekz;
+      ekx = eky = ekz = ZEROF;
+
+      if (use_packing) {
+        for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) {
+          ekx += ekxy_arr[l];
+          eky += ekxy_arr[l+1];
+          ekz += ekz0_arr[l];
+        }
+      } else {
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx += ekx_arr[l];
+          eky += eky_arr[l];
+          ekz += ekz_arr[l];
+        }
+      }
+
+      // convert E-field to force
+
+      const flt_t qfactor = fqqrd2es * q[i];
+      f[i].x += qfactor*ekx;
+      f[i].y += qfactor*eky;
+      if (slabflag != 2) f[i].z += qfactor*ekz;
     }
-
-    // convert E-field to force
-
-    const flt_t qfactor = fqqrd2es * q[i];
-    f[i].x += qfactor*ekx;
-    f[i].y += qfactor*eky;
-    if (slabflag != 2) f[i].z += qfactor*ekz;
   }
 }
 
@@ -506,7 +769,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
    interpolate from grid to get electric field & force on my particles for ad
 ------------------------------------------------------------------------- */
 
-template<class flt_t, class acc_t>
+template<class flt_t, class acc_t, int use_table>
 void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
 {
   // loop over my charges, interpolate electric field from nearby grid points
@@ -519,120 +782,436 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
   const flt_t * _noalias const q = buffers->get_q(0);
   FORCE_T * _noalias const f = buffers->get_f();
   int nlocal = atom->nlocal;
+  int nthr;
+  if (_use_lrt)
+    nthr = 1;
+  else
+    nthr = comm->nthreads;
 
-  const flt_t ftwo_pi = MY_PI * 2.0;
-  const flt_t ffour_pi = MY_PI * 4.0;
+  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
+  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
+  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
 
-  const flt_t lo0 = boxlo[0];
-  const flt_t lo1 = boxlo[1];
-  const flt_t lo2 = boxlo[2];
-  const flt_t xi = delxinv;
-  const flt_t yi = delyinv;
-  const flt_t zi = delzinv;
-  const flt_t fshiftone = shiftone;
-  const flt_t fqqrd2es = qqrd2e * scale;
-
-  const double *prd = domain->prd;
-  const double xprd = prd[0];
-  const double yprd = prd[1];
-  const double zprd = prd[2];
-
-  const flt_t hx_inv = nx_pppm/xprd;
-  const flt_t hy_inv = ny_pppm/yprd;
-  const flt_t hz_inv = nz_pppm/zprd;
-
-  const flt_t fsf_coeff0 = sf_coeff[0];
-  const flt_t fsf_coeff1 = sf_coeff[1];
-  const flt_t fsf_coeff2 = sf_coeff[2];
-  const flt_t fsf_coeff3 = sf_coeff[3];
-  const flt_t fsf_coeff4 = sf_coeff[4];
-  const flt_t fsf_coeff5 = sf_coeff[5];
-
-  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned nontemporal
-  #pragma simd
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
   #endif
-  for (int i = 0; i < nlocal; i++) {
-    int nx = part2grid[i][0];
-    int ny = part2grid[i][1];
-    int nz = part2grid[i][2];
-    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
-    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
-    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+  {
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
 
-    flt_t rho[3][INTEL_P3M_MAXORDER];
-    flt_t drho[3][INTEL_P3M_MAXORDER];
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
 
-    for (int k = nlower; k <= nupper; k++) {
-      FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
-      dr1 = dr2 = dr3 = ZEROF;
+    const double *prd = domain->prd;
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2];
 
-      r1 = rho_coeff[order-1][k];
-      r2 = rho_coeff[order-1][k];
-      r3 = rho_coeff[order-1][k];
-      for (int l = order-2; l >= 0; l--) {
-        r1 = rho_coeff[l][k] + r1 * dx;
-        r2 = rho_coeff[l][k] + r2 * dy;
-        r3 = rho_coeff[l][k] + r3 * dz;
-	dr1 = drho_coeff[l][k] + dr1 * dx;
-	dr2 = drho_coeff[l][k] + dr2 * dy;
-	dr3 = drho_coeff[l][k] + dr3 * dz;
-      }
-      rho[0][k-nlower] = r1;
-      rho[1][k-nlower] = r2;
-      rho[2][k-nlower] = r3;
-      drho[0][k-nlower] = dr1;
-      drho[1][k-nlower] = dr2;
-      drho[2][k-nlower] = dr3;
-    }
+    const flt_t hx_inv = nx_pppm/xprd;
+    const flt_t hy_inv = ny_pppm/yprd;
+    const flt_t hz_inv = nz_pppm/zprd;
 
-    FFT_SCALAR ekx, eky, ekz;
-    ekx = eky = ekz = ZEROF;
-    for (int n = nlower; n <= nupper; n++) {
-      int mz = n+nz;
-      for (int m = nlower; m <= nupper; m++) {
-        int my = m+ny;
-        FFT_SCALAR ekx_p = rho[1][m-nlower] * rho[2][n-nlower];
-        FFT_SCALAR eky_p = drho[1][m-nlower] * rho[2][n-nlower];
-        FFT_SCALAR ekz_p = rho[1][m-nlower] * drho[2][n-nlower];
-        for (int l = nlower; l <= nupper; l++) {
-          int mx = l+nx;
-          ekx += drho[0][l-nlower] * ekx_p * u_brick[mz][my][mx];
-          eky += rho[0][l-nlower] * eky_p * u_brick[mz][my][mx];
-          ekz += rho[0][l-nlower] * ekz_p * u_brick[mz][my][mx];
+    const flt_t fsf_coeff0 = sf_coeff[0];
+    const flt_t fsf_coeff1 = sf_coeff[1];
+    const flt_t fsf_coeff2 = sf_coeff[2];
+    const flt_t fsf_coeff3 = sf_coeff[3];
+    const flt_t fsf_coeff4 = sf_coeff[4];
+    const flt_t fsf_coeff5 = sf_coeff[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+
+      int nxsum = nx + nlower;
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+          drho[0][k] = drho_lookup[idx][k];
+          drho[1][k] = drho_lookup[idy][k];
+          drho[2][k] = drho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff[order-1][k];
+          r2 = rho_coeff[order-1][k];
+          r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1 * dx;
+            r2 = rho_coeff[l][k] + r2 * dy;
+            r3 = rho_coeff[l][k] + r3 * dz;
+            dr1 = drho_coeff[l][k] + dr1 * dx;
+            dr2 = drho_coeff[l][k] + dr2 * dy;
+            dr3 = drho_coeff[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+          drho[0][k-nlower] = dr1;
+          drho[1][k-nlower] = dr2;
+          drho[2][k-nlower] = dr3;
         }
       }
+
+      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
+            eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
+            ekz[l] +=  rho[0][l] * ekz_p * u_brick[mz][my][mx];
+          }
+        }
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        particle_ekx[i] += ekx[l];
+        particle_eky[i] += eky[l];
+        particle_ekz[i] += ekz[l];
+      }
     }
-    ekx *= hx_inv;
-    eky *= hy_inv;
-    ekz *= hz_inv;
 
-    // convert E-field to force
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx[i] *= hx_inv;
+      particle_eky[i] *= hy_inv;
+      particle_ekz[i] *= hz_inv;
 
-    const flt_t qfactor = fqqrd2es * q[i];
-    const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
+      // convert E-field to force
 
-    const flt_t s1 = x[i].x * hx_inv;
-    const flt_t s2 = x[i].y * hy_inv;
-    const flt_t s3 = x[i].z * hz_inv;
-    flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-    sf += fsf_coeff1 * sin(ffour_pi * s1);
-    sf *= twoqsq;
-    f[i].x += qfactor * ekx - fqqrd2es * sf;
+      const flt_t qfactor = fqqrd2es * q[i];
+      const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
 
-    sf = fsf_coeff2 * sin(ftwo_pi * s2);
-    sf += fsf_coeff3 * sin(ffour_pi * s2);
-    sf *= twoqsq;
-    f[i].y += qfactor * eky - fqqrd2es * sf;
+      const flt_t s1 = x[i].x * hx_inv;
+      const flt_t s2 = x[i].y * hy_inv;
+      const flt_t s3 = x[i].z * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= twoqsq;
+      f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf;
 
-    sf = fsf_coeff4 * sin(ftwo_pi * s3);
-    sf += fsf_coeff5 * sin(ffour_pi * s3);
-    sf *= twoqsq;
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= twoqsq;
+      f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf;
 
-    if (slabflag != 2) f[i].z += qfactor * ekz - fqqrd2es * sf;
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= twoqsq;
+
+      if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf;
+    }
   }
 }
 
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik
+   Does special things for packing mode to avoid repeated copies
+------------------------------------------------------------------------- */
+
+void PPPMIntel::poisson_ik_intel()
+{
+  if (_use_packing == 0) {
+    poisson_ik();
+    return;
+  }
+
+  int i,j,k,n;
+  double eng;
+
+  // transform charge density (r -> k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] = density_fft[i];
+    work1[n++] = ZEROF;
+  }
+
+  fft1->compute(work1,work1,1);
+
+  // global energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        eng = s2 * greensfn[i] * (work1[n]*work1[n] +
+                                  work1[n+1]*work1[n+1]);
+        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
+        if (eflag_global) energy += eng;
+        n += 2;
+      }
+    } else {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        energy +=
+          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        n += 2;
+      }
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] *= scaleinv * greensfn[i];
+    work1[n++] *= scaleinv * greensfn[i];
+  }
+
+  // extra FFTs for per-atom energy/virial
+
+  if (evflag_atom) poisson_peratom();
+
+  // triclinic system
+
+  if (triclinic) {
+    poisson_ik_triclinic();
+    return;
+  }
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fkx[i]*work1[n+1];
+        work2[n+1] = -fkx[i]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  // y direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work3[n] = fky[j]*work1[n+1];
+        work3[n+1] = -fky[j]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work3,work3,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdxy_brick[k][j][2*i] = work2[n];
+        vdxy_brick[k][j][2*i+1] = work3[n];
+        n += 2;
+      }
+
+  // z direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fkz[k]*work1[n+1];
+        work2[n+1] = -fkz[k]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdz0_brick[k][j][2*i] = work2[n];
+        vdz0_brick[k][j][2*i+1] = 0.;
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   precompute rho coefficients as a lookup table to save time in make_rho
+   and fieldforce.  Instead of doing this polynomial for every atom 6 times
+   per time step, precompute it for some number of points.
+------------------------------------------------------------------------- */
+
+void PPPMIntel::precompute_rho()
+{
+
+  half_rho_scale = (rho_points - 1.)/2.;
+  half_rho_scale_plus = half_rho_scale + 0.5;
+
+  for (int i = 0; i < rho_points; i++) {
+    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int k=nlower; k<=nupper;k++){
+      FFT_SCALAR r1 = ZEROF;
+      for(int l=order-1; l>=0; l--){
+        r1 = rho_coeff[l][k] + r1*dx;
+      }
+      rho_lookup[i][k-nlower] = r1;
+    }
+    for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+      rho_lookup[i][k] = 0;
+    }
+    if (differentiation_flag == 1) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for(int k=nlower; k<=nupper;k++){
+        FFT_SCALAR r1 = ZEROF;
+        for(int l=order-2; l>=0; l--){
+          r1 = drho_coeff[l][k] + r1*dx;
+        }
+        drho_lookup[i][k-nlower] = r1;
+      }
+      for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        drho_lookup[i][k] = 0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack own values to buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPMIntel::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+
+  if ((flag == FORWARD_IK) && _use_packing) {
+    FFT_SCALAR *xsrc = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out];
+    FFT_SCALAR *zsrc = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = xsrc[list[i]];
+      buf[n++] = zsrc[list[i]];
+    }
+  } else {
+    PPPM::pack_forward(flag, buf, nlist, list);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's own values from buf and set own ghost values
+------------------------------------------------------------------------- */
+
+void PPPMIntel::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+
+  if ((flag == FORWARD_IK) && _use_packing) {
+    FFT_SCALAR *xdest = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out];
+    FFT_SCALAR *zdest = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      xdest[list[i]] = buf[n++];
+      zdest[list[i]] = buf[n++];
+    }
+  } else {
+    PPPM::unpack_forward(flag, buf, nlist, list);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
+
+double PPPMIntel::memory_usage()
+{
+  double bytes = PPPM::memory_usage();
+  if ((comm->nthreads > 1) && !_use_lrt) {
+    bytes += (comm->nthreads - 1) * (ngrid + INTEL_P3M_ALIGNED_MAXORDER) *
+      sizeof(FFT_SCALAR);
+  }
+  if (differentiation_flag == 1) {
+    bytes += 3 * nmax * sizeof(FFT_SCALAR);
+  }
+  if (_use_table) {
+    bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR);
+    if (differentiation_flag == 1) {
+      bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR);
+    }
+  }
+  if (_use_packing) {
+    bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1)
+               * (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
+    bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
+               * (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
+    bytes += 2 * nfft_both * sizeof(FFT_SCALAR);
+    bytes += cg_pack->memory_usage();
+  }
+  return bytes;
+}
+
 /* ----------------------------------------------------------------------
   Pack data into intel package buffers if using LRT mode
 ------------------------------------------------------------------------- */
@@ -640,13 +1219,16 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
 void PPPMIntel::pack_buffers()
 {
   fix->start_watch(TIME_PACK);
+  int packthreads;
+  if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;
+  else packthreads = 1;
   #if defined(_OPENMP)
-  #pragma omp parallel default(none)
+  #pragma omp parallel if(packthreads > 1)
   #endif
   {
     int ifrom, ito, tid;
     IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
-                              comm->nthreads, 
+                              packthreads,
                               sizeof(IntelBuffers<float,double>::atom_t));
     if (fix->precision() == FixIntel::PREC_MODE_MIXED)
       fix->get_mixed_buffers()->thr_pack(ifrom,ito,1);
diff --git a/src/USER-INTEL/pppm_intel.h b/src/USER-INTEL/pppm_intel.h
index 40669a5561..e152486b29 100644
--- a/src/USER-INTEL/pppm_intel.h
+++ b/src/USER-INTEL/pppm_intel.h
@@ -1,4 +1,4 @@
-/* -*- c++ -*- ----------------------------------------------------------
+/* *- c++ -*- -----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -12,7 +12,9 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Rodrigo Canales (RWTH Aachen University)
+   Contributing authors: William McDoniel (RWTH Aachen University)
+                         Rodrigo Canales (RWTH Aachen University)
+                         Markus Hoehnerbach (RWTH Aachen University)
                          W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
@@ -36,6 +38,9 @@ class PPPMIntel : public PPPM {
   virtual ~PPPMIntel();
   virtual void init();
   virtual void compute(int, int);
+  virtual void pack_forward(int, FFT_SCALAR *, int, int *);
+  virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
+  virtual double memory_usage();
   void compute_first(int, int);
   void compute_second(int, int);
   void pack_buffers();
@@ -47,18 +52,74 @@ class PPPMIntel : public PPPM {
  protected:
   FixIntel *fix;
 
+  int _use_lrt;
+  FFT_SCALAR **perthread_density;
+  FFT_SCALAR *particle_ekx;
+  FFT_SCALAR *particle_eky;
+  FFT_SCALAR *particle_ekz;
+
+  int _use_table;
+  int rho_points;
+  FFT_SCALAR **rho_lookup;
+  FFT_SCALAR **drho_lookup;
+  FFT_SCALAR half_rho_scale, half_rho_scale_plus;
+
+  int _use_packing;
+  FFT_SCALAR ***vdxy_brick;
+  FFT_SCALAR ***vdz0_brick;
+  FFT_SCALAR *work3;
+  class GridComm *cg_pack;
+
   #ifdef _LMP_INTEL_OFFLOAD
   int _use_base;
   #endif
 
+    template<class flt_t, class acc_t>
+    void test_function(IntelBuffers<flt_t,acc_t> *buffers);
+
+
+  void precompute_rho();
   template<class flt_t, class acc_t>
   void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
-  template<class flt_t, class acc_t>
+  template<class flt_t, class acc_t, int use_table>
   void make_rho(IntelBuffers<flt_t,acc_t> *buffers);
   template<class flt_t, class acc_t>
+  void make_rho(IntelBuffers<flt_t,acc_t> *buffers) {
+    if (_use_table == 1) {
+      make_rho<flt_t,acc_t,1>(buffers);
+    } else {
+      make_rho<flt_t,acc_t,0>(buffers);
+    }
+  }
+  void poisson_ik_intel();
+  template<class flt_t, class acc_t, int use_table, int use_packing>
   void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers);
   template<class flt_t, class acc_t>
+  void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+    if (_use_table == 1) {
+      if (_use_packing == 1) {
+        fieldforce_ik<flt_t, acc_t, 1, 1>(buffers);
+      } else {
+        fieldforce_ik<flt_t, acc_t, 1, 0>(buffers);
+      }
+    } else {
+      if (_use_packing == 1) {
+        fieldforce_ik<flt_t, acc_t, 0, 1>(buffers);
+      } else {
+        fieldforce_ik<flt_t, acc_t, 0, 0>(buffers);
+      }
+    }
+  }
+  template<class flt_t, class acc_t, int use_table>
   void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers);
+  template<class flt_t, class acc_t>
+  void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+    if (_use_table == 1) {
+      fieldforce_ad<flt_t,acc_t,1>(buffers);
+    } else {
+      fieldforce_ad<flt_t,acc_t,0>(buffers);
+    }
+  }
 };
 
 }
diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp
index afb7852f98..81f4586143 100644
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@@ -51,7 +51,7 @@ VerletLRTIntel::VerletLRTIntel(LAMMPS *lmp, int narg, char **arg) :
 
 /* ---------------------------------------------------------------------- */
 
-VerletLRTIntel::~VerletLRTIntel() 
+VerletLRTIntel::~VerletLRTIntel()
 {
   #if defined(_LMP_INTEL_LRT_PTHREAD)
   pthread_mutex_destroy(&_kmutex);
@@ -67,10 +67,10 @@ void VerletLRTIntel::init()
   Verlet::init();
 
   _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
-  
+
   #ifdef LMP_INTEL_NOLRT
-  error->all(FLERR, 
-	     "LRT otion for Intel package disabled at compile time");
+  error->all(FLERR,
+             "LRT otion for Intel package disabled at compile time");
   #endif
 }
 
@@ -78,17 +78,17 @@ void VerletLRTIntel::init()
    setup before run
 ------------------------------------------------------------------------- */
 
-void VerletLRTIntel::setup()
+void VerletLRTIntel::setup(int flag)
 {
   if (_intel_kspace == 0) {
-    Verlet::setup();
+    Verlet::setup(flag);
     return;
-  } 
+  }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_intel_kspace->use_base()) {
     _intel_kspace = 0;
-    Verlet::setup();
+    Verlet::setup(flag);
     return;
   }
   #endif
@@ -154,15 +154,15 @@ void VerletLRTIntel::setup()
   _intel_kspace->setup();
 
   #if defined(_LMP_INTEL_LRT_PTHREAD)
-  pthread_create(&_kspace_thread, &_kspace_attr, 
-		 &VerletLRTIntel::k_launch_loop, this);
+  pthread_create(&_kspace_thread, &_kspace_attr,
+                 &VerletLRTIntel::k_launch_loop, this);
   #elif defined(_LMP_INTEL_LRT_11)
   std::thread kspace_thread;
-  if (kspace_compute_flag) 
-    _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag, 
+  if (kspace_compute_flag)
+    _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag,
                                                                  vflag); });
-  else 
-    _kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag, 
+  else
+    _kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag,
                                                                  vflag); });
   #endif
 
@@ -297,8 +297,8 @@ void VerletLRTIntel::run(int n)
     pthread_mutex_unlock(&_kmutex);
     #elif defined(_LMP_INTEL_LRT_11)
     std::thread kspace_thread;
-    if (kspace_compute_flag) 
-      kspace_thread=std::thread([=] { 
+    if (kspace_compute_flag)
+      kspace_thread=std::thread([=] {
         _intel_kspace->compute_first(eflag, vflag);
         timer->stamp(Timer::KSPACE);
       } );
@@ -329,7 +329,7 @@ void VerletLRTIntel::run(int n)
     _kspace_done = 0;
     pthread_mutex_unlock(&_kmutex);
     #elif defined(_LMP_INTEL_LRT_11)
-    if (kspace_compute_flag) 
+    if (kspace_compute_flag)
       kspace_thread.join();
     #endif
 
@@ -367,7 +367,7 @@ void VerletLRTIntel::run(int n)
   }
 
   #if defined(_LMP_INTEL_LRT_PTHREAD)
-  if (run_cancelled) 
+  if (run_cancelled)
     pthread_cancel(_kspace_thread);
   else {
     pthread_mutex_lock(&_kmutex);
@@ -390,9 +390,9 @@ void * VerletLRTIntel::k_launch_loop(void *context)
 {
   VerletLRTIntel * const c = (VerletLRTIntel *)context;
 
-  if (c->kspace_compute_flag) 
+  if (c->kspace_compute_flag)
     c->_intel_kspace->compute_first(c->eflag, c->vflag);
-  else 
+  else
     c->_intel_kspace->compute_dummy(c->eflag, c->vflag);
 
   pthread_mutex_lock(&(c->_kmutex));
@@ -408,7 +408,7 @@ void * VerletLRTIntel::k_launch_loop(void *context)
   pthread_mutex_unlock(&(c->_kmutex));
 
   for (int i = 0; i < n; i++) {
-    
+
     if (c->kspace_compute_flag) {
       c->_intel_kspace->compute_first(c->eflag, c->vflag);
       c->timer->stamp(Timer::KSPACE);
diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h
index a699c20796..0521b161c7 100644
--- a/src/USER-INTEL/verlet_lrt_intel.h
+++ b/src/USER-INTEL/verlet_lrt_intel.h
@@ -42,7 +42,7 @@ class VerletLRTIntel : public Verlet {
   VerletLRTIntel(class LAMMPS *, int, char **);
   virtual ~VerletLRTIntel();
   virtual void init();
-  virtual void setup();
+  virtual void setup(int flag = 1);
   virtual void run(int);
 
  protected:
diff --git a/src/USER-MANIFOLD/fix_nve_manifold_rattle.h b/src/USER-MANIFOLD/fix_nve_manifold_rattle.h
index 71aa1aed9a..2bc821ab04 100644
--- a/src/USER-MANIFOLD/fix_nve_manifold_rattle.h
+++ b/src/USER-MANIFOLD/fix_nve_manifold_rattle.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    Lammps - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MANIFOLD/fix_nvt_manifold_rattle.h b/src/USER-MANIFOLD/fix_nvt_manifold_rattle.h
index a9d3e3122f..144cda3799 100644
--- a/src/USER-MANIFOLD/fix_nvt_manifold_rattle.h
+++ b/src/USER-MANIFOLD/fix_nvt_manifold_rattle.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    Lammps - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MANIFOLD/manifold.h b/src/USER-MANIFOLD/manifold.h
index b89e765a6e..f0d56ffd82 100644
--- a/src/USER-MANIFOLD/manifold.h
+++ b/src/USER-MANIFOLD/manifold.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MANIFOLD/manifold_factory.h b/src/USER-MANIFOLD/manifold_factory.h
index a2f21861eb..7fdd0a6de5 100644
--- a/src/USER-MANIFOLD/manifold_factory.h
+++ b/src/USER-MANIFOLD/manifold_factory.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    Lammps - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MANIFOLD/manifold_gaussian_bump.h b/src/USER-MANIFOLD/manifold_gaussian_bump.h
index f3401a4a33..f31e2a4bf4 100644
--- a/src/USER-MANIFOLD/manifold_gaussian_bump.h
+++ b/src/USER-MANIFOLD/manifold_gaussian_bump.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/compute_cnp_atom.cpp b/src/USER-MISC/compute_cnp_atom.cpp
index 89568c6731..f479486b79 100644
--- a/src/USER-MISC/compute_cnp_atom.cpp
+++ b/src/USER-MISC/compute_cnp_atom.cpp
@@ -50,7 +50,7 @@ enum{NCOMMON};
 
 ComputeCNPAtom::ComputeCNPAtom(LAMMPS *lmp, int narg, char **arg) :
   Compute(lmp, narg, arg),
-  nearest(NULL), nnearest(NULL), cnpv(NULL)
+  list(NULL), nearest(NULL), nnearest(NULL), cnpv(NULL)
 {
   if (narg != 4) error->all(FLERR,"Illegal compute cnp/atom command");
 
diff --git a/src/USER-MISC/fix_filter_corotate.h b/src/USER-MISC/fix_filter_corotate.h
index 3f8e8bba43..67e3fb4f01 100644
--- a/src/USER-MISC/fix_filter_corotate.h
+++ b/src/USER-MISC/fix_filter_corotate.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
  http://lammps.sandia.gov, Sandia National Laboratories
  Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/fix_ti_spring.h b/src/USER-MISC/fix_ti_spring.h
index 57c0d3299a..13f1236a8a 100644
--- a/src/USER-MISC/fix_ti_spring.h
+++ b/src/USER-MISC/fix_ti_spring.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/fix_ttm_mod.h b/src/USER-MISC/fix_ttm_mod.h
index 3415baf5b1..21f6e57e04 100644
--- a/src/USER-MISC/fix_ttm_mod.h
+++ b/src/USER-MISC/fix_ttm_mod.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/improper_distance.h b/src/USER-MISC/improper_distance.h
index 6301e6897a..cff96c6d9d 100644
--- a/src/USER-MISC/improper_distance.h
+++ b/src/USER-MISC/improper_distance.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/pair_buck_mdf.h b/src/USER-MISC/pair_buck_mdf.h
index c0ecceeea8..4f9bac6341 100644
--- a/src/USER-MISC/pair_buck_mdf.h
+++ b/src/USER-MISC/pair_buck_mdf.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/pair_edip_multi.h b/src/USER-MISC/pair_edip_multi.h
index e55916f79b..fd94594d93 100644
--- a/src/USER-MISC/pair_edip_multi.h
+++ b/src/USER-MISC/pair_edip_multi.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/pair_kolmogorov_crespi_z.h b/src/USER-MISC/pair_kolmogorov_crespi_z.h
index 3a81e571ef..caa5dac868 100644
--- a/src/USER-MISC/pair_kolmogorov_crespi_z.h
+++ b/src/USER-MISC/pair_kolmogorov_crespi_z.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/pair_lennard_mdf.h b/src/USER-MISC/pair_lennard_mdf.h
index 6fefe6fc3f..c4ffc80cd6 100644
--- a/src/USER-MISC/pair_lennard_mdf.h
+++ b/src/USER-MISC/pair_lennard_mdf.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/pair_lj_mdf.h b/src/USER-MISC/pair_lj_mdf.h
index f49020bf2d..c6236a923c 100644
--- a/src/USER-MISC/pair_lj_mdf.h
+++ b/src/USER-MISC/pair_lj_mdf.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/pair_meam_spline.h b/src/USER-MISC/pair_meam_spline.h
index 6200254674..d3554f056e 100644
--- a/src/USER-MISC/pair_meam_spline.h
+++ b/src/USER-MISC/pair_meam_spline.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-MISC/pair_momb.h b/src/USER-MISC/pair_momb.h
index 95b750cb2c..e08b81aa3e 100644
--- a/src/USER-MISC/pair_momb.h
+++ b/src/USER-MISC/pair_momb.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-NETCDF/dump_netcdf.h b/src/USER-NETCDF/dump_netcdf.h
index 036df3f058..b86f294d30 100644
--- a/src/USER-NETCDF/dump_netcdf.h
+++ b/src/USER-NETCDF/dump_netcdf.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.h b/src/USER-NETCDF/dump_netcdf_mpiio.h
index 10b0e800d2..3ca52449a5 100644
--- a/src/USER-NETCDF/dump_netcdf_mpiio.h
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-OMP/fix_qeq_reax_omp.h b/src/USER-OMP/fix_qeq_reax_omp.h
index 078ba3b9af..7565f0aff0 100644
--- a/src/USER-OMP/fix_qeq_reax_omp.h
+++ b/src/USER-OMP/fix_qeq_reax_omp.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-OMP/pair_airebo_omp.cpp b/src/USER-OMP/pair_airebo_omp.cpp
index 206e8e86e6..13df133585 100644
--- a/src/USER-OMP/pair_airebo_omp.cpp
+++ b/src/USER-OMP/pair_airebo_omp.cpp
@@ -1038,7 +1038,7 @@ double PairAIREBOOMP::bondorder_thr(int i, int j, double rij[3], double rijmag,
   double w21,dw21,r34[3],r34mag,cos234,w34,dw34;
   double cross321[3],cross234[3],prefactor,SpN;
   double fcijpc,fcikpc,fcjlpc,fcjkpc,fcilpc;
-  double dt2dik[3],dt2djl[3],dt2dij[3],aa,aaa1,aaa2,at2,cw,cwnum,cwnom;
+  double dt2dik[3],dt2djl[3],dt2dij[3],aa,aaa2,at2,cw,cwnum,cwnom;
   double sin321,sin234,rr,rijrik,rijrjl,rjk2,rik2,ril2,rjl2;
   double dctik,dctjk,dctjl,dctij,dctji,dctil,rik2i,rjl2i,sink2i,sinl2i;
   double rjk[3],ril[3],dt1dik,dt1djk,dt1djl,dt1dil,dt1dij;
@@ -1628,8 +1628,6 @@ double PairAIREBOOMP::bondorder_thr(int i, int j, double rij[3], double rijmag,
 
                 aa = (prefactor*2.0*cw/cwnom)*w21*w34 *
                   (1.0-tspjik)*(1.0-tspijl);
-                aaa1 = -prefactor*(1.0-square(om1234)) *
-                  (1.0-tspjik)*(1.0-tspijl);
                 aaa2 = -prefactor*(1.0-square(om1234)) * w21*w34;
                 at2 = aa*cwnum;
 
@@ -1879,7 +1877,7 @@ double PairAIREBOOMP::bondorderLJ_thr(int i, int j, double rij[3], double rijmag
   double w21,dw21,r34[3],r34mag,cos234,w34,dw34;
   double cross321[3],cross234[3],prefactor,SpN;
   double fcikpc,fcjlpc,fcjkpc,fcilpc;
-  double dt2dik[3],dt2djl[3],aa,aaa1,aaa2,at2,cw,cwnum,cwnom;
+  double dt2dik[3],dt2djl[3],aa,aaa2,at2,cw,cwnum,cwnom;
   double sin321,sin234,rr,rijrik,rijrjl,rjk2,rik2,ril2,rjl2;
   double dctik,dctjk,dctjl,dctil,rik2i,rjl2i,sink2i,sinl2i;
   double rjk[3],ril[3],dt1dik,dt1djk,dt1djl,dt1dil;
@@ -2572,8 +2570,6 @@ double PairAIREBOOMP::bondorderLJ_thr(int i, int j, double rij[3], double rijmag
 
                   aa = (prefactor*2.0*cw/cwnom)*w21*w34 *
                     (1.0-tspjik)*(1.0-tspijl);
-                  aaa1 = -prefactor*(1.0-square(om1234)) *
-                    (1.0-tspjik)*(1.0-tspijl);
                   aaa2 = -prefactor*(1.0-square(om1234)) * w21*w34;
                   at2 = aa*cwnum;
 
diff --git a/src/USER-OMP/pair_comb_omp.cpp b/src/USER-OMP/pair_comb_omp.cpp
index c776ff3026..2a0e6ceb23 100644
--- a/src/USER-OMP/pair_comb_omp.cpp
+++ b/src/USER-OMP/pair_comb_omp.cpp
@@ -484,7 +484,7 @@ double PairCombOMP::yasu_char(double *qf_fix, int &igroup)
 
         qfo_field(&params[iparam_ij],rsq1,iq,jq,fqji,fqjj);
         fqi   += jq * fqij + fqji;
-#if defined(_OPENMP)
+#if defined(_OPENMP) && !defined(__NVCC__)
 #pragma omp atomic
 #endif
         qf[j] += (iq * fqij + fqjj);
@@ -511,13 +511,13 @@ double PairCombOMP::yasu_char(double *qf_fix, int &igroup)
 
         qfo_short(&params[iparam_ij],i,nj,rsq1,iq,jq,fqij,fqjj);
         fqi += fqij;
-#if defined(_OPENMP)
+#if defined(_OPENMP) && !defined(__NVCC__)
 #pragma omp atomic
 #endif
         qf[j] += fqjj;
       }
 
-#if defined(_OPENMP)
+#if defined(_OPENMP) && !defined(__NVCC__)
 #pragma omp atomic
 #endif
       qf[i] += fqi;
diff --git a/src/USER-OMP/pair_reaxc_omp.cpp b/src/USER-OMP/pair_reaxc_omp.cpp
index 0fb24ed5f2..91fa3d38c7 100644
--- a/src/USER-OMP/pair_reaxc_omp.cpp
+++ b/src/USER-OMP/pair_reaxc_omp.cpp
@@ -572,17 +572,18 @@ void PairReaxCOMP::read_reax_forces(int vflag)
 
 void PairReaxCOMP::FindBond()
 {
-  int i, ii, j, pj, jtag, nj, jtmp, jj;
-  double bo_tmp, bo_cut, rij, rsq;
-
-  bond_data *bo_ij;
-  bo_cut = 0.10;
+  const double bo_cut = 0.10;
+  int i;
 
 #if defined(_OPENMP)
 #pragma omp parallel for schedule(static) default(shared)   \
-  private(i, nj, pj, bo_ij, j, bo_tmp)
+  private(i)
 #endif
   for (i = 0; i < system->n; i++) {
+    int j, pj, nj;
+    double bo_tmp;
+    bond_data *bo_ij;
+
     nj = 0;
     for( pj = Start_Index(i, lists); pj < End_Index(i, lists); ++pj ) {
       bo_ij = &( lists->select.bond_list[pj] );
diff --git a/src/USER-OMP/pair_reaxc_omp.h b/src/USER-OMP/pair_reaxc_omp.h
index a5e077c309..156627ece0 100644
--- a/src/USER-OMP/pair_reaxc_omp.h
+++ b/src/USER-OMP/pair_reaxc_omp.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-OMP/reaxc_bond_orders_omp.cpp b/src/USER-OMP/reaxc_bond_orders_omp.cpp
index 222c00980e..85755a39a3 100644
--- a/src/USER-OMP/reaxc_bond_orders_omp.cpp
+++ b/src/USER-OMP/reaxc_bond_orders_omp.cpp
@@ -55,7 +55,6 @@ void Add_dBond_to_ForcesOMP( reax_system *system, int i, int pj,
   long reductionOffset = (system->N * tid);
 
   /* Virial Tallying variables */
-  double f_scaler;
   rvec fi_tmp, fj_tmp, fk_tmp, delij, delji, delki, delkj, temp;
 
   /* Initializations */
@@ -229,14 +228,11 @@ void Add_dBond_to_Forces_NPTOMP( reax_system *system, int i, int pj, simulation_
   ivec rel_box;
   int pk, k, j;
 
-  PairReaxCOMP *pair_reax_ptr = static_cast<class PairReaxCOMP*>(system->pair_ptr);
-
 #if defined(_OPENMP)
   int tid = omp_get_thread_num();
 #else
   int tid = 0;
 #endif
-  ThrData *thr = pair_reax_ptr->getFixOMP()->get_thr(tid);
   long reductionOffset = (system->N * tid);
 
   /* Initializations */
@@ -430,12 +426,9 @@ void BOOMP( reax_system *system, control_params *control, simulation_data *data,
 #endif
 
   double p_lp1 = system->reax_param.gp.l[15];
-  int  num_bonds = 0;
   double p_boc1 = system->reax_param.gp.l[0];
   double p_boc2 = system->reax_param.gp.l[1];
   reax_list *bonds = (*lists) + BONDS;
-  int  natoms = system->N;
-  int  nthreads = control->nthreads;
 
 #if defined(_OPENMP)
 #pragma omp parallel default(shared)
@@ -454,11 +447,6 @@ void BOOMP( reax_system *system, control_params *control, simulation_data *data,
     two_body_parameters *twbp;
     bond_order_data *bo_ij, *bo_ji;
 
-#if defined(_OPENMP)
-    int tid = omp_get_thread_num();
-#else
-    int tid = 0;
-#endif
     /* Calculate Deltaprime, Deltaprime_boc values */
 #if defined(_OPENMP)
 #pragma omp for schedule(static)
diff --git a/src/USER-OMP/reaxc_bonds_omp.cpp b/src/USER-OMP/reaxc_bonds_omp.cpp
index dcf788a79c..9e16919007 100644
--- a/src/USER-OMP/reaxc_bonds_omp.cpp
+++ b/src/USER-OMP/reaxc_bonds_omp.cpp
@@ -49,14 +49,13 @@ void BondsOMP( reax_system *system, control_params *control,
   startTimeBase = MPI_Wtime();
 #endif
 
-  int natoms = system->n;
-  int nthreads = control->nthreads;
+  const int natoms = system->n;
   reax_list *bonds = (*lists) + BONDS;
-  double gp3 = system->reax_param.gp.l[3];
-  double gp4 = system->reax_param.gp.l[4];
-  double gp7 = system->reax_param.gp.l[7];
-  double gp10 = system->reax_param.gp.l[10];
-  double gp37 = (int) system->reax_param.gp.l[37];
+  const double gp3 = system->reax_param.gp.l[3];
+  const double gp4 = system->reax_param.gp.l[4];
+  const double gp7 = system->reax_param.gp.l[7];
+  const double gp10 = system->reax_param.gp.l[10];
+  const int gp37 = (int) system->reax_param.gp.l[37];
   double total_Ebond = 0.0;
 
 #if defined(_OPENMP)
@@ -66,9 +65,8 @@ void BondsOMP( reax_system *system, control_params *control,
   int  i, j, pj;
   int start_i, end_i;
   int type_i, type_j;
-  double ebond, ebond_thr=0.0, pow_BOs_be2, exp_be12, CEbo;
-  double gp3, gp4, gp7, gp10, gp37;
-  double exphu, exphua1, exphub1, exphuov, hulpov, estriph, estriph_thr=0.0;
+  double ebond, pow_BOs_be2, exp_be12, CEbo;
+  double exphu, exphua1, exphub1, exphuov, hulpov, estriph;
   double decobdbo, decobdboua, decobdboub;
   single_body_parameters *sbp_i, *sbp_j;
   two_body_parameters *twbp;
diff --git a/src/USER-OMP/reaxc_forces_omp.cpp b/src/USER-OMP/reaxc_forces_omp.cpp
index 4e37dac38d..5e93f31125 100644
--- a/src/USER-OMP/reaxc_forces_omp.cpp
+++ b/src/USER-OMP/reaxc_forces_omp.cpp
@@ -265,7 +265,6 @@ void Validate_ListsOMP( reax_system *system, storage *workspace, reax_list **lis
 {
   int i, comp, Hindex;
   reax_list *bonds, *hbonds;
-  reallocate_data *realloc = &(workspace->realloc);
   double saferzone = system->saferzone;
 
 #if defined(_OPENMP)
@@ -335,25 +334,21 @@ void Init_Forces_noQEq_OMP( reax_system *system, control_params *control,
   startTimeBase = MPI_Wtime();
 #endif
 
-  int i, j, pi, pj;
-  int start_i, end_i, start_j, end_j;
+  int i, j, pj;
+  int start_i, end_i;
   int type_i, type_j;
   int ihb, jhb, ihb_top, jhb_top;
-  int local, flag;
-  double r_ij, cutoff;
+  double cutoff;
   single_body_parameters *sbp_i, *sbp_j;
   two_body_parameters *twbp;
   far_neighbor_data *nbr_pj;
   reax_atom *atom_i, *atom_j;
-  bond_data *ibond, *jbond;
   reax_list *far_nbrs = *lists + FAR_NBRS;
   reax_list *bonds = *lists + BONDS;
   reax_list *hbonds = *lists + HBONDS;
   int num_bonds = 0;
   int num_hbonds = 0;
   int btop_i = 0;
-  int btop_j = 0;
-  int renbr = (data->step-data->prev_steps) % control->reneighbor == 0;
 
   // We will use CdDeltaReduction as a temporary (double) buffer to accumulate total_bond_order
   // This is safe because CdDeltaReduction is currently zeroed and its accumulation doesn't start until BondsOMP()
@@ -368,8 +363,8 @@ void Init_Forces_noQEq_OMP( reax_system *system, control_params *control,
 
 #if defined(_OPENMP)
 #pragma omp parallel default(shared) \
-  private(i, atom_i, type_i, pi, start_i, end_i, sbp_i, btop_i, ibond, ihb, ihb_top, \
-          j, atom_j, type_j, pj, start_j, end_j, sbp_j, nbr_pj, jbond, jhb, twbp)
+  private(i, atom_i, type_i, start_i, end_i, sbp_i, btop_i, ihb, ihb_top, \
+          j, atom_j, type_j, pj, sbp_j, nbr_pj, jhb, twbp)
 #endif
   {
 
@@ -417,7 +412,6 @@ void Init_Forces_noQEq_OMP( reax_system *system, control_params *control,
 	// outside of critical section.
 
 	// Start top portion of BOp()
-	int jj = nbr_pj->nbr;
 	double C12, C34, C56;
 	double BO, BO_s, BO_pi, BO_pi2;
 	double bo_cut = control->bo_cut;
@@ -602,7 +596,6 @@ void Compute_ForcesOMP( reax_system *system, control_params *control,
                      reax_list **lists, output_controls *out_control,
                      mpi_datatypes *mpi_data )
 {
-  int qeq_flag;
   MPI_Comm comm = mpi_data->world;
 
   // Init Forces
diff --git a/src/USER-OMP/reaxc_multi_body_omp.cpp b/src/USER-OMP/reaxc_multi_body_omp.cpp
index acbe4ec268..ff8baa3c1a 100644
--- a/src/USER-OMP/reaxc_multi_body_omp.cpp
+++ b/src/USER-OMP/reaxc_multi_body_omp.cpp
@@ -50,16 +50,14 @@ void Atom_EnergyOMP( reax_system *system, control_params *control,
 #endif
 
   /* Initialize parameters */
-  double p_lp1 = system->reax_param.gp.l[15];
-  double p_lp3 = system->reax_param.gp.l[5];
-  double p_ovun3 = system->reax_param.gp.l[32];
-  double p_ovun4 = system->reax_param.gp.l[31];
-  double p_ovun6 = system->reax_param.gp.l[6];
-  double p_ovun7 = system->reax_param.gp.l[8];
-  double p_ovun8 = system->reax_param.gp.l[9];
+  const double p_lp3 = system->reax_param.gp.l[5];
+  const double p_ovun3 = system->reax_param.gp.l[32];
+  const double p_ovun4 = system->reax_param.gp.l[31];
+  const double p_ovun6 = system->reax_param.gp.l[6];
+  const double p_ovun7 = system->reax_param.gp.l[8];
+  const double p_ovun8 = system->reax_param.gp.l[9];
 
-  int natoms = system->n;
-  int nthreads = control->nthreads;
+  const int natoms = system->n;
   reax_list *bonds = (*lists) + BONDS;
 
   double total_Elp = 0.0;
@@ -79,11 +77,11 @@ void Atom_EnergyOMP( reax_system *system, control_params *control,
   double exp_ovun2n, exp_ovun6, exp_ovun8;
   double inv_exp_ovun1, inv_exp_ovun2, inv_exp_ovun2n, inv_exp_ovun8;
   double e_un, CEunder1, CEunder2, CEunder3, CEunder4;
-  double eng_tmp, f_tmp;
+  double eng_tmp;
   double p_lp2, p_ovun2, p_ovun5;
   int numbonds;
 
-  single_body_parameters *sbp_i, *sbp_j;
+  single_body_parameters *sbp_i;
   two_body_parameters *twbp;
   bond_data *pbond;
   bond_order_data *bo_ij;
diff --git a/src/USER-OMP/reaxc_nonbonded_omp.cpp b/src/USER-OMP/reaxc_nonbonded_omp.cpp
index 38a6d9e860..c509be8a26 100644
--- a/src/USER-OMP/reaxc_nonbonded_omp.cpp
+++ b/src/USER-OMP/reaxc_nonbonded_omp.cpp
@@ -48,8 +48,6 @@ void vdW_Coulomb_Energy_OMP( reax_system *system, control_params *control,
 			     reax_list **lists, output_controls *out_control ) {
 
   int natoms = system->n;
-  int nthreads = control->nthreads;
-  long totalReductionSize = system->N * nthreads;
   reax_list *far_nbrs = (*lists) + FAR_NBRS;
   double p_vdW1 = system->reax_param.gp.l[28];
   double p_vdW1i = 1.0 / p_vdW1;
@@ -71,7 +69,8 @@ void vdW_Coulomb_Energy_OMP( reax_system *system, control_params *control,
   double tmp, r_ij, fn13, exp1, exp2;
   double Tap, dTap, dfn13, CEvd, CEclmb, de_core;
   double dr3gamij_1, dr3gamij_3;
-  double e_ele, e_ele_thr, e_vdW, e_vdW_thr, e_core, SMALL = 0.0001;
+  double e_ele, e_vdW, e_core;
+  const double SMALL = 0.0001;
   double e_lg, de_lg, r_ij5, r_ij6, re6;
   rvec temp, ext_press;
   two_body_parameters *twbp;
@@ -92,7 +91,6 @@ void vdW_Coulomb_Energy_OMP( reax_system *system, control_params *control,
 				    system->pair_ptr->vatom, thr);
   e_core = 0;
   e_vdW = 0;
-  e_vdW_thr = 0;
   e_lg = 0;
   de_lg = 0.0;
 
@@ -263,8 +261,6 @@ void Tabulated_vdW_Coulomb_Energy_OMP(reax_system *system,control_params *contro
   double SMALL = 0.0001;
   int  natoms = system->n;
   reax_list *far_nbrs = (*lists) + FAR_NBRS;
-  int  nthreads = control->nthreads;
-  long totalReductionSize = system->N * nthreads;
   double total_EvdW = 0.;
   double total_Eele = 0.;
 
diff --git a/src/USER-OMP/reaxc_torsion_angles_omp.cpp b/src/USER-OMP/reaxc_torsion_angles_omp.cpp
index 7a7e42ea30..4ede439ed4 100644
--- a/src/USER-OMP/reaxc_torsion_angles_omp.cpp
+++ b/src/USER-OMP/reaxc_torsion_angles_omp.cpp
@@ -69,8 +69,10 @@ void Torsion_AnglesOMP( reax_system *system, control_params *control,
   double total_Econ = 0;
   int  nthreads = control->nthreads;
 
+#if defined(_OPENMP)
 #pragma omp parallel default(shared) reduction(+: total_Etor, total_Econ)
- {
+#endif
+  {
   int i, j, k, l, pi, pj, pk, pl, pij, plk;
   int type_i, type_j, type_k, type_l;
   int start_j, end_j;
@@ -125,7 +127,9 @@ void Torsion_AnglesOMP( reax_system *system, control_params *control,
                                     system->N, system->pair_ptr->eatom,
                                     system->pair_ptr->vatom, thr);
 
+#if defined(_OPENMP)
 #pragma omp for schedule(static)
+#endif
   for (j = 0; j < system->N; ++j) {
     start_j = Start_Index(j, bonds);
     end_j = End_Index(j, bonds);
@@ -137,7 +141,9 @@ void Torsion_AnglesOMP( reax_system *system, control_params *control,
     }
   }
 
+#if defined(_OPENMP)
 #pragma omp for schedule(dynamic,50)
+#endif
   for (j = 0; j < natoms; ++j) {
     type_j = system->my_atoms[j].type;
     Delta_j = workspace->Delta_boc[j];
diff --git a/src/USER-OMP/reaxc_valence_angles_omp.cpp b/src/USER-OMP/reaxc_valence_angles_omp.cpp
index 869a325d6e..d6f0962020 100644
--- a/src/USER-OMP/reaxc_valence_angles_omp.cpp
+++ b/src/USER-OMP/reaxc_valence_angles_omp.cpp
@@ -124,8 +124,9 @@ void Valence_AnglesOMP( reax_system *system, control_params *control,
   int  nthreads = control->nthreads;
   int  num_thb_intrs = 0;
   int  TWICE = 2;
-
+#if defined(_OPENMP)
 #pragma omp parallel default(shared) reduction(+:total_Eang, total_Epen, total_Ecoa, num_thb_intrs)
+#endif
   {
     int i, j, pi, k, pk, t;
     int type_i, type_j, type_k;
@@ -180,7 +181,9 @@ void Valence_AnglesOMP( reax_system *system, control_params *control,
 
     const int per_thread = thb_intrs->num_intrs / nthreads;
 
+#if defined(_OPENMP)
 #pragma omp for schedule(dynamic,50)
+#endif
     for (j = 0; j < system->N; ++j) {
       type_j = system->my_atoms[j].type;
       _my_offset[j] = 0;
@@ -251,11 +254,14 @@ void Valence_AnglesOMP( reax_system *system, control_params *control,
     } // for(j)
 
     // Wait for all threads to finish counting angles
+#if defined(_OPENMP) && !defined(__NVCC__)
 #pragma omp barrier
-
+#endif
     // Master thread uses angle counts to compute offsets
     // This can be threaded
+#if defined(_OPENMP) && !defined(__NVCC__)
 #pragma omp master
+#endif
     {
       int current_count = 0;
       int m = _my_offset[0];
@@ -269,12 +275,15 @@ void Valence_AnglesOMP( reax_system *system, control_params *control,
     }
 
     // All threads wait till master thread finished computing offsets
+#if defined(_OPENMP) && !defined(__NVCC__)
 #pragma omp barrier
-
+#endif
     // Original loop, but now using precomputed offsets
     // Safe to use all threads available, regardless of threads tasked above
     // We also now skip over atoms that have no angles assigned
+#if defined(_OPENMP)
 #pragma omp for schedule(dynamic,50)//(dynamic,chunksize)//(guided)
+#endif
     for (j = 0; j < system->N; ++j) {         // Ray: the first one with system->N
       type_j = system->my_atoms[j].type;
       if(type_j < 0) continue;
diff --git a/src/USER-QTB/fix_qbmsst.h b/src/USER-QTB/fix_qbmsst.h
index 3484076abf..cec54e40d8 100644
--- a/src/USER-QTB/fix_qbmsst.h
+++ b/src/USER-QTB/fix_qbmsst.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-QTB/fix_qtb.h b/src/USER-QTB/fix_qtb.h
index e2b7634537..5537ecb56e 100644
--- a/src/USER-QTB/fix_qtb.h
+++ b/src/USER-QTB/fix_qtb.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-QUIP/pair_quip.h b/src/USER-QUIP/pair_quip.h
index 15e143aee7..f86df015ea 100644
--- a/src/USER-QUIP/pair_quip.h
+++ b/src/USER-QUIP/pair_quip.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-REAXC/fix_reaxc_species.cpp b/src/USER-REAXC/fix_reaxc_species.cpp
index 23a37455e8..62b3bff93e 100644
--- a/src/USER-REAXC/fix_reaxc_species.cpp
+++ b/src/USER-REAXC/fix_reaxc_species.cpp
@@ -68,7 +68,7 @@ FixReaxCSpecies::FixReaxCSpecies(LAMMPS *lmp, int narg, char **arg) :
   nrepeat = atoi(arg[4]);
   global_freq = nfreq = atoi(arg[5]);
 
-  comm_forward = 5;
+  comm_forward = 4;
 
   if (nevery <= 0 || nrepeat <= 0 || nfreq <= 0)
     error->all(FLERR,"Illegal fix reax/c/species command");
@@ -133,12 +133,10 @@ FixReaxCSpecies::FixReaxCSpecies(LAMMPS *lmp, int narg, char **arg) :
   }
 
   x0 = NULL;
-  PBCconnected = NULL;
   clusterID = NULL;
 
   int ntmp = 1;
   memory->create(x0,ntmp,"reax/c/species:x0");
-  memory->create(PBCconnected,ntmp,"reax/c/species:PBCconnected");
   memory->create(clusterID,ntmp,"reax/c/species:clusterID");
   vector_atom = clusterID;
 
@@ -251,7 +249,6 @@ FixReaxCSpecies::~FixReaxCSpecies()
   memory->destroy(ele);
   memory->destroy(BOCut);
   memory->destroy(clusterID);
-  memory->destroy(PBCconnected);
   memory->destroy(x0);
 
   memory->destroy(nd);
@@ -458,16 +455,13 @@ void FixReaxCSpecies::Output_ReaxC_Bonds(bigint ntimestep, FILE *fp)
   if (atom->nmax > nmax) {
     nmax = atom->nmax;
     memory->destroy(x0);
-    memory->destroy(PBCconnected);
     memory->destroy(clusterID);
     memory->create(x0,nmax,"reax/c/species:x0");
-    memory->create(PBCconnected,nmax,"reax/c/species:PBCconnected");
     memory->create(clusterID,nmax,"reax/c/species:clusterID");
     vector_atom = clusterID;
   }
 
   for (int i = 0; i < nmax; i++) {
-    PBCconnected[i] = 0;
     x0[i].x = x0[i].y = x0[i].z = 0.0;
   }
 
@@ -520,8 +514,6 @@ void FixReaxCSpecies::FindMolecule ()
   int *ilist;
   double bo_tmp,bo_cut;
   double **spec_atom = f_SPECBOND->array_atom;
-  const double * const * const x = atom->x;
-  const int nlocal = atom->nlocal;
 
   inum = reaxc->list->inum;
   ilist = reaxc->list->ilist;
@@ -559,7 +551,6 @@ void FixReaxCSpecies::FindMolecule ()
           if (!(mask[j] & groupbit)) continue;
 
           if (clusterID[i] == clusterID[j]
-              && PBCconnected[i] == PBCconnected[j]
               && x0[i].x == x0[j].x
               && x0[i].y == x0[j].y
               && x0[i].z == x0[j].z) continue;
@@ -570,21 +561,7 @@ void FixReaxCSpecies::FindMolecule ()
 
           if (bo_tmp > bo_cut) {
             clusterID[i] = clusterID[j] = MIN(clusterID[i], clusterID[j]);
-            PBCconnected[i] = PBCconnected[j] = MAX(PBCconnected[i], PBCconnected[j]);
             x0[i] = x0[j] = chAnchor(x0[i], x0[j]);
-            // spec_atom[][] contains filtered coordinates only for local atoms,
-            // so we have to use unfiltered ones for ghost atoms.
-            if (j < nlocal) {
-              if ((fabs(spec_atom[i][1] - spec_atom[j][1]) > reaxc->control->bond_cut)
-                  || (fabs(spec_atom[i][2] - spec_atom[j][2]) > reaxc->control->bond_cut)
-                  || (fabs(spec_atom[i][3] - spec_atom[j][3]) > reaxc->control->bond_cut))
-                PBCconnected[i] = PBCconnected[j] = 1;
-            } else {
-              if ((fabs(spec_atom[i][1] - x[j][1]) > reaxc->control->bond_cut)
-                  || (fabs(spec_atom[i][2] - x[j][2]) > reaxc->control->bond_cut)
-                  || (fabs(spec_atom[i][3] - x[j][3]) > reaxc->control->bond_cut))
-                PBCconnected[i] = PBCconnected[j] = 1;
-            }
             done = 0;
           }
         }
@@ -878,20 +855,18 @@ void FixReaxCSpecies::WritePos(int Nmole, int Nspec)
         Name[itype] ++;
         count ++;
         avq += spec_atom[i][0];
-        if (PBCconnected[i]) {
-          if ((x0[i].x - spec_atom[i][1]) > halfbox[0])
-            spec_atom[i][1] += box[0];
-          if ((spec_atom[i][1] - x0[i].x) > halfbox[0])
-            spec_atom[i][1] -= box[0];
-          if ((x0[i].y - spec_atom[i][2]) > halfbox[1])
-            spec_atom[i][2] += box[1];
-          if ((spec_atom[i][2] - x0[i].y) > halfbox[1])
-            spec_atom[i][2] -= box[1];
-          if ((x0[i].z - spec_atom[i][3]) > halfbox[2])
-            spec_atom[i][3] += box[2];
-          if ((spec_atom[i][3] - x0[i].z) > halfbox[2])
-            spec_atom[i][3] -= box[2];
-        }
+        if ((x0[i].x - spec_atom[i][1]) > halfbox[0])
+          spec_atom[i][1] += box[0];
+        if ((spec_atom[i][1] - x0[i].x) > halfbox[0])
+          spec_atom[i][1] -= box[0];
+        if ((x0[i].y - spec_atom[i][2]) > halfbox[1])
+          spec_atom[i][2] += box[1];
+        if ((spec_atom[i][2] - x0[i].y) > halfbox[1])
+          spec_atom[i][2] -= box[1];
+        if ((x0[i].z - spec_atom[i][3]) > halfbox[2])
+          spec_atom[i][3] += box[2];
+        if ((spec_atom[i][3] - x0[i].z) > halfbox[2])
+          spec_atom[i][3] -= box[2];
         for (n = 0; n < 3; n++)
           avx[n] += spec_atom[i][n+1];
       }
@@ -977,11 +952,10 @@ int FixReaxCSpecies::pack_forward_comm(int n, int *list, double *buf,
   for (i = 0; i < n; i++) {
     j = list[i];
     buf[m] = clusterID[j];
-    buf[m+1] = (double)PBCconnected[j];
-    buf[m+2] = x0[j].x;
-    buf[m+3] = x0[j].y;
-    buf[m+4] = x0[j].z;
-    m += 5;
+    buf[m+1] = x0[j].x;
+    buf[m+2] = x0[j].y;
+    buf[m+3] = x0[j].z;
+    m += 4;
   }
   return m;
 }
@@ -996,11 +970,10 @@ void FixReaxCSpecies::unpack_forward_comm(int n, int first, double *buf)
   last = first + n;
   for (i = first; i < last; i++) {
     clusterID[i] = buf[m];
-    PBCconnected[i] = (int)buf[m+1];
-    x0[i].x = buf[m+2];
-    x0[i].y = buf[m+3];
-    x0[i].z = buf[m+4];
-    m += 5;
+    x0[i].x = buf[m+1];
+    x0[i].y = buf[m+2];
+    x0[i].z = buf[m+3];
+    m += 4;
   }
 }
 
@@ -1010,7 +983,7 @@ double FixReaxCSpecies::memory_usage()
 {
   double bytes;
 
-  bytes = 5*nmax*sizeof(double);  // clusterID + PBCconnected + x0
+  bytes = 4*nmax*sizeof(double);  // clusterID + x0
 
   return bytes;
 }
diff --git a/src/USER-REAXC/fix_reaxc_species.h b/src/USER-REAXC/fix_reaxc_species.h
index 563a10f39d..23a470fd0a 100644
--- a/src/USER-REAXC/fix_reaxc_species.h
+++ b/src/USER-REAXC/fix_reaxc_species.h
@@ -52,7 +52,6 @@ class FixReaxCSpecies : public Fix {
   int Nmoltype, vector_nmole, vector_nspec;
   int *Name, *MolName, *NMol, *nd, *MolType, *molmap;
   double *clusterID;
-  int *PBCconnected;
   AtomCoord *x0;
 
   double bg_cut;
diff --git a/src/USER-SMD/atom_vec_smd.h b/src/USER-SMD/atom_vec_smd.h
index cea9a31f86..34fdfc1f76 100644
--- a/src/USER-SMD/atom_vec_smd.h
+++ b/src/USER-SMD/atom_vec_smd.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_contact_radius.h b/src/USER-SMD/compute_smd_contact_radius.h
index 46034f2f11..f22dce1bab 100644
--- a/src/USER-SMD/compute_smd_contact_radius.h
+++ b/src/USER-SMD/compute_smd_contact_radius.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_damage.h b/src/USER-SMD/compute_smd_damage.h
index 1259788ec4..c8447872c7 100644
--- a/src/USER-SMD/compute_smd_damage.h
+++ b/src/USER-SMD/compute_smd_damage.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_hourglass_error.h b/src/USER-SMD/compute_smd_hourglass_error.h
index d4e39ce25f..a6d1d1a1e2 100644
--- a/src/USER-SMD/compute_smd_hourglass_error.h
+++ b/src/USER-SMD/compute_smd_hourglass_error.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_internal_energy.h b/src/USER-SMD/compute_smd_internal_energy.h
index 0754f6fe6c..fbccfbfb7e 100644
--- a/src/USER-SMD/compute_smd_internal_energy.h
+++ b/src/USER-SMD/compute_smd_internal_energy.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_plastic_strain.h b/src/USER-SMD/compute_smd_plastic_strain.h
index cdc322e4cf..d2e64e31a1 100644
--- a/src/USER-SMD/compute_smd_plastic_strain.h
+++ b/src/USER-SMD/compute_smd_plastic_strain.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_plastic_strain_rate.h b/src/USER-SMD/compute_smd_plastic_strain_rate.h
index efc4f0c67c..03445e92f8 100644
--- a/src/USER-SMD/compute_smd_plastic_strain_rate.h
+++ b/src/USER-SMD/compute_smd_plastic_strain_rate.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_rho.h b/src/USER-SMD/compute_smd_rho.h
index ce749c6466..35dfdf8e91 100644
--- a/src/USER-SMD/compute_smd_rho.h
+++ b/src/USER-SMD/compute_smd_rho.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_tlsph_defgrad.h b/src/USER-SMD/compute_smd_tlsph_defgrad.h
index 654403e7a3..5dfa502991 100644
--- a/src/USER-SMD/compute_smd_tlsph_defgrad.h
+++ b/src/USER-SMD/compute_smd_tlsph_defgrad.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_tlsph_dt.h b/src/USER-SMD/compute_smd_tlsph_dt.h
index 16969a05be..09bf6c9727 100644
--- a/src/USER-SMD/compute_smd_tlsph_dt.h
+++ b/src/USER-SMD/compute_smd_tlsph_dt.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_tlsph_num_neighs.h b/src/USER-SMD/compute_smd_tlsph_num_neighs.h
index 77e4c5838c..da649fbce2 100644
--- a/src/USER-SMD/compute_smd_tlsph_num_neighs.h
+++ b/src/USER-SMD/compute_smd_tlsph_num_neighs.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_tlsph_shape.h b/src/USER-SMD/compute_smd_tlsph_shape.h
index 167e2b67e4..193657870c 100644
--- a/src/USER-SMD/compute_smd_tlsph_shape.h
+++ b/src/USER-SMD/compute_smd_tlsph_shape.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_tlsph_strain.h b/src/USER-SMD/compute_smd_tlsph_strain.h
index 199190c8b9..1294af2f45 100644
--- a/src/USER-SMD/compute_smd_tlsph_strain.h
+++ b/src/USER-SMD/compute_smd_tlsph_strain.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_tlsph_strain_rate.h b/src/USER-SMD/compute_smd_tlsph_strain_rate.h
index 9924f5e1cc..cc4ed9f5ee 100644
--- a/src/USER-SMD/compute_smd_tlsph_strain_rate.h
+++ b/src/USER-SMD/compute_smd_tlsph_strain_rate.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_tlsph_stress.h b/src/USER-SMD/compute_smd_tlsph_stress.h
index 3a8f6610b1..bf9079bb4f 100644
--- a/src/USER-SMD/compute_smd_tlsph_stress.h
+++ b/src/USER-SMD/compute_smd_tlsph_stress.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_triangle_mesh_vertices.h b/src/USER-SMD/compute_smd_triangle_mesh_vertices.h
index 1a4ff295bb..54c6055b98 100644
--- a/src/USER-SMD/compute_smd_triangle_mesh_vertices.h
+++ b/src/USER-SMD/compute_smd_triangle_mesh_vertices.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_ulsph_effm.h b/src/USER-SMD/compute_smd_ulsph_effm.h
index 3c2abd851d..68981fe76d 100644
--- a/src/USER-SMD/compute_smd_ulsph_effm.h
+++ b/src/USER-SMD/compute_smd_ulsph_effm.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_ulsph_num_neighs.h b/src/USER-SMD/compute_smd_ulsph_num_neighs.h
index 5af3bd9302..57340f01b6 100644
--- a/src/USER-SMD/compute_smd_ulsph_num_neighs.h
+++ b/src/USER-SMD/compute_smd_ulsph_num_neighs.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_ulsph_strain.h b/src/USER-SMD/compute_smd_ulsph_strain.h
index 2e266f4e25..a5796f34e2 100644
--- a/src/USER-SMD/compute_smd_ulsph_strain.h
+++ b/src/USER-SMD/compute_smd_ulsph_strain.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_ulsph_strain_rate.h b/src/USER-SMD/compute_smd_ulsph_strain_rate.h
index d2d43ef972..fc6df758e6 100644
--- a/src/USER-SMD/compute_smd_ulsph_strain_rate.h
+++ b/src/USER-SMD/compute_smd_ulsph_strain_rate.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_ulsph_stress.h b/src/USER-SMD/compute_smd_ulsph_stress.h
index c962449703..4e27a51723 100644
--- a/src/USER-SMD/compute_smd_ulsph_stress.h
+++ b/src/USER-SMD/compute_smd_ulsph_stress.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/compute_smd_vol.h b/src/USER-SMD/compute_smd_vol.h
index e946ed85ca..5525ce57cb 100644
--- a/src/USER-SMD/compute_smd_vol.h
+++ b/src/USER-SMD/compute_smd_vol.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/fix_smd_adjust_dt.h b/src/USER-SMD/fix_smd_adjust_dt.h
index 3b96d76d76..d7d8c922f2 100644
--- a/src/USER-SMD/fix_smd_adjust_dt.h
+++ b/src/USER-SMD/fix_smd_adjust_dt.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/fix_smd_integrate_tlsph.h b/src/USER-SMD/fix_smd_integrate_tlsph.h
index 7119f8d919..ca047f2dfd 100644
--- a/src/USER-SMD/fix_smd_integrate_tlsph.h
+++ b/src/USER-SMD/fix_smd_integrate_tlsph.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/fix_smd_integrate_ulsph.h b/src/USER-SMD/fix_smd_integrate_ulsph.h
index 19ae31a59e..ea4f46ce53 100644
--- a/src/USER-SMD/fix_smd_integrate_ulsph.h
+++ b/src/USER-SMD/fix_smd_integrate_ulsph.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/fix_smd_move_triangulated_surface.h b/src/USER-SMD/fix_smd_move_triangulated_surface.h
index ce4eaed88e..c851d490c5 100644
--- a/src/USER-SMD/fix_smd_move_triangulated_surface.h
+++ b/src/USER-SMD/fix_smd_move_triangulated_surface.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/fix_smd_setvel.h b/src/USER-SMD/fix_smd_setvel.h
index 9e5fe642eb..b987a56f6c 100644
--- a/src/USER-SMD/fix_smd_setvel.h
+++ b/src/USER-SMD/fix_smd_setvel.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/fix_smd_tlsph_reference_configuration.h b/src/USER-SMD/fix_smd_tlsph_reference_configuration.h
index ede06151ee..3ff693e9bb 100644
--- a/src/USER-SMD/fix_smd_tlsph_reference_configuration.h
+++ b/src/USER-SMD/fix_smd_tlsph_reference_configuration.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/fix_smd_wall_surface.h b/src/USER-SMD/fix_smd_wall_surface.h
index 8bd7002a9e..a32319f48f 100644
--- a/src/USER-SMD/fix_smd_wall_surface.h
+++ b/src/USER-SMD/fix_smd_wall_surface.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
  http://lammps.sandia.gov, Sandia National Laboratories
  Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-SMD/pair_smd_hertz.h b/src/USER-SMD/pair_smd_hertz.h
index 6b40b6bb5c..0866ef7486 100644
--- a/src/USER-SMD/pair_smd_hertz.h
+++ b/src/USER-SMD/pair_smd_hertz.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/pair_smd_tlsph.h b/src/USER-SMD/pair_smd_tlsph.h
index 17db11b816..4c9db9209b 100644
--- a/src/USER-SMD/pair_smd_tlsph.h
+++ b/src/USER-SMD/pair_smd_tlsph.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/pair_smd_triangulated_surface.h b/src/USER-SMD/pair_smd_triangulated_surface.h
index 6332313646..c1eba7804e 100644
--- a/src/USER-SMD/pair_smd_triangulated_surface.h
+++ b/src/USER-SMD/pair_smd_triangulated_surface.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/pair_smd_ulsph.h b/src/USER-SMD/pair_smd_ulsph.h
index 40ccc37e93..032079072e 100644
--- a/src/USER-SMD/pair_smd_ulsph.h
+++ b/src/USER-SMD/pair_smd_ulsph.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/smd_kernels.h b/src/USER-SMD/smd_kernels.h
index ba40699fc7..6621881f4b 100644
--- a/src/USER-SMD/smd_kernels.h
+++ b/src/USER-SMD/smd_kernels.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/smd_material_models.h b/src/USER-SMD/smd_material_models.h
index 858c5bbbdf..c6b6f8d94c 100644
--- a/src/USER-SMD/smd_material_models.h
+++ b/src/USER-SMD/smd_material_models.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMD/smd_math.h b/src/USER-SMD/smd_math.h
index 5352cfccf9..b7bf9c112d 100644
--- a/src/USER-SMD/smd_math.h
+++ b/src/USER-SMD/smd_math.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
  *
  *                    *** Smooth Mach Dynamics ***
  *
diff --git a/src/USER-SMTBQ/pair_smtbq.h b/src/USER-SMTBQ/pair_smtbq.h
index 49fbe8768b..25dfe98888 100644
--- a/src/USER-SMTBQ/pair_smtbq.h
+++ b/src/USER-SMTBQ/pair_smtbq.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/USER-TALLY/compute_heat_flux_tally.cpp b/src/USER-TALLY/compute_heat_flux_tally.cpp
index 65f57b7678..b366b92be3 100644
--- a/src/USER-TALLY/compute_heat_flux_tally.cpp
+++ b/src/USER-TALLY/compute_heat_flux_tally.cpp
@@ -246,7 +246,7 @@ void ComputeHeatFluxTally::compute_vector()
       double ke_i;
 
       if (rmass) ke_i = pfactor * rmass[i];
-      else ke_i *= pfactor * mass[type[i]];
+      else ke_i = pfactor * mass[type[i]];
       ke_i *= (vi[0]*vi[0] + vi[1]*vi[1] + vi[2]*vi[2]);
       ke_i += eatom[i];
 
diff --git a/src/USER-VTK/dump_vtk.h b/src/USER-VTK/dump_vtk.h
index 603ca114ba..8df14c7f34 100644
--- a/src/USER-VTK/dump_vtk.h
+++ b/src/USER-VTK/dump_vtk.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/atom.cpp b/src/atom.cpp
index 545efbd392..173a77e586 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -40,6 +40,10 @@
 #include "memory.h"
 #include "error.h"
 
+#ifdef LMP_USER_INTEL
+#include "neigh_request.h"
+#endif
+
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
@@ -1882,6 +1886,53 @@ void Atom::setup_sort_bins()
   bininvy = nbiny / (bboxhi[1]-bboxlo[1]);
   bininvz = nbinz / (bboxhi[2]-bboxlo[2]);
 
+  #ifdef LMP_USER_INTEL
+  int intel_neigh = 0;
+  if (neighbor->nrequest) {
+    if (neighbor->requests[0]->intel) intel_neigh = 1;
+  } else if (neighbor->old_nrequest)
+    if (neighbor->old_requests[0]->intel) intel_neigh = 1;
+  if (intel_neigh && userbinsize == 0.0) {
+    if (neighbor->binsizeflag) bininv = 1.0/neighbor->binsize_user;
+
+    double nx_low = neighbor->bboxlo[0];
+    double ny_low = neighbor->bboxlo[1];
+    double nz_low = neighbor->bboxlo[2];
+    double nxbbox = neighbor->bboxhi[0] - nx_low;
+    double nybbox = neighbor->bboxhi[1] - ny_low;
+    double nzbbox = neighbor->bboxhi[2] - nz_low;
+    int nnbinx = static_cast<int> (nxbbox * bininv);
+    int nnbiny = static_cast<int> (nybbox * bininv);
+    int nnbinz = static_cast<int> (nzbbox * bininv);
+    if (domain->dimension == 2) nnbinz = 1;
+
+    if (nnbinx == 0) nnbinx = 1;
+    if (nnbiny == 0) nnbiny = 1;
+    if (nnbinz == 0) nnbinz = 1;
+
+    double binsizex = nxbbox/nnbinx;
+    double binsizey = nybbox/nnbiny;
+    double binsizez = nzbbox/nnbinz;
+
+    bininvx = 1.0 / binsizex;
+    bininvy = 1.0 / binsizey;
+    bininvz = 1.0 / binsizez;
+
+    int lxo = (bboxlo[0] - nx_low) * bininvx;
+    int lyo = (bboxlo[1] - ny_low) * bininvy;
+    int lzo = (bboxlo[2] - nz_low) * bininvz;
+    bboxlo[0] = nx_low + static_cast<double>(lxo) / bininvx;
+    bboxlo[1] = ny_low + static_cast<double>(lyo) / bininvy;
+    bboxlo[2] = nz_low + static_cast<double>(lzo) / bininvz;
+    nbinx = static_cast<int>((bboxhi[0] - bboxlo[0]) * bininvx) + 1;
+    nbiny = static_cast<int>((bboxhi[1] - bboxlo[1]) * bininvy) + 1;
+    nbinz = static_cast<int>((bboxhi[2] - bboxlo[2]) * bininvz) + 1;
+    bboxhi[0] = bboxlo[0] + static_cast<double>(nbinx) / bininvx;
+    bboxhi[1] = bboxlo[1] + static_cast<double>(nbiny) / bininvy;
+    bboxhi[2] = bboxlo[2] + static_cast<double>(nbinz) / bininvz;
+  }
+  #endif
+
   if (1.0*nbinx*nbiny*nbinz > INT_MAX)
     error->one(FLERR,"Too many atom sorting bins");
 
diff --git a/src/compute_cna_atom.cpp b/src/compute_cna_atom.cpp
index 9680921e5f..bd24e06cae 100644
--- a/src/compute_cna_atom.cpp
+++ b/src/compute_cna_atom.cpp
@@ -43,7 +43,7 @@ enum{NCOMMON,NBOND,MAXBOND,MINBOND};
 
 ComputeCNAAtom::ComputeCNAAtom(LAMMPS *lmp, int narg, char **arg) :
   Compute(lmp, narg, arg),
-  nearest(NULL), nnearest(NULL), pattern(NULL)
+  list(NULL), nearest(NULL), nnearest(NULL), pattern(NULL)
 {
   if (narg != 4) error->all(FLERR,"Illegal compute cna/atom command");
 
diff --git a/src/compute_pair_local.cpp b/src/compute_pair_local.cpp
index 4595175503..adac486bef 100644
--- a/src/compute_pair_local.cpp
+++ b/src/compute_pair_local.cpp
@@ -126,11 +126,15 @@ void ComputePairLocal::init()
                  " requested by compute pair/local");
 
   // need an occasional half neighbor list
+  // set size to same value as request made by force->pair
+  // this should enable it to always be a copy list (e.g. for granular pstyle)
 
   int irequest = neighbor->request(this,instance_me);
   neighbor->requests[irequest]->pair = 0;
   neighbor->requests[irequest]->compute = 1;
   neighbor->requests[irequest]->occasional = 1;
+  NeighRequest *pairrequest = neighbor->find_request((void *) force->pair);
+  if (pairrequest) neighbor->requests[irequest]->size = pairrequest->size;
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/compute_property_local.cpp b/src/compute_property_local.cpp
index 90faa88921..27b31979c9 100644
--- a/src/compute_property_local.cpp
+++ b/src/compute_property_local.cpp
@@ -280,12 +280,16 @@ void ComputePropertyLocal::init()
   }
 
   // for NEIGH/PAIR need an occasional half neighbor list
+  // set size to same value as request made by force->pair
+  // this should enable it to always be a copy list  (e.g. for granular pstyle)
 
   if (kindflag == NEIGH || kindflag == PAIR) {
     int irequest = neighbor->request(this,instance_me);
     neighbor->requests[irequest]->pair = 0;
     neighbor->requests[irequest]->compute = 1;
     neighbor->requests[irequest]->occasional = 1;
+    NeighRequest *pairrequest = neighbor->find_request((void *) force->pair);
+    if (pairrequest) neighbor->requests[irequest]->size = pairrequest->size;
   }
 
   // do initial memory allocation so that memory_usage() is correct
diff --git a/src/dump.cpp b/src/dump.cpp
index f8896c8fee..44098298ba 100644
--- a/src/dump.cpp
+++ b/src/dump.cpp
@@ -30,9 +30,12 @@
 
 using namespace LAMMPS_NS;
 
+#if defined(LMP_QSORT)
 // allocate space for static class variable
-
 Dump *Dump::dumpptr;
+#else
+#include "mergesort.h"
+#endif
 
 #define BIG 1.0e20
 #define EPSILON 1.0e-6
@@ -82,7 +85,7 @@ Dump::Dump(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   buffer_flag = 0;
   padflag = 0;
   pbcflag = 0;
-  
+
   maxbuf = maxids = maxsort = maxproc = 0;
   buf = bufsort = NULL;
   ids = idsort = NULL;
@@ -168,13 +171,13 @@ Dump::~Dump()
   delete irregular;
 
   memory->destroy(sbuf);
-  
+
   if (pbcflag) {
     memory->destroy(xpbc);
     memory->destroy(vpbc);
     memory->destroy(imagepbc);
   }
-  
+
   if (multiproc) MPI_Comm_free(&clustercomm);
 
   // XTC style sets fp to NULL since it closes file in its destructor
@@ -275,7 +278,7 @@ void Dump::init()
   }
 
   // preallocation for PBC copies if requested
-  
+
   if (pbcflag && atom->nlocal > maxpbc) pbc_allocate();
 }
 
@@ -388,7 +391,7 @@ void Dump::write()
     atom->image = imagepbc;
     domain->pbc();
   }
-  
+
   // pack my data into buf
   // if sorting on IDs also request ID list from pack()
   // sort buf as needed
@@ -689,6 +692,7 @@ void Dump::sort()
         index[idsort[i]-idlo] = i;
   }
 
+#if defined(LMP_QSORT)
   if (!reorderflag) {
     dumpptr = this;
     for (i = 0; i < nme; i++) index[i] = i;
@@ -696,6 +700,14 @@ void Dump::sort()
     else if (sortorder == ASCEND) qsort(index,nme,sizeof(int),bufcompare);
     else qsort(index,nme,sizeof(int),bufcompare_reverse);
   }
+#else
+  if (!reorderflag) {
+    for (i = 0; i < nme; i++) index[i] = i;
+    if (sortcol == 0) merge_sort(index,nme,(void *)this,idcompare);
+    else if (sortorder == ASCEND) merge_sort(index,nme,(void *)this,bufcompare);
+    else merge_sort(index,nme,(void *)this,bufcompare_reverse);
+  }
+#endif
 
   // reset buf size and maxbuf to largest of any post-sort nme values
   // this insures proc 0 can receive everyone's info
@@ -716,6 +728,8 @@ void Dump::sort()
     memcpy(&buf[i*size_one],&bufsort[index[i]*size_one],nbytes);
 }
 
+#if defined(LMP_QSORT)
+
 /* ----------------------------------------------------------------------
    compare two atom IDs
    called via qsort() in sort() method
@@ -776,6 +790,65 @@ int Dump::bufcompare_reverse(const void *pi, const void *pj)
   return 0;
 }
 
+#else
+
+/* ----------------------------------------------------------------------
+   compare two atom IDs
+   called via merge_sort() in sort() method
+------------------------------------------------------------------------- */
+
+int Dump::idcompare(const int i, const int j, void *ptr)
+{
+  tagint *idsort = ((Dump *)ptr)->idsort;
+  if (idsort[i] < idsort[j]) return -1;
+  else if (idsort[i] > idsort[j]) return 1;
+  else return 0;
+}
+
+/* ----------------------------------------------------------------------
+   compare two buffer values with size_one stride
+   called via merge_sort() in sort() method
+   sort in ASCENDing order
+------------------------------------------------------------------------- */
+
+int Dump::bufcompare(const int i, const int j, void *ptr)
+{
+  Dump *dptr = (Dump *) ptr;
+  double *bufsort     = dptr->bufsort;
+  const int size_one  = dptr->size_one;
+  const int sortcolm1 = dptr->sortcolm1;
+
+  const int ii=i*size_one + sortcolm1;
+  const int jj=j*size_one + sortcolm1;
+
+  if (bufsort[ii] < bufsort[jj]) return -1;
+  else if (bufsort[ii] > bufsort[jj]) return 1;
+  else return 0;
+}
+
+/* ----------------------------------------------------------------------
+   compare two buffer values with size_one stride
+   called via merge_sort() in sort() method
+   sort in DESCENDing order
+------------------------------------------------------------------------- */
+
+int Dump::bufcompare_reverse(const int i, const int j, void *ptr)
+{
+  Dump *dptr = (Dump *) ptr;
+  double *bufsort     = dptr->bufsort;
+  const int size_one  = dptr->size_one;
+  const int sortcolm1 = dptr->sortcolm1;
+
+  const int ii=i*size_one + sortcolm1;
+  const int jj=j*size_one + sortcolm1;
+
+  if (bufsort[ii] < bufsort[jj]) return 1;
+  else if (bufsort[ii] > bufsort[jj]) return -1;
+  else return 0;
+}
+
+#endif
+
 /* ----------------------------------------------------------------------
    process params common to all dumps here
    if unknown param, call modify_param specific to the dump
diff --git a/src/dump.h b/src/dump.h
index c4d9335201..a5582ce5a5 100644
--- a/src/dump.h
+++ b/src/dump.h
@@ -33,9 +33,10 @@ class Dump : protected Pointers {
   int comm_forward;          // size of forward communication (0 if none)
   int comm_reverse;          // size of reverse communication (0 if none)
 
+#if defined(LMP_QSORT)
   // static variable across all Dump objects
-
   static Dump *dumpptr;         // holds a ptr to Dump currently being used
+#endif
 
   Dump(class LAMMPS *, int, char **);
   virtual ~Dump();
@@ -132,11 +133,17 @@ class Dump : protected Pointers {
   virtual int convert_string(int, double *) {return 0;}
   virtual void write_data(int, double *) = 0;
   void pbc_allocate();
-    
+
   void sort();
+#if defined(LMP_QSORT)
   static int idcompare(const void *, const void *);
   static int bufcompare(const void *, const void *);
   static int bufcompare_reverse(const void *, const void *);
+#else
+  static int idcompare(const int, const int, void *);
+  static int bufcompare(const int, const int, void *);
+  static int bufcompare_reverse(const int, const int, void *);
+#endif
 };
 
 }
diff --git a/src/force.cpp b/src/force.cpp
index 3dd28cc710..33e6630406 100644
--- a/src/force.cpp
+++ b/src/force.cpp
@@ -53,6 +53,8 @@ Force::Force(LAMMPS *lmp) : Pointers(lmp)
   special_extra = 0;
 
   dielectric = 1.0;
+  qqr2e_lammps_real = 332.06371;          // these constants are toggled
+  qqr2e_charmm_real = 332.0716;           // by new CHARMM pair styles
 
   pair = NULL;
   bond = NULL;
diff --git a/src/force.h b/src/force.h
index f2d9abc7dd..edaac1b527 100644
--- a/src/force.h
+++ b/src/force.h
@@ -43,6 +43,9 @@ class Force : protected Pointers {
   double femtosecond;                // 1 femtosecond in native units
   double qelectron;                  // 1 electron charge abs() in native units
 
+  double qqr2e_lammps_real;          // different versions of this constant
+  double qqr2e_charmm_real;          // used by new CHARMM pair styles
+
   int newton,newton_pair,newton_bond;   // Newton's 3rd law settings
 
   class Pair *pair;
diff --git a/src/irregular.cpp b/src/irregular.cpp
index d0210244fb..6cd1b22c2f 100644
--- a/src/irregular.cpp
+++ b/src/irregular.cpp
@@ -23,11 +23,16 @@
 
 using namespace LAMMPS_NS;
 
+#if defined(LMP_QSORT)
 // allocate space for static class variable
 // prototype for non-class function
-
 int *Irregular::proc_recv_copy;
-int compare_standalone(const void *, const void *);
+static int compare_standalone(const void *, const void *);
+#else
+#include "mergesort.h"
+// prototype for non-class function
+static int compare_standalone(const int, const int, void *);
+#endif
 
 enum{LAYOUT_UNIFORM,LAYOUT_NONUNIFORM,LAYOUT_TILED};    // several files
 
@@ -423,8 +428,13 @@ int Irregular::create_atom(int n, int *sizes, int *proclist, int sortflag)
     int *length_recv_ordered = new int[nrecv_proc];
 
     for (i = 0; i < nrecv_proc; i++) order[i] = i;
+
+#if defined(LMP_QSORT)
     proc_recv_copy = proc_recv;
     qsort(order,nrecv_proc,sizeof(int),compare_standalone);
+#else
+    merge_sort(order,nrecv_proc,(void *)proc_recv,compare_standalone);
+#endif
 
     int j;
     for (i = 0; i < nrecv_proc; i++) {
@@ -450,6 +460,8 @@ int Irregular::create_atom(int n, int *sizes, int *proclist, int sortflag)
   return nrecvsize;
 }
 
+#if defined(LMP_QSORT)
+
 /* ----------------------------------------------------------------------
    comparison function invoked by qsort()
    accesses static class member proc_recv_copy, set before call to qsort()
@@ -465,6 +477,23 @@ int compare_standalone(const void *iptr, const void *jptr)
   return 0;
 }
 
+#else
+
+/* ----------------------------------------------------------------------
+   comparison function invoked by merge_sort()
+   void pointer contains proc_recv list;
+------------------------------------------------------------------------- */
+
+int compare_standalone(const int i, const int j, void *ptr)
+{
+  int *proc_recv = (int *) ptr;
+  if (proc_recv[i] < proc_recv[j]) return -1;
+  if (proc_recv[i] > proc_recv[j]) return 1;
+  return 0;
+}
+
+#endif
+
 /* ----------------------------------------------------------------------
    communicate atoms via PlanAtom
    sendbuf = list of atoms to send
@@ -671,8 +700,13 @@ int Irregular::create_data(int n, int *proclist, int sortflag)
     int *num_recv_ordered = new int[nrecv_proc];
 
     for (i = 0; i < nrecv_proc; i++) order[i] = i;
+
+#if defined(LMP_QSORT)
     proc_recv_copy = proc_recv;
     qsort(order,nrecv_proc,sizeof(int),compare_standalone);
+#else
+    merge_sort(order,nrecv_proc,(void *)proc_recv,compare_standalone);
+#endif
 
     int j;
     for (i = 0; i < nrecv_proc; i++) {
diff --git a/src/irregular.h b/src/irregular.h
index ea0fee2eb8..1f74fe801b 100644
--- a/src/irregular.h
+++ b/src/irregular.h
@@ -21,9 +21,11 @@ namespace LAMMPS_NS {
 class Irregular : protected Pointers {
  public:
 
+#if defined(LMP_QSORT)
   // static variable across all Irregular objects, for qsort callback
 
   static int *proc_recv_copy;
+#endif
 
   Irregular(class LAMMPS *);
   ~Irregular();
diff --git a/src/mergesort.h b/src/mergesort.h
new file mode 100644
index 0000000000..1df6cb4b81
--- /dev/null
+++ b/src/mergesort.h
@@ -0,0 +1,120 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MERGESORT
+#define LMP_MERGESORT
+
+#include <string.h>
+
+// custom hybrid upward merge sort implementation with support to pass
+// an opaque pointer to the comparison function, e.g. for access to
+// class members. this avoids having to use global variables.
+// for improved performance, we employ an in-place insertion sort on
+// chunks of up to 64 elements and switch to merge sort from then on.
+
+// part 1. insertion sort for pre-sorting of small chunks
+
+static void insertion_sort(int *index, int num, void *ptr,
+                           int (*comp)(int, int, void*))
+{
+  if (num < 2) return;
+  for (int i=1; i < num; ++i) {
+    int tmp = index[i];
+    for (int j=i-1; j >= 0; --j) {
+      if ((*comp)(index[j],tmp,ptr) > 0) {
+        index[j+1] = index[j];
+      } else {
+        index[j+1] = tmp;
+        break;
+      }
+      if (j == 0) index[0] = tmp;
+    }
+  }
+}
+
+// part 2. merge two sublists
+
+static void do_merge(int *idx, int *buf, int llo, int lhi, int rlo, int rhi,
+                     void *ptr, int (*comp)(int, int, void *))
+{
+  int i = llo;
+  int l = llo;
+  int r = rlo;
+  while ((l < lhi) && (r < rhi)) {
+    if ((*comp)(buf[l],buf[r],ptr) < 0)
+      idx[i++] = buf[l++];
+    else idx[i++] = buf[r++];
+  }
+
+  while (l < lhi) idx[i++] = buf[l++];
+  while (r < rhi) idx[i++] = buf[r++];
+}
+
+// part 3: loop over sublists doubling in size with each iteration.
+//         pre-sort sublists with insertion sort for better performance.
+
+static void merge_sort(int *index, int num, void *ptr,
+                       int (*comp)(int, int, void *))
+{
+  if (num < 2) return;
+
+  int chunk,i,j;
+
+  // do insertion sort on chunks of up to 64 elements
+
+  chunk = 64;
+  for (i=0; i < num; i += chunk) {
+    j = (i+chunk > num) ? num-i : chunk;
+    insertion_sort(index+i,j,ptr,comp);
+  }
+
+  // already done?
+
+  if (chunk >= num) return;
+
+  // continue with merge sort on the pre-sorted chunks.
+  // we need an extra buffer for temporary storage and two
+  // pointers to operate on, so we can swap the pointers
+  // rather than copying to the hold buffer in each pass
+
+  int *buf = new int[num];
+  int *dest = index;
+  int *hold = buf;
+
+  while (chunk < num) {
+    int m;
+
+    // swap hold and destination buffer
+
+    int *tmp = dest; dest = hold; hold = tmp;
+
+    // merge from hold array to destiation array
+
+    for (i=0; i < num-1; i += 2*chunk) {
+      j = i + 2*chunk;
+      if (j > num) j=num;
+      m = i+chunk;
+      if (m > num) m=num;
+      do_merge(dest,hold,i,m,m,j,ptr,comp);
+    }
+    chunk *= 2;
+  }
+
+  // if the final sorted data is in buf, copy back to index
+
+  if (dest == buf) memcpy(index,buf,sizeof(int)*num);
+
+  delete[] buf;
+}
+
+#endif
diff --git a/src/min.cpp b/src/min.cpp
index d308efb848..af23629cad 100644
--- a/src/min.cpp
+++ b/src/min.cpp
@@ -165,8 +165,8 @@ void Min::init()
 
   if (neigh_every != 1 || neigh_delay != 0 || neigh_dist_check != 1) {
     if (comm->me == 0)
-      error->warning(FLERR,
-                     "Resetting reneighboring criteria during minimization");
+      error->warning(FLERR, "Using 'neigh_modify every 1 delay 0 check"
+                     " yes' setting during minimization");
   }
 
   neighbor->every = 1;
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index a5ff157a1f..e311020a60 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -1268,6 +1268,12 @@ void Neighbor::morph_copy()
 
       if (irq->ghost && !jrq->ghost) continue;
 
+      // do not copy from a history list or a respa middle/inner list
+
+      if (jrq->history) continue;
+      if (jrq->respamiddle) continue;
+      if (jrq->respainner) continue;
+
       // these flags must be same,
       //   else 2 lists do not store same pairs
       //   or their data structures are different
@@ -1619,6 +1625,21 @@ void Neighbor::requests_new2old()
   old_oneatom = oneatom;
 }
 
+/* ----------------------------------------------------------------------
+   find and return request made by classptr
+   if not found or classpt = NULL, return NULL
+------------------------------------------------------------------------- */
+
+NeighRequest *Neighbor::find_request(void *classptr)
+{
+  if (classptr == NULL) return NULL;
+
+  for (int i = 0; i < nrequest; i++)
+    if (requests[i]->requestor == classptr) return requests[i];
+
+  return NULL;
+}
+
 /* ----------------------------------------------------------------------
    assign NBin class to a NeighList
    use neigh request settings to build mask
diff --git a/src/neighbor.h b/src/neighbor.h
index 16a80b5991..64bced2293 100644
--- a/src/neighbor.h
+++ b/src/neighbor.h
@@ -122,6 +122,7 @@ class Neighbor : protected Pointers {
 
   void exclusion_group_group_delete(int, int);  // rm a group-group exclusion
   int exclude_setting();            // return exclude value to accelerator pkg
+  class NeighRequest *find_request(void *);  // find a neighbor request
 
   bigint memory_usage();
 
diff --git a/src/pair_coul_streitz.h b/src/pair_coul_streitz.h
index 5f02d29c11..8e713997f5 100644
--- a/src/pair_coul_streitz.h
+++ b/src/pair_coul_streitz.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
diff --git a/src/update.cpp b/src/update.cpp
index 5599dc6c88..e4c85dde73 100644
--- a/src/update.cpp
+++ b/src/update.cpp
@@ -154,7 +154,7 @@ void Update::set_units(const char *style)
     force->ftm2v = 1.0 / 48.88821291 / 48.88821291;
     force->mv2d = 1.0 / 0.602214129;
     force->nktv2p = 68568.415;
-    force->qqr2e = 332.06371;
+    force->qqr2e = 332.06371;     // see also force->qqr2d_lammps_real
     force->qe2f = 23.060549;
     force->vxmu2f = 1.4393264316e4;
     force->xxt2kmu = 0.1;
diff --git a/src/version.h b/src/version.h
index dc0ebe76b8..07bfcc3885 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "19 May 2017"
+#define LAMMPS_VERSION "23 Jun 2017"