diff --git a/doc/Eqs/fix_rattle_constraints.jpg b/doc/Eqs/fix_rattle_constraints.jpg
new file mode 100644
index 0000000000..2ba86095cd
Binary files /dev/null and b/doc/Eqs/fix_rattle_constraints.jpg differ
diff --git a/doc/Eqs/fix_rattle_constraints.tex b/doc/Eqs/fix_rattle_constraints.tex
new file mode 100644
index 0000000000..daff0ce775
--- /dev/null
+++ b/doc/Eqs/fix_rattle_constraints.tex
@@ -0,0 +1,9 @@
+\documentclass[12pt]{article}
+\usepackage{amsmath}
+
+\begin{document}
+\begin{align*}
+\mathbf r^{n+1}_{ij} \cdot \mathbf r^{n+1}_{ij} &= d^2_{ij} \quad \text{and} \\
+\mathbf v^{n+1}_{ij} \cdot \mathbf r^{n+1}_{ij} &= 0, \label{eq:velcon}
+\end{align*}
+\end{document}
diff --git a/doc/Eqs/fix_rattle_rij.jpg b/doc/Eqs/fix_rattle_rij.jpg
new file mode 100644
index 0000000000..b08e2fe02e
Binary files /dev/null and b/doc/Eqs/fix_rattle_rij.jpg differ
diff --git a/doc/Eqs/fix_rattle_rij.tex b/doc/Eqs/fix_rattle_rij.tex
new file mode 100644
index 0000000000..fae0457c60
--- /dev/null
+++ b/doc/Eqs/fix_rattle_rij.tex
@@ -0,0 +1,8 @@
+\documentclass[12pt]{article}
+\usepackage{amsmath}
+\begin{document}
+$$
+ \mathbf r^{n+1}_{ij} = \mathbf r^n_j - \mathbf r^n_i
+$$
+\end{document}
+
diff --git a/doc/Manual.html b/doc/Manual.html
index e4aaa9664f..0379772882 100644
--- a/doc/Manual.html
+++ b/doc/Manual.html
@@ -1,7 +1,7 @@
 <HTML>
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="24 Feb 2015 version">
+<META NAME="docnumber" CONTENT="5 Mar 2015 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@@ -22,7 +22,7 @@
 
 <CENTER><H3>LAMMPS-ICMS Documentation 
 </H3></CENTER>
-<CENTER><H4>24 Feb 2015 version 
+<CENTER><H4>5 Mar 2015 version 
 </H4></CENTER>
 <H4>Version info: 
 </H4>
diff --git a/doc/Manual.txt b/doc/Manual.txt
index 460c523506..0b3f7e4099 100644
--- a/doc/Manual.txt
+++ b/doc/Manual.txt
@@ -1,7 +1,7 @@
 <HEAD>
 <TITLE>LAMMPS-ICMS Users Manual</TITLE>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="24 Feb 2015 version">
+<META NAME="docnumber" CONTENT="5 Mar 2015 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@@ -19,7 +19,7 @@
 <H1></H1>
 
 LAMMPS-ICMS Documentation :c,h3
-24 Feb 2015 version :c,h4
+5 Mar 2015 version :c,h4
 
 Version info: :h4
 
diff --git a/doc/Section_commands.html b/doc/Section_commands.html
index c54f1e8e49..a51ad60fda 100644
--- a/doc/Section_commands.html
+++ b/doc/Section_commands.html
@@ -407,12 +407,12 @@ g = GPU, i = USER-INTEL, k = KOKKOS, o = USER-OMP, t = OPT.
 <TR ALIGN="center"><TD ><A HREF = "fix_nh.html">npt (co)</A></TD><TD ><A HREF = "fix_npt_asphere.html">npt/asphere (o)</A></TD><TD ><A HREF = "fix_npt_sphere.html">npt/sphere (o)</A></TD><TD ><A HREF = "fix_nve.html">nve (cko)</A></TD><TD ><A HREF = "fix_nve_asphere.html">nve/asphere</A></TD><TD ><A HREF = "fix_nve_asphere_noforce.html">nve/asphere/noforce</A></TD><TD ><A HREF = "fix_nve_body.html">nve/body</A></TD><TD ><A HREF = "fix_nve_limit.html">nve/limit</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_nve_line.html">nve/line</A></TD><TD ><A HREF = "fix_nve_noforce.html">nve/noforce</A></TD><TD ><A HREF = "fix_nve_sphere.html">nve/sphere (o)</A></TD><TD ><A HREF = "fix_nve_tri.html">nve/tri</A></TD><TD ><A HREF = "fix_nh.html">nvt (co)</A></TD><TD ><A HREF = "fix_nvt_asphere.html">nvt/asphere (o)</A></TD><TD ><A HREF = "fix_nvt_sllod.html">nvt/sllod (o)</A></TD><TD ><A HREF = "fix_nvt_sphere.html">nvt/sphere (o)</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_oneway.html">oneway</A></TD><TD ><A HREF = "fix_orient_fcc.html">orient/fcc</A></TD><TD ><A HREF = "fix_planeforce.html">planeforce</A></TD><TD ><A HREF = "fix_poems.html">poems</A></TD><TD ><A HREF = "fix_pour.html">pour</A></TD><TD ><A HREF = "fix_press_berendsen.html">press/berendsen</A></TD><TD ><A HREF = "fix_print.html">print</A></TD><TD ><A HREF = "fix_property_atom.html">property/atom</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_qeq_comb.html">qeq/comb (o)</A></TD><TD ><A HREF = "fix_qeq.html">qeq/dynamic</A></TD><TD ><A HREF = "fix_qeq.html">qeq/point</A></TD><TD ><A HREF = "fix_qeq.html">qeq/shielded</A></TD><TD ><A HREF = "fix_qeq.html">qeq/slater</A></TD><TD ><A HREF = "fix_reax_bonds.html">reax/bonds</A></TD><TD ><A HREF = "fix_recenter.html">recenter</A></TD><TD ><A HREF = "fix_restrain.html">restrain</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nph (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/npt (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nve (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nvt (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/nph</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/npt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/small/nve</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/nvt</A></TD><TD ><A HREF = "fix_setforce.html">setforce (c)</A></TD><TD ><A HREF = "fix_shake.html">shake (c)</A></TD><TD ><A HREF = "fix_spring.html">spring</A></TD><TD ><A HREF = "fix_spring_rg.html">spring/rg</A></TD><TD ><A HREF = "fix_spring_self.html">spring/self</A></TD><TD ><A HREF = "fix_srd.html">srd</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_store_force.html">store/force</A></TD><TD ><A HREF = "fix_store_state.html">store/state</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen (c)</A></TD><TD ><A HREF = "fix_temp_csvr.html">temp/csvr</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale (c)</A></TD><TD ><A HREF = "fix_tfmc.html">tfmc</A></TD><TD ><A HREF = "fix_thermal_conductivity.html">thermal/conductivity</A></TD><TD ><A HREF = "fix_tmd.html">tmd</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_ttm.html">ttm</A></TD><TD ><A HREF = "fix_tune_kspace.html">tune/kspace</A></TD><TD ><A HREF = "fix_vector.html">vector</A></TD><TD ><A HREF = "fix_viscosity.html">viscosity</A></TD><TD ><A HREF = "fix_viscous.html">viscous (c)</A></TD><TD ><A HREF = "fix_wall.html">wall/colloid</A></TD><TD ><A HREF = "fix_wall_gran.html">wall/gran</A></TD><TD ><A HREF = "fix_wall.html">wall/harmonic</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_wall.html">wall/lj1043</A></TD><TD ><A HREF = "fix_wall.html">wall/lj126</A></TD><TD ><A HREF = "fix_wall.html">wall/lj93</A></TD><TD ><A HREF = "fix_wall_piston.html">wall/piston</A></TD><TD ><A HREF = "fix_wall_reflect.html">wall/reflect</A></TD><TD ><A HREF = "fix_wall_region.html">wall/region</A></TD><TD ><A HREF = "fix_wall_srd.html">wall/srd</A> 
+<TR ALIGN="center"><TD ><A HREF = "fix_qeq_comb.html">qeq/comb (o)</A></TD><TD ><A HREF = "fix_qeq.html">qeq/dynamic</A></TD><TD ><A HREF = "fix_qeq.html">qeq/point</A></TD><TD ><A HREF = "fix_qeq.html">qeq/shielded</A></TD><TD ><A HREF = "fix_qeq.html">qeq/slater</A></TD><TD ><A HREF = "fix_shake.html">rattle</A></TD><TD ><A HREF = "fix_reax_bonds.html">reax/bonds</A></TD><TD ><A HREF = "fix_recenter.html">recenter</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_restrain.html">restrain</A></TD><TD ><A HREF = "fix_rigid.html">rigid (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nph (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/npt (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nve (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nvt (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small (o)</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/nph</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/small/npt</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/nve</A></TD><TD ><A HREF = "fix_rigid.html">rigid/small/nvt</A></TD><TD ><A HREF = "fix_setforce.html">setforce (c)</A></TD><TD ><A HREF = "fix_shake.html">shake (c)</A></TD><TD ><A HREF = "fix_spring.html">spring</A></TD><TD ><A HREF = "fix_spring_rg.html">spring/rg</A></TD><TD ><A HREF = "fix_spring_self.html">spring/self</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_srd.html">srd</A></TD><TD ><A HREF = "fix_store_force.html">store/force</A></TD><TD ><A HREF = "fix_store_state.html">store/state</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen (c)</A></TD><TD ><A HREF = "fix_temp_csvr.html">temp/csvr</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale (c)</A></TD><TD ><A HREF = "fix_tfmc.html">tfmc</A></TD><TD ><A HREF = "fix_thermal_conductivity.html">thermal/conductivity</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_tmd.html">tmd</A></TD><TD ><A HREF = "fix_ttm.html">ttm</A></TD><TD ><A HREF = "fix_tune_kspace.html">tune/kspace</A></TD><TD ><A HREF = "fix_vector.html">vector</A></TD><TD ><A HREF = "fix_viscosity.html">viscosity</A></TD><TD ><A HREF = "fix_viscous.html">viscous (c)</A></TD><TD ><A HREF = "fix_wall.html">wall/colloid</A></TD><TD ><A HREF = "fix_wall_gran.html">wall/gran</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_wall.html">wall/harmonic</A></TD><TD ><A HREF = "fix_wall.html">wall/lj1043</A></TD><TD ><A HREF = "fix_wall.html">wall/lj126</A></TD><TD ><A HREF = "fix_wall.html">wall/lj93</A></TD><TD ><A HREF = "fix_wall_piston.html">wall/piston</A></TD><TD ><A HREF = "fix_wall_reflect.html">wall/reflect</A></TD><TD ><A HREF = "fix_wall_region.html">wall/region</A></TD><TD ><A HREF = "fix_wall_srd.html">wall/srd</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are additional fix styles in USER packages, which can be used if
@@ -499,7 +499,7 @@ KOKKOS, o = USER-OMP, t = OPT.
 <TR ALIGN="center"><TD ><A HREF = "pair_nm.html">nm/cut (o)</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/cut (o)</A></TD><TD ><A HREF = "pair_nm.html">nm/cut/coul/long (o)</A></TD><TD ><A HREF = "pair_peri.html">peri/eps</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_peri.html">peri/lps (o)</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb (o)</A></TD><TD ><A HREF = "pair_peri.html">peri/ves</A></TD><TD ><A HREF = "pair_reax.html">reax</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_airebo.html">rebo (o)</A></TD><TD ><A HREF = "pair_resquared.html">resquared (go)</A></TD><TD ><A HREF = "pair_snap.html">snap</A></TD><TD ><A HREF = "pair_soft.html">soft (go)</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_sw.html">sw (cgo)</A></TD><TD ><A HREF = "pair_table.html">table (gko)</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff (co)</A></TD><TD ><A HREF = "pair_tersoff_mod.html">tersoff/mod (o)</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_sw.html">sw (cgio)</A></TD><TD ><A HREF = "pair_table.html">table (gko)</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff (co)</A></TD><TD ><A HREF = "pair_tersoff_mod.html">tersoff/mod (o)</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl (o)</A></TD><TD ><A HREF = "pair_coul.html">tip4p/cut (o)</A></TD><TD ><A HREF = "pair_coul.html">tip4p/long (o)</A></TD><TD ><A HREF = "pair_tri_lj.html">tri/lj (o)</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_yukawa.html">yukawa (go)</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid (go)</A></TD><TD ><A HREF = "pair_zbl.html">zbl (o)</A> 
 </TD></TR></TABLE></DIV>
diff --git a/doc/Section_commands.txt b/doc/Section_commands.txt
index 3138cf9789..fb527f942f 100644
--- a/doc/Section_commands.txt
+++ b/doc/Section_commands.txt
@@ -535,6 +535,7 @@ g = GPU, i = USER-INTEL, k = KOKKOS, o = USER-OMP, t = OPT.
 "qeq/point"_fix_qeq.html,
 "qeq/shielded"_fix_qeq.html,
 "qeq/slater"_fix_qeq.html,
+"rattle"_fix_shake.html,
 "reax/bonds"_fix_reax_bonds.html,
 "recenter"_fix_recenter.html,
 "restrain"_fix_restrain.html,
@@ -813,7 +814,7 @@ KOKKOS, o = USER-OMP, t = OPT.
 "resquared (go)"_pair_resquared.html,
 "snap"_pair_snap.html,
 "soft (go)"_pair_soft.html,
-"sw (cgo)"_pair_sw.html,
+"sw (cgio)"_pair_sw.html,
 "table (gko)"_pair_table.html,
 "tersoff (co)"_pair_tersoff.html,
 "tersoff/mod (o)"_pair_tersoff_mod.html,
diff --git a/doc/Section_start.html b/doc/Section_start.html
index 06e22f8b92..98e33fc5f7 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -377,10 +377,11 @@ need to build the STUBS library for your platform before making LAMMPS
 itself.  Note that if you are building with src/MAKE/Makefile.serial,
 e.g. by typing "make serial", then the STUBS library is built for you.
 </P>
-<P>To build the STUBS library from the src directory, type "make stubs",
-or from the src/STUBS dir, type "make".  This should create a
-libmpi_stubs.a file suitable for linking to LAMMPS.  If the build
-fails, you will need to edit the STUBS/Makefile for your platform.
+<P>To build the STUBS library from the src directory, type "make
+mpi-stubs", or from the src/STUBS dir, type "make".  This should
+create a libmpi_stubs.a file suitable for linking to LAMMPS.  If the
+build fails, you will need to edit the STUBS/Makefile for your
+platform.
 </P>
 <P>The file STUBS/mpi.c provides a CPU timer function called MPI_Wtime()
 that calls gettimeofday() .  If your system doesn't support
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index e5fe747d41..1eeb81be63 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -371,10 +371,11 @@ need to build the STUBS library for your platform before making LAMMPS
 itself.  Note that if you are building with src/MAKE/Makefile.serial,
 e.g. by typing "make serial", then the STUBS library is built for you.
 
-To build the STUBS library from the src directory, type "make stubs",
-or from the src/STUBS dir, type "make".  This should create a
-libmpi_stubs.a file suitable for linking to LAMMPS.  If the build
-fails, you will need to edit the STUBS/Makefile for your platform.
+To build the STUBS library from the src directory, type "make
+mpi-stubs", or from the src/STUBS dir, type "make".  This should
+create a libmpi_stubs.a file suitable for linking to LAMMPS.  If the
+build fails, you will need to edit the STUBS/Makefile for your
+platform.
 
 The file STUBS/mpi.c provides a CPU timer function called MPI_Wtime()
 that calls gettimeofday() .  If your system doesn't support
diff --git a/doc/fix_shake.html b/doc/fix_shake.html
index c1f62b8ac7..020ef7e870 100644
--- a/doc/fix_shake.html
+++ b/doc/fix_shake.html
@@ -13,13 +13,15 @@
 </H3>
 <H3>fix shake/cuda command 
 </H3>
+<H3>fix rattle command 
+</H3>
 <P><B>Syntax:</B>
 </P>
-<PRE>fix ID group-ID shake tol iter N constraint values ... keyword value ... 
+<PRE>fix ID group-ID style tol iter N constraint values ... keyword value ... 
 </PRE>
 <UL><LI>ID, group-ID are documented in <A HREF = "fix.html">fix</A> command 
 
-<LI>shake = style name of this fix command 
+<LI>style = shake or rattle = style name of this fix command 
 
 <LI>tol = accuracy tolerance of SHAKE solution 
 
@@ -49,24 +51,61 @@
 </P>
 <PRE>fix 1 sub shake 0.0001 20 10 b 4 19 a 3 5 2
 fix 1 sub shake 0.0001 20 10 t 5 6 m 1.0 a 31
-fix 1 sub shake 0.0001 20 10 t 5 6 m 1.0 a 31 mol myMol 
+fix 1 sub shake 0.0001 20 10 t 5 6 m 1.0 a 31 mol myMol
+fix 1 sub rattle 0.0001 20 10 t 5 6 m 1.0 a 31
+fix 1 sub rattle 0.0001 20 10 t 5 6 m 1.0 a 31 mol myMol 
 </PRE>
 <P><B>Description:</B>
 </P>
 <P>Apply bond and angle constraints to specified bonds and angles in the
-simulation.  This typically enables a longer timestep.
+simulation by either the SHAKE or RATTLE algorithms.  This typically
+enables a longer timestep.
+</P>
+<P><B>SHAKE vs RATTLE:</B>
+</P>
+<P>The SHAKE algorithm was invented for schemes such as standard Verlet
+timesteppnig, where only the coordinates are integrated and the
+velocities are approximated as finite differences to the trajectories
+(<A HREF = "#Ryckaert">Ryckaert et al. (1977)</A>).  If the velocities are
+integrated explicitly, as with velocity Verlet which is what LAMMPS
+uses as an integration method, a second set of constraining forces is
+required in order to eliminate velocity components along the bonds
+(<A HREF = "#Andersen">Andersen (1983)</A>).
+</P>
+<P>In order to formulate individual constraints for SHAKE and RATTLE,
+focus on a single molecule whose bonds are constrained.  Let Ri and Vi
+be the position and velocity of atom <I>i</I> at time <I>n</I>, for
+<I>i</I>=1,...,<I>N</I>, where <I>N</I> is the number of sites of our reference
+molecule. The distance vector between sites <I>i</I> and <I>j</I> is given by
+</P>
+<CENTER><IMG SRC = "Eqs/fix_rattle_rij.jpg">
+</CENTER>
+<P>The constraints can then be formulated as
+</P>
+<CENTER><IMG SRC = "Eqs/fix_rattle_constraints.jpg">
+</CENTER>
+<P>The SHAKE algorithm satisfies the first condition, i.e. the sites at
+time <I>n+1</I> will have the desired separations Dij immediately after the
+coordinates are integrated.  If we also enforce the second condition,
+the velocity components along the bonds will vanish.  RATTLE satisfies
+both conditions.  As implemented in LAMMPS, fix rattle uses fix shake
+for satisfying the coordinate constraints. Therefore the settings and
+optional keywords are the same for both fixes, and all the information
+below about SHAKE is also relevant for RATTLE.
+</P>
+<P><B>SHAKE:</B>
 </P>
 <P>Each timestep the specified bonds and angles are reset to their
-equilibrium lengths and angular values via the well-known SHAKE
-algorithm.  This is done by applying an additional constraint force so
-that the new positions preserve the desired atom separations.  The
-equations for the additional force are solved via an iterative method
-that typically converges to an accurate solution in a few iterations.
-The desired tolerance (e.g. 1.0e-4 = 1 part in 10000) and maximum # of
-iterations are specified as arguments.  Setting the N argument will
-print statistics to the screen and log file about regarding the
-lengths of bonds and angles that are being constrained.  Small delta
-values mean SHAKE is doing a good job.
+equilibrium lengths and angular values via the SHAKE algorithm
+(<A HREF = "#Ryckaert">Ryckaert et al. (1977)</A>).  This is done by applying an
+additional constraint force so that the new positions preserve the
+desired atom separations.  The equations for the additional force are
+solved via an iterative method that typically converges to an accurate
+solution in a few iterations.  The desired tolerance (e.g. 1.0e-4 = 1
+part in 10000) and maximum # of iterations are specified as arguments.
+Setting the N argument will print statistics to the screen and log
+file about regarding the lengths of bonds and angles that are being
+constrained.  Small delta values mean SHAKE is doing a good job.
 </P>
 <P>In LAMMPS, only small clusters of atoms can be constrained.  This is
 so the constraint calculation for a cluster can be performed by a
@@ -144,24 +183,42 @@ more instructions on how to use the accelerated styles effectively.
 </P>
 <HR>
 
+<P><B>RATTLE:</B>
+</P>
+<P>The velocity constraints lead to a linear system of equations which
+can be solved analytically.  The implementation of the algorithm in
+LAMMPS closely follows (<A HREF = "#Andersen">Andersen (1983)</A>).
+</P>
+<P>IMPORTANT NOTE: The fix rattle command modifies forces and velocities
+and thus should be defined after all other integration fixes in your
+input script.  If you define other fixes that modify velocities or
+forces after fix rattle operates, then fix rattle will not take them
+into account and the overall time integration will typically not
+satisfy the RATTLE constraints.  You can check whether the constraints
+work correctly by setting the value of RATTLE_DEBUG in
+src/fix_rattle.cpp to 1 and recompiling LAMMPS.
+</P>
+<HR>
+
 <P><B>Restart, fix_modify, output, run start/stop, minimize info:</B>
 </P>
-<P>No information about this fix is written to <A HREF = "restart.html">binary restart
+<P>No information about these fixes is written to <A HREF = "restart.html">binary restart
 files</A>.  None of the <A HREF = "fix_modify.html">fix_modify</A> options
-are relevant to this fix.  No global or per-atom quantities are stored
-by this fix for access by various <A HREF = "Section_howto.html#howto_15">output
-commands</A>.  No parameter of this fix can
-be used with the <I>start/stop</I> keywords of the <A HREF = "run.html">run</A> command.
-This fix is not invoked during <A HREF = "minimize.html">energy minimization</A>.
+are relevant to these fixes.  No global or per-atom quantities are
+stored by these fixes for access by various <A HREF = "Section_howto.html#howto_15">output
+commands</A>.  No parameter of these fixes
+can be used with the <I>start/stop</I> keywords of the <A HREF = "run.html">run</A>
+command.  These fixes are not invoked during <A HREF = "minimize.html">energy
+minimization</A>.
 </P>
 <P><B>Restrictions:</B>
 </P>
-<P>This fix is part of the RIGID package.  It is only enabled if LAMMPS
-was built with that package.  See the <A HREF = "Section_start.html#start_3">Making
+<P>These fixes are part of the RIGID package.  They are only enabled if
+LAMMPS was built with that package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
-<P>For computational efficiency, there can only be one shake fix defined
-in a simulation.
+<P>For computational efficiency, there can only be one shake or rattle
+fix defined in a simulation.
 </P>
 <P>If you use a tolerance that is too large or a max-iteration count that
 is too small, the constraints will not be enforced very strongly,
@@ -173,4 +230,16 @@ of SHAKE parameters and monitoring the energy versus time.
 </P>
 <P><B>Default:</B> none
 </P>
+<HR>
+
+<A NAME = "Ryckaert"></A>
+
+<P><B>(Ryckaert)</B> J.-P. Ryckaert, G. Ciccotti and H. J. C. Berendsen, 
+Journal of Computational Physics, 23, 327–341 (1977).
+</P>
+<A NAME = "Andersen"></A>
+
+<P><B>(Andersen)</B> H. Andersen,
+Journal of Computational Physics, 52, 24-34 (1983).
+</P>
 </HTML>
diff --git a/doc/fix_shake.txt b/doc/fix_shake.txt
index edcb10ea2a..32c98711b4 100644
--- a/doc/fix_shake.txt
+++ b/doc/fix_shake.txt
@@ -8,13 +8,14 @@
 
 fix shake command :h3
 fix shake/cuda command :h3
+fix rattle command :h3
 
 [Syntax:]
 
-fix ID group-ID shake tol iter N constraint values ... keyword value ... :pre
+fix ID group-ID style tol iter N constraint values ... keyword value ... :pre
 
 ID, group-ID are documented in "fix"_fix.html command :ulb,l
-shake = style name of this fix command :l
+style = shake or rattle = style name of this fix command :l
 tol = accuracy tolerance of SHAKE solution :l
 iter = max # of iterations in each SHAKE solution :l
 N = print SHAKE statistics every this many timesteps (0 = never) :l
@@ -34,24 +35,61 @@ keyword = {mol} :l
 
 fix 1 sub shake 0.0001 20 10 b 4 19 a 3 5 2
 fix 1 sub shake 0.0001 20 10 t 5 6 m 1.0 a 31
-fix 1 sub shake 0.0001 20 10 t 5 6 m 1.0 a 31 mol myMol :pre
+fix 1 sub shake 0.0001 20 10 t 5 6 m 1.0 a 31 mol myMol
+fix 1 sub rattle 0.0001 20 10 t 5 6 m 1.0 a 31
+fix 1 sub rattle 0.0001 20 10 t 5 6 m 1.0 a 31 mol myMol :pre
 
 [Description:]
 
 Apply bond and angle constraints to specified bonds and angles in the
-simulation.  This typically enables a longer timestep.
+simulation by either the SHAKE or RATTLE algorithms.  This typically
+enables a longer timestep.
+
+[SHAKE vs RATTLE:]
+
+The SHAKE algorithm was invented for schemes such as standard Verlet
+timesteppnig, where only the coordinates are integrated and the
+velocities are approximated as finite differences to the trajectories
+("Ryckaert et al. (1977)"_#Ryckaert).  If the velocities are
+integrated explicitly, as with velocity Verlet which is what LAMMPS
+uses as an integration method, a second set of constraining forces is
+required in order to eliminate velocity components along the bonds
+("Andersen (1983)"_#Andersen).
+
+In order to formulate individual constraints for SHAKE and RATTLE,
+focus on a single molecule whose bonds are constrained.  Let Ri and Vi
+be the position and velocity of atom {i} at time {n}, for
+{i}=1,...,{N}, where {N} is the number of sites of our reference
+molecule. The distance vector between sites {i} and {j} is given by
+
+:c,image(Eqs/fix_rattle_rij.jpg)
+
+The constraints can then be formulated as
+
+:c,image(Eqs/fix_rattle_constraints.jpg)
+
+The SHAKE algorithm satisfies the first condition, i.e. the sites at
+time {n+1} will have the desired separations Dij immediately after the
+coordinates are integrated.  If we also enforce the second condition,
+the velocity components along the bonds will vanish.  RATTLE satisfies
+both conditions.  As implemented in LAMMPS, fix rattle uses fix shake
+for satisfying the coordinate constraints. Therefore the settings and
+optional keywords are the same for both fixes, and all the information
+below about SHAKE is also relevant for RATTLE.
+
+[SHAKE:]
 
 Each timestep the specified bonds and angles are reset to their
-equilibrium lengths and angular values via the well-known SHAKE
-algorithm.  This is done by applying an additional constraint force so
-that the new positions preserve the desired atom separations.  The
-equations for the additional force are solved via an iterative method
-that typically converges to an accurate solution in a few iterations.
-The desired tolerance (e.g. 1.0e-4 = 1 part in 10000) and maximum # of
-iterations are specified as arguments.  Setting the N argument will
-print statistics to the screen and log file about regarding the
-lengths of bonds and angles that are being constrained.  Small delta
-values mean SHAKE is doing a good job.
+equilibrium lengths and angular values via the SHAKE algorithm
+("Ryckaert et al. (1977)"_#Ryckaert).  This is done by applying an
+additional constraint force so that the new positions preserve the
+desired atom separations.  The equations for the additional force are
+solved via an iterative method that typically converges to an accurate
+solution in a few iterations.  The desired tolerance (e.g. 1.0e-4 = 1
+part in 10000) and maximum # of iterations are specified as arguments.
+Setting the N argument will print statistics to the screen and log
+file about regarding the lengths of bonds and angles that are being
+constrained.  Small delta values mean SHAKE is doing a good job.
 
 In LAMMPS, only small clusters of atoms can be constrained.  This is
 so the constraint calculation for a cluster can be performed by a
@@ -129,24 +167,42 @@ more instructions on how to use the accelerated styles effectively.
 
 :line
 
+[RATTLE:]
+
+The velocity constraints lead to a linear system of equations which
+can be solved analytically.  The implementation of the algorithm in
+LAMMPS closely follows ("Andersen (1983)"_#Andersen).
+	
+IMPORTANT NOTE: The fix rattle command modifies forces and velocities
+and thus should be defined after all other integration fixes in your
+input script.  If you define other fixes that modify velocities or
+forces after fix rattle operates, then fix rattle will not take them
+into account and the overall time integration will typically not
+satisfy the RATTLE constraints.  You can check whether the constraints
+work correctly by setting the value of RATTLE_DEBUG in
+src/fix_rattle.cpp to 1 and recompiling LAMMPS.
+
+:line
+
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
-No information about this fix is written to "binary restart
+No information about these fixes is written to "binary restart
 files"_restart.html.  None of the "fix_modify"_fix_modify.html options
-are relevant to this fix.  No global or per-atom quantities are stored
-by this fix for access by various "output
-commands"_Section_howto.html#howto_15.  No parameter of this fix can
-be used with the {start/stop} keywords of the "run"_run.html command.
-This fix is not invoked during "energy minimization"_minimize.html.
+are relevant to these fixes.  No global or per-atom quantities are
+stored by these fixes for access by various "output
+commands"_Section_howto.html#howto_15.  No parameter of these fixes
+can be used with the {start/stop} keywords of the "run"_run.html
+command.  These fixes are not invoked during "energy
+minimization"_minimize.html.
 
 [Restrictions:]
 
-This fix is part of the RIGID package.  It is only enabled if LAMMPS
-was built with that package.  See the "Making
+These fixes are part of the RIGID package.  They are only enabled if
+LAMMPS was built with that package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
-For computational efficiency, there can only be one shake fix defined
-in a simulation.
+For computational efficiency, there can only be one shake or rattle
+fix defined in a simulation.
 
 If you use a tolerance that is too large or a max-iteration count that
 is too small, the constraints will not be enforced very strongly,
@@ -157,3 +213,13 @@ of SHAKE parameters and monitoring the energy versus time.
 [Related commands:] none
 
 [Default:] none
+
+:line 
+ 
+:link(Ryckaert)
+[(Ryckaert)] J.-P. Ryckaert, G. Ciccotti and H. J. C. Berendsen, 
+Journal of Computational Physics, 23, 327–341 (1977).
+ 
+:link(Andersen)
+[(Andersen)] H. Andersen,
+Journal of Computational Physics, 52, 24-34 (1983).
diff --git a/doc/pair_sw.html b/doc/pair_sw.html
index 040b4166ea..70611a5328 100644
--- a/doc/pair_sw.html
+++ b/doc/pair_sw.html
@@ -15,6 +15,8 @@
 </H3>
 <H3>pair_style sw/gpu command 
 </H3>
+<H3>pair_style sw/intel command 
+</H3>
 <H3>pair_style sw/omp command 
 </H3>
 <P><B>Syntax:</B>
@@ -164,6 +166,11 @@ by including their suffix, or you can use the <A HREF = "Section_start.html#star
 switch</A> when you invoke LAMMPS, or you can
 use the <A HREF = "suffix.html">suffix</A> command in your input script.
 </P>
+<P>When using the USER-INTEL package with this style, there is an
+additional 5 to 10 percent performance improvement when the
+Stillinger-Weber parameters p and q are set to 4 and 0 respectively.
+These parameters are common for modeling silicon and water.
+</P>
 <P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
 more instructions on how to use the accelerated styles effectively.
 </P>
diff --git a/doc/pair_sw.txt b/doc/pair_sw.txt
index 40f86682d3..b2f9f8a5f2 100644
--- a/doc/pair_sw.txt
+++ b/doc/pair_sw.txt
@@ -9,6 +9,7 @@
 pair_style sw command :h3
 pair_style sw/cuda command :h3
 pair_style sw/gpu command :h3
+pair_style sw/intel command :h3
 pair_style sw/omp command :h3
 
 [Syntax:]
@@ -158,6 +159,11 @@ by including their suffix, or you can use the "-suffix command-line
 switch"_Section_start.html#start_7 when you invoke LAMMPS, or you can
 use the "suffix"_suffix.html command in your input script.
 
+When using the USER-INTEL package with this style, there is an
+additional 5 to 10 percent performance improvement when the
+Stillinger-Weber parameters p and q are set to 4 and 0 respectively.
+These parameters are common for modeling silicon and water.
+
 See "Section_accelerate"_Section_accelerate.html of the manual for
 more instructions on how to use the accelerated styles effectively.
 
diff --git a/src/MC/fix_atom_swap.cpp b/src/MC/fix_atom_swap.cpp
index 1192dd4776..37264fe185 100644
--- a/src/MC/fix_atom_swap.cpp
+++ b/src/MC/fix_atom_swap.cpp
@@ -739,6 +739,7 @@ void FixAtomSwap::write_restart(FILE *fp)
   int n = 0;
   double list[4];
   list[n++] = random_equal->state();
+  list[n++] = random_unequal->state();
   list[n++] = next_reneighbor;
 
   if (comm->me == 0) {
diff --git a/src/Makefile b/src/Makefile
index 5b995da4ee..0280d3ffea 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -46,7 +46,7 @@ help:
 	@echo 'make -f Makefile.lib machine      build LAMMPS as static library for machine'
 	@echo 'make -f Makefile.shlib machine    build LAMMPS as shared library for machine'
 	@echo 'make -f Makefile.list machine     build LAMMPS from explicit list of files'
-	@echo 'make stubs               build dummy MPI library in STUBS'
+	@echo 'make mpi-stubs           build dummy MPI library in STUBS'
 	@echo 'make install-python      install LAMMPS wrapper in Python'
 	@echo 'make tar                 create lmp_src.tar.gz of src dir and packages'
 	@echo ''
@@ -94,7 +94,7 @@ help:
 
 .DEFAULT:
 	@if [ $@ = "serial" -a ! -f STUBS/libmpi_stubs.a ]; \
-	  then $(MAKE) stubs; fi
+	  then $(MAKE) mpi-stubs; fi
 	@test -f MAKE/Makefile.$@ -o -f MAKE/OPTIONS/Makefile.$@ -o \
 	  -f MAKE/MACHINES/Makefile.$@ -o -f MAKE/MINE/Makefile.$@
 	@if [ ! -d Obj_$@ ]; then mkdir Obj_$@; fi
@@ -144,7 +144,7 @@ makelist:
 
 # Make MPI STUBS library
 
-stubs:
+mpi-stubs:
 	@cd STUBS; $(MAKE) clean; $(MAKE)
 
 # install LAMMPS shared lib and Python wrapper for Python usage
diff --git a/src/RIGID/fix_rattle.cpp b/src/RIGID/fix_rattle.cpp
new file mode 100644
index 0000000000..6a693c6bc3
--- /dev/null
+++ b/src/RIGID/fix_rattle.cpp
@@ -0,0 +1,883 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Peter Wirnsberger (University of Cambridge)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "math.h"
+#include "stdlib.h"
+#include "string.h"
+#include "stdio.h"
+#include "fix_rattle.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "molecule.h"
+#include "update.h"
+#include "respa.h"
+#include "modify.h"
+#include "domain.h"
+#include "force.h"
+#include "bond.h"
+#include "angle.h"
+#include "comm.h"
+#include "group.h"
+#include "fix_respa.h"
+#include "math_const.h"
+#include "math_extra.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+using namespace MathConst;
+using namespace MathExtra;
+
+// set RATTLE_DEBUG = 1 to check constraints at end of timestep
+
+#define RATTLE_DEBUG 0
+#define RATTLE_TEST_VEL true
+#define RATTLE_TEST_POS true
+
+enum{V,VP,XSHAKE,X};
+
+/* ---------------------------------------------------------------------- */
+
+FixRattle::FixRattle(LAMMPS *lmp, int narg, char **arg) :
+  FixShake(lmp, narg, arg)
+{
+  rattle = 1;
+
+  // define timestep for velocity integration
+
+  dtfv = 0.5 * update->dt * force->ftm2v;
+
+  // allocate memory for unconstrained velocity update
+
+  vp = NULL;
+  grow_arrays(atom->nmax);
+
+  // default communication mode
+  // necessary for compatibility with SHAKE
+  // see pack_forward and unpack_forward
+
+  comm_mode = XSHAKE;
+  vflag_post_force = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixRattle::~FixRattle()
+{ 
+  memory->destroy(vp);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixRattle::setmask()
+{
+  int mask = 0;
+  mask |= PRE_NEIGHBOR;
+  mask |= POST_FORCE; 
+  mask |= POST_FORCE_RESPA;
+  mask |= FINAL_INTEGRATE;
+  mask |= FINAL_INTEGRATE_RESPA;
+  if (RATTLE_DEBUG) mask |= END_OF_STEP;          
+  return mask;
+}
+
+/* ----------------------------------------------------------------------
+   initialize RATTLE and check that this is the last final_integrate fix
+------------------------------------------------------------------------- */
+
+void FixRattle::init() {
+
+  // initialise SHAKE first
+
+  FixShake::init();
+
+  // show a warning if any final-integrate fix comes after this one
+
+  int after = 0;
+  int flag = 0;
+  for (int i = 0; i < modify->nfix; i++) {
+    if (strcmp(id,modify->fix[i]->id) == 0) after = 1;
+    else if ((modify->fmask[i] & FINAL_INTEGRATE) && after) flag = 1;
+  }
+
+  if (flag && comm->me == 0)
+    error->warning(FLERR,
+                   "Fix rattle should come after all other integration fixes");
+}
+
+/* ----------------------------------------------------------------------
+   This method carries out an unconstrained velocity update first and
+   then applies the velocity corrections directly (v and vp are modified).
+------------------------------------------------------------------------- */
+
+void FixRattle::post_force(int vflag)
+{
+  // remember vflag for the coordinate correction in this->final_integrate
+
+  vflag_post_force = vflag;
+
+  // unconstrained velocity update by half a timestep
+  // similar to FixShake::unconstrained_update()
+
+  update_v_half_nocons();
+
+  // communicate the unconstrained velocities
+
+  if (nprocs > 1) {
+    comm_mode = VP;
+    comm->forward_comm_fix(this);
+  }
+
+  // correct the velocity for each molecule accordingly
+
+  int m;
+  for (int i = 0; i < nlist; i++) {
+    m = list[i];
+    if      (shake_flag[m] == 2)        vrattle2(m);   
+    else if (shake_flag[m] == 3)        vrattle3(m);  
+    else if (shake_flag[m] == 4)        vrattle4(m);  
+    else                                vrattle3angle(m); 
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  // remember vflag for the coordinate correction in this->final_integrate
+
+  vflag_post_force = vflag;
+
+  // unconstrained velocity update by half a timestep
+  // similar to FixShake::unconstrained_update()
+
+  update_v_half_nocons_respa(ilevel);
+
+  // communicate the unconstrained velocities
+
+  if (nprocs > 1) {
+    comm_mode = VP;
+    comm->forward_comm_fix(this);
+  }
+
+  // correct the velocity for each molecule accordingly
+
+  int m;
+  for (int i = 0; i < nlist; i++) {
+    m = list[i];
+    if      (shake_flag[m] == 2)        vrattle2(m);
+    else if (shake_flag[m] == 3)        vrattle3(m);
+    else if (shake_flag[m] == 4)        vrattle4(m);
+    else                                vrattle3angle(m);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   let SHAKE calculate the constraining forces for the coordinates
+------------------------------------------------------------------------- */
+
+void FixRattle::final_integrate()
+{
+  comm_mode = XSHAKE;
+  FixShake::post_force(vflag_post_force);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::final_integrate_respa(int ilevel, int iloop)
+{
+  comm_mode = XSHAKE;
+  FixShake::post_force_respa(vflag_post_force, ilevel, iloop);
+}
+
+/* ----------------------------------------------------------------------
+   correct velocities of molecule m with 2 constraints bonds and 1 angle
+------------------------------------------------------------------------- */
+
+void FixRattle::vrattle3angle(int m)
+{
+  tagint i0,i1,i2;
+  int nlist,list[3];
+  double c[3], l[3], a[3][3], r01[3], imass[3],
+         r02[3], r12[3], vp01[3], vp02[3], vp12[3];
+
+  // local atom IDs and constraint distances
+  i0 = atom->map(shake_atom[m][0]);
+  i1 = atom->map(shake_atom[m][1]);
+  i2 = atom->map(shake_atom[m][2]);
+
+  // r01,r02,r12 = distance vec between atoms
+  MathExtra::sub3(x[i1],x[i0],r01);
+  MathExtra::sub3(x[i2],x[i0],r02);
+  MathExtra::sub3(x[i2],x[i1],r12);
+
+  // take into account periodicity
+  domain->minimum_image(r01);
+  domain->minimum_image(r02);
+  domain->minimum_image(r12);
+
+  // v01,v02,v12 = velocity differences 
+  MathExtra::sub3(vp[i1],vp[i0],vp01);
+  MathExtra::sub3(vp[i2],vp[i0],vp02);
+  MathExtra::sub3(vp[i2],vp[i1],vp12);
+
+  // matrix coeffs and rhs for lamda equations
+  if (rmass) {
+    imass[0] = 1.0/rmass[i0];
+    imass[1] = 1.0/rmass[i1];
+    imass[2] = 1.0/rmass[i2];
+  } else {
+    imass[0] = 1.0/mass[type[i0]];
+    imass[1] = 1.0/mass[type[i1]];
+    imass[2] = 1.0/mass[type[i2]];
+  }
+
+  // setup matrix
+  a[0][0]   =   (imass[1] + imass[0])   * MathExtra::dot3(r01,r01);
+  a[0][1]   =   (imass[0]           )   * MathExtra::dot3(r01,r02);
+  a[0][2]   =   (-imass[1]          )   * MathExtra::dot3(r01,r12);
+  a[1][0]   =   a[0][1];
+  a[1][1]   =   (imass[0] + imass[2])   * MathExtra::dot3(r02,r02); 
+  a[1][2]   =   (imass[2]           )   * MathExtra::dot3(r02,r12);
+  a[2][0]   =   a[0][2];
+  a[2][1]   =   a[1][2];
+  a[2][2]   =   (imass[2] + imass[1])   * MathExtra::dot3(r12,r12);
+
+  // sestup RHS
+  c[0]  = -MathExtra::dot3(vp01,r01);
+  c[1]  = -MathExtra::dot3(vp02,r02);
+  c[2]  = -MathExtra::dot3(vp12,r12);
+
+  // calculate the inverse matrix exactly
+  solve3x3exactly(a,c,l);
+
+  // add corrections to the velocities if processor owns atom
+  if (i0 < nlocal) {
+    for (int k=0; k<3; k++)  
+      v[i0][k]  -=  imass[0]*  (  l[0] * r01[k] + l[1] * r02[k] );
+  } 
+  if (i1 < nlocal) {
+    for (int k=0; k<3; k++)  
+      v[i1][k]  -=  imass[1] * ( -l[0] * r01[k] + l[2] * r12[k] );
+  }
+  if (i2 < nlocal) {
+    for (int k=0; k<3; k++) 
+      v[i2][k] -=   imass[2] * ( -l[1] * r02[k] - l[2] * r12[k] );
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::vrattle2(int m)
+{
+  tagint    i0, i1;
+  int       nlist,list[2];
+  double    imass[2], r01[3], vp01[3];
+
+  // local atom IDs and constraint distances
+  i0 = atom->map(shake_atom[m][0]);
+  i1 = atom->map(shake_atom[m][1]);
+
+  // r01 = distance vec between atoms, with PBC
+  MathExtra::sub3(x[i1],x[i0],r01);
+  domain->minimum_image(r01);
+
+  // v01 = distance vectors for velocities
+  MathExtra::sub3(vp[i1],vp[i0],vp01);
+
+  // matrix coeffs and rhs for lamda equations
+  if (rmass) {
+    imass[0] = 1.0/rmass[i0];
+    imass[1] = 1.0/rmass[i1];
+  } else {
+    imass[0] = 1.0/mass[type[i0]];
+    imass[1] = 1.0/mass[type[i1]];
+  }
+
+  // Lagrange multiplier: exact solution
+  double l01 = - MathExtra::dot3(r01,vp01) / 
+    (MathExtra::dot3(r01,r01) * (imass[0] + imass[1]));
+
+  // add corrections to the velocities if the process owns this atom
+  if (i0 < nlocal) {
+    for (int k=0; k<3; k++)  
+      v[i0][k] -= imass[0] * l01 * r01[k];
+  } 
+  if (i1 < nlocal) {
+    for (int k=0; k<3; k++) 
+      v[i1][k] -= imass[1] * (-l01) * r01[k];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::vrattle3(int m)
+{
+  tagint    i0,i1,i2;
+  int       nlist,list[3];
+  double    imass[3], r01[3], r02[3], vp01[3], vp02[3],
+            a[2][2],c[2],l[2];
+
+  // local atom IDs and constraint distances
+  i0 = atom->map(shake_atom[m][0]);
+  i1 = atom->map(shake_atom[m][1]);
+  i2 = atom->map(shake_atom[m][2]);
+
+  // r01,r02 = distance vec between atoms, with PBC
+  MathExtra::sub3(x[i1],x[i0],r01);
+  MathExtra::sub3(x[i2],x[i0],r02);
+ 
+  domain->minimum_image(r01);
+  domain->minimum_image(r02);
+
+  // vp01,vp02 =  distance vectors between velocities
+  MathExtra::sub3(vp[i1],vp[i0],vp01);
+  MathExtra::sub3(vp[i2],vp[i0],vp02);
+
+  if (rmass) {
+    imass[0] = 1.0/rmass[i0];
+    imass[1] = 1.0/rmass[i1];
+    imass[2] = 1.0/rmass[i2];
+  } else {
+    imass[0] = 1.0/mass[type[i0]];
+    imass[1] = 1.0/mass[type[i1]];
+    imass[2] = 1.0/mass[type[i2]];
+  }
+
+  // setup matrix
+  a[0][0]   =   (imass[1] + imass[0])   * MathExtra::dot3(r01,r01);
+  a[0][1]   =   (imass[0]           )   * MathExtra::dot3(r01,r02);
+  a[1][0]   =   a[0][1];
+  a[1][1]   =   (imass[0] + imass[2])   * MathExtra::dot3(r02,r02);
+
+  // setup RHS
+  c[0]  = - MathExtra::dot3(vp01,r01);
+  c[1]  = - MathExtra::dot3(vp02,r02);
+
+  // calculate the inverse 2x2 matrix exactly
+  solve2x2exactly(a,c,l);
+
+  // add corrections to the velocities if the process owns this atom
+  if (i0 < nlocal) {
+    for (int k=0; k<3; k++)  
+      v[i0][k] -= imass[0] * (  l[0] * r01[k] + l[1] * r02[k] );
+  } 
+  if (i1 < nlocal) 
+    for (int k=0; k<3; k++) {
+      v[i1][k] -= imass[1] * ( -l[0] * r01[k] );
+  }
+  if (i2 < nlocal) {
+    for (int k=0; k<3; k++) 
+      v[i2][k] -= imass[2] * ( -l[1] * r02[k] );
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::vrattle4(int m)
+{
+  tagint    i0,i1,i2,i3;
+  int       nlist,list[4];
+  double    imass[4], c[3], l[3], a[3][3],
+            r01[3], r02[3], r03[3], vp01[3], vp02[3], vp03[3];
+
+  // local atom IDs and constraint distances
+  i0 = atom->map(shake_atom[m][0]);
+  i1 = atom->map(shake_atom[m][1]);
+  i2 = atom->map(shake_atom[m][2]);
+  i3 = atom->map(shake_atom[m][3]);
+
+  // r01,r02,r12 = distance vec between atoms, with PBC
+  MathExtra::sub3(x[i1],x[i0],r01);
+  MathExtra::sub3(x[i2],x[i0],r02);
+  MathExtra::sub3(x[i3],x[i0],r03);
+
+  domain->minimum_image(r01);
+  domain->minimum_image(r02);
+  domain->minimum_image(r03);
+
+  // vp01,vp02,vp03 = distance vectors between velocities
+  MathExtra::sub3(vp[i1],vp[i0],vp01);
+  MathExtra::sub3(vp[i2],vp[i0],vp02);
+  MathExtra::sub3(vp[i3],vp[i0],vp03);
+
+  // matrix coeffs and rhs for lamda equations
+  if (rmass) {
+    imass[0] = 1.0/rmass[i0];
+    imass[1] = 1.0/rmass[i1];
+    imass[2] = 1.0/rmass[i2];
+    imass[3] = 1.0/rmass[i3];
+  } else {
+    imass[0] = 1.0/mass[type[i0]];
+    imass[1] = 1.0/mass[type[i1]];
+    imass[2] = 1.0/mass[type[i2]];
+    imass[3] = 1.0/mass[type[i3]];
+  }
+
+  // setup matrix
+  a[0][0]   =   (imass[0] + imass[1])   * MathExtra::dot3(r01,r01);
+  a[0][1]   =   (imass[0]           )   * MathExtra::dot3(r01,r02);
+  a[0][2]   =   (imass[0]           )   * MathExtra::dot3(r01,r03);
+  a[1][0]   =   a[0][1];
+  a[1][1]   =   (imass[0] + imass[2])   * MathExtra::dot3(r02,r02);
+  a[1][2]   =   (imass[0]           )   * MathExtra::dot3(r02,r03);
+  a[2][0]   =   a[0][2];
+  a[2][1]   =   a[1][2];
+  a[2][2]   =   (imass[0] + imass[3])   * MathExtra::dot3(r03,r03);
+
+  // setup RHS
+  c[0]  = - MathExtra::dot3(vp01,r01);
+  c[1]  = - MathExtra::dot3(vp02,r02);
+  c[2]  = - MathExtra::dot3(vp03,r03);
+
+  // calculate the inverse 3x3 matrix exactly
+  solve3x3exactly(a,c,l);
+
+  // add corrections to the velocities if the process owns this atom
+  if (i0 < nlocal) {
+    for (int k=0; k<3; k++) 
+      v[i0][k] -= imass[0] * (  l[0] * r01[k] + l[1] * r02[k] + l[2] * r03[k]);
+  } 
+  if (i1 < nlocal) {
+    for (int k=0; k<3; k++) 
+      v[i1][k] -= imass[1] * (-l[0] * r01[k]); 
+  }
+  if (i2 < nlocal) {
+    for (int k=0; k<3; k++) 
+      v[i2][k] -= imass[2] * ( -l[1] * r02[k]);
+  }
+  if (i3 < nlocal) {
+    for (int k=0; k<3; k++) 
+      v[i3][k] -= imass[3] * ( -l[2] * r03[k]);                 
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::solve2x2exactly(const double a[][2], 
+                                const double c[], double l[])
+{
+  double determ, determinv;
+
+  // calculate the determinant of the matrix
+  determ = a[0][0] * a[1][1] - a[0][1] * a[1][0];
+
+  // check if matrix is actually invertible
+  if (determ == 0.0) error->one(FLERR,"RATTLE determinant = 0.0");
+  determinv = 1.0/determ;
+
+  // Calcualte the solution:  (l01, l02)^T = A^(-1) * c
+  l[0] = determinv * ( a[1][1] * c[0]  - a[0][1] * c[1]);
+  l[1] = determinv * (-a[1][0] * c[0]  + a[0][0] * c[1]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::solve3x3exactly(const double a[][3], 
+                                const double c[], double l[])
+{
+  double ai[3][3];
+  double determ, determinv;
+  
+  // calculate the determinant of the matrix
+  determ = a[0][0]*a[1][1]*a[2][2] + a[0][1]*a[1][2]*a[2][0] + 
+    a[0][2]*a[1][0]*a[2][1] - a[0][0]*a[1][2]*a[2][1] - 
+    a[0][1]*a[1][0]*a[2][2] - a[0][2]*a[1][1]*a[2][0];
+  
+  // check if matrix is actually invertible
+  if (determ == 0.0) error->one(FLERR,"Rattle determinant = 0.0");
+
+  // calculate the inverse 3x3 matrix: A^(-1) = (ai_jk)
+  determinv = 1.0/determ;
+  ai[0][0] =  determinv * (a[1][1]*a[2][2] - a[1][2]*a[2][1]);
+  ai[0][1] = -determinv * (a[0][1]*a[2][2] - a[0][2]*a[2][1]); 
+  ai[0][2] =  determinv * (a[0][1]*a[1][2] - a[0][2]*a[1][1]);
+  ai[1][0] = -determinv * (a[1][0]*a[2][2] - a[1][2]*a[2][0]);
+  ai[1][1] =  determinv * (a[0][0]*a[2][2] - a[0][2]*a[2][0]);
+  ai[1][2] = -determinv * (a[0][0]*a[1][2] - a[0][2]*a[1][0]);
+  ai[2][0] =  determinv * (a[1][0]*a[2][1] - a[1][1]*a[2][0]);
+  ai[2][1] = -determinv * (a[0][0]*a[2][1] - a[0][1]*a[2][0]);
+  ai[2][2] =  determinv * (a[0][0]*a[1][1] - a[0][1]*a[1][0]);
+
+  // calculate the solution:  (l01, l02, l12)^T = A^(-1) * c
+  for (int i=0; i<3; i++) {
+    l[i] = 0;
+    for (int j=0; j<3; j++) 
+      l[i] += ai[i][j] * c[j];
+  }  
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::reset_dt()
+{
+  FixShake::reset_dt();
+  dtfv = 0.5 * update->dt * force->ftm2v;
+}
+
+/* ----------------------------------------------------------------------
+   carry out an unconstrained velocity update (vp is modified)
+------------------------------------------------------------------------- */
+
+void FixRattle::update_v_half_nocons()
+{
+  double dtfvinvm;
+  if (rmass) {
+    for (int i = 0; i < nlocal; i++) {
+      if (shake_flag[i]) {
+        dtfvinvm = dtfv / rmass[i];
+        for (int k=0; k<3; k++)
+          vp[i][k] = v[i][k] + dtfvinvm * f[i][k];
+      }
+      else
+        vp[i][0] = vp[i][1] = vp[i][2] = 0;
+    }
+  }
+  else {
+    for (int i = 0; i < nlocal; i++) {
+      dtfvinvm = dtfv/mass[type[i]];
+      if (shake_flag[i]) {
+        for (int k=0; k<3; k++)
+          vp[i][k] = v[i][k] + dtfvinvm * f[i][k];
+      }
+      else
+        vp[i][0] = vp[i][1] = vp[i][2] = 0;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::update_v_half_nocons_respa(int ilevel)
+{
+  // select timestep for current level
+  dtfv = 0.5 * step_respa[ilevel] * force->ftm2v;
+
+  // carry out unconstrained velocity update
+  update_v_half_nocons();
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local atom-based arrays
+------------------------------------------------------------------------- */
+
+double FixRattle::memory_usage()
+{
+  int nmax = atom->nmax;
+  double bytes = FixShake::memory_usage();
+  bytes += nmax*3 * sizeof(double);
+  return bytes;
+}
+
+/* ----------------------------------------------------------------------
+   allocate local atom-based arrays
+------------------------------------------------------------------------- */
+
+void FixRattle::grow_arrays(int nmax)
+{
+  FixShake::grow_arrays(nmax);
+  memory->destroy(vp);
+  memory->create(vp,nmax,3,"rattle:vp");
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixRattle::pack_forward_comm(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  m = 0;
+
+  switch (comm_mode) {
+    case XSHAKE:
+      m = FixShake::pack_forward_comm(n, list, buf, pbc_flag, pbc);
+      break;
+    case VP:
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = vp[j][0];
+        buf[m++] = vp[j][1];
+        buf[m++] = vp[j][2];
+      }
+      break;
+
+    case V:
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = v[j][0];
+        buf[m++] = v[j][1];
+        buf[m++] = v[j][2];
+      }
+      break;
+
+    case X:
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = x[j][0];
+        buf[m++] = x[j][1];
+        buf[m++] = x[j][2];
+      }
+      break;
+  }
+  return m;
+} 
+
+/* ---------------------------------------------------------------------- */
+
+void FixRattle::unpack_forward_comm(int n, int first, double *buf)
+{
+  int i, m, last;
+  m = 0;
+  last = first + n;
+
+  switch (comm_mode) {
+    case XSHAKE:
+      FixShake::unpack_forward_comm(n, first,buf);
+      break;
+
+    case VP:
+      for (i = first; i < last; i++) {
+        vp[i][0] = buf[m++];
+        vp[i][1] = buf[m++];
+        vp[i][2] = buf[m++];
+      }
+      break;
+
+    case V:
+      for (i = first; i < last; i++) {
+        v[i][0] = buf[m++];
+        v[i][1] = buf[m++];
+        v[i][2] = buf[m++];
+      }
+      break;
+
+    case X:
+      for (i = first; i < last; i++) {
+        x[i][0] = buf[m++];
+        x[i][1] = buf[m++];
+        x[i][2] = buf[m++];
+      }
+      break;
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   wrapper method for end_of_step fixes which modify the coordinates
+------------------------------------------------------------------------- */
+
+void FixRattle::coordinate_constraints_end_of_step() {
+  comm_mode = XSHAKE;
+  FixShake::coordinate_constraints_end_of_step();
+}
+
+
+/* ----------------------------------------------------------------------
+   DEBUGGING methods
+   The functions below allow you to check whether the 
+     coordinate and velocity constraints are satisfied at the
+     end of the timestep
+   only enabled if RATTLE_DEBUG is set to 1 at top of file
+   checkX tests if shakeX and vrattleX worked as expected
+------------------------------------------------------------------------- */
+
+void FixRattle::end_of_step()
+{
+  // communicate velocities and coordinates (x and v)
+  if (nprocs > 1) {
+    comm_mode = V;
+    comm->forward_comm_fix(this); 
+    comm_mode = X;
+    comm->forward_comm_fix(this);
+  }
+  if (!check_constraints(v, RATTLE_TEST_POS, RATTLE_TEST_VEL)) {
+    error->one(FLERR, "RATTLE failed!");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+bool FixRattle::check_constraints(double **v, bool checkr, bool checkv)
+{
+  int m;
+  bool ret = true;
+  int i=0;
+  while (i < nlist && ret) {
+    m = list[i];
+    if      (shake_flag[m] == 2)     ret =   check2(v,m,checkr,checkv); 
+    else if (shake_flag[m] == 3)     ret =   check3(v,m,checkr,checkv); 
+    else if (shake_flag[m] == 4)     ret =   check4(v,m,checkr,checkv); 
+    else                             ret =   check3angle(v,m,checkr,checkv);
+    i++;
+  }
+  return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+
+bool FixRattle::check2(double **v, int m, bool checkr, bool checkv)
+{
+  bool      stat;
+  double    r01[3],v01[3];
+  const double tol = tolerance;
+  double bond1 = bond_distance[shake_type[m][0]];
+ 
+  tagint i0 = atom->map(shake_atom[m][0]);
+  tagint i1 = atom->map(shake_atom[m][1]);
+
+  MathExtra::sub3(x[i1],x[i0],r01);
+  domain->minimum_image(r01);
+  MathExtra::sub3(v[i1],v[i0],v01);
+
+  stat = !(checkr && (fabs(sqrt(MathExtra::dot3(r01,r01)) - bond1) > tol));
+  if (!stat)   
+     error->one(FLERR,"Coordinate constraints are not satisfied "
+                "up to desired tolerance");
+
+  stat = !(checkv && (fabs(MathExtra::dot3(r01,v01)) > tol));
+  if (!stat)   
+     error->one(FLERR,"Velocity constraints are not satisfied "
+                "up to desired tolerance");
+  return stat;
+}
+
+/* ---------------------------------------------------------------------- */
+
+bool FixRattle::check3(double **v, int m, bool checkr, bool checkv)
+{
+  bool      stat;
+  tagint    i0,i1,i2;
+  double    r01[3], r02[3], v01[3], v02[3];
+  const double tol = tolerance;
+  double bond1 = bond_distance[shake_type[m][0]];
+  double bond2 = bond_distance[shake_type[m][1]];
+ 
+  i0 = atom->map(shake_atom[m][0]);
+  i1 = atom->map(shake_atom[m][1]);
+  i2 = atom->map(shake_atom[m][2]);
+
+  MathExtra::sub3(x[i1],x[i0],r01);
+  MathExtra::sub3(x[i2],x[i0],r02);
+ 
+  domain->minimum_image(r01);
+  domain->minimum_image(r02);
+
+  MathExtra::sub3(v[i1],v[i0],v01);
+  MathExtra::sub3(v[i2],v[i0],v02);
+
+  stat = !(checkr && (fabs(sqrt(MathExtra::dot3(r01,r01)) - bond1) > tol ||
+                      fabs(sqrt(MathExtra::dot3(r02,r02))-bond2) > tol));
+  if (!stat)   
+     error->one(FLERR,"Coordinate constraints are not satisfied "
+                "up to desired tolerance");
+
+  stat = !(checkv && (fabs(MathExtra::dot3(r01,v01)) > tol || 
+                      fabs(MathExtra::dot3(r02,v02)) > tol));
+  if (!stat)   
+     error->one(FLERR,"Velocity constraints are not satisfied "
+                "up to desired tolerance!");
+  return stat;
+}
+
+/* ---------------------------------------------------------------------- */
+
+bool FixRattle::check4(double **v, int m, bool checkr, bool checkv)
+{
+  bool stat = true;
+  const double tol = tolerance;
+  double r01[3], r02[3], r03[3], v01[3], v02[3], v03[3];
+
+  int i0 = atom->map(shake_atom[m][0]);
+  int i1 = atom->map(shake_atom[m][1]);
+  int i2 = atom->map(shake_atom[m][2]);
+  int i3 = atom->map(shake_atom[m][3]);
+  double bond1 = bond_distance[shake_type[m][0]];
+  double bond2 = bond_distance[shake_type[m][1]];
+  double bond3 = bond_distance[shake_type[m][2]];
+
+  MathExtra::sub3(x[i1],x[i0],r01);
+  MathExtra::sub3(x[i2],x[i0],r02);
+  MathExtra::sub3(x[i3],x[i0],r03);
+
+  domain->minimum_image(r01);
+  domain->minimum_image(r02);
+  domain->minimum_image(r03);
+
+  MathExtra::sub3(v[i1],v[i0],v01);
+  MathExtra::sub3(v[i2],v[i0],v02);
+  MathExtra::sub3(v[i3],v[i0],v03);
+
+  stat = !(checkr && (fabs(sqrt(MathExtra::dot3(r01,r01)) - bond1) > tol || 
+                      fabs(sqrt(MathExtra::dot3(r02,r02))-bond2) > tol || 
+                      fabs(sqrt(MathExtra::dot3(r03,r03))-bond3) > tol));
+  if (!stat)   
+     error->one(FLERR,"Coordinate constraints are not satisfied "
+                "up to desired tolerance!");
+
+  stat = !(checkv && (fabs(MathExtra::dot3(r01,v01)) > tol || 
+                      fabs(MathExtra::dot3(r02,v02)) > tol || 
+                      fabs(MathExtra::dot3(r03,v03)) > tol));
+  if (!stat)   
+     error->one(FLERR,"Velocity constraints are not satisfied "
+                "up to desired tolerance!");
+  return stat;
+}
+
+/* ---------------------------------------------------------------------- */
+
+bool FixRattle::check3angle(double **v, int m, bool checkr, bool checkv)
+{
+  bool stat = true;
+  const double tol = tolerance;
+  double r01[3], r02[3], r12[3], v01[3], v02[3], v12[3];
+
+  int i0 = atom->map(shake_atom[m][0]);
+  int i1 = atom->map(shake_atom[m][1]);
+  int i2 = atom->map(shake_atom[m][2]);
+  double bond1 = bond_distance[shake_type[m][0]];
+  double bond2 = bond_distance[shake_type[m][1]];
+  double bond12 = angle_distance[shake_type[m][2]];
+
+  MathExtra::sub3(x[i1],x[i0],r01);
+  MathExtra::sub3(x[i2],x[i0],r02);
+  MathExtra::sub3(x[i2],x[i1],r12);
+
+  domain->minimum_image(r01);
+  domain->minimum_image(r02);
+  domain->minimum_image(r12);
+
+  MathExtra::sub3(v[i1],v[i0],v01);
+  MathExtra::sub3(v[i2],v[i0],v02);
+  MathExtra::sub3(v[i2],v[i1],v12);
+
+  stat = !(checkr && (fabs(sqrt(MathExtra::dot3(r01,r01)) - bond1) > tol || 
+                      fabs(sqrt(MathExtra::dot3(r02,r02))-bond2) > tol || 
+                      fabs(sqrt(MathExtra::dot3(r12,r12))-bond12) > tol));
+  if (!stat)   
+     error->one(FLERR,"Coordinate constraints are not satisfied "
+                "up to desired tolerance!");
+
+  stat = !(checkv && (fabs(MathExtra::dot3(r01,v01)) > tol || 
+                      fabs(MathExtra::dot3(r02,v02)) > tol || 
+                      fabs(MathExtra::dot3(r12,v12)) > tol));
+  if (!stat)   
+     error->one(FLERR,"Velocity constraints are not satisfied "
+                "up to desired tolerance!");
+  return stat;
+}
diff --git a/src/RIGID/fix_rattle.h b/src/RIGID/fix_rattle.h
new file mode 100644
index 0000000000..84d04caeec
--- /dev/null
+++ b/src/RIGID/fix_rattle.h
@@ -0,0 +1,74 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(rattle,FixRattle)
+
+#else
+
+#ifndef LMP_FIX_RATTLE_H
+#define LMP_FIX_RATTLE_H
+
+#include "fix.h"
+#include "fix_shake.h"
+
+namespace LAMMPS_NS {
+
+class FixRattle : public FixShake {
+ public:
+  double **vp;                // array for unconstrained velocities
+  double dtfv;                // timestep for velocity update
+  int comm_mode;              // mode for communication pack/unpack
+
+  FixRattle(class LAMMPS *, int, char **);
+  ~FixRattle();
+  int setmask();
+  void init();
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void final_integrate();
+  void final_integrate_respa(int,int);
+  void coordinate_constraints_end_of_step();
+
+  double memory_usage();
+  void grow_arrays(int);
+  int pack_forward_comm(int, int *, double *, int, int *);
+  void unpack_forward_comm(int, int, double *);
+  void reset_dt();
+
+ private:
+  void update_v_half_nocons();
+  void update_v_half_nocons_respa(int);
+
+  void vrattle2(int m);
+  void vrattle3(int m);
+  void vrattle4(int m);
+  void vrattle3angle(int m);
+  void solve3x3exactly(const double a[][3], const double c[], double l[]);
+  void solve2x2exactly(const double a[][2], const double c[], double l[]);
+
+  // debugging methosd
+
+  bool check3angle(double ** v, int m, bool checkr, bool checkv);
+  bool check2(double **v, int m, bool checkr, bool checkv);
+  bool check3(double **v, int m, bool checkr, bool checkv);
+  bool check4(double **v, int m, bool checkr, bool checkv);
+  bool check_constraints(double **v, bool checkr, bool checkv);
+  void end_of_step();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/RIGID/fix_shake.cpp b/src/RIGID/fix_shake.cpp
index 0dd6c85d5f..bf41b17a5b 100644
--- a/src/RIGID/fix_shake.cpp
+++ b/src/RIGID/fix_shake.cpp
@@ -205,6 +205,7 @@ FixShake::FixShake(LAMMPS *lmp, int narg, char **arg) :
   // SHAKE vs RATTLE
 
   rattle = 0;
+  vflag_post_force = 0;
 
   // identify all SHAKE clusters
 
@@ -432,12 +433,14 @@ void FixShake::setup(int vflag)
   // half timestep constraint on pre-step, full timestep thereafter
 
   if (strstr(update->integrate_style,"verlet")) {
+    respa   = 0;   
     dtv     = update->dt;
     dtfsq   = 0.5 * update->dt * update->dt * force->ftm2v;
     FixShake::post_force(vflag);
     if (!rattle) dtfsq = update->dt * update->dt * force->ftm2v;
 
   } else {
+    respa  = 1;
     dtv = step_respa[0]; 
     dtf_innerhalf = 0.5 * step_respa[0] * force->ftm2v;
     dtf_inner = dtf_innerhalf;
@@ -564,6 +567,10 @@ void FixShake::post_force(int vflag)
     else if (shake_flag[m] == 4) shake4(m);
     else shake3angle(m);
   }
+  
+  // store vflag for coordinate_constraints_end_of_step()
+
+  vflag_post_force = vflag;
 }
 
 /* ----------------------------------------------------------------------
@@ -608,6 +615,10 @@ void FixShake::post_force_respa(int vflag, int ilevel, int iloop)
     else if (shake_flag[m] == 4) shake4(m);
     else shake3angle(m);
   }
+
+  // store vflag for coordinate_constraints_end_of_step()
+
+  vflag_post_force = vflag;
 }
 
 /* ----------------------------------------------------------------------
@@ -2665,3 +2676,29 @@ void *FixShake::extract(const char *str, int &dim)
   }
   return NULL;
 }
+
+
+
+/* ----------------------------------------------------------------------
+   wrapper method for end_of_step fixes which modify the coordinates
+------------------------------------------------------------------------- */
+
+void FixShake::coordinate_constraints_end_of_step() {
+  if (!respa) {
+    dtfsq   = 0.5 * update->dt * update->dt * force->ftm2v;
+    FixShake::post_force(vflag_post_force);
+    if (!rattle) dtfsq = update->dt * update->dt * force->ftm2v;
+  } 
+  else {
+    dtf_innerhalf = 0.5 * step_respa[0] * force->ftm2v;
+    dtf_inner = dtf_innerhalf;
+    // apply correction to all rRESPA levels
+    for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) {
+      ((Respa *) update->integrate)->copy_flevel_f(ilevel);
+      FixShake::post_force_respa(vflag_post_force,ilevel,loop_respa[ilevel]-1);
+      ((Respa *) update->integrate)->copy_f_flevel(ilevel);
+    }
+    if (!rattle) dtf_inner = step_respa[0] * force->ftm2v;
+  }
+}
+
diff --git a/src/RIGID/fix_shake.h b/src/RIGID/fix_shake.h
index 74e09d12d8..049971ca4c 100644
--- a/src/RIGID/fix_shake.h
+++ b/src/RIGID/fix_shake.h
@@ -46,19 +46,22 @@ class FixShake : public Fix {
   virtual int unpack_exchange(int, double *);
   virtual int pack_forward_comm(int, int *, double *, int, int *);
   virtual void unpack_forward_comm(int, int, double *);
+  virtual void coordinate_constraints_end_of_step();
 
   int dof(int);
   virtual void reset_dt();
   void *extract(const char *, int &);
 
  protected:
+  int vflag_post_force;                  // store the vflag of last post_force call
+  int respa;                             // 0 = vel. Verlet, 1 = respa
   int me,nprocs;
   int rattle;                            // 0 = SHAKE, 1 = RATTLE
   double tolerance;                      // SHAKE tolerance
   int max_iter;                          // max # of SHAKE iterations
   int output_every;                      // SHAKE stat output every so often
   bigint next_output;                    // timestep for next output
-
+  
                                          // settings from input command
   int *bond_flag,*angle_flag;            // bond/angle types to constrain
   int *type_flag;                        // constrain bonds to these types
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index cd63199095..c0847a8bf1 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -177,7 +177,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   // set OpenMP threads
   // nomp is user setting, default = 0
   
-#if defined(_OPENMP)
+  #if defined(_OPENMP)
   if (nomp != 0) {
     omp_set_num_threads(nomp);
     comm->nthreads = nomp;
@@ -187,7 +187,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
     nthreads = omp_get_num_threads();
     comm->nthreads = nthreads;
   }
-#endif
+  #endif
 
   // set offload params
 
@@ -357,17 +357,23 @@ void FixIntel::pair_init_check()
     #endif
   }
 
+  int need_tag = 0;
+  if (atom->molecular) need_tag = 1;
+
   // Clear buffers used for pair style
   char kmode[80];
   if (_precision_mode == PREC_MODE_SINGLE) {
     strcpy(kmode, "single");
     get_single_buffers()->free_all_nbor_buffers();
+    get_single_buffers()->need_tag(need_tag);
   } else if (_precision_mode == PREC_MODE_MIXED) {
     strcpy(kmode, "mixed");
     get_mixed_buffers()->free_all_nbor_buffers();
+    get_mixed_buffers()->need_tag(need_tag);
   } else {
     strcpy(kmode, "double");
     get_double_buffers()->free_all_nbor_buffers();
+    get_double_buffers()->need_tag(need_tag);
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
@@ -730,16 +736,16 @@ int FixIntel::set_host_affinity(const int nomp)
     int pragma_size = 1024;
     buf1 = (float*) malloc(sizeof(float)*pragma_size);
 
-    #pragma offload target (mic:0) \
+    #pragma offload target (mic:0) mandatory \
       in(buf1:length(pragma_size) alloc_if(1) free_if(0))	\
       signal(&sig1)
-    {}
+    { buf1[0] = 0.0; }
     #pragma offload_wait target(mic:0) wait(&sig1)
 
-    #pragma offload target (mic:0) \
+    #pragma offload target (mic:0) mandatory \
       out(buf1:length(pragma_size) alloc_if(0) free_if(1))	\
       signal(&sig2)
-    {}
+    { buf1[0] = 1.0; }
     #pragma offload_wait target(mic:0) wait(&sig2)
     free(buf1);
 
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index f7d985b704..cbb0051594 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -366,11 +366,12 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
       lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
 	out_offset;
       if (eatom) {
+	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[ii].x;
           f[i].y += f_in[ii].y;
           f[i].z += f_in[ii].z;
-          force->pair->eatom[i] += f_in[ii].w;
+          lmp_eatom[i] += f_in[ii].w;
           tor[i].x += f_in[ii+1].x;
           tor[i].y += f_in[ii+1].y;
           tor[i].z += f_in[ii+1].z;
@@ -389,11 +390,12 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
       }
     } else {
       if (eatom) {
+	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[i].x;
           f[i].y += f_in[i].y;
           f[i].z += f_in[i].z;
-          force->pair->eatom[i] += f_in[i].w;
+          lmp_eatom[i] += f_in[i].w;
         }
       } else {
         for (int i = ifrom; i < ito; i++) {
@@ -478,7 +480,11 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
 
   start_watch(TIME_OFFLOAD_WAIT);
   #ifdef _LMP_INTEL_OFFLOAD
-  #pragma offload_wait target(mic:_cop) wait(f_in)
+  if (neighbor->ago == 0) {
+    #pragma offload_wait target(mic:_cop) wait(atom->tag, f_in)
+  } else {
+    #pragma offload_wait target(mic:_cop) wait(f_in)
+  }
   #endif
   double wait_time = stop_watch(TIME_OFFLOAD_WAIT);
 
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index 2f9a39d2f2..e430a1ebc4 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -27,6 +27,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
   _list_alloc_atoms = 0;
   _ntypes = 0;
   _off_map_maxlocal = 0;
+  _ccachex = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   _separate_buffers = 0;
   _off_f = 0;
@@ -35,6 +36,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
   _off_map_maxhead = 0;
   _off_list_alloc = false;
   _off_threads = 0;
+  _off_ccache = 0;
   #endif
 }
 
@@ -45,6 +47,7 @@ IntelBuffers<flt_t, acc_t>::~IntelBuffers()
 {
   free_buffers();
   free_all_nbor_buffers();
+  free_ccache();
   set_ntypes(0);
 }
 
@@ -196,14 +199,16 @@ void IntelBuffers<flt_t, acc_t>::_grow_nmax()
     nspecial = lmp->atom->nspecial[0];
     special_length = size * lmp->atom->maxspecial;
     nspecial_length = size * 3;
-    tag_length = size;
   } else {
     special = &_special_holder;
     nspecial = &_nspecial_holder;
     special_length = 1;
     nspecial_length = 1;
-    tag_length = 1;
   }
+  if (_need_tag)
+    tag_length = size;
+  else
+    tag_length = 1;
   int *tag = lmp->atom->tag;
   int *bins = lmp->neighbor->bins;
   #pragma offload_transfer target(mic:_cop) \
@@ -337,8 +342,8 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list, 
-						 const int nlocal,
-						 const int offload_end)
+                                                 const int nlocal,
+                                                 const int offload_end)
 {
   free_nbor_list();
   _list_alloc_atoms = 1.10 * nlocal;
@@ -353,8 +358,8 @@ void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
     if (special_flag != NULL && list_alloc != NULL) {
       #pragma offload_transfer target(mic:_cop) \
         in(special_flag:length(4) alloc_if(1) free_if(0)) \
-	in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) \
-	nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0))
+        in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) \
+        nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0))
       _off_map_stencil = stencil;
       _off_list_alloc = true;
     }
@@ -378,6 +383,86 @@ void IntelBuffers<flt_t, acc_t>::_grow_stencil(NeighList *list)
 
 /* ---------------------------------------------------------------------- */
 
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_ccache()
+{
+  if (_ccachex) {
+    flt_t *ccachex = _ccachex;
+    flt_t *ccachey = _ccachey;
+    flt_t *ccachez = _ccachez;
+    flt_t *ccachew = _ccachew;
+    int *ccachei = _ccachei;
+    int *ccachej = _ccachej;
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_off_ccache) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ccachex,ccachey,ccachez,ccachew:alloc_if(0) free_if(1)) \
+        nocopy(ccachei,ccachej:alloc_if(0) free_if(1))
+    }
+    _off_ccache = 0;
+    #endif
+
+    lmp->memory->destroy(ccachex);
+    lmp->memory->destroy(ccachey);
+    lmp->memory->destroy(ccachez);
+    lmp->memory->destroy(ccachew);
+    lmp->memory->destroy(ccachei);
+    lmp->memory->destroy(ccachej);
+
+    _ccachex = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
+                                           const int nthreads)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_ccachex && off_flag && _off_ccache == 0)
+    free_ccache();
+  #endif
+  if (_ccachex)
+    return;
+
+  const int nsize = get_max_nbors();
+  int esize = MIN(sizeof(int), sizeof(flt_t));
+  IP_PRE_get_stride(_ccache_stride, nsize, esize, 0);
+  int nt = MAX(nthreads, _off_threads);
+  const int vsize = _ccache_stride * nt;
+
+  lmp->memory->create(_ccachex, vsize , "_ccachex");
+  lmp->memory->create(_ccachey, vsize, "_ccachey");
+  lmp->memory->create(_ccachez, vsize, "_ccachez");
+  lmp->memory->create(_ccachew, vsize, "_ccachew");
+  lmp->memory->create(_ccachei, vsize, "_ccachei");
+  lmp->memory->create(_ccachej, vsize, "_ccachej");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_flag) {
+    flt_t *ccachex = _ccachex;
+    flt_t *ccachey = _ccachey;
+    flt_t *ccachez = _ccachez;
+    flt_t *ccachew = _ccachew;
+    int *ccachei = _ccachei;
+    int *ccachej = _ccachej;
+
+    if (ccachex != NULL && ccachey !=NULL && ccachez != NULL && 
+	ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
+        nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
+        nocopy(ccachei,ccachej:length(vsize) alloc_if(1) free_if(0))
+    }
+    _off_ccache = 1;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
 {
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index 19f10f9559..91dbb790ca 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -49,7 +49,14 @@ class IntelBuffers {
   inline int get_stride(int nall) {
     int stride;
     IP_PRE_get_stride(stride, nall, sizeof(vec3_acc_t), 
-			 lmp->atom->torque);
+                         lmp->atom->torque);
+    return stride;
+  }
+
+  template <class stype>
+  inline int get_scalar_stride(const int n) {
+    int stride;
+    IP_PRE_get_stride(stride, n, sizeof(stype), 0);
     return stride;
   }
 
@@ -103,6 +110,16 @@ class IntelBuffers {
     #endif
   }
 
+  void free_ccache();
+  void grow_ccache(const int off_flag, const int nthreads);
+  inline int ccache_stride() { return _ccache_stride; }
+  inline flt_t * get_ccachex() { return _ccachex; }
+  inline flt_t * get_ccachey() { return _ccachey; }
+  inline flt_t * get_ccachez() { return _ccachez; }
+  inline flt_t * get_ccachew() { return _ccachew; }
+  inline int * get_ccachei() { return _ccachei; }
+  inline int * get_ccachej() { return _ccachej; }
+
   inline int get_max_nbors() {
     int mn = lmp->neighbor->oneatom * sizeof(int) /
         (INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
@@ -232,6 +249,12 @@ class IntelBuffers {
 	     used_ghost * sizeof(flt_t));
     }
   }
+
+  inline int need_tag() { return _need_tag; }
+  inline void need_tag(const int nt) { _need_tag = nt; }
+  #else
+  inline int need_tag() { return 0; }
+  inline void need_tag(const int nt) { }
   #endif
 
   double memory_usage(const int nthreads);
@@ -255,17 +278,22 @@ class IntelBuffers {
   flt_t **_cutneighsq;
   int _ntypes;
 
+  int _ccache_stride;
+  flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
+  int *_ccachei, *_ccachej;
+
   #ifdef _LMP_INTEL_OFFLOAD
   int _separate_buffers;
   atom_t *_host_x;
   flt_t *_host_q;
   quat_t *_host_quat;
   vec3_acc_t *_off_f;
-  int _off_map_nmax, _off_map_maxhead, _cop;
+  int _off_map_nmax, _off_map_maxhead, _cop, _off_ccache;
   int *_off_map_ilist;
   int *_off_map_stencil, *_off_map_special, *_off_map_nspecial, *_off_map_tag;
   int *_off_map_binhead, *_off_map_bins, *_off_map_numneigh;
   bool _off_list_alloc;
+  int _need_tag;
   #endif
   
   int _buf_size, _buf_local_size;
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index 9dfb3a20e0..44534e1324 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -247,7 +247,7 @@ inline double MIC_Wtime() {
       nall = nlocal + nghost;						\
       separate_flag--;							\
       int flength;							\
-      if (NEWTON_PAIR) flength = nall;					\
+      if (newton) flength = nall;					\
       else flength = nlocal;						\
       IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),		\
 			   separate_flag);				\
@@ -302,16 +302,40 @@ inline double MIC_Wtime() {
 
 #endif
 
-#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz)	\
-{									\
-  if (vflag == 1) {							\
-    sv0 += ev_pre * delx * delx * fpair;				\
-    sv1 += ev_pre * dely * dely * fpair;				\
-    sv2 += ev_pre * delz * delz * fpair;				\
-    sv3 += ev_pre * delx * dely * fpair;				\
-    sv4 += ev_pre * delx * delz * fpair;				\
-    sv5 += ev_pre * dely * delz * fpair;				\
-  }									\
+#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz)    \
+{                                                                       \
+  if (vflag == 1) {                                                     \
+    sv0 += ev_pre * delx * delx * fpair;                                \
+    sv1 += ev_pre * dely * dely * fpair;                                \
+    sv2 += ev_pre * delz * delz * fpair;                                \
+    sv3 += ev_pre * delx * dely * fpair;                                \
+    sv4 += ev_pre * delx * delz * fpair;                                \
+    sv5 += ev_pre * dely * delz * fpair;                                \
+  }                                                                     \
+}
+
+#define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2)	\
+{                                                                       \
+  if (vflag == 1) {                                                     \
+    sv0 += delx * fj[0] + delr2[0] * fk[0];                             \
+    sv1 += dely * fj[1] + delr2[1] * fk[1];				\
+    sv2 += delz * fj[2] + delr2[2] * fk[2];				\
+    sv3 += delx * fj[1] + delr2[0] * fk[1];				\
+    sv4 += delx * fj[2] + delr2[0] * fk[2];				\
+    sv5 += dely * fj[2] + delr2[1] * fk[2];				\
+  }                                                                     \
+}
+
+#define IP_PRE_ev_tally_nbor3v(vflag, fj0, fj1, fj2, delx, dely, delz)  \
+{                                                                       \
+  if (vflag == 1) {                                                     \
+    sv0 += delx * fj0;							\
+    sv1 += dely * fj1;							\
+    sv2 += delz * fj2;							\
+    sv3 += delx * fj1;							\
+    sv4 += delx * fj2;							\
+    sv5 += dely * fj2;							\
+  }                                                                     \
 }
 
 #define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp)    	\
diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp
index f4ca6b3a1e..6c3cfc1961 100644
--- a/src/USER-INTEL/neigh_half_bin_intel.cpp
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@@ -203,16 +203,15 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
   const int molecular = atom->molecular;
   int *ns = NULL;
   tagint *s = NULL;
-  int tag_size, special_size;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = nall;
   if (molecular) {
     s = atom->special[0];
     ns = atom->nspecial[0];
-    tag_size = nall;
     special_size = aend;
   } else {
     s = &buffers->_special_holder;
     ns = &buffers->_nspecial_holder;
-    tag_size = 0;
     special_size = 0;
   }
   const tagint * _noalias const special = s;
@@ -286,7 +285,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
     in(separate_buffers, astart, aend, nlocal, molecular, ntypes) \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(numneigh)
+    signal(tag)
   #endif
   {
     #ifdef __MIC__
@@ -604,16 +603,15 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
   const int molecular = atom->molecular;
   int *ns = NULL;
   tagint *s = NULL;
-  int tag_size, special_size;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
   if (molecular) {
     s = atom->special[0];
     ns = atom->nspecial[0];
-    tag_size = e_nall;
     special_size = aend;
   } else {
     s = &buffers->_special_holder;
     ns = &buffers->_nspecial_holder;
-    tag_size = 0;
     special_size = 0;
   }
   const tagint * _noalias const special = s;
@@ -687,7 +685,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
     in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(numneigh)
+    signal(tag)
   #endif
   {
     #ifdef __MIC__
@@ -1049,16 +1047,15 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
   const int molecular = atom->molecular;
   int *ns = NULL;
   tagint *s = NULL;
-  int tag_size, special_size;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
   if (molecular) {
     s = atom->special[0];
     ns = atom->nspecial[0];
-    tag_size = e_nall;
     special_size = aend;
   } else {
     s = &buffers->_special_holder;
     ns = &buffers->_nspecial_holder;
-    tag_size = 0;
     special_size = 0;
   }
   const tagint * _noalias const special = s;
@@ -1132,7 +1129,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
     in(offload,separate_buffers, astart, aend, nlocal, molecular, ntypes) \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(numneigh)
+    signal(tag)
   #endif
   {
     #ifdef __MIC__
@@ -1344,3 +1341,417 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
     #endif
   }
 }
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void Neighbor::full_bin_intel(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+  list->gnum = 0;
+
+  // Get fix for intel stuff
+  FixIntel *fix = static_cast<FixIntel *>(fix_intel);
+
+  const int off_end = fix->offload_end_neighbor();
+  int host_start = fix->host_start_neighbor();;
+  int offload_noghost = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->full_host_list()) host_start = 0;
+  offload_noghost = fix->offload_noghost();
+  if (exclude)
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    if (offload_noghost) {
+      fbi<float,double,1>(1, list, fix->get_mixed_buffers(),
+                          0, off_end, fix);
+      fbi<float,double,1>(0, list, fix->get_mixed_buffers(),
+                          host_start, nlocal, fix, off_end);
+    } else {
+      fbi<float,double,0>(1, list, fix->get_mixed_buffers(),
+                          0, off_end, fix);
+      fbi<float,double,0>(0, list, fix->get_mixed_buffers(),
+                          host_start, nlocal, fix);
+    }
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    if (offload_noghost) {
+      fbi<double,double,1>(1, list, fix->get_double_buffers(),
+                           0, off_end, fix);
+      fbi<double,double,1>(0, list, fix->get_double_buffers(),
+                           host_start, nlocal, fix, off_end);
+    } else {
+      fbi<double,double,0>(1, list, fix->get_double_buffers(),
+                           0, off_end, fix);
+      fbi<double,double,0>(0, list, fix->get_double_buffers(),
+                           host_start, nlocal, fix);
+    }
+  } else {
+    if (offload_noghost) {
+      fbi<float,float,1>(1, list, fix->get_single_buffers(), 0, off_end, fix);
+      fbi<float,float,1>(0, list, fix->get_single_buffers(),
+                         host_start, nlocal, fix, off_end);
+    } else {
+      fbi<float,float,0>(1, list, fix->get_single_buffers(), 0, off_end, fix);
+      fbi<float,float,0>(0, list, fix->get_single_buffers(),
+                         host_start, nlocal, fix);
+    }
+  }
+}
+
+template <class flt_t, class acc_t, int offload_noghost>
+void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
+                    const int astart, const int aend, void *fix_in,
+                    const int offload_end) {
+  IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
+  FixIntel *fix = (FixIntel *)fix_in;
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+
+  if (offload) {
+    fix->start_watch(TIME_PACK);
+    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
+    buffers->grow_nbor(list, atom->nlocal, aend);
+
+    ATOM_T biga;
+    biga.x = INTEL_BIGP;
+    biga.y = INTEL_BIGP;
+    biga.z = INTEL_BIGP;
+    biga.w = 1;
+    buffers->get_x()[nall]=biga;
+
+    const int nthreads = comm->nthreads;
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(buffers)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
+                                sizeof(ATOM_T));
+      buffers->thr_pack(ifrom, ito, 0);
+    }
+    fix->stop_watch(TIME_PACK);
+
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    bin_atoms<flt_t,acc_t>(buffers->get_x(), buffers->get_atombin());
+  } else {
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+  }
+  const int pad_width = pad;
+
+  if (aend-astart == 0) {
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    return;
+  }
+
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
+  int nall_t = nall;
+  if (offload_noghost && offload) nall_t = atom->nlocal;
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL;
+  tagint *s = NULL;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    special_size = 0;
+  }
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const tagint * _noalias const tag = atom->tag;
+
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = list->nstencil;
+  const int * _noalias const stencil = list->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  tagint * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  if (offload) {
+    timer_compute = fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = fix->get_off_overflow_flag();
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else {
+    tnum = comm->nthreads;
+    overflow = fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+  int * _noalias const atombin = buffers->get_atombin();
+
+  // Make sure dummy coordinates to eliminate loop remainder not within cutoff
+  {
+    const flt_t dx = (INTEL_BIGP - bboxhi[0]);
+    const flt_t dy = (INTEL_BIGP - bboxhi[1]);
+    const flt_t dz = (INTEL_BIGP - bboxhi[2]);
+    if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
+      error->one(FLERR,
+        "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const special_flag = this->special_flag;
+  const int * _noalias const bins = this->bins;
+  const int cop = fix->coprocessor_number();
+  const int separate_buffers = fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(atombin:length(aend) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(special_flag:length(0) alloc_if(0) free_if(0)) \
+    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(tag)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(numneigh, overflow)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
+      #endif
+
+      const int num = aend - astart;
+      int tid, ifrom, ito;
+      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+      ifrom += astart;
+      ito += astart;
+
+      int which;
+
+      const int list_size = (ito + tid + 1) * maxnbors;
+      int ct = (ifrom + tid) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      for (int i = ifrom; i < ito; i++) {
+        int j, k, n, n2, itype, jtype, ibin;
+        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
+
+        n = 0;
+        n2 = maxnbors;
+
+        xtmp = x[i].x;
+        ytmp = x[i].y;
+        ztmp = x[i].z;
+        itype = x[i].w;
+        const tagint itag = tag[i];
+        const int ioffset = ntypes * itype;
+
+        // loop over all atoms in surrounding bins in stencil including self
+        // skip i = j
+
+        ibin = atombin[i];
+
+        for (k = 0; k < nstencil; k++) {
+          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+            if (i == j) continue;
+
+            if (offload_noghost) {
+              if (j < nlocal) {
+                if (i < offload_end) continue;
+              } else if (offload) continue;
+            }
+
+            jtype = x[j].w;
+            #ifndef _LMP_INTEL_OFFLOAD
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+            #endif
+
+            delx = xtmp - x[j].x;
+            dely = ytmp - x[j].y;
+            delz = ztmp - x[j].z;
+            rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq <= cutneighsq[ioffset + jtype]) {
+	      const int jtag = tag[j];
+              int flist = 0;
+	      if (itag > jtag) {
+		if ((itag+jtag) % 2 == 0) flist = 1;
+	      } else if (itag < jtag) {
+		if ((itag+jtag) % 2 == 1) flist = 1;
+	      } else {
+		if (x[j].z < ztmp) flist = 1;
+		else if (x[j].z == ztmp && x[j].y < ytmp) flist = 1;
+		else if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp) 
+		  flist = 1;
+	      }
+	      if (flist)
+		neighptr[n2++] = j;
+	      else
+		neighptr[n++] = j;
+	      
+              #ifdef _LMP_INTEL_OFFLOAD
+	      if (j < nlocal) {
+                if (j < lmin) lmin = j;
+                if (j > lmax) lmax = j;
+              } else {
+                if (j < gmin) gmin = j;
+                if (j > gmax) gmax = j;
+              }
+              #endif
+            }
+          }
+        }
+        ilist[i] = i;
+
+        cnumneigh[i] = ct;
+        if (n > maxnbors) *overflow = 1;
+	atombin[i] = n;
+        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
+        numneigh[i] = n;
+        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
+        ct += n;
+        neighptr += n;
+        if (ct + n + maxnbors > list_size) {
+          *overflow = 1;
+          ct = (ifrom + tid) * maxnbors;
+        }
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (separate_buffers) {
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+        #pragma omp barrier
+      }
+
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+        } else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      }
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            ofind_special(which, special, nspecial, i, tag[j],
+                          special_flag);
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j >= nlocal) {
+              if (j == e_nall)
+                jlist[jj] = nall_offset;
+              else if (which > 0)
+                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+              else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+            if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+          }
+        }
+      }
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          int jj = 0;
+          for (jj = 0; jj < jnum; jj++) {
+            if (jlist[jj] >= nlocal) {
+	      if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+	      else jlist[jj] -= ghost_offset;
+	    }
+          }
+        }
+      }
+      #endif
+    } // end omp
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload) {
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    #ifdef _LMP_INTEL_OFFLOAD
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+    #endif
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_buffers) {
+      fix->start_watch(TIME_PACK);
+      fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(fix->host_min_local(),
+                                    fix->host_used_local(),
+                                    fix->host_min_ghost(),
+                                    fix->host_used_ghost());
+      fix->stop_watch(TIME_PACK);
+    }
+    #endif
+  }
+}
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
new file mode 100755
index 0000000000..ebd626b5f7
--- /dev/null
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -0,0 +1,703 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_sw_intel.h"
+#include "atom.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+#include "force.h"
+#include "comm.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "memory.h"
+#include "error.h"
+#include "modify.h"
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+
+#define FC_PACKED0_T typename ForceConst<flt_t>::fc_packed0
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
+#define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
+
+#define MAXLINE 1024
+#define DELTA 4
+
+/* ---------------------------------------------------------------------- */
+
+PairSWIntel::PairSWIntel(LAMMPS *lmp) : PairSW(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairSWIntel::~PairSWIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSWIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairSWIntel::compute(int eflag, int vflag,
+                          IntelBuffers<flt_t,acc_t> *buffers,
+                          const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                nthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom, ito, ago);
+    }
+
+    fix->stop_watch(TIME_PACK);
+  }
+
+  if (_spq) {
+    if (evflag || vflag_fdotr) {
+      int ovflag = 0;
+      if (vflag_fdotr) ovflag = 2;
+      else if (vflag) ovflag = 1;
+      if (eflag) {
+	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+      } else {
+	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+      }
+    } else {
+      eval<1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
+      eval<1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+    }
+  } else {
+   if (evflag || vflag_fdotr) {
+      int ovflag = 0;
+      if (vflag_fdotr) ovflag = 2;
+      else if (vflag) ovflag = 1;
+      if (eflag) {
+	eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+      } else {
+	eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+      }
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int SPQ, int EVFLAG, int EFLAG, class flt_t, class acc_t>
+void PairSWIntel::eval(const int offload, const int vflag,
+                       IntelBuffers<flt_t,acc_t> *buffers,
+                       const ForceConst<flt_t> &fc, const int astart, 
+		       const int aend, const int pad_width)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  const int * _noalias const numneighhalf = buffers->get_atombin();
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+
+  const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
+  const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
+  const FC_PACKED2_T * _noalias const p2e = fc.p2e[0];
+  const FC_PACKED3_T * _noalias const p3 = fc.p3[0][0];
+
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  double *timer_compute = fix->off_watch_pair();
+  int *overflow = fix->get_off_overflow_flag();
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(p2,p2f,p2e,p3:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
+    in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
+    in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal) \
+      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      // loop over full neighbor list of my atoms
+
+      for (int i = iifrom; i < iito; ++i) {
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const FC_PACKED0_T * _noalias const p2i = p2 + ptr_off;
+        const FC_PACKED1_T * _noalias const p2fi = p2f + ptr_off;
+        const FC_PACKED2_T * _noalias const p2ei = p2e + ptr_off;
+        const FC_PACKED3_T * _noalias const p3i = p3 + ptr_off*ntypes;
+
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+	const int jnumhalf = numneighhalf[i];
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+        fxtmp = fytmp = fztmp = (acc_t)0.0;
+        if (EVFLAG) {
+          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        }
+
+	int ejnum = 0, ejnumhalf = 0;
+        for (int jj = 0; jj < jnum; jj++) {
+          int j = jlist[jj];
+          j &= NEIGHMASK;
+          const flt_t delx = x[j].x - xtmp;
+          const flt_t dely = x[j].y - ytmp;
+          const flt_t delz = x[j].z - ztmp;
+          const int jtype = x[j].w;
+          const flt_t rsq1 = delx * delx + dely * dely + delz * delz;
+          if (rsq1 < p2i[jtype].cutsq) {
+	    tdelx[ejnum] = delx;
+	    tdely[ejnum] = dely;
+	    tdelz[ejnum] = delz;
+	    trsq[ejnum] = rsq1;
+	    tj[ejnum] = j;
+	    tjtype[ejnum] = jtype;
+	    ejnum++;
+	    if (jj < jnumhalf) ejnumhalf++;
+	  }
+	}
+	int ejnum_pad = ejnum;
+	while ( (ejnum_pad % pad_width) != 0) {
+	  tdelx[ejnum_pad] = (flt_t)0.0;
+	  tdely[ejnum_pad] = (flt_t)0.0;
+	  tdelz[ejnum_pad] = (flt_t)0.0;
+	  trsq[ejnum_pad] = (flt_t)1.0;
+	  tj[ejnum_pad] = nall;
+	  tjtype[ejnum_pad] = 0;
+	  ejnum_pad++;
+	}
+
+        #if defined(__INTEL_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+				 sv0, sv1, sv2, sv3, sv4, sv5)
+	#endif
+        for (int jj = 0; jj < ejnum_pad; jj++) {
+          acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
+          fjxtmp = fjytmp = fjztmp = (acc_t)0.0;
+          if (EFLAG) fjtmp = (acc_t)0.0;
+
+          const flt_t delx = tdelx[jj];
+          const flt_t dely = tdely[jj];
+          const flt_t delz = tdelz[jj];
+          const int jtype = tjtype[jj];
+          const flt_t rsq1 = trsq[jj];
+
+          const flt_t r1 = sqrt(rsq1); 
+          const flt_t rinvsq1 = (flt_t)1.0 / rsq1;
+          const flt_t rainv1 = (flt_t)1.0 / (r1 - p2fi[jtype].cut);
+	  
+	  // two-body interactions, skip half of them
+	  flt_t rp, rq;
+	  if (SPQ == 1) {
+	    rp = r1 * r1;
+	    rp *= rp;
+	    rp = (flt_t)1.0 / rp;
+	    rq = (flt_t)1.0;
+	  } else {
+	    rp = pow(r1, -p2fi[jtype].powerp);
+	    rq = pow(r1, -p2fi[jtype].powerq);
+	  }
+	  const flt_t rainvsq = rainv1 * rainv1 * r1;
+	  flt_t expsrainv = exp(p2fi[jtype].sigma * rainv1);
+	  if (jj >= ejnumhalf) expsrainv = (flt_t)0.0;
+	  const flt_t fpair = (p2fi[jtype].c1 * rp - p2fi[jtype].c2 * rq +
+			       (p2fi[jtype].c3 * rp -p2fi[jtype].c4 * rq) * 
+			       rainvsq) * expsrainv * rinvsq1;
+
+	  fxtmp -= delx * fpair;
+	  fytmp -= dely * fpair;
+	  fztmp -= delz * fpair;
+	  fjxtmp += delx * fpair;
+	  fjytmp += dely * fpair;
+	  fjztmp += delz * fpair;
+
+	  if (EVFLAG) {
+	    if (EFLAG) {
+	      flt_t evdwl;
+	      evdwl = (p2ei[jtype].c5 * rp - p2ei[jtype].c6 * rq) * 
+	    	      expsrainv;
+	      sevdwl += evdwl;
+	      if (eatom) {
+		fwtmp += (acc_t)0.5 * evdwl;
+		fjtmp += (acc_t)0.5 * evdwl;
+	      }
+	    }
+	    IP_PRE_ev_tally_nbor(vflag, (flt_t)1.0, fpair,
+				 -delx, -dely, -delz);
+	  }
+
+	  /*---------------------------------------------*/
+
+          flt_t gsrainv1 = p2i[jtype].sigma_gamma * rainv1;
+          flt_t gsrainvsq1 = gsrainv1 * rainv1 / r1;
+          flt_t expgsrainv1 = exp(gsrainv1);
+	  const int joffset = jtype * ntypes;
+
+          for (int kk = 0; kk < ejnum; kk++) {
+            flt_t delr2[3];
+            delr2[0] = tdelx[kk];
+            delr2[1] = tdely[kk];
+            delr2[2] = tdelz[kk];
+	    const int ktype = tjtype[kk];
+            const flt_t rsq2 = trsq[kk];
+
+	    const flt_t r2 = sqrt(rsq2);
+	    const flt_t rinvsq2 = (flt_t)1.0 / rsq2;
+	    const flt_t rainv2 = (flt_t)1.0 / (r2 - p2i[ktype].cut);
+	    const flt_t gsrainv2 = p2i[ktype].sigma_gamma * rainv2;
+	    const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
+	    const flt_t expgsrainv2 = exp(gsrainv2);
+
+	    const flt_t rinv12 = (flt_t)1.0 / (r1 * r2);
+	    const flt_t cs = (delx * delr2[0] + dely * delr2[1] + 
+                              delz * delr2[2]) * rinv12;
+	    const flt_t delcs = cs - p3i[joffset + ktype].costheta;
+	    const flt_t delcssq = delcs*delcs;
+
+	    flt_t kfactor;
+	    if (jj == kk) kfactor = (flt_t)0.0;
+	    else kfactor = (flt_t)1.0;
+
+	    const flt_t facexp = expgsrainv1*expgsrainv2*kfactor;
+	    const flt_t facrad = p3i[joffset + ktype].lambda_epsilon * 
+	                         facexp * delcssq;
+	    const flt_t frad1 = facrad*gsrainvsq1;
+	    const flt_t frad2 = facrad*gsrainvsq2;
+	    const flt_t facang = p3i[joffset + ktype].lambda_epsilon2 * 
+	                         facexp * delcs;
+	    const flt_t facang12 = rinv12*facang;
+	    const flt_t csfacang = cs*facang;
+	    const flt_t csfac1 = rinvsq1*csfacang;
+
+	    const flt_t fjx = delx*(frad1+csfac1)-delr2[0]*facang12;
+	    const flt_t fjy = dely*(frad1+csfac1)-delr2[1]*facang12;
+	    const flt_t fjz = delz*(frad1+csfac1)-delr2[2]*facang12;
+
+	    fxtmp -= fjx;
+	    fytmp -= fjy;
+	    fztmp -= fjz;
+	    fjxtmp += fjx;
+	    fjytmp += fjy;
+	    fjztmp += fjz;
+	      
+	    if (EVFLAG) {
+	      if (EFLAG) {
+	        const flt_t evdwl = facrad * (flt_t)0.5;
+		sevdwl += evdwl;
+		if (eatom) {
+		  fwtmp += (acc_t)0.33333333 * evdwl;
+		  fjtmp += (acc_t)0.33333333 * facrad;
+		}
+	      }
+	      IP_PRE_ev_tally_nbor3v(vflag, fjx, fjy, fjz, 
+				     delx, dely, delz);
+	    }
+	  } // for kk
+	  const int j = tj[jj];
+          f[j].x += fjxtmp;
+          f[j].y += fjytmp;
+          f[j].z += fjztmp;
+          if (EFLAG)
+	    if (eatom) f[j].w += fjtmp;
+        } // for jj
+	
+        f[i].x += fxtmp;
+        f[i].y += fytmp;
+        f[i].z += fztmp;
+        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+      } // for ii
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall,
+                             nlocal, minlocal, nthreads, f_start, f_stride,
+                             x);
+    } // end omp
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = (acc_t)0.0;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSWIntel::allocate()
+{
+  PairSW::allocate();
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairSWIntel::init_style()
+{
+  PairSW::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+  map[0] = map[1];
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  _cop = fix->coprocessor_number();
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+    fix->get_mixed_buffers()->need_tag(1);
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    pack_force_const(force_const_double, fix->get_double_buffers());
+    fix->get_double_buffers()->need_tag(1);
+  } else {
+    pack_force_const(force_const_single, fix->get_single_buffers());
+    fix->get_single_buffers()->need_tag(1);
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_noghost())
+    error->all(FLERR,"The 'ghost no' option cannot be used with sw/intel.");
+  #endif
+
+  #if defined(__INTEL_COMPILER)
+  if (__INTEL_COMPILER_BUILD_DATE < 20141023)
+    error->all(FLERR, "Intel compiler versions before "
+	       "15 Update 1 not supported for sw/intel");
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                   IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads);
+  _ccache_stride = buffers->ccache_stride();
+
+  int tp1 = atom->ntypes + 1;
+  fc.set_ntypes(tp1,memory,_cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  _spq = 1;
+  for (int ii = 0; ii < tp1; ii++) {
+    int i = map[ii];
+    for (int jj = 0; jj < tp1; jj++) {
+      int j = map[jj];
+      if (i < 0 || j < 0 || ii == 0 || jj == 0) {
+	fc.p2[ii][jj].cutsq = 0;
+	fc.p2[ii][jj].cut = 0;
+	fc.p2[ii][jj].sigma_gamma = 0;
+	fc.p2f[ii][jj].cut = 0;
+	fc.p2f[ii][jj].powerp = 0;
+	fc.p2f[ii][jj].powerq = 0;
+	fc.p2f[ii][jj].sigma = 0;
+	fc.p2f[ii][jj].c1 = 0;
+	fc.p2f[ii][jj].c2 = 0;
+	fc.p2f[ii][jj].c3 = 0;
+	fc.p2f[ii][jj].c4 = 0;
+	fc.p2e[ii][jj].c5 = 0;
+	fc.p2e[ii][jj].c6 = 0;
+      } else {
+	int ijparam = elem2param[i][j][j];
+	fc.p2[ii][jj].cutsq = params[ijparam].cutsq;
+	fc.p2[ii][jj].cut = params[ijparam].cut;
+	fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma;
+	fc.p2f[ii][jj].cut = params[ijparam].cut;
+	fc.p2f[ii][jj].powerp = params[ijparam].powerp;
+	fc.p2f[ii][jj].powerq = params[ijparam].powerq;
+	fc.p2f[ii][jj].sigma = params[ijparam].sigma;
+	fc.p2f[ii][jj].c1 = params[ijparam].c1;
+	fc.p2f[ii][jj].c2 = params[ijparam].c2;
+	fc.p2f[ii][jj].c3 = params[ijparam].c3;
+	fc.p2f[ii][jj].c4 = params[ijparam].c4;
+	fc.p2e[ii][jj].c5 = params[ijparam].c5;
+	fc.p2e[ii][jj].c6 = params[ijparam].c6;
+
+	double cutcut = params[ijparam].cut * params[ijparam].cut;
+	if (params[ijparam].cutsq >= cutcut)
+	  fc.p2[ii][jj].cutsq *= 0.98;
+
+	if (params[ijparam].powerp != 4.0 || params[ijparam].powerq != 0.0)
+	  _spq = 0;
+      }
+
+      for (int kk = 0; kk < tp1; kk++) {
+        int k = map[kk];
+	if (i < 0 || j < 0 || k < 0  || ii == 0 || jj == 0 || kk == 0) {
+	  fc.p3[ii][jj][kk].costheta = 0;
+	  fc.p3[ii][jj][kk].lambda_epsilon = 0;
+	  fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
+	} else {
+	  int ijkparam = elem2param[i][j][k];
+	  fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
+	  fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
+	  fc.p3[ii][jj][kk].lambda_epsilon2 = params[ijkparam].lambda_epsilon2;
+	}
+      }
+    }
+  }
+
+  _host_pad = 1;
+  _offload_pad = 1;
+
+  if (INTEL_NBOR_PAD > 1)
+    _host_pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  FC_PACKED0_T *op2 = fc.p2[0];
+  FC_PACKED1_T *op2f = fc.p2f[0];
+  FC_PACKED2_T *op2e = fc.p2e[0];
+  FC_PACKED3_T *op3 = fc.p3[0][0];
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  int tp1cu = tp1sq * tp1;
+  if (op2 != NULL && op2f != NULL && op2e != NULL && op3 != NULL &&
+      ocutneighsq != NULL) {
+    #pragma offload_transfer target(mic:_cop) \
+      in(op2,op2f,op2e: length(tp1sq) alloc_if(0) free_if(0)) \
+      in(op3: length(tp1cu) alloc_if(0) free_if(0)) \
+      in(ocutneighsq: length(tp1sq))
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                Memory *memory,
+                                                const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      fc_packed0 *op2 = p2[0];
+      fc_packed1 *op2f = p2f[0];
+      fc_packed2 *op2e = p2e[0];
+      fc_packed3 *op3 = p3[0][0];
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (op2 != NULL && op2f != NULL && op2e != NULL && op3 != NULL &&
+          _cop >= 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(op2, op2f, op2e, op3: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(op2);
+      _memory->destroy(op2f);
+      _memory->destroy(op2e);
+      _memory->destroy(op3);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(p2,ntypes,ntypes,"fc.p2");
+      memory->create(p2f,ntypes,ntypes,"fc.p2f");
+      memory->create(p2e,ntypes,ntypes,"fc.p2e");
+      memory->create(p3,ntypes,ntypes,ntypes,"fc.p3");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      fc_packed0 *op2 = p2[0];
+      fc_packed1 *op2f = p2f[0];
+      fc_packed2 *op2e = p2e[0];
+      fc_packed3 *op3 = p3[0][0];
+      int tp1sq = ntypes * ntypes;
+      int tp1cu = tp1sq * ntypes;
+      if (op2 != NULL && op2f != NULL && op2e != NULL && op3 != NULL &&
+          cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(op2,op2f,op2e: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(op3: length(tp1cu) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h
new file mode 100755
index 0000000000..5bd54c5f49
--- /dev/null
+++ b/src/USER-INTEL/pair_sw_intel.h
@@ -0,0 +1,111 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(sw/intel,PairSWIntel)
+
+#else
+
+#ifndef LMP_PAIR_SW_INTEL_H
+#define LMP_PAIR_SW_INTEL_H
+
+#include "pair_sw.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairSWIntel : public PairSW {
+ public:
+  PairSWIntel(class LAMMPS *);
+  virtual ~PairSWIntel();
+  virtual void compute(int, int);
+  virtual void init_style();
+
+ protected:
+  FixIntel *fix;
+  int _cop;
+  template <class flt_t> class ForceConst;
+
+  virtual void allocate();
+
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int SPQ, int EVFLAG, int EFLAG, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, 
+	    const int astart, const int aend, const int pad_width);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  int _ccache_stride, _host_pad, _offload_pad, _spq;
+
+  // ----------------------------------------------------------------------
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct {
+      flt_t cutsq, cut, sigma_gamma, pad;
+    } fc_packed0;
+    typedef struct {
+      flt_t powerp, powerq, cut, sigma, c1, c2, c3, c4;
+    } fc_packed1;
+    typedef struct {
+      flt_t c5, c6;
+    } fc_packed2;
+    typedef struct {
+      flt_t costheta, lambda_epsilon, lambda_epsilon2, pad;
+    } fc_packed3;
+
+    fc_packed0 **p2;
+    fc_packed1 **p2f;
+    fc_packed2 **p2e;
+    fc_packed3 ***p3;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: The 'ghost no' option cannot be used with sw/intel.
+
+Self-explanatory.
+
+E: Intel compiler versions before 15 Update 1 not supported for sw/intel.
+
+Self-explanatory.
+
+*/
diff --git a/src/USER-MISC/fix_ttm_mod.cpp b/src/USER-MISC/fix_ttm_mod.cpp
index dcbca2debc..d66e49d6a2 100644
--- a/src/USER-MISC/fix_ttm_mod.cpp
+++ b/src/USER-MISC/fix_ttm_mod.cpp
@@ -33,6 +33,7 @@
 #include "random_mars.h"
 #include "memory.h"
 #include "error.h"
+#include "citeme.h"
 #include "math_const.h"
 
 using namespace LAMMPS_NS;
@@ -41,11 +42,34 @@ using namespace MathConst;
 
 #define MAXLINE 1024
 
+static const char cite_fix_ttm_mod[] =
+  "fix ttm/mod command:\n\n"
+  "@article{Pisarev2014,\n"
+  "author = {Pisarev, V. V. and Starikov, S. V.},\n"
+  "title = {{Atomistic simulation of ion track formation in UO2.}},\n"
+  "journal = {J.~Phys.:~Condens.~Matter},\n"
+  "volume = {26},\n"
+  "number = {47},\n"
+  "pages = {475401},\n"
+  "year = {2014}\n"
+  "}\n\n"
+  "@article{Norman2013,\n"
+  "author = {Norman, G. E. and Starikov, S. V. and Stegailov, V. V. and Saitov, I. M. and Zhilyaev, P. A.},\n"
+  "title = {{Atomistic Modeling of Warm Dense Matter in the Two-Temperature State}},\n"
+  "journal = {Contrib.~Plasm.~Phys.},\n"
+  "number = {2},\n"
+  "volume = {53},\n"
+  "pages = {129--139},\n"
+  "year = {2013}\n"
+  "}\n\n";
+
 /* ---------------------------------------------------------------------- */
 
 FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
   Fix(lmp, narg, arg)
 {
+  if (lmp->citeme) lmp->citeme->add(cite_fix_ttm_mod);
+  
   if (narg < 9) error->all(FLERR,"Illegal fix ttm/mod command");
   vector_flag = 1;
   size_vector = 2;
@@ -55,16 +79,16 @@ FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
   restart_peratom = 1;
   restart_global = 1;
   seed = atoi(arg[3]);
-  nxnodes = atoi(arg[4]);
-  nynodes = atoi(arg[5]);
-  nznodes = atoi(arg[6]);
-  fpr = fopen(arg[7],"r");
+  fpr_2 = fopen(arg[4],"r");
+  nxnodes = atoi(arg[5]);
+  nynodes = atoi(arg[6]);
+  nznodes = atoi(arg[7]);
+  fpr = fopen(arg[8],"r");
   if (fpr == NULL) {
     char str[128];
     sprintf(str,"Cannot open file %s",arg[7]);
     error->one(FLERR,str);
   }
-  fpr_2 = fopen(arg[8],"r");
   if (fpr == NULL) {
     char str[128];
     sprintf(str,"Cannot open file %s",arg[8]);
@@ -233,6 +257,7 @@ FixTTMMod::FixTTMMod(LAMMPS *lmp, int narg, char **arg) :
   memory->create(sum_mass_vsq_all,nxnodes,nynodes,nznodes,
                  "ttm/mod:sum_mass_vsq_all");
   memory->create(T_electron_old,nxnodes,nynodes,nznodes,"ttm/mod:T_electron_old");
+  memory->create(T_electron_first,nxnodes,nynodes,nznodes,"ttm/mod:T_electron_first");
   memory->create(T_electron,nxnodes,nynodes,nznodes,"ttm/mod:T_electron");
   memory->create(net_energy_transfer,nxnodes,nynodes,nznodes,
                  "ttm/mod:net_energy_transfer");
@@ -585,89 +610,113 @@ void FixTTMMod::end_of_step()
   // required this MD step to maintain a stable explicit solve
   int num_inner_timesteps = 1;
   double inner_dt = update->dt;
-  double stability_criterion = 1.0 -
-    2.0*inner_dt/el_specific_heat *
-    (el_thermal_conductivity*(1.0/dx/dx + 1.0/dy/dy + 1.0/dz/dz));
-  if (stability_criterion < 0.0) {
-    inner_dt = 0.25*el_specific_heat /
-      (el_thermal_conductivity*(1.0/dx/dx + 1.0/dy/dy + 1.0/dz/dz));
-  }
-  num_inner_timesteps = static_cast<unsigned int>(update->dt/inner_dt) + 1;
-  inner_dt = update->dt/double(num_inner_timesteps);
-  if (num_inner_timesteps > 1000000)
-    error->warning(FLERR,"Too many inner timesteps in fix ttm",0);
-  for (int ith_inner_timestep = 0; ith_inner_timestep < num_inner_timesteps;
-       ith_inner_timestep++) {
-    for (int ixnode = 0; ixnode < nxnodes; ixnode++)
+  double stability_criterion = 0.0;
+  
+  for (int ixnode = 0; ixnode < nxnodes; ixnode++)
+	for (int iynode = 0; iynode < nynodes; iynode++)
+	  for (int iznode = 0; iznode < nznodes; iznode++)
+		T_electron_first[ixnode][iynode][iznode] =
+		  T_electron[ixnode][iynode][iznode];
+  do {
+	for (int ixnode = 0; ixnode < nxnodes; ixnode++)
       for (int iynode = 0; iynode < nynodes; iynode++)
         for (int iznode = 0; iznode < nznodes; iznode++)
-          T_electron_old[ixnode][iynode][iznode] =
-            T_electron[ixnode][iynode][iznode];
-    // compute new electron T profile
-    duration = duration + inner_dt;
-    for (int ixnode = 0; ixnode < nxnodes; ixnode++)
-      for (int iynode = 0; iynode < nynodes; iynode++)
-        for (int iznode = 0; iznode < nznodes; iznode++) {
-          int right_xnode = ixnode + 1;
-          int right_ynode = iynode + 1;
-          int right_znode = iznode + 1;
-          if (right_xnode == nxnodes) right_xnode = 0;
-          if (right_ynode == nynodes) right_ynode = 0;
-          if (right_znode == nznodes) right_znode = 0;
-          int left_xnode = ixnode - 1;
-          int left_ynode = iynode - 1;
-          int left_znode = iznode - 1;
-          if (left_xnode == -1) left_xnode = nxnodes - 1;
-          if (left_ynode == -1) left_ynode = nynodes - 1;
-          if (left_znode == -1) left_znode = nznodes - 1;
-          double skin_layer_d = double(skin_layer);
-          double ixnode_d = double(ixnode);
-          double surface_d = double(t_surface_l);
-          mult_factor = 0.0;
-          if (duration < width){
-            if (ixnode >= t_surface_l) mult_factor = (intensity/(dx*skin_layer_d))*exp((-1.0)*(ixnode_d - surface_d)/skin_layer_d);
-          }
-          if (ixnode < t_surface_l) net_energy_transfer_all[ixnode][iynode][iznode] = 0.0;
-          double cr_vac = 1;
-          if (T_electron_old[ixnode][iynode][iznode] == 0) cr_vac = 0;
-          double cr_v_l_x = 1;
-          if (T_electron_old[left_xnode][iynode][iznode] == 0) cr_v_l_x = 0;
-          double cr_v_r_x = 1;
-          if (T_electron_old[right_xnode][iynode][iznode] == 0) cr_v_r_x = 0;
-          double cr_v_l_y = 1;
-          if (T_electron_old[ixnode][left_ynode][iznode] == 0) cr_v_l_y = 0;
-          double cr_v_r_y = 1;
-          if (T_electron_old[ixnode][right_ynode][iznode] == 0) cr_v_r_y = 0;
-          double cr_v_l_z = 1;
-          if (T_electron_old[ixnode][iynode][left_znode] == 0) cr_v_l_z = 0;
-          double cr_v_r_z = 1;
-          if (T_electron_old[ixnode][iynode][right_znode] == 0) cr_v_r_z = 0;
-          if (cr_vac != 0) {
-            T_electron[ixnode][iynode][iznode] =
-              T_electron_old[ixnode][iynode][iznode] +
-              inner_dt/el_properties(T_electron_old[ixnode][iynode][iznode]).el_heat_capacity *
-              ((cr_v_r_x*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[right_xnode][iynode][iznode]/2.0).el_thermal_conductivity*
-                (T_electron_old[right_xnode][iynode][iznode]-T_electron_old[ixnode][iynode][iznode])/dx -
-                cr_v_l_x*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[left_xnode][iynode][iznode]/2.0).el_thermal_conductivity*
-                (T_electron_old[ixnode][iynode][iznode]-T_electron_old[left_xnode][iynode][iznode])/dx)/dx +
-               (cr_v_r_y*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[ixnode][right_ynode][iznode]/2.0).el_thermal_conductivity*
-                (T_electron_old[ixnode][right_ynode][iznode]-T_electron_old[ixnode][iynode][iznode])/dy -
-                cr_v_l_y*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[ixnode][left_ynode][iznode]/2.0).el_thermal_conductivity*
-                (T_electron_old[ixnode][iynode][iznode]-T_electron_old[ixnode][left_ynode][iznode])/dy)/dy +
-               (cr_v_r_z*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[ixnode][iynode][right_znode]/2.0).el_thermal_conductivity*
-                (T_electron_old[ixnode][iynode][right_znode]-T_electron_old[ixnode][iynode][iznode])/dz -
-                cr_v_l_z*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[ixnode][iynode][left_znode]/2.0).el_thermal_conductivity*
-                (T_electron_old[ixnode][iynode][iznode]-T_electron_old[ixnode][iynode][left_znode])/dz)/dz);
-            T_electron[ixnode][iynode][iznode]+=inner_dt/el_properties(T_electron[ixnode][iynode][iznode]).el_heat_capacity*
-              (mult_factor -
-               net_energy_transfer_all[ixnode][iynode][iznode]/del_vol);
-          }
-          else T_electron[ixnode][iynode][iznode] =
-                 T_electron_old[ixnode][iynode][iznode];
-          if ((T_electron[ixnode][iynode][iznode] > 0.0) && (T_electron[ixnode][iynode][iznode] < electron_temperature_min))
-            T_electron[ixnode][iynode][iznode] = T_electron[ixnode][iynode][iznode] + 0.5*(electron_temperature_min - T_electron[ixnode][iynode][iznode]);
-        }
-  }
+          T_electron[ixnode][iynode][iznode] =
+            T_electron_first[ixnode][iynode][iznode];
+	  
+	  stability_criterion = 1.0 -
+	    2.0*inner_dt/el_specific_heat *
+	    (el_thermal_conductivity*(1.0/dx/dx + 1.0/dy/dy + 1.0/dz/dz));
+	  if (stability_criterion < 0.0) {
+	    inner_dt = 0.25*el_specific_heat /
+	      (el_thermal_conductivity*(1.0/dx/dx + 1.0/dy/dy + 1.0/dz/dz));
+	  }
+	  num_inner_timesteps = static_cast<unsigned int>(update->dt/inner_dt) + 1;
+	  inner_dt = update->dt/double(num_inner_timesteps);
+	  if (num_inner_timesteps > 1000000)
+	    error->warning(FLERR,"Too many inner timesteps in fix ttm",0);
+	  for (int ith_inner_timestep = 0; ith_inner_timestep < num_inner_timesteps;
+	       ith_inner_timestep++) {
+	    for (int ixnode = 0; ixnode < nxnodes; ixnode++)
+	      for (int iynode = 0; iynode < nynodes; iynode++)
+	        for (int iznode = 0; iznode < nznodes; iznode++)
+	          T_electron_old[ixnode][iynode][iznode] =
+	            T_electron[ixnode][iynode][iznode];
+	    // compute new electron T profile
+	    duration = duration + inner_dt;
+	    for (int ixnode = 0; ixnode < nxnodes; ixnode++)
+	      for (int iynode = 0; iynode < nynodes; iynode++)
+	        for (int iznode = 0; iznode < nznodes; iznode++) {
+	          int right_xnode = ixnode + 1;
+	          int right_ynode = iynode + 1;
+	          int right_znode = iznode + 1;
+	          if (right_xnode == nxnodes) right_xnode = 0;
+	          if (right_ynode == nynodes) right_ynode = 0;
+	          if (right_znode == nznodes) right_znode = 0;
+	          int left_xnode = ixnode - 1;
+	          int left_ynode = iynode - 1;
+	          int left_znode = iznode - 1;
+	          if (left_xnode == -1) left_xnode = nxnodes - 1;
+	          if (left_ynode == -1) left_ynode = nynodes - 1;
+	          if (left_znode == -1) left_znode = nznodes - 1;
+	          double skin_layer_d = double(skin_layer);
+	          double ixnode_d = double(ixnode);
+	          double surface_d = double(t_surface_l);
+	          mult_factor = 0.0;
+	          if (duration < width){
+	            if (ixnode >= t_surface_l) mult_factor = (intensity/(dx*skin_layer_d))*exp((-1.0)*(ixnode_d - surface_d)/skin_layer_d);
+	          }
+	          if (ixnode < t_surface_l) net_energy_transfer_all[ixnode][iynode][iznode] = 0.0;
+	          double cr_vac = 1;
+	          if (T_electron_old[ixnode][iynode][iznode] == 0) cr_vac = 0;
+	          double cr_v_l_x = 1;
+	          if (T_electron_old[left_xnode][iynode][iznode] == 0) cr_v_l_x = 0;
+	          double cr_v_r_x = 1;
+	          if (T_electron_old[right_xnode][iynode][iznode] == 0) cr_v_r_x = 0;
+	          double cr_v_l_y = 1;
+	          if (T_electron_old[ixnode][left_ynode][iznode] == 0) cr_v_l_y = 0;
+	          double cr_v_r_y = 1;
+	          if (T_electron_old[ixnode][right_ynode][iznode] == 0) cr_v_r_y = 0;
+	          double cr_v_l_z = 1;
+	          if (T_electron_old[ixnode][iynode][left_znode] == 0) cr_v_l_z = 0;
+	          double cr_v_r_z = 1;
+	          if (T_electron_old[ixnode][iynode][right_znode] == 0) cr_v_r_z = 0;
+	          if (cr_vac != 0) {
+	            T_electron[ixnode][iynode][iznode] =
+	              T_electron_old[ixnode][iynode][iznode] +
+	              inner_dt/el_properties(T_electron_old[ixnode][iynode][iznode]).el_heat_capacity *
+	              ((cr_v_r_x*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[right_xnode][iynode][iznode]/2.0).el_thermal_conductivity*
+	                (T_electron_old[right_xnode][iynode][iznode]-T_electron_old[ixnode][iynode][iznode])/dx -
+	                cr_v_l_x*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[left_xnode][iynode][iznode]/2.0).el_thermal_conductivity*
+	                (T_electron_old[ixnode][iynode][iznode]-T_electron_old[left_xnode][iynode][iznode])/dx)/dx +
+	               (cr_v_r_y*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[ixnode][right_ynode][iznode]/2.0).el_thermal_conductivity*
+	                (T_electron_old[ixnode][right_ynode][iznode]-T_electron_old[ixnode][iynode][iznode])/dy -
+	                cr_v_l_y*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[ixnode][left_ynode][iznode]/2.0).el_thermal_conductivity*
+	                (T_electron_old[ixnode][iynode][iznode]-T_electron_old[ixnode][left_ynode][iznode])/dy)/dy +
+	               (cr_v_r_z*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[ixnode][iynode][right_znode]/2.0).el_thermal_conductivity*
+	                (T_electron_old[ixnode][iynode][right_znode]-T_electron_old[ixnode][iynode][iznode])/dz -
+	                cr_v_l_z*el_properties(T_electron_old[ixnode][iynode][iznode]/2.0+T_electron_old[ixnode][iynode][left_znode]/2.0).el_thermal_conductivity*
+	                (T_electron_old[ixnode][iynode][iznode]-T_electron_old[ixnode][iynode][left_znode])/dz)/dz);
+	            T_electron[ixnode][iynode][iznode]+=inner_dt/el_properties(T_electron[ixnode][iynode][iznode]).el_heat_capacity*
+	              (mult_factor -
+	               net_energy_transfer_all[ixnode][iynode][iznode]/del_vol);
+	          }
+	          else T_electron[ixnode][iynode][iznode] =
+	                 T_electron_old[ixnode][iynode][iznode];
+	          if ((T_electron[ixnode][iynode][iznode] > 0.0) && (T_electron[ixnode][iynode][iznode] < electron_temperature_min))
+	            T_electron[ixnode][iynode][iznode] = T_electron[ixnode][iynode][iznode] + 0.5*(electron_temperature_min - T_electron[ixnode][iynode][iznode]);
+	          
+	          if (el_properties(T_electron[ixnode][iynode][iznode]).el_thermal_conductivity > el_thermal_conductivity)
+	            el_thermal_conductivity = el_properties(T_electron[ixnode][iynode][iznode]).el_thermal_conductivity;
+	          if ((T_electron[ixnode][iynode][iznode] > 0.0) && (el_properties(T_electron[ixnode][iynode][iznode]).el_heat_capacity < el_specific_heat))
+                el_specific_heat = el_properties(T_electron[ixnode][iynode][iznode]).el_heat_capacity;
+	        }
+	  }
+	  stability_criterion = 1.0 -
+	    2.0*inner_dt/el_specific_heat *
+	    (el_thermal_conductivity*(1.0/dx/dx + 1.0/dy/dy + 1.0/dz/dz));
+	    	  
+	} while (stability_criterion < 0.0);
   // output nodal temperatures for current timestep
   if ((nfileevery) && !(update->ntimestep % nfileevery)) {
     // compute atomic Ta for each grid point
diff --git a/src/accelerator_intel.h b/src/accelerator_intel.h
index 2f0179eb4b..ad856e41e3 100644
--- a/src/accelerator_intel.h
+++ b/src/accelerator_intel.h
@@ -34,7 +34,7 @@
   template <class flt_t, class acc_t>
   void bin_atoms(void *, int *);
 
-template <class flt_t, class acc_t, int>
+  template <class flt_t, class acc_t, int>
   void hbni(const int, NeighList *, void *, const int, const int, void *,
 	    const int offload_end = 0);
   template <class flt_t, class acc_t>
@@ -42,10 +42,14 @@ template <class flt_t, class acc_t, int>
   template <class flt_t, class acc_t, int>
   void hbnti(const int, NeighList *, void *, const int, const int, void *,
 	     const int offload_end = 0);
+  template <class flt_t, class acc_t, int>
+  void fbi(const int, NeighList *, void *, const int, const int, void *,
+	   const int offload_end = 0);
 
   void half_bin_no_newton_intel(class NeighList *);
   void half_bin_newton_intel(class NeighList *);
   void half_bin_newton_tri_intel(class NeighList *);
+  void full_bin_intel(class NeighList *);
 
 #endif /* !LMP_INSIDE_NEIGHBOR_H */
 
@@ -58,6 +62,7 @@ template <class flt_t, class acc_t, int>
   void half_bin_no_newton_intel(class NeighList *) {}
   void half_bin_newton_intel(class NeighList *) {}
   void half_bin_newton_tri_intel(class NeighList *) {}
+  void full_bin_intel(class NeighList *) {}
 
 #endif
 
diff --git a/src/fix_respa.h b/src/fix_respa.h
index d65587cdc9..7c7bd53a61 100644
--- a/src/fix_respa.h
+++ b/src/fix_respa.h
@@ -50,7 +50,6 @@ class FixRespa : public Fix {
 
 #endif
 #endif
-
 /* ERROR/WARNING messages:
 
 */
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index db4cbad693..8567d74297 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -1187,8 +1187,10 @@ void Neighbor::choose_build(int index, NeighRequest *rq)
                      "Neighbor include group not allowed with ghost neighbors");
         else pb = &Neighbor::full_nsq_ghost_omp;
       } else if (style == BIN) {
-        if (rq->ghost == 0) pb = &Neighbor::full_bin_omp;
-        else if (includegroup)
+        if (rq->ghost == 0) {
+	  if (rq->intel) pb = &Neighbor::full_bin_intel;
+	  else pb = &Neighbor::full_bin_omp;
+        } else if (includegroup)
           error->all(FLERR,
                      "Neighbor include group not allowed with ghost neighbors");
         else pb = &Neighbor::full_bin_ghost_omp;
diff --git a/src/velocity.cpp b/src/velocity.cpp
index 0ea6b01673..ba1bc06bd0 100644
--- a/src/velocity.cpp
+++ b/src/velocity.cpp
@@ -98,6 +98,28 @@ void Velocity::command(int narg, char **arg)
   else if (style == RAMP) options(narg-8,&arg[8]);
   else if (style == ZERO) options(narg-3,&arg[3]);
 
+  // special cases where full init and border communication must be done first
+  // for CREATE/SET if compute temp/cs is used
+  // for ZERO if fix rigid/small is used
+  // b/c methods invoked in the compute/fix perform forward/reverse comm
+
+  int initcomm = 0;
+  if (style == ZERO && rfix >= 0 && 
+      strcmp(modify->fix[rfix]->style,"rigid/small") == 0) initcomm = 1;
+  if ((style == CREATE || style == SET) && temperature &&
+      strcmp(temperature->style,"temp/cs2") == 0) initcomm = 1;
+
+  if (initcomm) {
+    lmp->init();
+    if (domain->triclinic) domain->x2lamda(atom->nlocal);
+    domain->pbc();
+    domain->reset_box();
+    comm->setup();
+    comm->exchange();
+    comm->borders();
+    if (domain->triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+  }
+
   // initialize velocities based on style
   // create() invoked differently, so can be called externally
 
@@ -670,37 +692,27 @@ void Velocity::ramp(int narg, char **arg)
 
 /* ----------------------------------------------------------------------
    zero linear or angular momentum of a group
-   if using rigid/small requires init of entire system since
-      its methods perform forward/reverse comm,
-      comm::init needs neighbor::init needs pair::init needs kspace::init, etc
-      also requires setup_pre_neighbor call to setup bodies
 ------------------------------------------------------------------------- */
 
 void Velocity::zero(int narg, char **arg)
 {
   if (strcmp(arg[0],"linear") == 0) {
     if (rfix < 0) zero_momentum();
-    else {
-      if (strcmp(modify->fix[rfix]->style,"rigid/small") == 0) {
-        lmp->init();
-        modify->fix[rfix]->setup_pre_neighbor();
-        modify->fix[rfix]->zero_momentum();
-      } else if (strstr(modify->fix[rfix]->style,"rigid")) {
-        modify->fix[rfix]->zero_momentum();
-      } else error->all(FLERR,"Velocity rigid used with non-rigid fix-ID");
-    }
+    else if (strcmp(modify->fix[rfix]->style,"rigid/small") == 0) {
+      modify->fix[rfix]->setup_pre_neighbor();
+      modify->fix[rfix]->zero_momentum();
+    } else if (strstr(modify->fix[rfix]->style,"rigid")) {
+      modify->fix[rfix]->zero_momentum();
+    } else error->all(FLERR,"Velocity rigid used with non-rigid fix-ID");
 
   } else if (strcmp(arg[0],"angular") == 0) {
     if (rfix < 0) zero_rotation();
-    else {
-      if (strcmp(modify->fix[rfix]->style,"rigid/small") == 0) {
-        lmp->init();
-        modify->fix[rfix]->setup_pre_neighbor();
-        modify->fix[rfix]->zero_rotation();
-      } else if (strstr(modify->fix[rfix]->style,"rigid")) {
-        modify->fix[rfix]->zero_rotation();
-      } else error->all(FLERR,"Velocity rigid used with non-rigid fix-ID");
-    }
+    else if (strcmp(modify->fix[rfix]->style,"rigid/small") == 0) {
+      modify->fix[rfix]->setup_pre_neighbor();
+      modify->fix[rfix]->zero_rotation();
+    } else if (strstr(modify->fix[rfix]->style,"rigid")) {
+      modify->fix[rfix]->zero_rotation();
+    } else error->all(FLERR,"Velocity rigid used with non-rigid fix-ID");
 
   } else error->all(FLERR,"Illegal velocity command");
 }
diff --git a/src/version.h b/src/version.h
index b648b07436..33e0a6a6d2 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "24 Feb 2015"
+#define LAMMPS_VERSION "5 Mar 2015"