diff --git a/doc/Eqs/box_inverse.jpg b/doc/Eqs/box_inverse.jpg
new file mode 100644
index 0000000000..dabcfb7b5d
Binary files /dev/null and b/doc/Eqs/box_inverse.jpg differ
diff --git a/doc/Eqs/box_inverse.tex b/doc/Eqs/box_inverse.tex
new file mode 100644
index 0000000000..68994f2868
--- /dev/null
+++ b/doc/Eqs/box_inverse.tex
@@ -0,0 +1,14 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+\begin{eqnarray*}
+{\rm lx} &=& a \\
+{\rm xy} &=& b \cos{\gamma}  \\
+{\rm xz} &=& c \cos{\beta}\\
+{\rm ly}^2 &=&   b^2 - {\rm xy}^2 \\
+{\rm yz} &=& \frac{b*c \cos{\alpha} - {\rm xy}*{\rm xz}}{\rm ly} \\
+{\rm lz}^2 &=&  c^2 - {\rm xz}^2 - {\rm yz}^2 \\
+\end{eqnarray*}
+
+\end{document}
diff --git a/doc/Section_commands.html b/doc/Section_commands.html
index 5f996268de..039367f5ce 100644
--- a/doc/Section_commands.html
+++ b/doc/Section_commands.html
@@ -311,19 +311,20 @@ default LAMMPS build.  These dependencies are listed as Restrictions
 in the command's documentation.
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
-<TR ALIGN="center"><TD ><A HREF = "angle_coeff.html">angle_coeff</A></TD><TD ><A HREF = "angle_style.html">angle_style</A></TD><TD ><A HREF = "atom_modify.html">atom_modify</A></TD><TD ><A HREF = "atom_style.html">atom_style</A></TD><TD ><A HREF = "bond_coeff.html">bond_coeff</A></TD><TD ><A HREF = "bond_style.html">bond_style</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "boundary.html">boundary</A></TD><TD ><A HREF = "change_box.html">change_box</A></TD><TD ><A HREF = "clear.html">clear</A></TD><TD ><A HREF = "communicate.html">communicate</A></TD><TD ><A HREF = "compute.html">compute</A></TD><TD ><A HREF = "compute_modify.html">compute_modify</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "create_atoms.html">create_atoms</A></TD><TD ><A HREF = "create_box.html">create_box</A></TD><TD ><A HREF = "delete_atoms.html">delete_atoms</A></TD><TD ><A HREF = "delete_bonds.html">delete_bonds</A></TD><TD ><A HREF = "dielectric.html">dielectric</A></TD><TD ><A HREF = "dihedral_coeff.html">dihedral_coeff</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "dihedral_style.html">dihedral_style</A></TD><TD ><A HREF = "dimension.html">dimension</A></TD><TD ><A HREF = "displace_atoms.html">displace_atoms</A></TD><TD ><A HREF = "displace_box.html">displace_box</A></TD><TD ><A HREF = "dump.html">dump</A></TD><TD ><A HREF = "dump_modify.html">dump_modify</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "echo.html">echo</A></TD><TD ><A HREF = "fix.html">fix</A></TD><TD ><A HREF = "fix_modify.html">fix_modify</A></TD><TD ><A HREF = "group.html">group</A></TD><TD ><A HREF = "if.html">if</A></TD><TD ><A HREF = "improper_coeff.html">improper_coeff</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "improper_style.html">improper_style</A></TD><TD ><A HREF = "include.html">include</A></TD><TD ><A HREF = "jump.html">jump</A></TD><TD ><A HREF = "kspace_modify.html">kspace_modify</A></TD><TD ><A HREF = "kspace_style.html">kspace_style</A></TD><TD ><A HREF = "label.html">label</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "lattice.html">lattice</A></TD><TD ><A HREF = "log.html">log</A></TD><TD ><A HREF = "mass.html">mass</A></TD><TD ><A HREF = "minimize.html">minimize</A></TD><TD ><A HREF = "min_modify.html">min_modify</A></TD><TD ><A HREF = "min_style.html">min_style</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "neb.html">neb</A></TD><TD ><A HREF = "neigh_modify.html">neigh_modify</A></TD><TD ><A HREF = "neighbor.html">neighbor</A></TD><TD ><A HREF = "newton.html">newton</A></TD><TD ><A HREF = "next.html">next</A></TD><TD ><A HREF = "pair_coeff.html">pair_coeff</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_modify.html">pair_modify</A></TD><TD ><A HREF = "pair_style.html">pair_style</A></TD><TD ><A HREF = "pair_write.html">pair_write</A></TD><TD ><A HREF = "prd.html">prd</A></TD><TD ><A HREF = "print.html">print</A></TD><TD ><A HREF = "processors.html">processors</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "read_data.html">read_data</A></TD><TD ><A HREF = "read_restart.html">read_restart</A></TD><TD ><A HREF = "region.html">region</A></TD><TD ><A HREF = "replicate.html">replicate</A></TD><TD ><A HREF = "reset_timestep.html">reset_timestep</A></TD><TD ><A HREF = "restart.html">restart</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "run.html">run</A></TD><TD ><A HREF = "run_style.html">run_style</A></TD><TD ><A HREF = "set.html">set</A></TD><TD ><A HREF = "shell.html">shell</A></TD><TD ><A HREF = "special_bonds.html">special_bonds</A></TD><TD ><A HREF = "tad.html">tad</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "temper.html">temper</A></TD><TD ><A HREF = "thermo.html">thermo</A></TD><TD ><A HREF = "thermo_modify.html">thermo_modify</A></TD><TD ><A HREF = "thermo_style.html">thermo_style</A></TD><TD ><A HREF = "timestep.html">timestep</A></TD><TD ><A HREF = "uncompute.html">uncompute</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "undump.html">undump</A></TD><TD ><A HREF = "unfix.html">unfix</A></TD><TD ><A HREF = "units.html">units</A></TD><TD ><A HREF = "variable.html">variable</A></TD><TD ><A HREF = "velocity.html">velocity</A></TD><TD ><A HREF = "write_restart.html">write_restart</A> 
+<TR ALIGN="center"><TD ><A HREF = "accelerator.html">accelerator</A></TD><TD ><A HREF = "angle_coeff.html">angle_coeff</A></TD><TD ><A HREF = "angle_style.html">angle_style</A></TD><TD ><A HREF = "atom_modify.html">atom_modify</A></TD><TD ><A HREF = "atom_style.html">atom_style</A></TD><TD ><A HREF = "bond_coeff.html">bond_coeff</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "bond_style.html">bond_style</A></TD><TD ><A HREF = "boundary.html">boundary</A></TD><TD ><A HREF = "change_box.html">change_box</A></TD><TD ><A HREF = "clear.html">clear</A></TD><TD ><A HREF = "communicate.html">communicate</A></TD><TD ><A HREF = "compute.html">compute</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "compute_modify.html">compute_modify</A></TD><TD ><A HREF = "create_atoms.html">create_atoms</A></TD><TD ><A HREF = "create_box.html">create_box</A></TD><TD ><A HREF = "delete_atoms.html">delete_atoms</A></TD><TD ><A HREF = "delete_bonds.html">delete_bonds</A></TD><TD ><A HREF = "dielectric.html">dielectric</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "dihedral_coeff.html">dihedral_coeff</A></TD><TD ><A HREF = "dihedral_style.html">dihedral_style</A></TD><TD ><A HREF = "dimension.html">dimension</A></TD><TD ><A HREF = "displace_atoms.html">displace_atoms</A></TD><TD ><A HREF = "displace_box.html">displace_box</A></TD><TD ><A HREF = "dump.html">dump</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "dump_modify.html">dump_modify</A></TD><TD ><A HREF = "echo.html">echo</A></TD><TD ><A HREF = "fix.html">fix</A></TD><TD ><A HREF = "fix_modify.html">fix_modify</A></TD><TD ><A HREF = "group.html">group</A></TD><TD ><A HREF = "if.html">if</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "improper_coeff.html">improper_coeff</A></TD><TD ><A HREF = "improper_style.html">improper_style</A></TD><TD ><A HREF = "include.html">include</A></TD><TD ><A HREF = "jump.html">jump</A></TD><TD ><A HREF = "kspace_modify.html">kspace_modify</A></TD><TD ><A HREF = "kspace_style.html">kspace_style</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "label.html">label</A></TD><TD ><A HREF = "lattice.html">lattice</A></TD><TD ><A HREF = "log.html">log</A></TD><TD ><A HREF = "mass.html">mass</A></TD><TD ><A HREF = "minimize.html">minimize</A></TD><TD ><A HREF = "min_modify.html">min_modify</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "min_style.html">min_style</A></TD><TD ><A HREF = "neb.html">neb</A></TD><TD ><A HREF = "neigh_modify.html">neigh_modify</A></TD><TD ><A HREF = "neighbor.html">neighbor</A></TD><TD ><A HREF = "newton.html">newton</A></TD><TD ><A HREF = "next.html">next</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_coeff.html">pair_coeff</A></TD><TD ><A HREF = "pair_modify.html">pair_modify</A></TD><TD ><A HREF = "pair_style.html">pair_style</A></TD><TD ><A HREF = "pair_write.html">pair_write</A></TD><TD ><A HREF = "prd.html">prd</A></TD><TD ><A HREF = "print.html">print</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "processors.html">processors</A></TD><TD ><A HREF = "read_data.html">read_data</A></TD><TD ><A HREF = "read_restart.html">read_restart</A></TD><TD ><A HREF = "region.html">region</A></TD><TD ><A HREF = "replicate.html">replicate</A></TD><TD ><A HREF = "reset_timestep.html">reset_timestep</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "restart.html">restart</A></TD><TD ><A HREF = "run.html">run</A></TD><TD ><A HREF = "run_style.html">run_style</A></TD><TD ><A HREF = "set.html">set</A></TD><TD ><A HREF = "shell.html">shell</A></TD><TD ><A HREF = "special_bonds.html">special_bonds</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "tad.html">tad</A></TD><TD ><A HREF = "temper.html">temper</A></TD><TD ><A HREF = "thermo.html">thermo</A></TD><TD ><A HREF = "thermo_modify.html">thermo_modify</A></TD><TD ><A HREF = "thermo_style.html">thermo_style</A></TD><TD ><A HREF = "timestep.html">timestep</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "uncompute.html">uncompute</A></TD><TD ><A HREF = "undump.html">undump</A></TD><TD ><A HREF = "unfix.html">unfix</A></TD><TD ><A HREF = "units.html">units</A></TD><TD ><A HREF = "variable.html">variable</A></TD><TD ><A HREF = "velocity.html">velocity</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "write_restart.html">write_restart</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
@@ -336,14 +337,15 @@ of each style or click on the style itself for a full description:
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "fix_adapt.html">adapt</A></TD><TD ><A HREF = "fix_addforce.html">addforce</A></TD><TD ><A HREF = "fix_aveforce.html">aveforce</A></TD><TD ><A HREF = "fix_ave_atom.html">ave/atom</A></TD><TD ><A HREF = "fix_ave_correlate.html">ave/correlate</A></TD><TD ><A HREF = "fix_ave_histo.html">ave/histo</A></TD><TD ><A HREF = "fix_ave_spatial.html">ave/spatial</A></TD><TD ><A HREF = "fix_ave_time.html">ave/time</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_bond_break.html">bond/break</A></TD><TD ><A HREF = "fix_bond_create.html">bond/create</A></TD><TD ><A HREF = "fix_bond_swap.html">bond/swap</A></TD><TD ><A HREF = "fix_box_relax.html">box/relax</A></TD><TD ><A HREF = "fix_deform.html">deform</A></TD><TD ><A HREF = "fix_deposit.html">deposit</A></TD><TD ><A HREF = "fix_drag.html">drag</A></TD><TD ><A HREF = "fix_dt_reset.html">dt/reset</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_efield.html">efield</A></TD><TD ><A HREF = "fix_enforce2d.html">enforce2d</A></TD><TD ><A HREF = "fix_evaporate.html">evaporate</A></TD><TD ><A HREF = "fix_external.html">external</A></TD><TD ><A HREF = "fix_freeze.html">freeze</A></TD><TD ><A HREF = "fix_gravity.html">gravity</A></TD><TD ><A HREF = "fix_heat.html">heat</A></TD><TD ><A HREF = "fix_indent.html">indent</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_langevin.html">langevin</A></TD><TD ><A HREF = "fix_lineforce.html">lineforce</A></TD><TD ><A HREF = "fix_momentum.html">momentum</A></TD><TD ><A HREF = "fix_move.html">move</A></TD><TD ><A HREF = "fix_msst.html">msst</A></TD><TD ><A HREF = "fix_neb.html">neb</A></TD><TD ><A HREF = "fix_nh.html">nph</A></TD><TD ><A HREF = "fix_nph_asphere.html">nph/asphere</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_nph_sphere.html">nph/sphere</A></TD><TD ><A HREF = "fix_nh.html">npt</A></TD><TD ><A HREF = "fix_npt_asphere.html">npt/asphere</A></TD><TD ><A HREF = "fix_npt_sphere.html">npt/sphere</A></TD><TD ><A HREF = "fix_nve.html">nve</A></TD><TD ><A HREF = "fix_nve_asphere.html">nve/asphere</A></TD><TD ><A HREF = "fix_nve_limit.html">nve/limit</A></TD><TD ><A HREF = "fix_nve_noforce.html">nve/noforce</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_nve_sphere.html">nve/sphere</A></TD><TD ><A HREF = "fix_nh.html">nvt</A></TD><TD ><A HREF = "fix_nvt_asphere.html">nvt/asphere</A></TD><TD ><A HREF = "fix_nvt_sllod.html">nvt/sllod</A></TD><TD ><A HREF = "fix_nvt_sphere.html">nvt/sphere</A></TD><TD ><A HREF = "fix_orient_fcc.html">orient/fcc</A></TD><TD ><A HREF = "fix_planeforce.html">planeforce</A></TD><TD ><A HREF = "fix_poems.html">poems</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_pour.html">pour</A></TD><TD ><A HREF = "fix_press_berendsen.html">press/berendsen</A></TD><TD ><A HREF = "fix_print.html">print</A></TD><TD ><A HREF = "fix_qeq_comb.html">qeq/comb</A></TD><TD ><A HREF = "fix_reax_bonds.html">reax/bonds</A></TD><TD ><A HREF = "fix_recenter.html">recenter</A></TD><TD ><A HREF = "fix_rigid.html">rigid</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nve</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/nvt</A></TD><TD ><A HREF = "fix_setforce.html">setforce</A></TD><TD ><A HREF = "fix_shake.html">shake</A></TD><TD ><A HREF = "fix_spring.html">spring</A></TD><TD ><A HREF = "fix_spring_rg.html">spring/rg</A></TD><TD ><A HREF = "fix_spring_self.html">spring/self</A></TD><TD ><A HREF = "fix_srd.html">srd</A></TD><TD ><A HREF = "fix_store_force.html">store/force</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_store_state.html">store/state</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale</A></TD><TD ><A HREF = "fix_thermal_conductivity.html">thermal/conductivity</A></TD><TD ><A HREF = "fix_tmd.html">tmd</A></TD><TD ><A HREF = "fix_ttm.html">ttm</A></TD><TD ><A HREF = "fix_viscosity.html">viscosity</A></TD><TD ><A HREF = "fix_viscous.html">viscous</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "fix_wall.html">wall/colloid</A></TD><TD ><A HREF = "fix_wall_gran.html">wall/gran</A></TD><TD ><A HREF = "fix_wall.html">wall/harmonic</A></TD><TD ><A HREF = "fix_wall.html">wall/lj126</A></TD><TD ><A HREF = "fix_wall.html">wall/lj93</A></TD><TD ><A HREF = "fix_wall_reflect.html">wall/reflect</A></TD><TD ><A HREF = "fix_wall_region.html">wall/region</A></TD><TD ><A HREF = "fix_wall_srd.html">wall/srd</A> 
+<TR ALIGN="center"><TD ><A HREF = "fix_efield.html">efield</A></TD><TD ><A HREF = "fix_enforce2d.html">enforce2d</A></TD><TD ><A HREF = "fix_evaporate.html">evaporate</A></TD><TD ><A HREF = "fix_external.html">external</A></TD><TD ><A HREF = "fix_freeze.html">freeze</A></TD><TD ><A HREF = "fix_gpu.html">gpu</A></TD><TD ><A HREF = "fix_gravity.html">gravity</A></TD><TD ><A HREF = "fix_heat.html">heat</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_indent.html">indent</A></TD><TD ><A HREF = "fix_langevin.html">langevin</A></TD><TD ><A HREF = "fix_lineforce.html">lineforce</A></TD><TD ><A HREF = "fix_momentum.html">momentum</A></TD><TD ><A HREF = "fix_move.html">move</A></TD><TD ><A HREF = "fix_msst.html">msst</A></TD><TD ><A HREF = "fix_neb.html">neb</A></TD><TD ><A HREF = "fix_nh.html">nph</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_nph_asphere.html">nph/asphere</A></TD><TD ><A HREF = "fix_nph_sphere.html">nph/sphere</A></TD><TD ><A HREF = "fix_nh.html">npt</A></TD><TD ><A HREF = "fix_npt_asphere.html">npt/asphere</A></TD><TD ><A HREF = "fix_npt_sphere.html">npt/sphere</A></TD><TD ><A HREF = "fix_nve.html">nve</A></TD><TD ><A HREF = "fix_nve_asphere.html">nve/asphere</A></TD><TD ><A HREF = "fix_nve_limit.html">nve/limit</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_nve_noforce.html">nve/noforce</A></TD><TD ><A HREF = "fix_nve_sphere.html">nve/sphere</A></TD><TD ><A HREF = "fix_nh.html">nvt</A></TD><TD ><A HREF = "fix_nvt_asphere.html">nvt/asphere</A></TD><TD ><A HREF = "fix_nvt_sllod.html">nvt/sllod</A></TD><TD ><A HREF = "fix_nvt_sphere.html">nvt/sphere</A></TD><TD ><A HREF = "fix_orient_fcc.html">orient/fcc</A></TD><TD ><A HREF = "fix_planeforce.html">planeforce</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_poems.html">poems</A></TD><TD ><A HREF = "fix_pour.html">pour</A></TD><TD ><A HREF = "fix_press_berendsen.html">press/berendsen</A></TD><TD ><A HREF = "fix_print.html">print</A></TD><TD ><A HREF = "fix_qeq_comb.html">qeq/comb</A></TD><TD ><A HREF = "fix_reax_bonds.html">reax/bonds</A></TD><TD ><A HREF = "fix_recenter.html">recenter</A></TD><TD ><A HREF = "fix_rigid.html">rigid</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_rigid.html">rigid/nve</A></TD><TD ><A HREF = "fix_rigid.html">rigid/nvt</A></TD><TD ><A HREF = "fix_setforce.html">setforce</A></TD><TD ><A HREF = "fix_shake.html">shake</A></TD><TD ><A HREF = "fix_spring.html">spring</A></TD><TD ><A HREF = "fix_spring_rg.html">spring/rg</A></TD><TD ><A HREF = "fix_spring_self.html">spring/self</A></TD><TD ><A HREF = "fix_srd.html">srd</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_store_force.html">store/force</A></TD><TD ><A HREF = "fix_store_state.html">store/state</A></TD><TD ><A HREF = "fix_temp_berendsen.html">temp/berendsen</A></TD><TD ><A HREF = "fix_temp_rescale.html">temp/rescale</A></TD><TD ><A HREF = "fix_thermal_conductivity.html">thermal/conductivity</A></TD><TD ><A HREF = "fix_tmd.html">tmd</A></TD><TD ><A HREF = "fix_ttm.html">ttm</A></TD><TD ><A HREF = "fix_viscosity.html">viscosity</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_viscous.html">viscous</A></TD><TD ><A HREF = "fix_wall.html">wall/colloid</A></TD><TD ><A HREF = "fix_wall_gran.html">wall/gran</A></TD><TD ><A HREF = "fix_wall.html">wall/harmonic</A></TD><TD ><A HREF = "fix_wall.html">wall/lj126</A></TD><TD ><A HREF = "fix_wall.html">wall/lj93</A></TD><TD ><A HREF = "fix_wall_reflect.html">wall/reflect</A></TD><TD ><A HREF = "fix_wall_region.html">wall/region</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "fix_wall_srd.html">wall/srd</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are fix styles contributed by users, which can be used if
diff --git a/doc/Section_commands.txt b/doc/Section_commands.txt
index 1c58401303..a8bcd7a79b 100644
--- a/doc/Section_commands.txt
+++ b/doc/Section_commands.txt
@@ -307,6 +307,7 @@ included when LAMMPS was built.  Not all packages are included in a
 default LAMMPS build.  These dependencies are listed as Restrictions
 in the command's documentation.
 
+"accelerator"_accelerator.html,
 "angle_coeff"_angle_coeff.html,
 "angle_style"_angle_style.html,
 "atom_modify"_atom_modify.html,
@@ -414,6 +415,7 @@ of each style or click on the style itself for a full description:
 "evaporate"_fix_evaporate.html,
 "external"_fix_external.html,
 "freeze"_fix_freeze.html,
+"gpu"_fix_gpu.html,
 "gravity"_fix_gravity.html,
 "heat"_fix_heat.html,
 "indent"_fix_indent.html,
diff --git a/doc/Section_howto.html b/doc/Section_howto.html
index adab09b06b..655adf939a 100644
--- a/doc/Section_howto.html
+++ b/doc/Section_howto.html
@@ -825,6 +825,10 @@ factors (xy,xz,yz) is as follows:
 </P>
 <CENTER><IMG SRC = "Eqs/box.jpg">
 </CENTER>
+<P>The inverse relationship can be written as follows:
+</P>
+<CENTER><IMG SRC = "Eqs/box_inverse.jpg">
+</CENTER>
 <P>As discussed on the <A HREF = "dump.html">dump</A> command doc page, when the BOX
 BOUNDS for a snapshot is written to a dump file for a triclinic box,
 an orthogonal bounding box which encloses the triclinic simulation box
@@ -1158,7 +1162,7 @@ discussed below, it can be referenced via the following bracket
 notation, where ID in this case is the ID of a compute.  The leading
 "c_" would be replaced by "f_" for a fix, or "v_" for a variable:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR><TD >c_ID </TD><TD > entire scalar, vector, or array</TD></TR>
 <TR><TD >c_ID[I] </TD><TD > one element of vector, one column of array</TD></TR>
 <TR><TD >c_ID[I][J] </TD><TD > one element of array 
@@ -1352,7 +1356,7 @@ data and scalar/vector/array data.
 input, that could be an element of a vector or array.  Likewise a
 vector input could be a column of an array.
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR><TD >Command</TD><TD > Input</TD><TD > Output</TD><TD ></TD></TR>
 <TR><TD ><A HREF = "thermo_style.html">thermo_style custom</A></TD><TD > global scalars</TD><TD > screen, log file</TD><TD ></TD></TR>
 <TR><TD ><A HREF = "dump.html">dump custom</A></TD><TD > per-atom vectors</TD><TD > dump file</TD><TD ></TD></TR>
diff --git a/doc/Section_start.html b/doc/Section_start.html
index 08287e3377..55e6bb352d 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -787,7 +787,8 @@ more processors or setup a smaller problem.
 which may be used in any order.  Either the full word or the
 one-letter abbreviation can be used:
 </P>
-<UL><LI>-echo or -e
+<UL><LI>-accelerator or -a
+<LI>-echo or -e
 <LI>-partition or -p
 <LI>-in or -i
 <LI>-log or -l
@@ -800,6 +801,27 @@ one-letter abbreviation can be used:
 </PRE>
 <P>Here are the details on the options:
 </P>
+<PRE>-accelerator style 
+</PRE>
+<P>Use accelerated variants of various styles if they exist.  The style
+can be <I>opt</I> or <I>gpu</I> or <I>cuda</I>.  The variant styles are part of
+optional packages that LAMMPS can be built with, as described above in
+<A HREF = "#2_3">Section 2.3</A>.  Also see the <A HREF = "accelerator.html">acclerator</A>
+command doc page.  The "opt" style corrsponds to the OPT package, the
+"gpu" style to the GPU package, and the "cuda" style to the USER-CUDA
+package.  For example, all of the packages provide a <A HREF = "pair_lj.html">pair_style
+lj/cut</A> variant, with style names lj/cut/opt or
+lj/cut/gpu or lj/cut/cuda.
+</P>
+<P>These accelerated styles can be specified explicitly in your input
+script, e.g. pair_style lj/cut/gpu.  If the -accelerator switch is
+used, you do not need to modify your input script.  The accelerator
+suffix (opt,gpu,cuda) is automatically appended when the style is
+created for atom, pair, fix, compute, and integrate styles.  If an
+accelerated version does not exist, the standard version is created.
+See the <A HREF = "accelerator.html">accelerator</A> command for info on how to
+temporarily turn off this option.
+</P>
 <PRE>-echo style 
 </PRE>
 <P>Set the style of command echoing.  The style can be <I>none</I> or <I>screen</I>
@@ -994,12 +1016,11 @@ processing units (GPUs).  We plan to add more over time.  Currently,
 they only support NVIDIA GPU cards.  To use them you need to install
 certain NVIDIA CUDA software on your system:
 </P>
-<UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 Go
-<LI>to http://www.nvidia.com/object/cuda_get.html Install a driver and
-<LI>toolkit appropriate for your system (SDK is not necessary) Follow the
-<LI>instructions in README in lammps/lib/gpu to build the library.  Run
-<LI>lammps/lib/gpu/nvc_get_devices to list supported devices and
-<LI>properties 
+<UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
+<LI>Go to http://www.nvidia.com/object/cuda_get.html
+<LI>Install a driver and toolkit appropriate for your system (SDK is not necessary)
+<LI>Follow the instructions in README in lammps/lib/gpu to build the library
+<LI>Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties 
 </UL>
 <H4>GPU configuration 
 </H4>
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index fbdd015ab4..ea0462ae9a 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -777,6 +777,7 @@ At run time, LAMMPS recognizes several optional command-line switches
 which may be used in any order.  Either the full word or the
 one-letter abbreviation can be used:
 
+-accelerator or -a
 -echo or -e
 -partition or -p
 -in or -i
@@ -790,6 +791,27 @@ mpirun -np 16 lmp_ibm -var f tmp.out -log my.log -screen none < in.alloy :pre
 
 Here are the details on the options:
 
+-accelerator style :pre
+
+Use accelerated variants of various styles if they exist.  The style
+can be {opt} or {gpu} or {cuda}.  The variant styles are part of
+optional packages that LAMMPS can be built with, as described above in
+"Section 2.3"_#2_3.  Also see the "acclerator"_accelerator.html
+command doc page.  The "opt" style corrsponds to the OPT package, the
+"gpu" style to the GPU package, and the "cuda" style to the USER-CUDA
+package.  For example, all of the packages provide a "pair_style
+lj/cut"_pair_lj.html variant, with style names lj/cut/opt or
+lj/cut/gpu or lj/cut/cuda.
+
+These accelerated styles can be specified explicitly in your input
+script, e.g. pair_style lj/cut/gpu.  If the -accelerator switch is
+used, you do not need to modify your input script.  The accelerator
+suffix (opt,gpu,cuda) is automatically appended when the style is
+created for atom, pair, fix, compute, and integrate styles.  If an
+accelerated version does not exist, the standard version is created.
+See the "accelerator"_accelerator.html command for info on how to
+temporarily turn off this option.
+
 -echo style :pre
 
 Set the style of command echoing.  The style can be {none} or {screen}
@@ -984,12 +1006,11 @@ processing units (GPUs).  We plan to add more over time.  Currently,
 they only support NVIDIA GPU cards.  To use them you need to install
 certain NVIDIA CUDA software on your system:
 
-Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 Go
-to http://www.nvidia.com/object/cuda_get.html Install a driver and
-toolkit appropriate for your system (SDK is not necessary) Follow the
-instructions in README in lammps/lib/gpu to build the library.  Run
-lammps/lib/gpu/nvc_get_devices to list supported devices and
-properties :ul
+Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
+Go to http://www.nvidia.com/object/cuda_get.html
+Install a driver and toolkit appropriate for your system (SDK is not necessary)
+Follow the instructions in README in lammps/lib/gpu to build the library
+Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties :ul
 
 GPU configuration :h4
 
diff --git a/doc/accelerator.html b/doc/accelerator.html
new file mode 100644
index 0000000000..ff6eca3ec0
--- /dev/null
+++ b/doc/accelerator.html
@@ -0,0 +1,89 @@
+<HTML>
+<CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
+</CENTER>
+
+
+
+
+
+
+<HR>
+
+<H3>accelerator command 
+</H3>
+<P><B>Syntax:</B>
+</P>
+<PRE>accelerator style args 
+</PRE>
+<UL><LI>style = <I>off</I> or <I>on</I> or <I>cuda</I> 
+
+<LI>args = 0 or more args specific to the style 
+
+<PRE>  <I>off</I> args = none
+  <I>on</I> args = none
+  <I>cuda</I> args = to be determined 
+</PRE>
+
+</UL>
+<P><B>Examples:</B>
+</P>
+<PRE>accelerator off
+accelerator on
+accelerator cuda blah 
+</PRE>
+<P><B>Description:</B>
+</P>
+<P>Alter settings for use of accelerated versions of various styles.
+LAMMPS can be built with optional packages which provide accelerated
+versions of specific <A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>,
+<A HREF = "fix.html">fix</A>, <A HREF = "compute.html">compute</A>, and <A HREF = "run_style.html">integrate</A>
+styles.
+</P>
+<P>These are the relevant packages:
+</P>
+<UL><LI>OPT = a handful of pair styles, cache-optimized for faster CPU performance
+<LI>GPU = a handful of pair styles and the PPPM kspace_style, optimized to run on one or more GPUs or multicore CPU/GPU nodes
+<LI>USER-CUDA = a collection of atom, pair, fix, compute, and intergrate styles, optimized to run on one or more NVIDIA GPUs 
+</UL>
+<P>See <A HREF = "Section_start.html#2_3">this section</A> of the manual for
+instructions on how to build LAMMPS with any of these packages.
+</P>
+<P>These styles can be specified explicitly in your input script,
+e.g. <A HREF = "pair_lj.html">pair_style lj/cut/gpu</A>.  If the -accelerator
+command-line switch is used, you do not need to modify your input
+script, as discussed in <A HREF = "Section_start.html#2_6">this section</A> of the
+manual.  The command-line suffix (opt,gpu,cuda) is automatically
+appended when the style is created for atom, pair, fix, compute, and
+integrate styles.  If an accelerated version does not exist, the
+standard version is created.
+</P>
+<P>If the -accelerator command-line switch is used, you may wish to
+disable it for one or more input script commands, so that the standard
+version of the style is used instead of the accelerated one.  This can
+be useful for performance testing or debugging.
+</P>
+<P>The <I>off</I> style allows you to do this.  The effect of the -accelerator
+command-line switch is effectively turned off until another
+accelerator command is used with the <I>on</I> style.  The <I>on</I> style can
+only be used if the -accelerator command-line switch was used.
+</P>
+<P>The <I>cuda</I> style invokes options associated with the use of the
+USER-CUDA package.  These will be described when the USER-CUDA package
+is released with LAMMPS.
+</P>
+<P><B>Restrictions:</B>
+</P>
+<P>This cuda style can only be invoked if LAMMPS was built with the
+USER-CUDA package.  See the <A HREF = "Section_start.html#2_3">Making LAMMPS</A>
+section for more info.
+</P>
+<P>Obviously, you must have GPU hardware and associated software to
+build LAMMPS with GPU support.
+</P>
+<P><B>Related commands:</B>
+</P>
+<P><A HREF = "fix_gpu.html">fix gpu</A>
+</P>
+<P><B>Default:</B> none
+</P>
+</HTML>
diff --git a/doc/accelerator.txt b/doc/accelerator.txt
new file mode 100644
index 0000000000..6f93f33a45
--- /dev/null
+++ b/doc/accelerator.txt
@@ -0,0 +1,81 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+accelerator command :h3
+
+[Syntax:]
+
+accelerator style args :pre
+
+style = {off} or {on} or {cuda} :ulb,l
+args = 0 or more args specific to the style :l
+  {off} args = none
+  {on} args = none
+  {cuda} args = to be determined :pre
+:ule
+
+[Examples:]
+
+accelerator off
+accelerator on
+accelerator cuda blah :pre
+
+[Description:]
+
+Alter settings for use of accelerated versions of various styles.
+LAMMPS can be built with optional packages which provide accelerated
+versions of specific "atom"_atom_style.html, "pair"_pair_style.html,
+"fix"_fix.html, "compute"_compute.html, and "integrate"_run_style.html
+styles.
+
+These are the relevant packages:
+
+OPT = a handful of pair styles, cache-optimized for faster CPU performance
+GPU = a handful of pair styles and the PPPM kspace_style, optimized to run on one or more GPUs or multicore CPU/GPU nodes
+USER-CUDA = a collection of atom, pair, fix, compute, and intergrate styles, optimized to run on one or more NVIDIA GPUs :ul
+
+See "this section"_Section_start.html#2_3 of the manual for
+instructions on how to build LAMMPS with any of these packages.
+
+These styles can be specified explicitly in your input script,
+e.g. "pair_style lj/cut/gpu"_pair_lj.html.  If the -accelerator
+command-line switch is used, you do not need to modify your input
+script, as discussed in "this section"_Section_start.html#2_6 of the
+manual.  The command-line suffix (opt,gpu,cuda) is automatically
+appended when the style is created for atom, pair, fix, compute, and
+integrate styles.  If an accelerated version does not exist, the
+standard version is created.
+
+If the -accelerator command-line switch is used, you may wish to
+disable it for one or more input script commands, so that the standard
+version of the style is used instead of the accelerated one.  This can
+be useful for performance testing or debugging.
+
+The {off} style allows you to do this.  The effect of the -accelerator
+command-line switch is effectively turned off until another
+accelerator command is used with the {on} style.  The {on} style can
+only be used if the -accelerator command-line switch was used.
+
+The {cuda} style invokes options associated with the use of the
+USER-CUDA package.  These will be described when the USER-CUDA package
+is released with LAMMPS.
+
+[Restrictions:]
+
+This cuda style can only be invoked if LAMMPS was built with the
+USER-CUDA package.  See the "Making LAMMPS"_Section_start.html#2_3
+section for more info.
+
+Obviously, you must have GPU hardware and associated software to
+build LAMMPS with GPU support.
+
+[Related commands:]
+
+"fix gpu"_fix_gpu.html
+
+[Default:] none
diff --git a/doc/compute_group_group.html b/doc/compute_group_group.html
index 57e867136a..bb665ca6f7 100644
--- a/doc/compute_group_group.html
+++ b/doc/compute_group_group.html
@@ -49,7 +49,7 @@ section</A> for an overview of LAMMPS output
 options.
 </P>
 <P>Both the scalar and vector values calculated by this compute are
-"extensive"., The scalar value will be in energy <A HREF = "units.html">units</A>.
+"extensive".  The scalar value will be in energy <A HREF = "units.html">units</A>.
 The vector values will be in force <A HREF = "units.html">units</A>.
 </P>
 <P><B>Restrictions:</B>
diff --git a/doc/compute_group_group.txt b/doc/compute_group_group.txt
index 71bfcf6450..5a77bcd561 100644
--- a/doc/compute_group_group.txt
+++ b/doc/compute_group_group.txt
@@ -46,7 +46,7 @@ section"_Section_howto.html#4_15 for an overview of LAMMPS output
 options.
 
 Both the scalar and vector values calculated by this compute are
-"extensive"., The scalar value will be in energy "units"_units.html.
+"extensive".  The scalar value will be in energy "units"_units.html.
 The vector values will be in force "units"_units.html.
 
 [Restrictions:]
diff --git a/doc/fix_adapt.html b/doc/fix_adapt.html
index 837f898199..c26a9b2a39 100644
--- a/doc/fix_adapt.html
+++ b/doc/fix_adapt.html
@@ -54,7 +54,7 @@
 </P>
 <PRE>fix 1 all adapt 1 pair soft a 1 1 v_prefactor
 fix 1 all adapt 1 pair soft a 2* 3 v_prefactor
-fix 1 all adapt 1 pair lj/cut epsilon * * v_scale1 coul/cut pre 3 3 v_scale2 scale yes reset yes
+fix 1 all adapt 1 pair lj/cut epsilon * * v_scale1 coul/cut scale 3 3 v_scale2 scale yes reset yes
 fix 1 all adapt 10 atom diameter v_size 
 </PRE>
 <P><B>Description:</B>
diff --git a/doc/fix_adapt.txt b/doc/fix_adapt.txt
index f281c2cf01..3c6ecad3eb 100644
--- a/doc/fix_adapt.txt
+++ b/doc/fix_adapt.txt
@@ -41,7 +41,7 @@ keyword = {scale} or {reset} :l
 
 fix 1 all adapt 1 pair soft a 1 1 v_prefactor
 fix 1 all adapt 1 pair soft a 2* 3 v_prefactor
-fix 1 all adapt 1 pair lj/cut epsilon * * v_scale1 coul/cut pre 3 3 v_scale2 scale yes reset yes
+fix 1 all adapt 1 pair lj/cut epsilon * * v_scale1 coul/cut scale 3 3 v_scale2 scale yes reset yes
 fix 1 all adapt 10 atom diameter v_size :pre
 
 [Description:]
diff --git a/doc/fix_gpu.html b/doc/fix_gpu.html
index f71a8e8a4a..3560baf21f 100644
--- a/doc/fix_gpu.html
+++ b/doc/fix_gpu.html
@@ -45,39 +45,38 @@ specified for a run or an error will be generated. The fix will not have an
 effect on any LAMMPS computations that do not use GPU acceleration, so there 
 should not be any problems with specifying this fix first in input scripts.
 </P>
-<P><I>mode</I> specifies where neighbor list calculations will be performed.
-If <I>mode</I> is force, neighbor list calculation is performed on the
-CPU. If <I>mode</I> is force/neigh, neighbor list calculation is 
-performed on the GPU. GPU neighbor list calculation currently cannot be
-used with a triclinic box. GPU neighbor list calculation currently
-cannot be used with <A HREF = "pair_hybrid.html">hybrid</A> pair styles.
-GPU neighbor lists are not compatible with styles that are not GPU-enabled.
-When a non-GPU enabled style requires a neighbor list, it will also be
-built using CPU routines. In these cases, it will typically be more efficient
-to only use CPU neighbor list builds.
+<P>The <I>mode</I> setting specifies where neighbor list calculations will be
+performed.  If <I>mode</I> is force, neighbor list calculation is performed
+on the CPU. If <I>mode</I> is force/neigh, neighbor list calculation is
+performed on the GPU. GPU neighbor list calculation currently cannot
+be used with a triclinic box. GPU neighbor list calculation currently
+cannot be used with <A HREF = "pair_hybrid.html">hybrid</A> pair styles.  GPU
+neighbor lists are not compatible with styles that are not
+GPU-enabled.  When a non-GPU enabled style requires a neighbor list,
+it will also be built using CPU routines. In these cases, it will
+typically be more efficient to only use CPU neighbor list builds.
 </P>
-<P><I>first</I> and <I>last</I> specify the GPUs that will be used for simulation.
-On each node, the GPU IDs in the inclusive range from <I>first</I> to <I>last</I> will
-be used.
+<P>The <I>first</I> and <I>last</I> settings specify the GPUs that will be used for
+simulation.  On each node, the GPU IDs in the inclusive range from
+<I>first</I> to <I>last</I> will be used.
 </P>
-<P><I>split</I> can be used for load balancing force calculation work between
-CPU and GPU cores in GPU-enabled pair styles. If 0<<I>split</I><1.0, 
-a fixed fraction of particles is offloaded to the GPU while force calculation
-for the other particles occurs simulataneously on the CPU. If <I>split</I><0,
-the optimal fraction (based on CPU and GPU timings) is calculated
-every 25 timesteps. If <I>split</I>=1.0, all force calculations for 
-GPU accelerated pair styles are performed
-on the GPU. In this case, <A HREF = "pair_hybrid.html">hybrid</A>, 
-<A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>, 
-<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>, 
-and <A HREF = "kspace_style.html">long-range</A> calculations can be performed on the CPU 
-while the GPU is performing force calculations for the GPU-enabled pair
-style.
+<P>The <I>split</I> setting can be used for load balancing force calculation
+work between CPU and GPU cores in GPU-enabled pair styles. If
+0<<I>split</I><1.0, a fixed fraction of particles is offloaded to the GPU
+while force calculation for the other particles occurs simulataneously
+on the CPU. If <I>split</I><0, the optimal fraction (based on CPU and GPU
+timings) is calculated every 25 timesteps. If <I>split</I>=1.0, all force
+calculations for GPU accelerated pair styles are performed on the
+GPU. In this case, <A HREF = "pair_hybrid.html">hybrid</A>, <A HREF = "bond_style.html">bond</A>,
+<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
+<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A>
+calculations can be performed on the CPU while the GPU is performing
+force calculations for the GPU-enabled pair style.
 </P>
-<P>In order to use GPU acceleration, a GPU enabled style must be
-selected in the input script in addition to this fix. Currently,
-this is limited to a few <A HREF = "pair_style.html">pair styles</A> and
-the PPPM <A HREF = "kspace_style.html">kspace style</A>.
+<P>In order to use GPU acceleration, a GPU enabled style must be selected
+in the input script in addition to this fix. Currently, this is
+limited to a few <A HREF = "pair_style.html">pair styles</A> and the PPPM <A HREF = "kspace_style.html">kspace
+style</A>.
 </P>
 <P>More details about these settings and various possible hardware
 configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
@@ -85,6 +84,10 @@ manual.
 </P>
 <P><B>Restart, fix_modify, output, run start/stop, minimize info:</B>
 </P>
+<P>This fix is part of the "gpu" package.  It is only enabled if LAMMPS
+was built with that package.  See the <A HREF = "Section_start.html#2_3">Making
+LAMMPS</A> section for more info.
+</P>
 <P>No information about this fix is written to <A HREF = "restart.html">binary restart
 files</A>.  None of the <A HREF = "fix_modify.html">fix_modify</A> options
 are relevant to this fix.
@@ -98,7 +101,8 @@ the <A HREF = "run.html">run</A> command.
 <I>mode</I> should not be used with a triclinic box or <A HREF = "pair_hybrid.html">hybrid</A>
 pair styles.
 </P>
-<P><I>split</I> must be positive when using <A HREF = "pair_hybrid.html">hybrid</A> pair styles.
+<P>The <I>split</I> setting must be positive when using
+<A HREF = "pair_hybrid.html">hybrid</A> pair styles.
 </P>
 <P>Currently, group-ID must be all.
 </P>
diff --git a/doc/fix_gpu.txt b/doc/fix_gpu.txt
index df8fbadb8f..e4cd41f1de 100644
--- a/doc/fix_gpu.txt
+++ b/doc/fix_gpu.txt
@@ -36,39 +36,38 @@ specified for a run or an error will be generated. The fix will not have an
 effect on any LAMMPS computations that do not use GPU acceleration, so there 
 should not be any problems with specifying this fix first in input scripts.
 
-{mode} specifies where neighbor list calculations will be performed.
-If {mode} is force, neighbor list calculation is performed on the
-CPU. If {mode} is force/neigh, neighbor list calculation is 
-performed on the GPU. GPU neighbor list calculation currently cannot be
-used with a triclinic box. GPU neighbor list calculation currently
-cannot be used with "hybrid"_pair_hybrid.html pair styles.
-GPU neighbor lists are not compatible with styles that are not GPU-enabled.
-When a non-GPU enabled style requires a neighbor list, it will also be
-built using CPU routines. In these cases, it will typically be more efficient
-to only use CPU neighbor list builds.
+The {mode} setting specifies where neighbor list calculations will be
+performed.  If {mode} is force, neighbor list calculation is performed
+on the CPU. If {mode} is force/neigh, neighbor list calculation is
+performed on the GPU. GPU neighbor list calculation currently cannot
+be used with a triclinic box. GPU neighbor list calculation currently
+cannot be used with "hybrid"_pair_hybrid.html pair styles.  GPU
+neighbor lists are not compatible with styles that are not
+GPU-enabled.  When a non-GPU enabled style requires a neighbor list,
+it will also be built using CPU routines. In these cases, it will
+typically be more efficient to only use CPU neighbor list builds.
 
-{first} and {last} specify the GPUs that will be used for simulation.
-On each node, the GPU IDs in the inclusive range from {first} to {last} will
-be used.
+The {first} and {last} settings specify the GPUs that will be used for
+simulation.  On each node, the GPU IDs in the inclusive range from
+{first} to {last} will be used.
 
-{split} can be used for load balancing force calculation work between
-CPU and GPU cores in GPU-enabled pair styles. If 0<{split}<1.0, 
-a fixed fraction of particles is offloaded to the GPU while force calculation
-for the other particles occurs simulataneously on the CPU. If {split}<0,
-the optimal fraction (based on CPU and GPU timings) is calculated
-every 25 timesteps. If {split}=1.0, all force calculations for 
-GPU accelerated pair styles are performed
-on the GPU. In this case, "hybrid"_pair_hybrid.html, 
-"bond"_bond_style.html, "angle"_angle_style.html, 
-"dihedral"_dihedral_style.html, "improper"_improper_style.html, 
-and "long-range"_kspace_style.html calculations can be performed on the CPU 
-while the GPU is performing force calculations for the GPU-enabled pair
-style.
+The {split} setting can be used for load balancing force calculation
+work between CPU and GPU cores in GPU-enabled pair styles. If
+0<{split}<1.0, a fixed fraction of particles is offloaded to the GPU
+while force calculation for the other particles occurs simulataneously
+on the CPU. If {split}<0, the optimal fraction (based on CPU and GPU
+timings) is calculated every 25 timesteps. If {split}=1.0, all force
+calculations for GPU accelerated pair styles are performed on the
+GPU. In this case, "hybrid"_pair_hybrid.html, "bond"_bond_style.html,
+"angle"_angle_style.html, "dihedral"_dihedral_style.html,
+"improper"_improper_style.html, and "long-range"_kspace_style.html
+calculations can be performed on the CPU while the GPU is performing
+force calculations for the GPU-enabled pair style.
 
-In order to use GPU acceleration, a GPU enabled style must be
-selected in the input script in addition to this fix. Currently,
-this is limited to a few "pair styles"_pair_style.html and
-the PPPM "kspace style"_kspace_style.html.
+In order to use GPU acceleration, a GPU enabled style must be selected
+in the input script in addition to this fix. Currently, this is
+limited to a few "pair styles"_pair_style.html and the PPPM "kspace
+style"_kspace_style.html.
 
 More details about these settings and various possible hardware
 configuration are in "this section"_Section_start.html#2_8 of the
@@ -76,6 +75,10 @@ manual.
 
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
+This fix is part of the "gpu" package.  It is only enabled if LAMMPS
+was built with that package.  See the "Making
+LAMMPS"_Section_start.html#2_3 section for more info.
+
 No information about this fix is written to "binary restart
 files"_restart.html.  None of the "fix_modify"_fix_modify.html options
 are relevant to this fix.
@@ -89,7 +92,8 @@ The fix must be the first fix specified for a given run. The force/neigh
 {mode} should not be used with a triclinic box or "hybrid"_pair_hybrid.html
 pair styles.
 
-{split} must be positive when using "hybrid"_pair_hybrid.html pair styles.
+The {split} setting must be positive when using
+"hybrid"_pair_hybrid.html pair styles.
 
 Currently, group-ID must be all.
 
diff --git a/examples/reax/control.reax_c.tatb b/examples/reax/control.reax_c.tatb
index 13171ebe4a..38c8fce313 100644
--- a/examples/reax/control.reax_c.tatb
+++ b/examples/reax/control.reax_c.tatb
@@ -11,11 +11,12 @@ thb_cutoff              0.001 ! cutoff value for three body interactions
 q_err                   1e-6  ! average per atom error norm allowed in GMRES convergence
 
 geo_format              0    ! 0: xyz, 1: pdb, 2: bgf
-write_freq              0    ! write trajectory after so many steps
+write_freq              25   ! write trajectory after so many steps
 traj_compress           0    ! 0: no compression  1: uses zlib to compress trajectory output
 traj_title              TATB ! (no white spaces)
 atom_info               0    ! 0: no atom info, 1: print basic atom info in the trajectory file
 atom_forces             0    ! 0: basic atom format, 1: print force on each atom in the trajectory file
 atom_velocities         0    ! 0: basic atom format, 1: print the velocity of each atom in the trajectory file
-bond_info               0    ! 0: do not print bonds, 1: print bonds in the trajectory file
+bond_info               1    ! 0: do not print bonds, 1: print bonds in the trajectory file
 angle_info              0    ! 0: do not print angles, 1: print angles in the trajectory file 
+
diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index d69a00a817..1777187010 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -20,7 +20,11 @@
 CUDA_HOME = /usr/local/cuda
 NVCC = nvcc
 
+# newer CUDA
 CUDA_ARCH = -arch=sm_13
+# older CUDA
+#CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
+
 CUDA_PRECISION = -D_SINGLE_SINGLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
diff --git a/lib/gpu/README b/lib/gpu/README
index a60d43064a..73a51fc391 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -33,13 +33,17 @@ NOTE: Installation of the CUDA SDK is not required.
 
 Current pair styles supporting GPU acceleration:
 
-  1. lj/cut/gpu
-  2. lj/cut/coul/cut/gpu
-  3. lj/cut/coul/long/gpu
-  4. lj96/cut/gpu
-  5. gayberne/gpu
-  6. cmm/cg/gpu
-  7. cmm/cg/coul/long/gpu
+  1. lj/cut
+  2. lj96/cut
+  3. lj/expand
+  4. lj/cut/coul/cut
+  5. lj/cut/coul/long
+  6. lj/charmm/coul/long
+  7. morse
+  8. cg/cmm
+  9. cg/cmm/coul/long
+ 10. gayberne
+ 11. pppm
 
                      MULTIPLE LAMMPS PROCESSES
                      
@@ -52,12 +56,12 @@ LAMMPS user manual for details on running with GPU acceleration.
 
                     BUILDING AND PRECISION MODES
 
-To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME, NVCC, CUDA_INCLUD,
-CUDA_LIB and CUDA_OPTS variables in one of the Makefiles. CUDA_ARCH should
-be set based on the compute capability of your GPU. This can be verified by
-running the nvc_get_devices executable after the build is complete.
-Additionally, the GPU package must be installed and compiled for LAMMPS.
-This may require editing the gpu_SYSPATH variable in the LAMMPS makefile.
+To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of 
+the Makefiles. CUDA_ARCH should be set based on the compute capability of 
+your GPU. This can be verified by running the nvc_get_devices executable after
+the build is complete. Additionally, the GPU package must be installed and
+compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
+LAMMPS makefile.
 
 Please note that the GPU library accesses the CUDA driver library directly,
 so it needs to be linked not only to the CUDA runtime library (libcudart.so)
@@ -74,6 +78,10 @@ the CUDA_PRECISION variable:
   CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
   CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
 
+NOTE: PPPM acceleration can only be run on GPUs with compute capability>=1.1.
+      You will get the error "GPU library not compiled for this accelerator."
+      when attempting to run PPPM on a GPU with compute capability 1.0.
+
 NOTE: Double precision is only supported on certain GPUs (with
       compute capability>=1.3).
       
@@ -83,15 +91,17 @@ NOTE: For Tesla and other graphics cards with compute capability>=1.3,
 NOTE: For Fermi, make sure that -arch=sm_20 is set on the CUDA_ARCH line.
 
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
-      package has been installed before installing the GPU package in LAMMPS.
+      package has been installed.
 
 NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be
-      installed if the USER-CG-CMM package has been installed before
-      installing the GPU package in LAMMPS.
+      installed if the USER-CG-CMM package has been installed.
 
-NOTE: The lj/cut/coul/long/gpu and cg/cmm/coul/long/gpu style will only be 
-      installed if the KSPACE package has been installed before installing
-      the GPU package in LAMMPS.
+NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, pppm/gpu/single, and
+      pppm/gpu/double styles will only be installed if the KSPACE package has
+      been installed.
+
+NOTE: The lj/charmm/coul/long will only be installed if the MOLECULE package
+      has been installed.
 
                       EXAMPLE BUILD PROCESS
                     
@@ -105,7 +115,3 @@ make yes-asphere
 make yes-kspace
 make yes-gpu
 make linux
-
-------------------------------------------------------------------------
-Last merge with gpulammps: r561 on 2010-11-12
-------------------------------------------------------------------------
diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu
index 08cc31ed7f..f99e7f06ac 100644
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef CMM_GPU_KERNEL
 #define CMM_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu
index 5153cb5016..a47a9267a1 100644
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@@ -18,38 +18,6 @@
 #ifndef CMML_GPU_KERNEL
 #define CMML_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -93,6 +61,38 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu
index 63ce924581..dfdc7af3cd 100644
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@@ -18,40 +18,6 @@
 #ifndef CRML_GPU_KERNEL
 #define CRML_GPU_KERNEL
 
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -94,6 +60,40 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#define MAX_BIO_SHARED_TYPES 128
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/ellipsoid_nbor.cu b/lib/gpu/ellipsoid_nbor.cu
index ef38962c23..67f9e631b4 100644
--- a/lib/gpu/ellipsoid_nbor.cu
+++ b/lib/gpu/ellipsoid_nbor.cu
@@ -16,16 +16,6 @@
 #ifndef ELLIPSOID_NBOR_H
 #define ELLIPSOID_NBOR_H
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#else
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -42,6 +32,16 @@
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#else
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#endif
+
 // ---------------------------------------------------------------------------
 // Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
 // -- Only unpack neighbors matching the specified inclusive range of forms
diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index 5c45dc3a87..65a51b5f04 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -33,6 +33,14 @@
 #define MEM_THREADS 32
 #endif
 
+#ifdef CUDA_PRE_THREE
+struct __builtin_align__(16) _double4
+{
+  double x, y, z, w;
+};
+typedef struct _double4 double4;
+#endif
+
 #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
 #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
 #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu
index 3fc6a2f308..1de9a8a7bf 100644
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef LJ96_GPU_KERNEL
 #define LJ96_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu
index 75f36446f7..9ef698cd09 100644
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef LJ_GPU_KERNEL
 #define LJ_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/lj_expand_gpu_kernel.cu b/lib/gpu/lj_expand_gpu_kernel.cu
index 2d09b4d941..26fbefacf8 100644
--- a/lib/gpu/lj_expand_gpu_kernel.cu
+++ b/lib/gpu/lj_expand_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef LJE_GPU_KERNEL
 #define LJE_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu
index 44a607588a..ad1e530712 100644
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef LJC_GPU_KERNEL
 #define LJC_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -85,6 +61,30 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu
index 7be7a86114..ddde1dec32 100644
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@@ -18,38 +18,6 @@
 #ifndef LJCL_GPU_KERNEL
 #define LJCL_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
-#define EWALD_F (numtyp)1.12837917
-#define EWALD_P (numtyp)0.3275911
-#define A1 (numtyp)0.254829592
-#define A2 (numtyp)-0.284496736
-#define A3 (numtyp)1.421413741
-#define A4 (numtyp)-1.453152027
-#define A5 (numtyp)1.061405429
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -93,6 +61,38 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/morse_gpu_kernel.cu b/lib/gpu/morse_gpu_kernel.cu
index 0a89aae070..8832f58c64 100644
--- a/lib/gpu/morse_gpu_kernel.cu
+++ b/lib/gpu/morse_gpu_kernel.cu
@@ -18,30 +18,6 @@
 #ifndef MORSE_GPU_KERNEL
 #define MORSE_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp2 double2
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp2 float2
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "nv_kernel_def.h"
@@ -75,6 +51,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 #define SBBITS 30
 #define NEIGHMASK 0x3FFFFFFF
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
diff --git a/lib/gpu/pair_gpu_atom_kernel.cu b/lib/gpu/pair_gpu_atom_kernel.cu
index 2d1a6ba85f..ab79ac6e9c 100644
--- a/lib/gpu/pair_gpu_atom_kernel.cu
+++ b/lib/gpu/pair_gpu_atom_kernel.cu
@@ -15,6 +15,13 @@
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */
 
+#ifdef NV_KERNEL
+#include "geryon/ucl_nv_kernel.h"
+#else
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#endif
+
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp4 double4
@@ -23,13 +30,6 @@
 #define numtyp4 float4
 #endif
 
-#ifdef NV_KERNEL
-#include "geryon/ucl_nv_kernel.h"
-#else
-#pragma OPENCL EXTENSION cl_khr_fp64: enable
-#define GLOBAL_ID_X get_global_id(0)
-#endif
-
 __kernel void kernel_cast_x(__global numtyp4 *x_type, __global double *x,
                             __global int *type, const int nall) {
   int ii=GLOBAL_ID_X;
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index d5906b10e5..165d202832 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -549,8 +549,9 @@ int PairGPUDeviceT::compile_kernels() {
   k_info.run(&d_gpu_lib_data.begin());
   ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
   
+  _ptx_arch=static_cast<double>(h_gpu_lib_data[0])/100.0;
   #ifndef USE_OPENCL
-  if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
+  if (_ptx_arch>gpu->arch())
     return -4;
   #endif
 
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index 1e7e15e6a8..52b35cfcf2 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -226,6 +226,8 @@ class PairGPUDevice {
   inline int block_bio_pair() const { return _block_bio_pair; }
   /// Return the maximum number of atom types for shared mem with "bio" styles
   inline int max_bio_shared_types() const { return _max_bio_shared_types; }
+  /// Architecture gpu code compiled for (returns 0 for OpenCL)
+  inline double ptx_arch() const { return _ptx_arch; }
 
   // -------------------- SHARED DEVICE ROUTINES -------------------- 
   // Perform asynchronous zero of integer array 
@@ -281,6 +283,7 @@ class PairGPUDevice {
   int _gpu_mode, _first_device, _last_device, _nthreads;
   double _particle_split;
   double _cpu_full;
+  double _ptx_arch;
 
   int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
   int _pppm_max_spline, _pppm_block;
diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index c04e784de8..fe1862d051 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -18,27 +18,6 @@
 #ifndef PPPM_GPU_KERNEL
 #define PPPM_GPU_KERNEL
 
-#ifdef _DOUBLE_DOUBLE
-#define numtyp double
-#define numtyp4 double4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifdef _SINGLE_DOUBLE
-#define numtyp float
-#define numtyp4 float4
-#define acctyp double
-#define acctyp4 double4
-#endif
-
-#ifndef numtyp
-#define numtyp float
-#define numtyp4 float4
-#define acctyp float
-#define acctyp4 float4
-#endif
-
 #ifdef NV_KERNEL
 
 #include "geryon/ucl_nv_kernel.h"
@@ -67,6 +46,12 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+// Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
+// generated at runtime with use of pppm/gpu
+#if (__CUDA_ARCH__ < 110)
+#define atom_add(x,y) 0
+#endif
+
 #else
 
 #pragma OPENCL EXTENSION cl_khr_fp64: enable
@@ -85,6 +70,27 @@ __inline float fetch_q(const int& i, const float *q)
 
 #endif
 
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
 // Maximum order for spline
 #define PPPM_MAX_SPLINE 8
 // Thread block size for PPPM kernels
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 521b3b1e46..2f7b35d051 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -66,7 +66,11 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
     flag=-5;
     return 0;
   }
-  
+  if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
+    flag=-4;
+    return 0;
+  }
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
diff --git a/src/ASPHERE/Install.sh b/src/ASPHERE/Install.sh
index 7de96460f3..3faf9fe241 100644
--- a/src/ASPHERE/Install.sh
+++ b/src/ASPHERE/Install.sh
@@ -1,7 +1,4 @@
 # Install/unInstall package files in LAMMPS
-# for unInstall, also unInstall/Install GPU package if installed
-#   so it will remove GPU files that depend on ASPHERE files,
-#   then replace others
 
 if (test $1 = 1) then
 
@@ -25,10 +22,6 @@ if (test $1 = 1) then
   cp pair_gayberne.h ..
   cp pair_resquared.h ..
 
-  if (test -e ../pair_lj_cut_gpu.h) then
-    cd ../GPU; /bin/sh Install.sh 1
-  fi
-
 elif (test $1 = 0) then
 
   rm ../compute_erotate_asphere.cpp
@@ -51,8 +44,4 @@ elif (test $1 = 0) then
   rm ../pair_gayberne.h
   rm ../pair_resquared.h
 
-  if (test -e ../pair_gayberne_gpu.h) then
-    cd ../GPU; /bin/sh Install.sh 0; /bin/sh Install.sh 1
-  fi
-
 fi
diff --git a/src/ASPHERE/pair_resquared.cpp b/src/ASPHERE/pair_resquared.cpp
index 8241e1a6c7..6831f7bd20 100755
--- a/src/ASPHERE/pair_resquared.cpp
+++ b/src/ASPHERE/pair_resquared.cpp
@@ -503,8 +503,8 @@ void PairRESquared::precompute_i(const int i,RE2Vars &ws)
   int *ellipsoid = atom->ellipsoid;
   AtomVecEllipsoid::Bonus *bonus = avec->bonus;
   MathExtra::quat_to_mat_trans(bonus[ellipsoid[i]].quat,ws.A);
-  MathExtra::transpose_times_diag3(ws.A,well[atom->type[i]],ws.aTe);
-  MathExtra::transpose_times_diag3(ws.A,shape2[atom->type[i]],aTs);
+  MathExtra::transpose_diag3(ws.A,well[atom->type[i]],ws.aTe);
+  MathExtra::transpose_diag3(ws.A,shape2[atom->type[i]],aTs);
   MathExtra::diag_times3(shape2[atom->type[i]],ws.A,ws.sa);
   MathExtra::times3(aTs,ws.A,ws.gamma);
   MathExtra::rotation_generator_x(ws.A,ws.lA[0]);
@@ -885,7 +885,7 @@ double PairRESquared::resquared_lj(const int i, const int j,
   scorrect[0] = scorrect[0] * scorrect[0] / 2.0;
   scorrect[1] = scorrect[1] * scorrect[1] / 2.0;
   scorrect[2] = scorrect[2] * scorrect[2] / 2.0;
-  MathExtra::transpose_times_diag3(wi.A,scorrect,aTs);
+  MathExtra::transpose_diag3(wi.A,scorrect,aTs);
   MathExtra::times3(aTs,wi.A,gamma);
   for (int ii=0; ii<3; ii++)
     MathExtra::times3(aTs,wi.lA[ii],lAtwo[ii]);
diff --git a/src/CLASS2/pair_lj_class2.h b/src/CLASS2/pair_lj_class2.h
index f2c5a22841..764b8b8c92 100644
--- a/src/CLASS2/pair_lj_class2.h
+++ b/src/CLASS2/pair_lj_class2.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairLJClass2 : public Pair {
  public:
   PairLJClass2(class LAMMPS *);
-  ~PairLJClass2();
+  virtual ~PairLJClass2();
   void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
@@ -38,7 +38,7 @@ class PairLJClass2 : public Pair {
   void read_restart_settings(FILE *);
   double single(int, int, int, int, double, double, double, double &);
 
- private:
+ protected:
   double cut_global;
   double **cut;
   double **epsilon,**sigma;
diff --git a/src/CLASS2/pair_lj_class2_coul_cut.h b/src/CLASS2/pair_lj_class2_coul_cut.h
index 0fd36104dc..12fabc3f1a 100644
--- a/src/CLASS2/pair_lj_class2_coul_cut.h
+++ b/src/CLASS2/pair_lj_class2_coul_cut.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairLJClass2CoulCut : public Pair {
  public:
   PairLJClass2CoulCut(class LAMMPS *);
-  ~PairLJClass2CoulCut();
+  virtual ~PairLJClass2CoulCut();
   void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
@@ -39,7 +39,7 @@ class PairLJClass2CoulCut : public Pair {
   void read_restart_settings(FILE *);
   double single(int, int, int, int, double, double, double, double &);
 
- private:
+ protected:
   double cut_lj_global,cut_coul_global;
   double **cut_lj,**cut_ljsq;
   double **cut_coul,**cut_coulsq;
diff --git a/src/CLASS2/pair_lj_class2_coul_long.h b/src/CLASS2/pair_lj_class2_coul_long.h
index 154d67412c..d1b19f84cf 100644
--- a/src/CLASS2/pair_lj_class2_coul_long.h
+++ b/src/CLASS2/pair_lj_class2_coul_long.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairLJClass2CoulLong : public Pair {
  public:
   PairLJClass2CoulLong(class LAMMPS *);
-  ~PairLJClass2CoulLong();
+  virtual ~PairLJClass2CoulLong();
   void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
@@ -40,7 +40,7 @@ class PairLJClass2CoulLong : public Pair {
   double single(int, int, int, int, double, double, double, double &);
   void *extract(char *, int &);
 
- private:
+ protected:
   double cut_lj_global;
   double **cut_lj,**cut_ljsq;
   double cut_coul,cut_coulsq;
diff --git a/src/Depend.sh b/src/Depend.sh
new file mode 100644
index 0000000000..f6f3d43d08
--- /dev/null
+++ b/src/Depend.sh
@@ -0,0 +1,36 @@
+# Depend.sh = Install/unInstall files from dependent packages
+# only Install/unInstall if dependent package is already installed
+# install dependent child files when parent files installed
+# uninstall dependent child files when parent files uninstalled
+
+if (test $1 = 1) then
+
+  if (test -e pair_lj_cut_opt.h) then
+    cd OPT; /bin/sh Install.sh 1; cd ..
+  fi
+  if (test -e pair_lj_cut_gpu.h) then
+    cd GPU; /bin/sh Install.sh 1; cd ..
+  fi
+  if (test -e cg_cmm_params.h) then
+    cd USER-CG-CMM; /bin/sh Install.sh 1; cd ..
+  fi
+  if (test -e pair_lj_cut_cuda.h) then
+    cd USER-CUDA; /bin/sh Install.sh 1; cd ..
+  fi
+
+elif (test $1 = 0) then
+
+  if (test -e pair_lj_cut_opt.h) then
+    cd OPT; /bin/sh Install.sh 0; /bin/sh Install.sh 1; cd ..
+  fi
+  if (test -e pair_lj_cut_gpu.h) then
+    cd GPU; /bin/sh Install.sh 0; /bin/sh Install.sh 1; cd ..
+  fi
+  if (test -e cg_cmm_params.h) then
+    cd USER-CG-CMM; /bin/sh Install.sh 0; /bin/sh Install.sh 1; cd ..
+  fi
+  if (test -e pair_lj_cut_cuda.h) then
+    cd USER-CUDA; /bin/sh Install.sh 0; /bin/sh Install.sh 1; cd ..
+  fi
+
+fi
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 202b4c956d..a6c1547dbb 100644
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -1,7 +1,6 @@
 # Install/unInstall package files in LAMMPS
-# edit Makefile.package to include/exclude GPU library
-# do not copy gayberne files if non-GPU version does not exist
-# do not copy charmm files if non-GPU version does not exist
+# edit Makefile.package to include/exclude GPU info
+# do not install child files if parent does not exist
 
 if (test $1 = 1) then
 
@@ -10,17 +9,9 @@ if (test $1 = 1) then
     sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package
     sed -i -e 's|^PKG_PATH =[ \t]*|&-L../../lib/gpu |' ../Makefile.package
     sed -i -e 's|^PKG_LIB =[ \t]*|&-lgpu |' ../Makefile.package
-    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(gpu_SYSINC) |' ../Makefile.package
     sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package
-  fi
-  
-  if (test -e ../pppm.cpp) then
-    cp pppm_gpu.cpp ..
-    cp pppm_gpu_single.cpp ..
-    cp pppm_gpu_double.cpp ..
-    cp pppm_gpu.h ..
-    cp pppm_gpu_single.h ..
-    cp pppm_gpu_double.h ..
+    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(gpu_SYSPATH) |' ../Makefile.package
   fi
   
   if (test -e ../pair_gayberne.cpp) then
@@ -54,12 +45,24 @@ if (test $1 = 1) then
     cp pair_cg_cmm_coul_msm_gpu.h ..
   fi
 
+  if (test -e ../pppm.cpp) then
+    cp pppm_gpu.cpp ..
+    cp pppm_gpu_single.cpp ..
+    cp pppm_gpu_double.cpp ..
+    cp pppm_gpu.h ..
+    cp pppm_gpu_single.h ..
+    cp pppm_gpu_double.h ..
+  fi
+
   cp pair_lj_cut_gpu.cpp ..
   cp pair_morse_gpu.cpp ..
   cp pair_lj96_cut_gpu.cpp ..
   cp pair_lj_expand_gpu.cpp ..
   cp pair_lj_cut_coul_cut_gpu.cpp ..
   cp pair_lj_cut_tgpu.cpp ..
+
+  cp fix_gpu.cpp ..
+
   cp pair_lj_cut_gpu.h ..
   cp pair_morse_gpu.h ..
   cp pair_lj96_cut_gpu.h ..
@@ -67,7 +70,6 @@ if (test $1 = 1) then
   cp pair_lj_cut_coul_cut_gpu.h ..
   cp pair_lj_cut_tgpu.h ..
   
-  cp fix_gpu.cpp ..
   cp fix_gpu.h ..
   cp gpu_extra.h ..
 
@@ -98,6 +100,7 @@ elif (test $1 = 0) then
   rm ../pair_cg_cmm_coul_long_gpu.cpp
   rm ../pair_cg_cmm_coul_msm.cpp
   rm ../pair_cg_cmm_coul_msm_gpu.cpp
+
   rm ../fix_gpu.cpp
   rm ../pair_omp_gpu.cpp
 
@@ -118,6 +121,7 @@ elif (test $1 = 0) then
   rm ../pair_cg_cmm_coul_long_gpu.h
   rm ../pair_cg_cmm_coul_msm.h
   rm ../pair_cg_cmm_coul_msm_gpu.h
+
   rm ../fix_gpu.h
   rm ../gpu_extra.h
   rm ../pair_omp_gpu.h
diff --git a/src/GRANULAR/fix_pour.h b/src/GRANULAR/fix_pour.h
index 5d03a6dc40..d279f54046 100644
--- a/src/GRANULAR/fix_pour.h
+++ b/src/GRANULAR/fix_pour.h
@@ -28,6 +28,7 @@ class FixPour : public Fix {
   friend class PairGranHertzHistory;
   friend class PairGranHooke;
   friend class PairGranHookeHistory;
+  friend class PairGranHookeCuda;
 
  public:
   FixPour(class LAMMPS *, int, char **);
diff --git a/src/GRANULAR/pair_gran_hooke_history.h b/src/GRANULAR/pair_gran_hooke_history.h
index a27b6ce916..2bc2cf63cc 100644
--- a/src/GRANULAR/pair_gran_hooke_history.h
+++ b/src/GRANULAR/pair_gran_hooke_history.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairGranHookeHistory : public Pair {
  public:
   PairGranHookeHistory(class LAMMPS *);
-  ~PairGranHookeHistory();
+  virtual ~PairGranHookeHistory();
   virtual void compute(int, int);
   virtual void settings(int, char **);
   void coeff(int, char **);
diff --git a/src/KSPACE/Install.sh b/src/KSPACE/Install.sh
index efecdf33f8..4db0f3e010 100644
--- a/src/KSPACE/Install.sh
+++ b/src/KSPACE/Install.sh
@@ -1,7 +1,4 @@
 # Install/unInstall package files in LAMMPS
-# for unInstall, also unInstall/Install OPT package if installed
-#   so it will remove OPT files that depend on KSPACE files,
-#   then replace others
 
 if (test $1 = 1) then
 
@@ -63,8 +60,4 @@ elif (test $1 = 0) then
   rm ../remap.h
   rm ../remap_wrap.h
 
-  if (test -e ../pair_lj_charmm_coul_long_opt.h) then
-    cd ../OPT; sh Install.sh 0; sh Install.sh 1
-  fi
-
 fi
diff --git a/src/KSPACE/pair_born_coul_long.h b/src/KSPACE/pair_born_coul_long.h
index dfd77c4bb6..9cbeb9deb0 100644
--- a/src/KSPACE/pair_born_coul_long.h
+++ b/src/KSPACE/pair_born_coul_long.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairBornCoulLong : public Pair {
  public:
   PairBornCoulLong(class LAMMPS *);
-  ~PairBornCoulLong();
+  virtual ~PairBornCoulLong();
   void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
@@ -40,7 +40,7 @@ class PairBornCoulLong : public Pair {
   double single(int, int, int, int, double, double, double, double &);
   void *extract(char *, int &);
 
- private:
+ protected:
   double cut_lj_global;
   double **cut_lj,**cut_ljsq;
   double cut_coul,cut_coulsq;
diff --git a/src/KSPACE/pair_buck_coul_long.h b/src/KSPACE/pair_buck_coul_long.h
index ada634a11b..44b82d99de 100644
--- a/src/KSPACE/pair_buck_coul_long.h
+++ b/src/KSPACE/pair_buck_coul_long.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairBuckCoulLong : public Pair {
  public:
   PairBuckCoulLong(class LAMMPS *);
-  ~PairBuckCoulLong();
+  virtual ~PairBuckCoulLong();
   void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
@@ -40,7 +40,7 @@ class PairBuckCoulLong : public Pair {
   double single(int, int, int, int, double, double, double, double &);
   void *extract(char *, int &);
 
- private:
+ protected:
   double cut_lj_global;
   double **cut_lj,**cut_ljsq;
   double cut_coul,cut_coulsq;
diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h
index 3577d6124c..f0ee75d12a 100644
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PPPM : public KSpace {
  public:
   PPPM(class LAMMPS *, int, char **);
-  ~PPPM();
+  virtual ~PPPM();
   void init();
   void setup();
   void compute(int, int);
diff --git a/src/MAKE/Makefile.g++ b/src/MAKE/Makefile.g++
index 147ac4f388..25144f903a 100755
--- a/src/MAKE/Makefile.g++
+++ b/src/MAKE/Makefile.g++
@@ -38,7 +38,7 @@ MPI_LIB =	-lmpich -lpthread
 # PATH = path for FFT library
 # LIB = name of FFT library
 
-FFT_INC =       -DFFT_FFTW
+FFT_INC =       -DFFT_FFTW 
 FFT_PATH = 
 FFT_LIB =	-lfftw
 
@@ -51,21 +51,30 @@ JPG_INC =
 JPG_PATH = 	
 JPG_LIB =	
 
-# additional system libraries needed by LAMMPS package libraries
+# additional system settings needed by LAMMPS package libraries
 # these settings are IGNORED if the corresponding LAMMPS package
 #   (e.g. gpu, meam) is NOT included in the LAMMPS build
-# SYSLIB = names of libraries
-# SYSPATH = paths of libraries
+# SYSINC = settings to compile with
+# SYSLIB = libraries to link with
+# SYSPATH = paths to libraries
+
+gpu_SYSINC =
+meam_SYSINC =
+reax_SYSINC =
+user-atc_SYSINC =
+user-cuda_SYSINC = -I/usr/local/cuda/include -DCUDA -DCUDA_ARCH=20 -DFFT_CUFFT
 
 gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lgfortran
 reax_SYSLIB =      -lgfortran
 user-atc_SYSLIB =  -lblas -llapack
+user-cuda_SYSLIB = -lcudart -lcuda
 
 gpu_SYSPATH =      -L/usr/local/cuda/lib64
 meam_SYSPATH =
 reax_SYSPATH =
 user-atc_SYSPATH = 	
+user-cuda_SYSPATH = 	
 
 # ---------------------------------------------------------------------
 # build rules and dependencies
@@ -73,7 +82,7 @@ user-atc_SYSPATH =
 
 include	Makefile.package
 
-EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC)
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
 EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
 EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
 
diff --git a/src/MANYBODY/Install.sh b/src/MANYBODY/Install.sh
index 0817b08cb2..12a551a6f8 100644
--- a/src/MANYBODY/Install.sh
+++ b/src/MANYBODY/Install.sh
@@ -1,7 +1,4 @@
 # Install/unInstall package files in LAMMPS
-# for unInstall, also unInstall/Install OPT package if installed
-#   so it will remove OPT files that depend on MANYBODY files,
-#   then replace others
 
 if (test $1 = 1) then
 
@@ -27,10 +24,6 @@ if (test $1 = 1) then
   cp pair_tersoff.h ..
   cp pair_tersoff_zbl.h ..
 
-  if (test -e ../pair_lj_cut_opt.h) then
-    cd ../OPT; sh Install.sh 1
-  fi
-
 elif (test $1 = 0) then
 
   rm ../fix_qeq_comb.cpp
@@ -55,8 +48,4 @@ elif (test $1 = 0) then
   rm ../pair_tersoff.h
   rm ../pair_tersoff_zbl.h
 
-  if (test -e ../pair_eam_opt.h) then
-    cd ../OPT; sh Install.sh 0; sh Install.sh 1
-  fi
-
 fi
diff --git a/src/MEAM/Install.sh b/src/MEAM/Install.sh
index e3c9d623f3..a2ff3ad029 100644
--- a/src/MEAM/Install.sh
+++ b/src/MEAM/Install.sh
@@ -1,5 +1,5 @@
 # Install/unInstall package files in LAMMPS
-# edit Makefile.package to include/exclude MEAM library
+# edit Makefile.package to include/exclude MEAM info
 
 if (test $1 = 1) then
 
@@ -9,8 +9,9 @@ if (test $1 = 1) then
     sed -i -e 's|^PKG_INC =[ \t]*|&-I../../lib/meam |' ../Makefile.package
     sed -i -e 's|^PKG_PATH =[ \t]*|&-L../../lib/meam |' ../Makefile.package
     sed -i -e 's|^PKG_LIB =[ \t]*|&-lmeam |' ../Makefile.package
-    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(meam_SYSPATH) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(meam_SYSINC) |' ../Makefile.package
     sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(meam_SYSLIB) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(meam_SYSPATH) |' ../Makefile.package
   fi
 
   cp pair_meam.cpp ..
diff --git a/src/MOLECULE/atom_vec_angle.h b/src/MOLECULE/atom_vec_angle.h
index daf18f2198..def4f67743 100644
--- a/src/MOLECULE/atom_vec_angle.h
+++ b/src/MOLECULE/atom_vec_angle.h
@@ -27,23 +27,24 @@ namespace LAMMPS_NS {
 class AtomVecAngle : public AtomVec {
  public:
   AtomVecAngle(class LAMMPS *, int, char **);
+  virtual ~AtomVecAngle() {}
   void grow(int);
   void grow_reset();
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
+  virtual int pack_comm(int, int *, double *, int, int *);
+  virtual int pack_comm_vel(int, int *, double *, int, int *);
+  virtual void unpack_comm(int, int, double *);
+  virtual void unpack_comm_vel(int, int, double *);
   int pack_reverse(int, int, double *);
   void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
+  virtual int pack_border(int, int *, double *, int, int *);
+  virtual int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
+  virtual void unpack_border(int, int, double *);
+  virtual void unpack_border_vel(int, int, double *);
   int unpack_border_hybrid(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
+  virtual int pack_exchange(int, double *);
+  virtual int unpack_exchange(double *);
   int size_restart();
   int pack_restart(int, double *);
   int unpack_restart(double *);
@@ -52,7 +53,7 @@ class AtomVecAngle : public AtomVec {
   int data_atom_hybrid(int, char **);
   bigint memory_usage();
 
- private:
+ protected:
   int *tag,*type,*mask,*image;
   double **x,**v,**f;
   int *molecule;
diff --git a/src/MOLECULE/atom_vec_full.h b/src/MOLECULE/atom_vec_full.h
index 408cb518ff..8749c183de 100644
--- a/src/MOLECULE/atom_vec_full.h
+++ b/src/MOLECULE/atom_vec_full.h
@@ -27,23 +27,24 @@ namespace LAMMPS_NS {
 class AtomVecFull : public AtomVec {
  public:
   AtomVecFull(class LAMMPS *, int, char **);
+  virtual ~AtomVecFull() {}
   void grow(int);
   void grow_reset();
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
+  virtual int pack_comm(int, int *, double *, int, int *);
+  virtual int pack_comm_vel(int, int *, double *, int, int *);
+  virtual void unpack_comm(int, int, double *);
+  virtual void unpack_comm_vel(int, int, double *);
   int pack_reverse(int, int, double *);
   void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
+  virtual int pack_border(int, int *, double *, int, int *);
+  virtual int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
+  virtual void unpack_border(int, int, double *);
+  virtual void unpack_border_vel(int, int, double *);
   int unpack_border_hybrid(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
+  virtual int pack_exchange(int, double *);
+  virtual int unpack_exchange(double *);
   int size_restart();
   int pack_restart(int, double *);
   int unpack_restart(double *);
@@ -52,7 +53,7 @@ class AtomVecFull : public AtomVec {
   int data_atom_hybrid(int, char **);
   bigint memory_usage();
 
- private:
+ protected:
   int *tag,*type,*mask,*image;
   double **x,**v,**f;
   double *q;
diff --git a/src/MOLECULE/pair_lj_charmm_coul_charmm.h b/src/MOLECULE/pair_lj_charmm_coul_charmm.h
index 970e03805c..2b3dae2274 100644
--- a/src/MOLECULE/pair_lj_charmm_coul_charmm.h
+++ b/src/MOLECULE/pair_lj_charmm_coul_charmm.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairLJCharmmCoulCharmm : public Pair {
  public:
   PairLJCharmmCoulCharmm(class LAMMPS *);
-  ~PairLJCharmmCoulCharmm();
+  virtual ~PairLJCharmmCoulCharmm();
   virtual void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
diff --git a/src/Makefile b/src/Makefile
index 7473b0e112..1812e979b9 100755
--- a/src/Makefile
+++ b/src/Makefile
@@ -149,7 +149,7 @@ yes-%:
 	  echo "Package $(@:yes-%=%) does not exist"; \
 	else \
 	  echo "Installing package $(@:yes-%=%)"; \
-	  cd $(YESDIR); $(SHELL) Install.sh 1; \
+	  cd $(YESDIR); $(SHELL) Install.sh 1; cd ..; $(SHELL) Depend.sh 1; \
 	fi;
 
 no-%:
@@ -157,7 +157,7 @@ no-%:
 	  echo "Package $(@:no-%=%) does not exist"; \
 	else \
 	  echo "Uninstalling package $(@:no-%=%), ignore errors"; \
-	  cd $(NODIR); $(SHELL) Install.sh 0; cd ..; \
+	  cd $(NODIR); $(SHELL) Install.sh 0; cd ..; $(SHELL) Depend.sh 0; \
         fi;
 
 # status = list differences between src and package files
diff --git a/src/Makefile.package.empty b/src/Makefile.package.empty
index fbee509ca6..ef54f586dc 100644
--- a/src/Makefile.package.empty
+++ b/src/Makefile.package.empty
@@ -5,5 +5,6 @@ PKG_INC =
 PKG_PATH =  
 PKG_LIB =   
 
-PKG_SYSPATH = 
+PKG_SYSINC = 
 PKG_SYSLIB =  
+PKG_SYSPATH = 
diff --git a/src/OPT/Install.sh b/src/OPT/Install.sh
index 4b12d0396a..37cedbb506 100644
--- a/src/OPT/Install.sh
+++ b/src/OPT/Install.sh
@@ -1,5 +1,5 @@
 # Install/unInstall package files in LAMMPS
-# do not copy eam and charmm files if non-OPT versions do not exist
+# do not install child files if parent does not exist
 
 if (test $1 = 1) then
 
diff --git a/src/OPT/pair_eam_opt.h b/src/OPT/pair_eam_opt.h
index ddc074b316..d2318981f3 100644
--- a/src/OPT/pair_eam_opt.h
+++ b/src/OPT/pair_eam_opt.h
@@ -193,6 +193,7 @@ void PairEAMOpt::eval()
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
+      j &= NEIGHMASK;
 
       double delx = xtmp - xx[j].x;
       double dely = ytmp - xx[j].y;
@@ -269,7 +270,8 @@ void PairEAMOpt::eval()
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
-      
+      j &= NEIGHMASK;
+
       double delx = xtmp - xx[j].x;
       double dely = ytmp - xx[j].y;
       double delz = ztmp - xx[j].z;
diff --git a/src/OPT/pair_lj_charmm_coul_long_opt.h b/src/OPT/pair_lj_charmm_coul_long_opt.h
index 583f2d57e0..9cf5f8477b 100644
--- a/src/OPT/pair_lj_charmm_coul_long_opt.h
+++ b/src/OPT/pair_lj_charmm_coul_long_opt.h
@@ -66,7 +66,7 @@ void PairLJCharmmCoulLongOpt::eval()
     double _pad[2];
   } fast_alpha_t;
   
-  int i,j,ii,jj,inum,jnum,itype,jtype,itable;
+  int i,j,ii,jj,inum,jnum,itype,jtype,itable,sbindex;
   double fraction,table;
   double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
   double grij,expm2,prefactor,t,erfc;
@@ -132,8 +132,9 @@ void PairLJCharmmCoulLongOpt::eval()
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
-      
-      if (j <= NEIGHMASK) {
+      sbindex = sbmask(j);
+
+      if (sbindex == 0) {
 	double delx = xtmp - xx[j].x;
 	double dely = ytmp - xx[j].y;
 	double delz = ztmp - xx[j].z;
@@ -219,8 +220,8 @@ void PairLJCharmmCoulLongOpt::eval()
 	}
 
       } else {
-	factor_lj = special_lj[sbmask(j)];
-	factor_coul = special_coul[sbmask(j)];
+	factor_lj = special_lj[sbindex];
+	factor_coul = special_coul[sbindex];
 	j &= NEIGHMASK;
 
 	double delx = xtmp - xx[j].x;
diff --git a/src/OPT/pair_lj_cut_opt.h b/src/OPT/pair_lj_cut_opt.h
index dce90b0249..ae7ac7d7af 100644
--- a/src/OPT/pair_lj_cut_opt.h
+++ b/src/OPT/pair_lj_cut_opt.h
@@ -54,7 +54,8 @@ void PairLJCutOpt::eval()
     double _pad[2];
   } fast_alpha_t;
   
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+  int i,j,ii,jj,inum,jnum,itype,jtype,sbindex;
+  double factor_lj;
   double evdwl = 0.0;
   
   double** __restrict__ x = atom->x;
@@ -106,9 +107,9 @@ void PairLJCutOpt::eval()
     
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
-      double factor_lj;
+      sbindex = sbmask(j);
 
-      if (j <= NEIGHMASK) {
+      if (sbindex == 0) {
 	double delx = xtmp - xx[j].x;
 	double dely = ytmp - xx[j].y;
 	double delz = ztmp - xx[j].z;
@@ -141,7 +142,7 @@ void PairLJCutOpt::eval()
 	}
 
       } else {
-	factor_lj = special_lj[sbmask(j)];
+	factor_lj = special_lj[sbindex];
 	j &= NEIGHMASK;
 
 	double delx = xtmp - xx[j].x;
diff --git a/src/OPT/pair_morse_opt.h b/src/OPT/pair_morse_opt.h
index 0e92408e1d..f7e9710433 100644
--- a/src/OPT/pair_morse_opt.h
+++ b/src/OPT/pair_morse_opt.h
@@ -55,7 +55,8 @@ void PairMorseOpt::eval()
     double _pad[2];
   } fast_alpha_t;
   
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+  int i,j,ii,jj,inum,jnum,itype,jtype,sbindex;
+  double factor_lj;
   double evdwl = 0.0;
   
   double** __restrict__ x = atom->x;
@@ -107,9 +108,9 @@ void PairMorseOpt::eval()
    
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
-      double factor_lj;
+      sbindex = sbmask(j);
 
-      if (j <= NEIGHMASK) {
+      if (sbindex == 0) {
 	double delx = xtmp - xx[j].x;
 	double dely = ytmp - xx[j].y;
 	double delz = ztmp - xx[j].z;
@@ -140,7 +141,7 @@ void PairMorseOpt::eval()
 	}
 
       } else {
-	factor_lj = special_lj[sbmask(j)];
+	factor_lj = special_lj[sbindex];
 	j &= NEIGHMASK;
 	
 	double delx = xtmp - xx[j].x;
diff --git a/src/PERI/atom_vec_peri.cpp b/src/PERI/atom_vec_peri.cpp
index d85853d114..ee36eb6db8 100644
--- a/src/PERI/atom_vec_peri.cpp
+++ b/src/PERI/atom_vec_peri.cpp
@@ -240,7 +240,7 @@ int AtomVecPeri::pack_comm_hybrid(int n, int *list, double *buf)
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
-    buf[m++] = s0[i];
+    buf[m++] = s0[j];
   }
   return m;
 }
diff --git a/src/POEMS/Install.sh b/src/POEMS/Install.sh
index 58a33048c7..59a86ee924 100644
--- a/src/POEMS/Install.sh
+++ b/src/POEMS/Install.sh
@@ -1,5 +1,5 @@
 # Install/unInstall package files in LAMMPS
-# edit Makefile.package to include/exclude POEMS library
+# edit Makefile.package to include/exclude POEMS info
 
 if (test $1 = 1) then
 
diff --git a/src/REAX/Install.sh b/src/REAX/Install.sh
index bd56f798db..69c55f60c2 100644
--- a/src/REAX/Install.sh
+++ b/src/REAX/Install.sh
@@ -1,5 +1,5 @@
 # Install/unInstall package files in LAMMPS
-# edit Makefile.package to include/exclude REAX library
+# edit Makefile.package to include/exclude REAX info
 
 if (test $1 = 1) then
 
@@ -9,8 +9,9 @@ if (test $1 = 1) then
     sed -i -e 's|^PKG_INC =[ \t]*|&-I../../lib/reax |' ../Makefile.package
     sed -i -e 's|^PKG_PATH =[ \t]*|&-L../../lib/reax |' ../Makefile.package
     sed -i -e 's|^PKG_LIB =[ \t]*|&-lreax |' ../Makefile.package
-    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(reax_SYSPATH) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(reax_SYSINC) |' ../Makefile.package
     sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(reax_SYSLIB) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(reax_SYSPATH) |' ../Makefile.package
   fi
 
   cp pair_reax.cpp ..
diff --git a/src/USER-ATC/Install.sh b/src/USER-ATC/Install.sh
index e55e1c7d65..9bb438f6a0 100755
--- a/src/USER-ATC/Install.sh
+++ b/src/USER-ATC/Install.sh
@@ -1,5 +1,5 @@
 # Install/unInstall package files in LAMMPS
-# edit Makefile.package to include/exclude ATC library
+# edit Makefile.package to include/exclude ATC info
 
 if (test $1 = 1) then
 
@@ -9,8 +9,9 @@ if (test $1 = 1) then
     sed -i -e 's|^PKG_INC =[ \t]*|&-I../../lib/atc |' ../Makefile.package
     sed -i -e 's|^PKG_PATH =[ \t]*|&-L../../lib/atc |' ../Makefile.package
     sed -i -e 's|^PKG_LIB =[ \t]*|&-latc |' ../Makefile.package
-    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(user-atc_SYSPATH) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(user-atc_SYSINC) |' ../Makefile.package
     sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(user-atc_SYSLIB) |' ../Makefile.package
+    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(user-atc_SYSPATH) |' ../Makefile.package
   fi
 
   cp fix_atc.h ..
diff --git a/src/USER-CG-CMM/Install.sh b/src/USER-CG-CMM/Install.sh
index 18dcb44d3a..ada215a923 100644
--- a/src/USER-CG-CMM/Install.sh
+++ b/src/USER-CG-CMM/Install.sh
@@ -1,4 +1,5 @@
 # Install/unInstall package files in LAMMPS
+# do not install child files if parent does not exist
 
 if (test $1 = 1) then
 
@@ -7,6 +8,11 @@ if (test $1 = 1) then
     cp angle_cg_cmm.cpp ..
   fi
 
+  if (test -e ../pppm.cpp) then
+    cp pair_cg_cmm_coul_long.cpp ..
+    cp pair_cg_cmm_coul_long.h ..
+  fi
+
   cp cg_cmm_parms.h ..
   cp cg_cmm_parms.cpp ..
 
@@ -17,15 +23,10 @@ if (test $1 = 1) then
   cp pair_cg_cmm_coul_cut.cpp ..
   cp pair_cg_cmm_coul_cut.h ..
 
-  if (test -e ../pppm.cpp) then
-    cp pair_cg_cmm_coul_long.cpp ..
-    cp pair_cg_cmm_coul_long.h ..
-  fi
-
 elif (test $1 = 0) then
 
-  rm -f ../angle_cg_cmm.h
-  rm -f ../angle_cg_cmm.cpp
+  rm ../angle_cg_cmm.h
+  rm ../angle_cg_cmm.cpp
 
   rm ../cg_cmm_parms.h
   rm ../cg_cmm_parms.cpp
@@ -37,7 +38,7 @@ elif (test $1 = 0) then
   rm ../pair_cg_cmm_coul_cut.cpp
   rm ../pair_cg_cmm_coul_cut.h
 
-  rm -f ../pair_cg_cmm_coul_long.cpp
-  rm -f ../pair_cg_cmm_coul_long.h
+  rm ../pair_cg_cmm_coul_long.cpp
+  rm ../pair_cg_cmm_coul_long.h
 
 fi
diff --git a/src/USER-EFF/atom_vec_electron.cpp b/src/USER-EFF/atom_vec_electron.cpp
index e5c47745ab..c0c97d161b 100644
--- a/src/USER-EFF/atom_vec_electron.cpp
+++ b/src/USER-EFF/atom_vec_electron.cpp
@@ -237,7 +237,7 @@ int AtomVecElectron::pack_comm_hybrid(int n, int *list, double *buf)
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
-    buf[m++] = eradius[i];
+    buf[m++] = eradius[j];
   }
   return m;
 }
diff --git a/src/accelerator.h b/src/accelerator.h
new file mode 100644
index 0000000000..9ef291adbc
--- /dev/null
+++ b/src/accelerator.h
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+// dummy interface to USER-CUDA
+// used when USER-CUDA is not installed
+
+#ifndef LMP_ACCELERATOR_H
+#define LMP_ACCELERATOR_H
+
+#include "comm.h"
+#include "modify.h"
+#include "verlet.h"
+
+namespace LAMMPS_NS {
+
+class Cuda {
+ public:
+  int cuda_exists;
+  int oncpu;
+  int neighbor_decide_by_integrator;
+  
+  Cuda(class LAMMPS *) {cuda_exists = 0;}
+  ~Cuda() {}
+  void setDevice(class LAMMPS *) {}
+  void accelerator(int, char **) {}
+  void evsetup_eatom_vatom(int, int) {}
+  void downloadAll() {}
+  void uploadAll() {}
+};
+
+class CommCuda : public Comm {
+ public:
+ CommCuda(class LAMMPS *lmp) : Comm(lmp) {}
+  ~CommCuda() {}
+};
+
+class DomainCuda : public Domain {
+ public:
+ DomainCuda(class LAMMPS *lmp) : Domain(lmp) {}
+  ~DomainCuda() {}
+};
+
+class NeighborCuda : public Neighbor {
+ public:
+ NeighborCuda(class LAMMPS *lmp) : Neighbor(lmp) {}
+  ~NeighborCuda() {}
+};
+
+class ModifyCuda : public Modify {
+ public:
+ ModifyCuda(class LAMMPS *lmp) : Modify(lmp) {}
+  ~ModifyCuda() {}
+};
+ 
+class VerletCuda : public Verlet {
+ public:
+ VerletCuda(class LAMMPS *lmp, int narg, char **arg) : Verlet(lmp,narg,arg) {}
+  ~VerletCuda() {}
+};
+
+}
+
+#endif
diff --git a/src/atom.cpp b/src/atom.cpp
index 806475b3bc..d7c08d9cbb 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -12,6 +12,7 @@
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
+#include "math.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "string.h"
@@ -30,6 +31,7 @@
 #include "update.h"
 #include "domain.h"
 #include "group.h"
+#include "accelerator.h"
 #include "memory.h"
 #include "error.h"
 
@@ -38,6 +40,9 @@ using namespace LAMMPS_NS;
 #define DELTA 1
 #define DELTA_MEMSTR 1024
 #define EPSILON 1.0e-6
+#define CUDA_CHUNK 3000
+
+enum{NOACCEL,OPT,GPU,USERCUDA};     // same as lammps.cpp
 
 #define MIN(A,B) ((A) < (B)) ? (A) : (B)
 #define MAX(A,B) ((A) > (B)) ? (A) : (B)
@@ -241,7 +246,7 @@ void Atom::settings(Atom *old)
    called from input script, restart file, replicate
 ------------------------------------------------------------------------- */
 
-void Atom::create_avec(const char *style, int narg, char **arg)
+void Atom::create_avec(const char *style, int narg, char **arg, char *suffix)
 {
   delete [] atom_style;
   if (avec) delete avec;
@@ -256,10 +261,20 @@ void Atom::create_avec(const char *style, int narg, char **arg)
   rmass_flag = radius_flag = omega_flag = torque_flag = angmom_flag = 0;
   vfrac_flag = spin_flag = eradius_flag = ervel_flag = erforce_flag = 0;
 
-  avec = new_avec(style,narg,arg);
-  int n = strlen(style) + 1;
-  atom_style = new char[n];
-  strcpy(atom_style,style);
+  int sflag;
+  avec = new_avec(style,narg,arg,suffix,sflag);
+
+  if (sflag) {
+    char estyle[256];
+    sprintf(estyle,"%s/%s",style,suffix);
+    int n = strlen(estyle) + 1;
+    atom_style = new char[n];
+    strcpy(atom_style,estyle);
+  } else {
+    int n = strlen(style) + 1;
+    atom_style = new char[n];
+    strcpy(atom_style,style);
+  }
 
   // if molecular system, default is to have array map
 
@@ -268,11 +283,30 @@ void Atom::create_avec(const char *style, int narg, char **arg)
 }
 
 /* ----------------------------------------------------------------------
-   generate an AtomVec class
+   generate an AtomVec class, first with suffix appended
 ------------------------------------------------------------------------- */
 
-AtomVec *Atom::new_avec(const char *style, int narg, char **arg)
+AtomVec *Atom::new_avec(const char *style, int narg, char **arg,
+			char *suffix, int &sflag)
 {
+  if (suffix && lmp->offaccel == 0) {
+    sflag = 1;
+    char estyle[256];
+    sprintf(estyle,"%s/%s",style,suffix);
+
+    if (0) return NULL;
+
+#define ATOM_CLASS
+#define AtomStyle(key,Class) \
+    else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg,arg);
+#include "style_atom.h"
+#undef AtomStyle
+#undef ATOM_CLASS
+
+  }
+
+  sflag = 0;
+
   if (0) return NULL;
 
 #define ATOM_CLASS
@@ -282,6 +316,7 @@ AtomVec *Atom::new_avec(const char *style, int narg, char **arg)
 #undef ATOM_CLASS
 
   else error->all("Invalid atom style");
+
   return NULL;
 }
 
@@ -1298,6 +1333,11 @@ void Atom::sort()
 
   nextsort = (update->ntimestep/sortfreq)*sortfreq + sortfreq;
 
+  // download data from GPU if necessary
+
+  if (lmp->accelerator == USERCUDA && !lmp->cuda->oncpu) 
+    lmp->cuda->downloadAll();
+
   // re-setup sort bins if needed
 
   if (domain->box_change) setup_sort_bins();
@@ -1373,6 +1413,11 @@ void Atom::sort()
     current[empty] = permute[empty];
   }
 
+  // upload data back to GPU if necessary
+
+  if (lmp->accelerator == USERCUDA && !lmp->cuda->oncpu)
+    lmp->cuda->uploadAll();
+
   // sanity check that current = permute
 
   //int flag = 0;
@@ -1389,12 +1434,25 @@ void Atom::sort()
 
 void Atom::setup_sort_bins()
 {
-  // binsize = user setting or 1/2 of neighbor cutoff
-  // neighbor cutoff can be 0.0
+  // binsize = user setting or default
+  // default = 1/2 of neighbor cutoff for non-CUDA
+  //           CUDA_CHUNK atoms/proc for CUDA
+  // check if neighbor cutoff = 0.0
 
   double binsize;
   if (userbinsize > 0.0) binsize = userbinsize;
-  else binsize = 0.5 * neighbor->cutneighmax;
+  else if (lmp->accelerator == USERCUDA) {
+    if (domain->dimension == 3) {
+      double vol = (domain->boxhi[0]-domain->boxlo[0]) * 
+	(domain->boxhi[1]-domain->boxlo[1]) * 
+	(domain->boxhi[2]-domain->boxlo[2]);
+      binsize = pow(1.0*CUDA_CHUNK/natoms*vol,1.0/3.0);
+    } else {
+      double area = (domain->boxhi[0]-domain->boxlo[0]) * 
+	(domain->boxhi[1]-domain->boxlo[1]);
+      binsize = pow(1.0*CUDA_CHUNK/natoms*area,1.0/2.0);
+    }
+  } else binsize = 0.5 * neighbor->cutneighmax;
   if (binsize == 0.0) error->all("Atom sorting has bin size = 0.0");
 
   double bininv = 1.0/binsize;
diff --git a/src/atom.h b/src/atom.h
index c0e5dafa60..e4972b27e6 100644
--- a/src/atom.h
+++ b/src/atom.h
@@ -114,8 +114,8 @@ class Atom : protected Pointers {
   ~Atom();
 
   void settings(class Atom *);
-  void create_avec(const char *, int, char **);
-  class AtomVec *new_avec(const char *, int, char **);
+  void create_avec(const char *, int, char **, char *suffix = NULL);
+  class AtomVec *new_avec(const char *, int, char **, char *, int &);
   void init();
   void setup();
 
@@ -155,6 +155,9 @@ class Atom : protected Pointers {
 
   void *extract(char *);
 
+  inline int* get_map_array() {return map_array;};
+  inline int get_map_size() {return map_tag_max+1;};
+
   bigint memory_usage();
   int memcheck(const char *);
 
diff --git a/src/atom_vec.h b/src/atom_vec.h
index a437d14a6b..35cae37bb6 100644
--- a/src/atom_vec.h
+++ b/src/atom_vec.h
@@ -38,6 +38,9 @@ class AtomVec : protected Pointers {
   int size_data_bonus;                 // number of values in Bonus line
   int xcol_data;                       // column (1-N) where x is in Atom line
 
+  int cudable;                         // 1 if atom style is CUDA-enabled
+  int *maxsend;                        // CUDA-specific variable
+
   AtomVec(class LAMMPS *, int, char **);
   virtual ~AtomVec() {}
   virtual void init();
diff --git a/src/atom_vec_atomic.h b/src/atom_vec_atomic.h
index 187f4c22f8..34fb3779a7 100644
--- a/src/atom_vec_atomic.h
+++ b/src/atom_vec_atomic.h
@@ -27,22 +27,22 @@ namespace LAMMPS_NS {
 class AtomVecAtomic : public AtomVec {
  public:
   AtomVecAtomic(class LAMMPS *, int, char **);
-  ~AtomVecAtomic() {}
+  virtual ~AtomVecAtomic() {}
   void grow(int);
   void grow_reset();
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
+  virtual int pack_comm(int, int *, double *, int, int *);
+  virtual int pack_comm_vel(int, int *, double *, int, int *);
+  virtual void unpack_comm(int, int, double *);
+  virtual void unpack_comm_vel(int, int, double *);
   int pack_reverse(int, int, double *);
   void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
+  virtual int pack_border(int, int *, double *, int, int *);
+  virtual int pack_border_vel(int, int *, double *, int, int *);
+  virtual void unpack_border(int, int, double *);
+  virtual void unpack_border_vel(int, int, double *);
+  virtual int pack_exchange(int, double *);
+  virtual int unpack_exchange(double *);
   int size_restart();
   int pack_restart(int, double *);
   int unpack_restart(double *);
@@ -50,7 +50,7 @@ class AtomVecAtomic : public AtomVec {
   void data_atom(double *, int, char **);
   bigint memory_usage();
 
- private:
+ protected:
   int *tag,*type,*mask,*image;
   double **x,**v,**f;
 };
diff --git a/src/atom_vec_charge.h b/src/atom_vec_charge.h
index a19d7d6cde..eb9fb91054 100644
--- a/src/atom_vec_charge.h
+++ b/src/atom_vec_charge.h
@@ -27,23 +27,24 @@ namespace LAMMPS_NS {
 class AtomVecCharge : public AtomVec {
  public:
   AtomVecCharge(class LAMMPS *, int, char **);
+  virtual ~AtomVecCharge() {}
   void grow(int);
   void grow_reset();
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
+  virtual int pack_comm(int, int *, double *, int, int *);
+  virtual int pack_comm_vel(int, int *, double *, int, int *);
+  virtual void unpack_comm(int, int, double *);
+  virtual void unpack_comm_vel(int, int, double *);
   int pack_reverse(int, int, double *);
   void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
+  virtual int pack_border(int, int *, double *, int, int *);
+  virtual int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
+  virtual void unpack_border(int, int, double *);
+  virtual void unpack_border_vel(int, int, double *);
   int unpack_border_hybrid(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
+  virtual int pack_exchange(int, double *);
+  virtual int unpack_exchange(double *);
   int size_restart();
   int pack_restart(int, double *);
   int unpack_restart(double *);
@@ -52,7 +53,7 @@ class AtomVecCharge : public AtomVec {
   int data_atom_hybrid(int, char **);
   bigint memory_usage();
 
- private:
+ protected:
   int *tag,*type,*mask,*image;
   double **x,**v,**f;
   double *q;
diff --git a/src/atom_vec_hybrid.cpp b/src/atom_vec_hybrid.cpp
index a7f48066f0..03c0c501ea 100644
--- a/src/atom_vec_hybrid.cpp
+++ b/src/atom_vec_hybrid.cpp
@@ -34,7 +34,7 @@ using namespace LAMMPS_NS;
 AtomVecHybrid::AtomVecHybrid(LAMMPS *lmp, int narg, char **arg) :
   AtomVec(lmp, narg, arg)
 {
-  int i,k;
+  int i,k,dummy;
 
   if (narg < 1) error->all("Illegal atom_style command");
 
@@ -50,7 +50,7 @@ AtomVecHybrid::AtomVecHybrid(LAMMPS *lmp, int narg, char **arg) :
 	error->all("Atom style hybrid cannot use same atom style twice");
     if (strcmp(arg[i],"hybrid") == 0) 
       error->all("Atom style hybrid cannot have hybrid as an argument");
-    styles[i] = atom->new_avec(arg[i],0,NULL);
+    styles[i] = atom->new_avec(arg[i],0,NULL,NULL,dummy);
     keywords[i] = new char[strlen(arg[i])+1];
     strcpy(keywords[i],arg[i]);
   }
diff --git a/src/atom_vec_sphere.cpp b/src/atom_vec_sphere.cpp
index ab53b79c5d..eee68209dd 100644
--- a/src/atom_vec_sphere.cpp
+++ b/src/atom_vec_sphere.cpp
@@ -374,8 +374,8 @@ int AtomVecSphere::pack_comm_hybrid(int n, int *list, double *buf)
   m = 0;
   for (i = 0; i < n; i++) {
     j = list[i];
-    buf[m++] = radius[i];
-    buf[m++] = rmass[i];
+    buf[m++] = radius[j];
+    buf[m++] = rmass[j];
   }
   return m;
 }
diff --git a/src/comm.cpp b/src/comm.cpp
index e97d3f3474..c8c06bbf16 100644
--- a/src/comm.cpp
+++ b/src/comm.cpp
@@ -400,7 +400,7 @@ void Comm::setup()
    other per-atom attributes may also be sent via pack/unpack routines
 ------------------------------------------------------------------------- */
 
-void Comm::forward_comm()
+void Comm::forward_comm(int dummy)
 {
   int n;
   MPI_Request request;
diff --git a/src/comm.h b/src/comm.h
index 11b7e28b8a..868f6bec2e 100644
--- a/src/comm.h
+++ b/src/comm.h
@@ -35,27 +35,27 @@ class Comm : protected Pointers {
   int ***grid2proc;                 // which proc owns i,j,k loc in 3d grid
 
   Comm(class LAMMPS *);
-  ~Comm();
+  virtual ~Comm();
 
-  void init();
-  void set_procs();                 // setup 3d grid of procs
-  void setup();                     // setup 3d communication pattern
-  void forward_comm();              // forward communication of atom coords
-  void reverse_comm();              // reverse communication of forces
-  void exchange();                  // move atoms to new procs
-  void borders();                   // setup list of atoms to communicate
+  virtual void init();
+  virtual void set_procs();                 // setup 3d grid of procs
+  virtual void setup();                     // setup 3d communication pattern
+  virtual void forward_comm(int dummy = 0); // forward communication of atom coords
+  virtual void reverse_comm();              // reverse communication of forces
+  virtual void exchange();                  // move atoms to new procs
+  virtual void borders();                   // setup list of atoms to communicate
 
-  void forward_comm_pair(class Pair *);        // forward comm from a Pair
-  void reverse_comm_pair(class Pair *);        // reverse comm from a Pair
-  void forward_comm_fix(class Fix *);          // forward comm from a Fix
-  void reverse_comm_fix(class Fix *);          // reverse comm from a Fix
-  void forward_comm_compute(class Compute *);  // forward comm from a Compute
-  void reverse_comm_compute(class Compute *);  // reverse comm from a Compute
+  virtual void forward_comm_pair(class Pair *);        // forward comm from a Pair
+  virtual void reverse_comm_pair(class Pair *);        // reverse comm from a Pair
+  virtual void forward_comm_fix(class Fix *);          // forward comm from a Fix
+  virtual void reverse_comm_fix(class Fix *);          // reverse comm from a Fix
+  virtual void forward_comm_compute(class Compute *);  // forward comm from a Compute
+  virtual void reverse_comm_compute(class Compute *);  // reverse comm from a Compute
 
-  void set(int, char **);           // set communication style
-  bigint memory_usage();
+  virtual void set(int, char **);           // set communication style
+  virtual bigint memory_usage();
 
- private:
+ protected:
   int style;                        // single vs multi-type comm
   int nswap;                        // # of swaps to perform
   int need[3];                      // procs I need atoms from in each dim
@@ -87,18 +87,18 @@ class Comm : protected Pointers {
   int maxsend,maxrecv;              // current size of send/recv buffer
   int maxforward,maxreverse;        // max # of datums in forward/reverse comm
 
-  void procs2box();                 // map procs to 3d box
-  void cross(double, double, double,
+  virtual void procs2box();                 // map procs to 3d box
+  virtual void cross(double, double, double,
 	     double, double, double,
 	     double &, double &, double &);    // cross product
-  void grow_send(int,int);          // reallocate send buffer
-  void grow_recv(int);              // free/allocate recv buffer
-  void grow_list(int, int);         // reallocate one sendlist
-  void grow_swap(int);              // grow swap and multi arrays
-  void allocate_swap(int);          // allocate swap arrays
-  void allocate_multi(int);         // allocate multi arrays
-  void free_swap();                 // free swap arrays
-  void free_multi();                // free multi arrays
+  virtual void grow_send(int,int);          // reallocate send buffer
+  virtual void grow_recv(int);              // free/allocate recv buffer
+  virtual void grow_list(int, int);         // reallocate one sendlist
+  virtual void grow_swap(int);              // grow swap and multi arrays
+  virtual void allocate_swap(int);          // allocate swap arrays
+  virtual void allocate_multi(int);         // allocate multi arrays
+  virtual void free_swap();                 // free swap arrays
+  virtual void free_multi();                // free multi arrays
 };
 
 }
diff --git a/src/compute.cpp b/src/compute.cpp
index 5c5398e5df..230c15919c 100644
--- a/src/compute.cpp
+++ b/src/compute.cpp
@@ -68,6 +68,7 @@ Compute::Compute(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 
   timeflag = 0;
   comm_forward = comm_reverse = 0;
+  cudable = 0;
 
   invoked_scalar = invoked_vector = invoked_array = -1;
   invoked_peratom = invoked_local = -1;
diff --git a/src/compute.h b/src/compute.h
index 9b929b345a..b0f9709df4 100644
--- a/src/compute.h
+++ b/src/compute.h
@@ -77,6 +77,8 @@ class Compute : protected Pointers {
   int comm_forward;   // size of forward communication (0 if none)
   int comm_reverse;   // size of reverse communication (0 if none)
 
+  int cudable;        // 1 if compute is CUDA-enabled
+
   Compute(class LAMMPS *, int, char **);
   virtual ~Compute();
   void modify_params(int, char **);
diff --git a/src/compute_pressure.h b/src/compute_pressure.h
index 2e362ab222..cab96ae619 100644
--- a/src/compute_pressure.h
+++ b/src/compute_pressure.h
@@ -27,13 +27,13 @@ namespace LAMMPS_NS {
 class ComputePressure : public Compute {
  public:
   ComputePressure(class LAMMPS *, int, char **);
-  ~ComputePressure();
+  virtual ~ComputePressure();
   void init();
   double compute_scalar();
   void compute_vector();
   void reset_extra_compute_fix(char *);
 
- private:
+ protected:
   double boltz,nktv2p,inv_volume;
   int nvirial,dimension;
   double **vptr;
diff --git a/src/compute_temp_partial.h b/src/compute_temp_partial.h
index 83bd6de752..e65c0b2191 100644
--- a/src/compute_temp_partial.h
+++ b/src/compute_temp_partial.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class ComputeTempPartial : public Compute {
  public:
   ComputeTempPartial(class LAMMPS *, int, char **);
-  ~ComputeTempPartial();
+  virtual ~ComputeTempPartial();
   void init();
   double compute_scalar();
   void compute_vector();
@@ -39,7 +39,7 @@ class ComputeTempPartial : public Compute {
   void restore_bias_all();
   double memory_usage();
 
- private:
+ protected:
   int xflag,yflag,zflag;
   int fix_dof;
   double tfactor;
diff --git a/src/domain.h b/src/domain.h
index d7cb88ce5c..3e8eba5d3e 100644
--- a/src/domain.h
+++ b/src/domain.h
@@ -85,14 +85,14 @@ class Domain : protected Pointers {
   class Region **regions;                  // list of defined Regions
 
   Domain(class LAMMPS *);
-  ~Domain();
-  void init();
+  virtual ~Domain();
+  virtual void init();
   void set_initial_box();
-  void set_global_box();
-  void set_lamda_box();
-  void set_local_box();
-  void reset_box();
-  void pbc();
+  virtual void set_global_box();
+  virtual void set_lamda_box();
+  virtual void set_local_box();
+  virtual void reset_box();
+  virtual void pbc();
   void remap(double *, int &);
   void remap(double *);
   void remap_near(double *, double *);
@@ -107,8 +107,8 @@ class Domain : protected Pointers {
   void set_boundary(int, char **);
   void print_box(const char *);
 
-  void lamda2x(int);
-  void x2lamda(int);
+  virtual void lamda2x(int);
+  virtual void x2lamda(int);
   void lamda2x(double *, double *);
   void x2lamda(double *, double *);
   void bbox(double *, double *, double *, double *);
diff --git a/src/fix.cpp b/src/fix.cpp
index e2c077fbc4..2ed2dc7485 100644
--- a/src/fix.cpp
+++ b/src/fix.cpp
@@ -58,6 +58,7 @@ Fix::Fix(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   time_depend = 0;
   create_attribute = 0;
   restart_pbc = 0;
+  cudable_comm = 0;
 
   scalar_flag = vector_flag = array_flag = 0;
   peratom_flag = local_flag = 0;
diff --git a/src/fix.h b/src/fix.h
index ca7008a8b2..b1ede8b67c 100644
--- a/src/fix.h
+++ b/src/fix.h
@@ -41,6 +41,7 @@ class Fix : protected Pointers {
                                  //      setting when a new atom is created
   int restart_pbc;               // 1 if fix moves atoms (except integrate)
                                  //      so write_restart must remap to PBC
+  int cudable_comm;              // 1 if fix has CUDA-enabled communication
 
   int scalar_flag;               // 0/1 if compute_scalar() function exists
   int vector_flag;               // 0/1 if compute_vector() function exists
diff --git a/src/fix_adapt.cpp b/src/fix_adapt.cpp
index 5e79423a2a..d5825b5616 100644
--- a/src/fix_adapt.cpp
+++ b/src/fix_adapt.cpp
@@ -54,7 +54,7 @@ FixAdapt::FixAdapt(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
       nadapt++;
       iarg += 6;
     } else if (strcmp(arg[iarg],"kspace") == 0) {
-      if (iarg+6 > narg) error->all("Illegal fix adapt command");
+      if (iarg+2 > narg) error->all("Illegal fix adapt command");
       nadapt++;
       iarg += 2;
     } else if (strcmp(arg[iarg],"atom") == 0) {
diff --git a/src/fix_nve.h b/src/fix_nve.h
index f894b8dc6c..2be7a671e8 100644
--- a/src/fix_nve.h
+++ b/src/fix_nve.h
@@ -27,13 +27,14 @@ namespace LAMMPS_NS {
 class FixNVE : public Fix {
  public:
   FixNVE(class LAMMPS *, int, char **);
+  virtual ~FixNVE() {}
   int setmask();
   virtual void init();
   virtual void initial_integrate(int);
   virtual void final_integrate();
-  void initial_integrate_respa(int, int, int);
-  void final_integrate_respa(int, int);
-  void reset_dt();
+  virtual void initial_integrate_respa(int, int, int);
+  virtual void final_integrate_respa(int, int);
+  virtual void reset_dt();
 
  protected:
   double dtv,dtf;
diff --git a/src/fix_rigid_nve.cpp b/src/fix_rigid_nve.cpp
index abdb258a75..3b539f2533 100644
--- a/src/fix_rigid_nve.cpp
+++ b/src/fix_rigid_nve.cpp
@@ -58,8 +58,8 @@ void FixRigidNVE::setup(int vflag)
   
   double mbody[3];
   for (int ibody = 0; ibody < nbody; ibody++) {
-    MathExtra::matvec_rows(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   angmom[ibody],mbody);
+    MathExtra::transpose_matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+				angmom[ibody],mbody);
     MathExtra::quatvec(quat[ibody],mbody,conjqm[ibody]);
     conjqm[ibody][0] *= 2.0;
     conjqm[ibody][1] *= 2.0;
@@ -99,8 +99,8 @@ void FixRigidNVE::initial_integrate(int vflag)
     torque[ibody][1] *= tflag[ibody][1];
     torque[ibody][2] *= tflag[ibody][2];
     
-    MathExtra::matvec_rows(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   torque[ibody],tbody);
+    MathExtra::transpose_matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+				torque[ibody],tbody);
     MathExtra::quatvec(quat[ibody],tbody,fquat);
     
     conjqm[ibody][0] += dtf2 * fquat[0];
@@ -123,8 +123,8 @@ void FixRigidNVE::initial_integrate(int vflag)
     MathExtra::q_to_exyz(quat[ibody],ex_space[ibody],ey_space[ibody],
 			 ez_space[ibody]);
     MathExtra::invquatvec(quat[ibody],conjqm[ibody],mbody);
-    MathExtra::matvec_cols(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   mbody,angmom[ibody]);
+    MathExtra::matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+		      mbody,angmom[ibody]);
     
     angmom[ibody][0] *= 0.5;
     angmom[ibody][1] *= 0.5;
@@ -252,8 +252,8 @@ void FixRigidNVE::final_integrate()
     torque[ibody][1] *= tflag[ibody][1];
     torque[ibody][2] *= tflag[ibody][2];
     
-    MathExtra::matvec_rows(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   torque[ibody],tbody);
+    MathExtra::transpose_matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+				torque[ibody],tbody);
     MathExtra::quatvec(quat[ibody],tbody,fquat);
     
     conjqm[ibody][0] += dtf2 * fquat[0];
@@ -262,8 +262,8 @@ void FixRigidNVE::final_integrate()
     conjqm[ibody][3] += dtf2 * fquat[3];
     
     MathExtra::invquatvec(quat[ibody],conjqm[ibody],mbody);
-    MathExtra::matvec_cols(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   mbody,angmom[ibody]);
+    MathExtra::matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+		      mbody,angmom[ibody]);
     
     angmom[ibody][0] *= 0.5;
     angmom[ibody][1] *= 0.5;
diff --git a/src/fix_rigid_nvt.cpp b/src/fix_rigid_nvt.cpp
index 816ea19a37..4ad18e1e7a 100644
--- a/src/fix_rigid_nvt.cpp
+++ b/src/fix_rigid_nvt.cpp
@@ -164,8 +164,8 @@ void FixRigidNVT::setup(int vflag)
   
   double mbody[3];
   for (int ibody = 0; ibody < nbody; ibody++) {
-    MathExtra::matvec_rows(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   angmom[ibody],mbody);
+    MathExtra::transpose_matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+				angmom[ibody],mbody);
     MathExtra::quatvec(quat[ibody],mbody,conjqm[ibody]);
     conjqm[ibody][0] *= 2.0;
     conjqm[ibody][1] *= 2.0;
@@ -225,8 +225,8 @@ void FixRigidNVT::initial_integrate(int vflag)
     
     // step 1.3 - apply torque (body coords) to quaternion momentum
 
-    MathExtra::matvec_rows(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   torque[ibody],tbody);
+    MathExtra::transpose_matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+				torque[ibody],tbody);
     MathExtra::quatvec(quat[ibody],tbody,fquat);
     
     conjqm[ibody][0] += dtf2 * fquat[0];
@@ -253,8 +253,8 @@ void FixRigidNVT::initial_integrate(int vflag)
     MathExtra::q_to_exyz(quat[ibody],ex_space[ibody],ey_space[ibody],
 			 ez_space[ibody]);
     MathExtra::invquatvec(quat[ibody],conjqm[ibody],mbody);
-    MathExtra::matvec_cols(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   mbody,angmom[ibody]);
+    MathExtra::matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+		      mbody,angmom[ibody]);
     
     angmom[ibody][0] *= 0.5;
     angmom[ibody][1] *= 0.5;
@@ -398,8 +398,8 @@ void FixRigidNVT::final_integrate()
     
     // convert torque to the body frame 
     
-    MathExtra::matvec_rows(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   torque[ibody],tbody);
+    MathExtra::transpose_matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+				torque[ibody],tbody);
     
     // compute "force" for quaternion
     
@@ -416,8 +416,8 @@ void FixRigidNVT::final_integrate()
     // then convert to the space-fixed frame
     
     MathExtra::invquatvec(quat[ibody],conjqm[ibody],mbody);
-    MathExtra::matvec_cols(ex_space[ibody],ey_space[ibody],ez_space[ibody],
-			   mbody,angmom[ibody]);
+    MathExtra::matvec(ex_space[ibody],ey_space[ibody],ez_space[ibody],
+		      mbody,angmom[ibody]);
     
     angmom[ibody][0] *= 0.5;
     angmom[ibody][1] *= 0.5;
diff --git a/src/fix_viscous.h b/src/fix_viscous.h
index ccadf4889a..de2c5156e1 100644
--- a/src/fix_viscous.h
+++ b/src/fix_viscous.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class FixViscous : public Fix {
  public:
   FixViscous(class LAMMPS *, int, char **);
-  ~FixViscous();
+  virtual ~FixViscous();
   int setmask();
   void init();
   void setup(int);
@@ -36,7 +36,7 @@ class FixViscous : public Fix {
   void post_force_respa(int, int, int);
   void min_post_force(int);
 
- private:
+ protected:
   double *gamma;
   int nlevels_respa;
 };
diff --git a/src/fix_wall_lj126.h b/src/fix_wall_lj126.h
index eba0ffa3e5..882efbafda 100644
--- a/src/fix_wall_lj126.h
+++ b/src/fix_wall_lj126.h
@@ -34,6 +34,7 @@ class FixWallLJ126 : public FixWall {
   double coeff1[6],coeff2[6],coeff3[6],coeff4[6],offset[6];
 };
 
-#endif
 }
+
+#endif
 #endif
diff --git a/src/fix_wall_reflect.h b/src/fix_wall_reflect.h
index 0bf98a33c0..6b87d4039b 100644
--- a/src/fix_wall_reflect.h
+++ b/src/fix_wall_reflect.h
@@ -27,12 +27,12 @@ namespace LAMMPS_NS {
 class FixWallReflect : public Fix {
  public:
   FixWallReflect(class LAMMPS *, int, char **);
-  ~FixWallReflect();
+  virtual ~FixWallReflect();
   int setmask();
   void init();
   void post_integrate();
 
- private:
+ protected:
   int nwall;
   int wallwhich[6],wallstyle[6];
   double coord0[6];
diff --git a/src/force.cpp b/src/force.cpp
index 8068717be5..3364455924 100644
--- a/src/force.cpp
+++ b/src/force.cpp
@@ -117,23 +117,51 @@ void Force::init()
    create a pair style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_pair(const char *style)
+void Force::create_pair(const char *style, char *suffix)
 {
   delete [] pair_style;
   if (pair) delete pair;
 
-  pair = new_pair(style);
-  int n = strlen(style) + 1;
-  pair_style = new char[n];
-  strcpy(pair_style,style);
+  int sflag;
+  pair = new_pair(style,suffix,sflag);
+
+  if (sflag) {
+    char estyle[256];
+    sprintf(estyle,"%s/%s",style,suffix);
+    int n = strlen(estyle) + 1;
+    pair_style = new char[n];
+    strcpy(pair_style,estyle);
+  } else {
+    int n = strlen(style) + 1;
+    pair_style = new char[n];
+    strcpy(pair_style,style);
+  }
 }
 
 /* ----------------------------------------------------------------------
-   generate a pair class
+   generate a pair class, first with suffix appended
 ------------------------------------------------------------------------- */
 
-Pair *Force::new_pair(const char *style)
+Pair *Force::new_pair(const char *style, char *suffix, int &sflag)
 {
+  if (suffix && lmp->offaccel == 0) {
+    sflag = 1;
+    char estyle[256];
+    sprintf(estyle,"%s/%s",style,suffix);
+
+    if (0) return NULL;
+
+#define PAIR_CLASS
+#define PairStyle(key,Class) \
+    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_pair.h"
+#undef PairStyle
+#undef PAIR_CLASS
+
+  }
+
+  sflag = 0;
+
   if (strcmp(style,"none") == 0) return NULL;
 
 #define PAIR_CLASS
@@ -143,6 +171,7 @@ Pair *Force::new_pair(const char *style)
 #undef PAIR_CLASS
 
   else error->all("Invalid pair style");
+
   return NULL;
 }
 
diff --git a/src/force.h b/src/force.h
index ee6343db98..9f20b4c66c 100644
--- a/src/force.h
+++ b/src/force.h
@@ -64,8 +64,8 @@ class Force : protected Pointers {
   ~Force();
   void init();
 
-  void create_pair(const char *);
-  class Pair *new_pair(const char *);
+  void create_pair(const char *, char *suffix = NULL);
+  class Pair *new_pair(const char *, char *, int &);
   class Pair *pair_match(const char *, int);
 
   void create_bond(const char *);
diff --git a/src/group.cpp b/src/group.cpp
index b5803ccdb5..7ab12b35f3 100644
--- a/src/group.cpp
+++ b/src/group.cpp
@@ -142,6 +142,7 @@ void Group::assign(int narg, char **arg)
     
   // style = region
   // add to group if atom is in region
+  // init all regions via domain->init() to insure region can perform match()
 
   if (strcmp(arg[1],"region") == 0) {
 
@@ -149,6 +150,7 @@ void Group::assign(int narg, char **arg)
     
     int iregion = domain->find_region(arg[2]);
     if (iregion == -1) error->all("Group region ID does not exist");
+    domain->init();
 
     for (i = 0; i < nlocal; i++)
       if (domain->regions[iregion]->match(x[i][0],x[i][1],x[i][2]))
diff --git a/src/input.cpp b/src/input.cpp
index 0861c96cc2..079879574f 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -42,6 +42,7 @@
 #include "neighbor.h"
 #include "special.h"
 #include "variable.h"
+#include "accelerator.h"
 #include "error.h"
 #include "memory.h"
 
@@ -417,6 +418,7 @@ int Input::execute_command()
   else if (!strcmp(command,"shell")) shell();
   else if (!strcmp(command,"variable")) variable_command();
 
+  else if (!strcmp(command,"accelerator")) accelerator();
   else if (!strcmp(command,"angle_coeff")) angle_coeff();
   else if (!strcmp(command,"angle_style")) angle_style();
   else if (!strcmp(command,"atom_modify")) atom_modify();
@@ -801,6 +803,32 @@ void Input::variable_command()
    one function for each LAMMPS-specific input script command
 ------------------------------------------------------------------------- */
 
+void Input::accelerator()
+{
+  if (domain->box_exist) 
+    error->all("Accelerator command after simulation box is defined");
+  if (narg < 1) error->all("Illegal accelerator command");
+  if (!lmp->asuffix || (strcmp(lmp->asuffix,arg[0]) != 0))
+    error->all("Accelerator command requires matching command-line -a switch");
+
+  if (strcmp(arg[0],"off") == 0) {
+    if (narg != 1) error->all("Illegal accelerator command");
+    lmp->offaccel = 1;
+    return;
+  }
+
+  if (strcmp(arg[0],"on") == 0) {
+    if (narg != 1) error->all("Illegal accelerator command");
+    lmp->offaccel = 0;
+    return;
+  }
+
+  if (strcmp(arg[0],"cuda") == 0) lmp->cuda->accelerator(narg-1,&arg[1]);
+  else error->all("Illegal accelerator command");
+}
+
+/* ---------------------------------------------------------------------- */
+
 void Input::angle_coeff()
 {
   if (domain->box_exist == 0)
@@ -837,7 +865,7 @@ void Input::atom_style()
   if (narg < 1) error->all("Illegal atom_style command");
   if (domain->box_exist) 
     error->all("Atom_style command after simulation box is defined");
-  atom->create_avec(arg[0],narg-1,&arg[1]);
+  atom->create_avec(arg[0],narg-1,&arg[1],lmp->asuffix);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -884,7 +912,7 @@ void Input::communicate()
 
 void Input::compute()
 {
-  modify->add_compute(narg,arg);
+  modify->add_compute(narg,arg,lmp->asuffix);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -962,7 +990,7 @@ void Input::dump_modify()
 
 void Input::fix()
 {
-  modify->add_fix(narg,arg);
+  modify->add_fix(narg,arg,lmp->asuffix);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1132,7 +1160,7 @@ void Input::pair_style()
     force->pair->settings(narg-1,&arg[1]);
     return;
   }
-  force->create_pair(arg[0]);
+  force->create_pair(arg[0],lmp->asuffix);
   if (force->pair) force->pair->settings(narg-1,&arg[1]);
 }
 
@@ -1191,7 +1219,7 @@ void Input::run_style()
 {
   if (domain->box_exist == 0)
     error->all("Run_style command before simulation box is defined");
-  update->create_integrate(narg,arg);
+  update->create_integrate(narg,arg,lmp->asuffix);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/input.h b/src/input.h
index cbd27881ef..2aac7e9646 100644
--- a/src/input.h
+++ b/src/input.h
@@ -61,7 +61,8 @@ class Input : protected Pointers {
   void shell();
   void variable_command();
 
-  void angle_coeff();          // LAMMPS commands
+  void accelerator();          // LAMMPS commands
+  void angle_coeff();
   void angle_style();
   void atom_modify();
   void atom_style();
diff --git a/src/lammps.cpp b/src/lammps.cpp
index 15b32bac2f..f0b79ec45b 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -27,10 +27,13 @@
 #include "modify.h"
 #include "group.h"
 #include "output.h"
+#include "accelerator.h"
 #include "timer.h"
 
 using namespace LAMMPS_NS;
 
+enum{NOACCEL,OPT,GPU,USERCUDA};
+
 /* ----------------------------------------------------------------------
    start up LAMMPS
    allocate fundamental classes (memory, error, universe, input)
@@ -54,14 +57,18 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
   int inflag = 0;
   int screenflag = 0;
   int logflag = 0;
+  accelerator = NOACCEL;
+  asuffix = NULL;
+  offaccel = 0;
+  cuda = NULL;
+
   int iarg = 1;
 
   while (iarg < narg) {
     if (strcmp(arg[iarg],"-partition") == 0 || 
 	strcmp(arg[iarg],"-p") == 0) {
       universe->existflag = 1;
-      if (iarg+2 > narg) 
-	error->universe_all("Invalid command-line argument");
+      if (iarg+2 > narg) error->universe_all("Invalid command-line argument");
       iarg++;
       while (iarg < narg && arg[iarg][0] != '-') {
 	universe->add_world(arg[iarg]);
@@ -69,32 +76,37 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
       }
     } else if (strcmp(arg[iarg],"-in") == 0 || 
 	       strcmp(arg[iarg],"-i") == 0) {
-      if (iarg+2 > narg) 
-	error->universe_all("Invalid command-line argument");
+      if (iarg+2 > narg) error->universe_all("Invalid command-line argument");
       inflag = iarg + 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-screen") == 0 || 
 	       strcmp(arg[iarg],"-s") == 0) {
-      if (iarg+2 > narg) 
-	error->universe_all("Invalid command-line argument");
+      if (iarg+2 > narg) error->universe_all("Invalid command-line argument");
       screenflag = iarg + 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-log") == 0 || 
 	       strcmp(arg[iarg],"-l") == 0) {
-      if (iarg+2 > narg) 
-	error->universe_all("Invalid command-line argument");
+      if (iarg+2 > narg) error->universe_all("Invalid command-line argument");
       logflag = iarg + 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-var") == 0 || 
 	       strcmp(arg[iarg],"-v") == 0) {
-      if (iarg+3 > narg) 
-	error->universe_all("Invalid command-line argument");
+      if (iarg+3 > narg) error->universe_all("Invalid command-line argument");
       iarg += 2;
       while (iarg < narg && arg[iarg][0] != '-') iarg++;
     } else if (strcmp(arg[iarg],"-echo") == 0 || 
 	       strcmp(arg[iarg],"-e") == 0) {
-      if (iarg+2 > narg) 
-	error->universe_all("Invalid command-line argument");
+      if (iarg+2 > narg) error->universe_all("Invalid command-line argument");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"-accel") == 0 || 
+	       strcmp(arg[iarg],"-a") == 0) {
+      if (iarg+2 > narg) error->universe_all("Invalid command-line argument");
+      if (strcmp(arg[iarg+1],"opt") == 0) accelerator = OPT;
+      else if (strcmp(arg[iarg+1],"gpu") == 0) accelerator = GPU;
+      else if (strcmp(arg[iarg+1],"cuda") == 0) accelerator = USERCUDA;
+      else error->universe_all("Invalid command-line argument");
+      asuffix = new char[8];
+      strcpy(asuffix,arg[iarg+1]);
       iarg += 2;
     } else error->universe_all("Invalid command-line argument");
   }
@@ -265,6 +277,16 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
   if (mpisize != sizeof(bigint))
       error->all("MPI_LMP_BIGINT and bigint in lmptype.h are not compatible");
 
+  // check consistency of -a switch with installed packages
+  // for OPT and GPU, no problem if not installed
+  // for USER-CUDA, throw error if not installed
+
+  if (accelerator == USERCUDA) {
+    cuda = new Cuda(this);
+    if (!cuda->cuda_exists)
+      error->all("Command-line switch requires USER-CUDA package be installed");
+  }
+
   // allocate input class now that MPI is fully setup
 
   input = new Input(this,narg,arg);
@@ -285,6 +307,7 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
 LAMMPS::~LAMMPS()
 {
   destroy();
+  if (accelerator == USERCUDA) delete cuda;
 
   if (universe->nworlds == 1) {
     if (logfile) fclose(logfile);
@@ -296,6 +319,8 @@ LAMMPS::~LAMMPS()
 
   if (world != universe->uworld) MPI_Comm_free(&world);
 
+  delete [] asuffix;
+
   delete input;
   delete universe;
   delete error;
@@ -305,17 +330,28 @@ LAMMPS::~LAMMPS()
 /* ----------------------------------------------------------------------
    allocate single instance of top-level classes
    fundamental classes are allocated in constructor
+   some classes have accelerator variants
 ------------------------------------------------------------------------- */
 
 void LAMMPS::create()
 {
   atom = new Atom(this);
-  neighbor = new Neighbor(this);
-  comm = new Comm(this);
-  domain = new Domain(this);
+
+  if (accelerator == USERCUDA) neighbor = new NeighborCuda(this);
+  else neighbor = new Neighbor(this);
+
+  if (accelerator == USERCUDA) comm = new CommCuda(this);
+  else comm = new Comm(this);
+
+  if (accelerator == USERCUDA) domain = new DomainCuda(this);
+  else domain = new Domain(this);
+
   group = new Group(this);
   force = new Force(this);    // must be after group, to create temperature
-  modify = new Modify(this);
+
+  if (accelerator == USERCUDA) modify = new ModifyCuda(this);
+  else modify = new Modify(this);
+
   output = new Output(this);  // must be after group, so "all" exists
                               // must be after modify so can create Computes
   update = new Update(this);  // must be after output, force, neighbor
@@ -328,6 +364,8 @@ void LAMMPS::create()
 
 void LAMMPS::init()
 {
+  if (accelerator == USERCUDA) cuda->accelerator(0,NULL);
+ 
   update->init();
   force->init();         // pair must come after update due to minimizer
   domain->init();
diff --git a/src/lammps.h b/src/lammps.h
index 1c3f81600e..deaa10ebd9 100644
--- a/src/lammps.h
+++ b/src/lammps.h
@@ -42,6 +42,11 @@ class LAMMPS {
   FILE *screen;                  // screen output
   FILE *logfile;                 // logfile
 
+  int accelerator;               // accelerator flag
+  char *asuffix;                 // accelerator suffix
+  int offaccel;                  // 1 if accelerator flag currently disabled
+  class Cuda *cuda;              // CUDA accelerator class
+
   LAMMPS(int, char **, MPI_Comm);
   ~LAMMPS();
   void create();
diff --git a/src/math_extra.h b/src/math_extra.h
index 1e05c5d728..a5fdc81ceb 100755
--- a/src/math_extra.h
+++ b/src/math_extra.h
@@ -63,8 +63,8 @@ namespace MathExtra {
   inline void transpose_matvec(const double *ex, const double *ey, 
 			       const double *ez, const double *v,
 			       double *ans);
-  inline void transpose_times_diag3(const double mat[3][3], const double*vec,
-                                    double ans[3][3]);
+  inline void transpose_diag3(const double mat[3][3], const double*vec,
+			      double ans[3][3]);
   inline void vecmat(const double *v, const double m[3][3], double *ans);
   inline void scalar_times3(const double f, double m[3][3]); 
 
@@ -91,10 +91,6 @@ namespace MathExtra {
   inline void invquatvec(double *a, double *b, double *c);
   inline void axisangle_to_quat(const double *v, const double angle,
                                 double *quat);
-  inline void matvec_rows(double *x, double *y, double *z, 
-			  double *b, double *c);
-  inline void matvec_cols(double *x, double *y, double *z,
-			  double *b, double *c);
 
   void angmom_to_omega(double *m, double *ex, double *ey, double *ez,
 		       double *idiag, double *w);
@@ -418,8 +414,8 @@ void MathExtra::transpose_matvec(const double *ex, const double *ey,
    transposed matrix times diagonal matrix
 ------------------------------------------------------------------------- */
 
-void MathExtra::transpose_times_diag3(const double m[3][3],
-                                      const double *d, double ans[3][3])
+void MathExtra::transpose_diag3(const double m[3][3], const double *d, 
+				double ans[3][3])
 {
   ans[0][0] = m[0][0]*d[0];
   ans[0][1] = m[1][0]*d[1];
@@ -562,30 +558,6 @@ void MathExtra::axisangle_to_quat(const double *v, const double angle,
   quat[3] = v[2]*sina;
 }
 
-/* ----------------------------------------------------------------------
-  matvec_rows: c = Ab, where rows of A are x, y, z
-------------------------------------------------------------------------- */
-
-void MathExtra::matvec_rows(double *x, double *y, double *z, 
-			    double *b, double *c)
-{
-  c[0] = x[0]*b[0] + x[1]*b[1] + x[2]*b[2];
-  c[1] = y[0]*b[0] + y[1]*b[1] + y[2]*b[2];
-  c[2] = z[0]*b[0] + z[1]*b[1] + z[2]*b[2];
-}
-
-/* ----------------------------------------------------------------------
-  matvec_cols: c = Ab, where columns of A are x, y, z
-------------------------------------------------------------------------- */
-
-void MathExtra::matvec_cols(double *x, double *y, double *z,
-			    double *b, double *c)
-{
-  c[0] = x[0]*b[0] + y[0]*b[1] + z[0]*b[2];
-  c[1] = x[1]*b[0] + y[1]*b[1] + z[1]*b[2];
-  c[2] = x[2]*b[0] + y[2]*b[1] + z[2]*b[2];
-}
-
 /* ----------------------------------------------------------------------
    Apply principal rotation generator about x to rotation matrix m
 ------------------------------------------------------------------------- */
diff --git a/src/modify.cpp b/src/modify.cpp
index 271e463229..0576cd2c83 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -274,7 +274,7 @@ void Modify::setup_pre_force(int vflag)
     for (int i = 0; i < n_pre_force; i++)
       fix[list_pre_force[i]]->setup_pre_force(vflag);
   else if (update->whichflag == 2)
-    for (int i = 0; i < n_pre_force; i++)
+    for (int i = 0; i < n_min_pre_force; i++)
       fix[list_min_pre_force[i]]->min_setup_pre_force(vflag);
 }
 
@@ -591,7 +591,7 @@ int Modify::min_reset_ref()
    add a new fix or replace one with same ID
 ------------------------------------------------------------------------- */
 
-void Modify::add_fix(int narg, char **arg)
+void Modify::add_fix(int narg, char **arg, char *suffix)
 {
   if (domain->box_exist == 0) 
     error->all("Fix command before simulation box is defined");
@@ -636,17 +636,39 @@ void Modify::add_fix(int narg, char **arg)
     }
   }
 
-  // create the Fix
+  // create the Fix, first with suffix appended
 
-  if (0) return;         // dummy line to enable else-if macro expansion
+  int success = 0;
+
+  if (suffix && lmp->offaccel == 0) {
+    char estyle[256];
+    sprintf(estyle,"%s/%s",arg[2],suffix);
+    success = 1;
+
+    if (0) return;
 
 #define FIX_CLASS
 #define FixStyle(key,Class) \
-  else if (strcmp(arg[2],#key) == 0) fix[ifix] = new Class(lmp,narg,arg);
+    else if (strcmp(estyle,#key) == 0) fix[ifix] = new Class(lmp,narg,arg);
 #include "style_fix.h"
+#undef FixStyle
 #undef FIX_CLASS
 
-  else error->all("Invalid fix style");
+    else success = 0;
+  }
+
+  if (!success) {
+    if (0) return;
+
+#define FIX_CLASS
+#define FixStyle(key,Class) \
+    else if (strcmp(arg[2],#key) == 0) fix[ifix] = new Class(lmp,narg,arg);
+#include "style_fix.h"
+#undef FixStyle
+#undef FIX_CLASS
+
+    else error->all("Invalid fix style");
+  }
 
   // set fix mask values and increment nfix (if new)
 
@@ -740,7 +762,7 @@ int Modify::find_fix(const char *id)
    add a new compute
 ------------------------------------------------------------------------- */
 
-void Modify::add_compute(int narg, char **arg)
+void Modify::add_compute(int narg, char **arg, char *suffix)
 {
   if (narg < 3) error->all("Illegal compute command");
 
@@ -758,18 +780,41 @@ void Modify::add_compute(int narg, char **arg)
       memory->srealloc(compute,maxcompute*sizeof(Compute *),"modify:compute");
   }
 
-  // create the Compute
+  // create the Compute, first with suffix appended
 
-  if (0) return;         // dummy line to enable else-if macro expansion
+  int success = 0;
+
+  if (suffix && lmp->offaccel == 0) {
+    char estyle[256];
+    sprintf(estyle,"%s/%s",arg[2],suffix);
+    success = 1;
+
+    if (0) return;
 
 #define COMPUTE_CLASS
 #define ComputeStyle(key,Class) \
-  else if (strcmp(arg[2],#key) == 0) \
-    compute[ncompute] = new Class(lmp,narg,arg);
+    else if (strcmp(estyle,#key) == 0) \
+      compute[ncompute] = new Class(lmp,narg,arg);
 #include "style_compute.h"
+#undef ComputeStyle
 #undef COMPUTE_CLASS
 
-  else error->all("Invalid compute style");
+    else success = 0;
+  }
+
+  if (!success) {
+    if (0) return;
+
+#define COMPUTE_CLASS
+#define ComputeStyle(key,Class) \
+    else if (strcmp(arg[2],#key) == 0) \
+      compute[ncompute] = new Class(lmp,narg,arg);
+#include "style_compute.h"
+#undef ComputeStyle
+#undef COMPUTE_CLASS
+
+    else error->all("Invalid compute style");
+  }
 
   ncompute++;
 }
diff --git a/src/modify.h b/src/modify.h
index 81d9401e69..0175cc06a0 100644
--- a/src/modify.h
+++ b/src/modify.h
@@ -40,22 +40,22 @@ class Modify : protected Pointers {
   class Compute **compute;
 
   Modify(class LAMMPS *);
-  ~Modify();
-  void init();
-  void setup(int);
-  void setup_pre_exchange();
-  void setup_pre_force(int);
-  void initial_integrate(int);
-  void post_integrate();
+  virtual ~Modify();
+  virtual void init();
+  virtual void setup(int);
+  virtual void setup_pre_exchange();
+  virtual void setup_pre_force(int);
+  virtual void initial_integrate(int);
+  virtual void post_integrate();
   void pre_decide();
-  void pre_exchange();
-  void pre_neighbor();
-  void pre_force(int);
-  void post_force(int);
-  void final_integrate();
-  void end_of_step();
-  double thermo_energy();
-  void post_run();
+  virtual void pre_exchange();
+  virtual void pre_neighbor();
+  virtual void pre_force(int);
+  virtual void post_force(int);
+  virtual void final_integrate();
+  virtual void end_of_step();
+  virtual double thermo_energy();
+  virtual void post_run();
 
   void setup_pre_force_respa(int, int);
   void initial_integrate_respa(int, int, int);
@@ -79,12 +79,12 @@ class Modify : protected Pointers {
   double max_alpha(double *);
   int min_dof();
 
-  void add_fix(int, char **);
+  void add_fix(int, char **, char *suffix = NULL);
   void modify_fix(int, char **);
   void delete_fix(const char *);
   int find_fix(const char *);
 
-  void add_compute(int, char **);
+  void add_compute(int, char **, char *suffix = NULL);
   void modify_compute(int, char **);
   void delete_compute(char *);
   int find_compute(char *);
@@ -98,7 +98,7 @@ class Modify : protected Pointers {
 
   bigint memory_usage();
 
- private:
+ protected:
 
   // lists of fixes to apply at different stages of timestep
 
diff --git a/src/neigh_list.h b/src/neigh_list.h
index dfb13b8d9a..63675503cf 100644
--- a/src/neigh_list.h
+++ b/src/neigh_list.h
@@ -73,6 +73,8 @@ class NeighList : protected Pointers {
   int **stencil_multi;             // list of bin offsets in each stencil
   double **distsq_multi;           // sq distances to bins in each stencil
 
+  class CudaNeighList *cuda_list;  // CUDA neighbor list
+
   NeighList(class LAMMPS *, int);
   ~NeighList();
   void grow(int);                       // grow maxlocal
@@ -80,6 +82,7 @@ class NeighList : protected Pointers {
   int **add_pages();                    // add pages to neigh list
   void copy_skip_info(int *, int **);   // copy skip info from a neigh request
   void print_attributes();              // debug routine
+  int get_maxlocal() {return maxatoms;}
   bigint memory_usage();
 
  private:
diff --git a/src/neigh_request.cpp b/src/neigh_request.cpp
index 99f7be82c7..82c9f0b267 100644
--- a/src/neigh_request.cpp
+++ b/src/neigh_request.cpp
@@ -50,6 +50,7 @@ NeighRequest::NeighRequest(LAMMPS *lmp) : Pointers(lmp)
   special = 1;
   dnum = 0;
   ghost = 0;
+  cudable = 0;
 
   // default is no copy or skip
 
@@ -99,6 +100,7 @@ int NeighRequest::identical(NeighRequest *other)
   if (special != other->special) same = 0;
   if (dnum != other->dnum) same = 0;
   if (ghost != other->ghost) same = 0;
+  if (cudable != other->cudable) same = 0;
 
   if (copy != other->copy) same = 0;
   if (same_skip(other) == 0) same = 0;
@@ -126,6 +128,7 @@ int NeighRequest::same_kind(NeighRequest *other)
   if (half_from_full != other->half_from_full) same = 0;
   if (newton != other->newton) same = 0;
   if (ghost != other->ghost) same = 0;
+  if (cudable != other->cudable) same = 0;
 
   return same;
 }
@@ -174,4 +177,5 @@ void NeighRequest::copy_request(NeighRequest *other)
   newton = other->newton;
   dnum = other->dnum;
   ghost = other->ghost;
+  cudable = other->cudable;
 }
diff --git a/src/neigh_request.h b/src/neigh_request.h
index d653307985..afff5966c0 100644
--- a/src/neigh_request.h
+++ b/src/neigh_request.h
@@ -72,6 +72,10 @@ class NeighRequest : protected Pointers {
 
   int ghost;
 
+  // 1 if neighbor list build will be done on GPU
+
+  int cudable;
+
   // set by neighbor and pair_hybrid after all requests are made
   // these settings do not change kind value
 
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index fc4f1125e5..e8ecccc9a0 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -480,11 +480,11 @@ void Neighbor::init()
     // fix/compute requests:
     //   kind of request = half or full, occasional or not doesn't matter
     //   if request = half and non-skip pair half/respaouter exists,
-    //     become copy of that list
+    //     become copy of that list if cudable flag matches
     //   if request = full and non-skip pair full exists,
-    //     become copy of that list
+    //     become copy of that list if cudable flag matches
     //   if request = half and non-skip pair full exists,
-    //     become half_from_full of that list
+    //     become half_from_full of that list if cudable flag matches
     //   if no matches, do nothing, fix/compute list will be built directly
     //   ok if parent is copy list
 
@@ -534,6 +534,8 @@ void Neighbor::init()
 	  if (requests[i]->half && requests[j]->pair &&
 	      requests[j]->skip == 0 && requests[j]->respaouter) break;
 	}
+	if (j < nlist && requests[j]->cudable != requests[i]->cudable)
+	  j = nlist;
 	if (j < nlist) {
 	  requests[i]->copy = 1;
 	  lists[i]->listcopy = lists[j];
@@ -542,6 +544,8 @@ void Neighbor::init()
 	    if (requests[i]->half && requests[j]->pair &&
 		requests[j]->skip == 0 && requests[j]->full) break;
 	  }
+	  if (j < nlist && requests[j]->cudable != requests[i]->cudable)
+	    j = nlist;
 	  if (j < nlist) {
 	    requests[i]->half = 0;
 	    requests[i]->half_from_full = 1;
@@ -553,11 +557,13 @@ void Neighbor::init()
 
     // set ptrs to pair_build and stencil_create functions for each list
     // ptrs set to NULL if not set explicitly
+    // also set cudable to 0 if any neigh list request is not cudable
 
     for (i = 0; i < nlist; i++) {
       choose_build(i,requests[i]);
       if (style != NSQ) choose_stencil(i,requests[i]);
       else stencil_create[i] = NULL;
+      if (!requests[i]->cudable) cudable = 0;
     }
 
     // set each list's build/grow/stencil/ghost flags based on neigh request
diff --git a/src/neighbor.h b/src/neighbor.h
index 97e0f354c4..c2fa635d63 100644
--- a/src/neighbor.h
+++ b/src/neighbor.h
@@ -19,6 +19,8 @@
 namespace LAMMPS_NS {
 
 class Neighbor : protected Pointers {
+  friend class Cuda;
+
  public:
   int style;                       // 0,1,2 = nsq, bin, multi
   int every;                       // build every this many steps
@@ -29,6 +31,7 @@ class Neighbor : protected Pointers {
   int oneatom;                     // max # of neighbors for one atom
   int includegroup;                // only build pairwise lists for this group
   int build_once;                  // 1 if only build lists once per run
+  int cudable;                     // GPU <-> CPU communication flag for CUDA
 
   double skin;                     // skin distance
   double cutneighmin;              // min neighbor cutoff for all type pairs
@@ -60,20 +63,20 @@ class Neighbor : protected Pointers {
   int **improperlist;
 
   Neighbor(class LAMMPS *);
-  ~Neighbor();
-  void init();
-  int request(void *);         // another class requests a neighbor list
-  void print_lists_of_lists(); // debug print out
-  int decide();                // decide whether to build or not
-  int check_distance();        // check max distance moved since last build
-  void setup_bins();           // setup bins based on box and cutoff
-  void build();                // create all neighbor lists (pair,bond)
-  void build_one(int);         // create a single neighbor list
-  void set(int, char **);      // set neighbor style and skin distance
+  virtual ~Neighbor();
+  virtual void init();
+  int request(void *);              // another class requests a neighbor list
+  void print_lists_of_lists();      // debug print out
+  int decide();                     // decide whether to build or not
+  virtual int check_distance();     // check max distance moved since last build
+  void setup_bins();                // setup bins based on box and cutoff
+  virtual void build();             // create all neighbor lists (pair,bond)
+  void build_one(int);              // create a single neighbor list
+  void set(int, char **);           // set neighbor style and skin distance
   void modify_params(int, char**);  // modify parameters that control builds
   bigint memory_usage();
   
- private:
+ protected:
   int me,nprocs;
 
   int maxatom;                     // size of atom-based NeighList arrays
@@ -162,7 +165,7 @@ class Neighbor : protected Pointers {
   int coord2bin(double *, int &, int &, int&); // ditto
 
   int exclusion(int, int, int, int, int *, int *);  // test for pair exclusion
-  void choose_build(int, class NeighRequest *);
+  virtual void choose_build(int, class NeighRequest *);
   void choose_stencil(int, class NeighRequest *);
 
   // pairwise build functions
diff --git a/src/output.cpp b/src/output.cpp
index 87e85286be..95808a9b5e 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -32,9 +32,12 @@
 #include "write_restart.h"
 #include "memory.h"
 #include "error.h"
+#include "accelerator.h"
 
 using namespace LAMMPS_NS;
 
+enum{NOACCEL,OPT,GPU,USERCUDA};     // same as lammps.cpp
+
 #define DELTA 1
 
 #define MYMIN(a,b) ((a) < (b) ? (a) : (b))
@@ -52,18 +55,18 @@ Output::Output(LAMMPS *lmp) : Pointers(lmp)
   newarg[0] = (char *) "thermo_temp";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "temp";
-  modify->add_compute(3,newarg);
+  modify->add_compute(3,newarg,lmp->asuffix);
 
   newarg[0] = (char *) "thermo_press";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "pressure";
   newarg[3] = (char *) "thermo_temp";
-  modify->add_compute(4,newarg);
+  modify->add_compute(4,newarg,lmp->asuffix);
 
   newarg[0] = (char *) "thermo_pe";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "pe";
-  modify->add_compute(3,newarg);
+  modify->add_compute(3,newarg,lmp->asuffix);
 
   delete [] newarg;
 
@@ -243,8 +246,13 @@ void Output::write(bigint ntimestep)
 {
   // next_dump does not force output on last step of run
   // wrap dumps that invoke computes with clear/add
+  // download data from GPU if necessary
 
   if (next_dump_any == ntimestep) {
+
+    if (lmp->accelerator == USERCUDA && !lmp->cuda->oncpu)
+      lmp->cuda->downloadAll();    
+    
     for (int idump = 0; idump < ndump; idump++) {
       if (next_dump[idump] == ntimestep && last_dump[idump] != ntimestep) {
         if (dump[idump]->clearstep) modify->clearstep_compute();
@@ -267,8 +275,13 @@ void Output::write(bigint ntimestep)
 
   // next_restart does not force output on last step of run
   // for toggle = 0, replace "*" with current timestep in restart filename
+  // download data from GPU if necessary
 
   if (next_restart == ntimestep && last_restart != ntimestep) {
+
+    if (lmp->accelerator == USERCUDA && !lmp->cuda->oncpu) 
+      lmp->cuda->downloadAll();    
+    
     if (restart_toggle == 0) {
       char *file = new char[strlen(restart1) + 16];
       char *ptr = strchr(restart1,'*');
diff --git a/src/pair.cpp b/src/pair.cpp
index b8331086f7..dde19c1d05 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -30,6 +30,7 @@
 #include "comm.h"
 #include "force.h"
 #include "update.h"
+#include "accelerator.h"
 #include "memory.h"
 #include "error.h"
 
@@ -40,6 +41,7 @@ using namespace LAMMPS_NS;
 
 enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER};
 enum{R,RSQ,BMP};
+enum{NOACCEL,OPT,GPU,USERCUDA};     // same as lammps.cpp
 
 /* ---------------------------------------------------------------------- */
 
@@ -325,6 +327,9 @@ void Pair::ev_setup(int eflag, int vflag)
     if (vflag_atom == 0) vflag_either = 0;
     if (vflag_either == 0 && eflag_either == 0) evflag = 0;
   } else vflag_fdotr = 0;
+
+  if (lmp->accelerator == USERCUDA)
+    lmp->cuda->evsetup_eatom_vatom(eflag_atom,vflag_atom);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/pair_buck.h b/src/pair_buck.h
index 70d571c8b8..c9e13d6cfa 100644
--- a/src/pair_buck.h
+++ b/src/pair_buck.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairBuck : public Pair {
  public:
   PairBuck(class LAMMPS *);
-  ~PairBuck();
+  virtual ~PairBuck();
   void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
@@ -39,7 +39,7 @@ class PairBuck : public Pair {
   double single(int, int, int, int, double, double, double, double &);
   void *extract(char *, int &);
 
- private:
+ protected:
   double cut_global;
   double **cut;
   double **a,**rho,**c;
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index a3d0dafd32..5fd45a8f60 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -227,6 +227,8 @@ void PairHybrid::settings(int narg, char **arg)
   // exception is 1st arg of reax/c style, which is non-numeric
   // need a better way to skip these exceptions
 
+  int dummy;
+
   nstyles = 0;
   i = 0;
   while (i < narg) {
@@ -237,7 +239,7 @@ void PairHybrid::settings(int narg, char **arg)
       error->all("Pair style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0) 
       error->all("Pair style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_pair(arg[i]);
+    styles[nstyles] = force->new_pair(arg[i],NULL,dummy);
     keywords[nstyles] = new char[strlen(arg[i])+1];
     strcpy(keywords[nstyles],arg[i]);
     istyle = i;
@@ -574,14 +576,14 @@ void PairHybrid::read_restart(FILE *fp)
   // each sub-style is created via new_pair()
   // each reads its settings, but no coeff info
 
-  int n;
+  int n,dummy;
   for (int m = 0; m < nstyles; m++) {
     if (me == 0) fread(&n,sizeof(int),1,fp);
     MPI_Bcast(&n,1,MPI_INT,0,world);
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_pair(keywords[m]);
+    styles[m] = force->new_pair(keywords[m],NULL,dummy);
     styles[m]->read_restart_settings(fp);
   }
 }
diff --git a/src/pair_lj_cut_coul_debye.h b/src/pair_lj_cut_coul_debye.h
index f112e5df7e..d008884801 100644
--- a/src/pair_lj_cut_coul_debye.h
+++ b/src/pair_lj_cut_coul_debye.h
@@ -27,13 +27,14 @@ namespace LAMMPS_NS {
 class PairLJCutCoulDebye : public PairLJCutCoulCut {
  public:
   PairLJCutCoulDebye(class LAMMPS *);
+  virtual ~PairLJCutCoulDebye() {}
   void compute(int, int);
   void settings(int, char **);
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
   double single(int, int, int, int, double, double, double, double &);
 
- private:
+ protected:
   double kappa;
 };
 
diff --git a/src/pair_lj_smooth.h b/src/pair_lj_smooth.h
index 1e0faf9c83..6b2fc150bb 100644
--- a/src/pair_lj_smooth.h
+++ b/src/pair_lj_smooth.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairLJSmooth : public Pair {
  public:
   PairLJSmooth(class LAMMPS *);
-  ~PairLJSmooth();
+  virtual ~PairLJSmooth();
   void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
@@ -38,7 +38,7 @@ class PairLJSmooth : public Pair {
   void read_restart_settings(FILE *);
   double single(int, int, int, int, double, double, double, double &);
 
- private:
+ protected:
   double cut_inner_global,cut_global;
   double **cut,**cut_inner,**cut_inner_sq;
   double **epsilon,**sigma;
diff --git a/src/thermo.cpp b/src/thermo.cpp
index 3e642a925e..afb32ac9a2 100644
--- a/src/thermo.cpp
+++ b/src/thermo.cpp
@@ -219,12 +219,16 @@ void Thermo::init()
   }
 
   // find current ptr for each Compute ID
+  // cudable = 0 if any compute used by Thermo is non-CUDA
+
+  cudable = 1;
 
   int icompute;
   for (i = 0; i < ncompute; i++) {
     icompute = modify->find_compute(id_compute[i]);
     if (icompute < 0) error->all("Could not find thermo compute ID");
     computes[i] = modify->compute[icompute];
+    cudable = cudable && computes[i]->cudable;
   }
 
   // find current ptr for each Fix ID
diff --git a/src/thermo.h b/src/thermo.h
index 9b9133c6a6..49e2f79e57 100644
--- a/src/thermo.h
+++ b/src/thermo.h
@@ -26,6 +26,7 @@ class Thermo : protected Pointers {
   char *style;
   int normflag;          // 0 if do not normalize by atoms, 1 if normalize
   int modified;          // 1 if thermo_modify has been used, else 0
+  int cudable;           // 1 if all computes used are cudable
 
   Thermo(class LAMMPS *, int, char **);
   ~Thermo();
@@ -73,6 +74,7 @@ class Thermo : protected Pointers {
   int *field2index;      // which compute,fix,variable calcs this field
   int *argindex1;        // indices into compute,fix scalar,vector
   int *argindex2;
+
                          // data for keyword-specific Compute objects
                          // index = where they are in computes list
                          // id = ID of Compute objects
diff --git a/src/update.cpp b/src/update.cpp
index c6b135eec6..c8efe4b796 100644
--- a/src/update.cpp
+++ b/src/update.cpp
@@ -25,11 +25,14 @@
 #include "region.h"
 #include "compute.h"
 #include "output.h"
+#include "accelerator.h"
 #include "memory.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
 
+enum{NOACCEL,OPT,GPU,USERCUDA};     // same as lammps.cpp
+
 /* ---------------------------------------------------------------------- */
 
 Update::Update(LAMMPS *lmp) : Pointers(lmp)
@@ -53,17 +56,16 @@ Update::Update(LAMMPS *lmp) : Pointers(lmp)
   unit_style = NULL;
   set_units("lj");
 
+  integrate_style = NULL;
+  integrate = NULL;
+  minimize_style = NULL;
+  minimize = NULL;
+
   str = (char *) "verlet";
-  n = strlen(str) + 1;
-  integrate_style = new char[n];
-  strcpy(integrate_style,str);
-  integrate = new Verlet(lmp,0,NULL);
+  create_integrate(1,&str,lmp->asuffix);
 
   str = (char *) "cg";
-  n = strlen(str) + 1;
-  minimize_style = new char[n];
-  strcpy(minimize_style,str);
-  minimize = new MinCG(lmp);
+  create_minimize(1,&str);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -191,26 +193,70 @@ void Update::set_units(const char *style)
 
 /* ---------------------------------------------------------------------- */
 
-void Update::create_integrate(int narg, char **arg)
+void Update::create_integrate(int narg, char **arg, char *suffix)
 {
   if (narg < 1) error->all("Illegal run_style command");
 
   delete [] integrate_style;
   delete integrate;
 
-  if (0) return;      // dummy line to enable else-if macro expansion
+  int sflag;
+  new_integrate(arg[0],narg-1,&arg[1],suffix,sflag);
+
+  if (sflag) {
+    char estyle[256];
+    sprintf(estyle,"%s/%s",arg[0],suffix);
+    int n = strlen(estyle) + 1;
+    integrate_style = new char[n];
+    strcpy(integrate_style,estyle);
+  } else {
+    int n = strlen(arg[0]) + 1;
+    integrate_style = new char[n];
+    strcpy(integrate_style,arg[0]);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create the Integrate style, first with suffix appended
+------------------------------------------------------------------------- */
+
+void Update::new_integrate(char *style, int narg, char **arg,
+			   char *suffix, int &sflag)
+{
+  int success = 0;
+
+  if (suffix && lmp->offaccel == 0) {
+    sflag = 1;
+    char estyle[256];
+    sprintf(estyle,"%s/%s",style,suffix);
+    success = 1;
+
+    if (0) return;
 
 #define INTEGRATE_CLASS
 #define IntegrateStyle(key,Class) \
-  else if (strcmp(arg[0],#key) == 0) integrate = new Class(lmp,narg-1,&arg[1]);
+    else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg);
 #include "style_integrate.h"
+#undef IntegrateStyle
 #undef INTEGRATE_CLASS
 
-  else error->all("Illegal run_style command");
+    else success = 0;
+  }
 
-  int n = strlen(arg[0]) + 1;
-  integrate_style = new char[n];
-  strcpy(integrate_style,arg[0]);
+  sflag = 0;
+
+  if (!success) {
+    if (0) return;
+
+#define INTEGRATE_CLASS
+#define IntegrateStyle(key,Class) \
+    else if (strcmp(style,#key) == 0) integrate = new Class(lmp,narg,arg);
+#include "style_integrate.h"
+#undef IntegrateStyle
+#undef INTEGRATE_CLASS
+
+    else error->all("Illegal integrate style");
+  }
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/update.h b/src/update.h
index bde3512457..692f21548c 100644
--- a/src/update.h
+++ b/src/update.h
@@ -48,10 +48,14 @@ class Update : protected Pointers {
   ~Update();
   void init();
   void set_units(const char *);
-  void create_integrate(int, char **);
+  void create_integrate(int, char **, char *);
   void create_minimize(int, char **);
   void reset_timestep(int, char **);
   bigint memory_usage();
+
+ private:
+  void new_integrate(char *, int, char **, char *, int &);
+
 };
 
 }
diff --git a/src/verlet.h b/src/verlet.h
index 211b5653f6..8b8c90b7a9 100644
--- a/src/verlet.h
+++ b/src/verlet.h
@@ -27,14 +27,14 @@ namespace LAMMPS_NS {
 class Verlet : public Integrate {
  public:
   Verlet(class LAMMPS *, int, char **);
-  ~Verlet() {}
+  virtual ~Verlet() {}
   void init();
   void setup();
   void setup_minimal(int);
   void run(int);
   void cleanup();
 
- private:
+ protected:
   int triclinic;                    // 0 if domain is orthog, 1 if triclinic
   int torqueflag;                   // zero out arrays every step
   int erforceflag;
diff --git a/src/version.h b/src/version.h
index 67eb15527e..2e56b98b95 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "4 May 2011"
+#define LAMMPS_VERSION "23 May 2011"
diff --git a/tools/reax/README.txt b/tools/reax/README.txt
index 8fbc595ac2..2a7ff29f26 100644
--- a/tools/reax/README.txt
+++ b/tools/reax/README.txt
@@ -15,3 +15,8 @@ bondConnectCheck.f90: reads the output of fix reax/bonds.
    Compile it using FORTRAN compiler
    To test, run it with bonds.reax
    Contact: Paul Liangliang Huang <lhuang4@ncsu.edu>
+
+reaxc_bond.pl: reads the bonding information in the
+                .trj file produced by pair_style reax/c and
+                outputs molecule counts for each frame.  
+
diff --git a/tools/reax/reaxc_bond.pl b/tools/reax/reaxc_bond.pl
new file mode 100755
index 0000000000..9b0fa50672
--- /dev/null
+++ b/tools/reax/reaxc_bond.pl
@@ -0,0 +1,152 @@
+#!/usr/bin/perl
+use Getopt::Long;
+Getopt::Long::Configure ('bundling');
+
+#################################################################
+#                                                               #
+#  This script is designed to take the bond information from a  #
+#  Lammps/reaxc *.trj file and output a molecular fraction file #
+#  for each frame.                                              #
+#                                                               #
+#  written by Mike Russo, PSU                                   #
+#  modified by Aidan Thompson, 5/12/2011                        #
+#                                                               #
+#  The required .trj file is generated by running LAMMPS        #
+#  with pair_style reax/c and the following settings in         #
+#  the corresponding reax/c control file                        #
+#     write_freq              25   ! write trajectory after so many steps
+#     traj_compress           0    ! 0: no compression  1: uses zlib to compress trajectory output
+#     traj_title              TATB ! (no white spaces)
+#     atom_info               0    ! 0: no atom info, 1: print basic atom info in the trajectory file
+#     atom_forces             0    ! 0: basic atom format, 1: print force on each atom in the trajectory file
+#     atom_velocities         0    ! 0: basic atom format, 1: print the velocity of each atom in the trajectory file
+#     bond_info               1    ! 0: do not print bonds, 1: print bonds in the trajectory file
+#     angle_info              0    ! 0: do not print angles, 1: print angles in the trajectory file 
+#                                                               #
+#################################################################
+
+#################################################################
+#  Setting up some default variables, and options for the user  #
+#  to input.                                                    #
+#################################################################
+$in_file = "bonds.trj";
+@test = qw(C H O N);
+GetOptions ('f|file=s' => \$in_file, 'a|atoms=s' => \@atoms, 'h|help' => \$help);
+if($help) {
+    print "Options for this program:\n-f --file for input file default= bonds.trj\n-a --atoms atom types (in correct order and input separately) default= @test\n";
+    exit;
+}
+open INPUT, "<$in_file" or die "Cannot open $in_file: $!";
+open OUTPUT, ">frac.dat" or die "Cannot open output file: $!";
+
+if(@atoms) {
+    @test = @atoms;
+}
+
+print "Input for this run:\n  Input file = $in_file\n Atom types = @test\n";
+#################################################################
+
+#################################################################
+#  Main loop of the script. Goes through each frames bond list. #
+#################################################################
+$i = 0;
+$section = 0;
+$at_count = -1;
+while(<INPUT>) {
+    if(/chars_to_skip_section/) {
+        $section++;
+        &bonds if($section > 2);
+        next;
+    }
+    next if($section <= 0);  #skipping the header section
+    next if(/\s*[A-Za-z]/); #skip text lines
+
+    split;
+    if ($section == 1) {
+	$q = $_[0];
+	$temp = $_[1];
+	$q--;
+	$at_type[$q] = $temp;
+	next;
+    }
+
+    $_[0]--;
+    $_[1]--;
+# Add i-j and j-i entries
+    push @{$list[$_[0]]}, $_[1];
+    push @{$list[$_[1]]}, $_[0];
+    $at_count++ if($section == 2);
+}
+$section++;
+&bonds;
+close(INPUT);
+#################################################################
+
+#################################################################
+#  Subroutine bonds: Uses the bond information to generate a    #
+#  count for each species, put them into molecules, and then    #
+#  count the number of each molecule type.                      #
+#################################################################
+sub bonds {
+  $flag = ();
+  $k = 0;
+  for(0..$#list) {
+      if($flag[$_] == 0) {
+          push @{$full_list[$k]}, $_;
+          foreach $atom (@{$full_list[$k]}) {
+              for($o = 0; $o <= $#{$list[$atom]}; $o++) {
+                  unless(grep /^$list[$atom][$o]$/, @{$full_list[$k]}) {
+                      push @{$full_list[$k]}, $list[$atom][$o];
+                      $flag[$list[$atom][$o]] = 1;
+                  }
+              }
+          }
+      } else {
+          next;
+      }
+      $k++;
+  }
+
+### Output section ###
+  $frame = $section - 2;
+  open OUTPUT2, ">temp_$frame.dat" or die "Cannot open temp file: $!";
+  print OUTPUT2 "Frame # $frame\n";
+  for($m = 0; $m < $k; $m++) {     #cycle through each molecule
+      foreach $atom ((@{$full_list[$m]})) { #for each atom in this molecule
+          ${"$test[$at_type[$atom]]"} += 1;        #Create variable named C,H,O, etc and set it to count
+      }
+      print OUTPUT2 "Mol $m = ";
+      foreach $atom (@test) {
+          print OUTPUT2 "$atom${$atom}" if(${$atom} > 0);
+      }
+      print OUTPUT2 "\n";
+      for($r = 0; $r <= $#test; $r++) {
+          ${"$test[$r]"} = 0;
+      }
+  }
+
+  close (OUTPUT2); #close the temp file as output
+  open INPUT3, "<temp_$frame.dat" or die "Cannot open temp file: $!"; # opne it as input
+  while(<INPUT3>) {
+      next if(/Frame/);
+      split;
+      push @mol_list, $_[3] unless(grep /^$_[3]$/, @mol_list);
+      ${"$_[3]"}++;
+  }
+  print OUTPUT "Frame # $frame\n";
+  foreach $mol (@mol_list) {
+      printf OUTPUT "%4d  %s\n", ${"$mol"}, $mol;
+      ${"$mol"} = 0;
+  }
+  @mol_list = ();
+###
+
+### Cleanup between frames ###
+  for(0..$at_count) {
+      @{$full_list[$_]} = ();
+      @{$list[$_]} = ();
+  }
+###
+}
+#################################################################
+

angle_coeff	angle_style	atom_modify	atom_style	bond_coeff	bond_style
boundary	change_box	clear	communicate	compute	compute_modify
create_atoms	create_box	delete_atoms	delete_bonds	dielectric	dihedral_coeff
dihedral_style	dimension	displace_atoms	displace_box	dump	dump_modify
echo	fix	fix_modify	group	if	improper_coeff
improper_style	include	jump	kspace_modify	kspace_style	label
lattice	log	mass	minimize	min_modify	min_style
neb	neigh_modify	neighbor	newton	next	pair_coeff
pair_modify	pair_style	pair_write	prd	print	processors
read_data	read_restart	region	replicate	reset_timestep	restart
run	run_style	set	shell	special_bonds	tad
temper	thermo	thermo_modify	thermo_style	timestep	uncompute
undump	unfix	units	variable	velocity	write_restart +
accelerator	angle_coeff	angle_style	atom_modify	atom_style	bond_coeff
bond_style	boundary	change_box	clear	communicate	compute
compute_modify	create_atoms	create_box	delete_atoms	delete_bonds	dielectric
dihedral_coeff	dihedral_style	dimension	displace_atoms	displace_box	dump
dump_modify	echo	fix	fix_modify	group	if
improper_coeff	improper_style	include	jump	kspace_modify	kspace_style
label	lattice	log	mass	minimize	min_modify
min_style	neb	neigh_modify	neighbor	newton	next
pair_coeff	pair_modify	pair_style	pair_write	prd	print
processors	read_data	read_restart	region	replicate	reset_timestep
restart	run	run_style	set	shell	special_bonds
tad	temper	thermo	thermo_modify	thermo_style	timestep
uncompute	undump	unfix	units	variable	velocity
write_restart
adapt	addforce	aveforce	ave/atom	ave/correlate	ave/histo	ave/spatial	ave/time
bond/break	bond/create	bond/swap	box/relax	deform	deposit	drag	dt/reset
efield	enforce2d	evaporate	external	freeze	gravity	heat	indent
langevin	lineforce	momentum	move	msst	neb	nph	nph/asphere
nph/sphere	npt	npt/asphere	npt/sphere	nve	nve/asphere	nve/limit	nve/noforce
nve/sphere	nvt	nvt/asphere	nvt/sllod	nvt/sphere	orient/fcc	planeforce	poems
pour	press/berendsen	print	qeq/comb	reax/bonds	recenter	rigid	rigid/nve
rigid/nvt	setforce	shake	spring	spring/rg	spring/self	srd	store/force
store/state	temp/berendsen	temp/rescale	thermal/conductivity	tmd	ttm	viscosity	viscous
wall/colloid	wall/gran	wall/harmonic	wall/lj126	wall/lj93	wall/reflect	wall/region	wall/srd +
efield	enforce2d	evaporate	external	freeze	gpu	gravity	heat
indent	langevin	lineforce	momentum	move	msst	neb	nph
nph/asphere	nph/sphere	npt	npt/asphere	npt/sphere	nve	nve/asphere	nve/limit
nve/noforce	nve/sphere	nvt	nvt/asphere	nvt/sllod	nvt/sphere	orient/fcc	planeforce
poems	pour	press/berendsen	print	qeq/comb	reax/bonds	recenter	rigid
rigid/nve	rigid/nvt	setforce	shake	spring	spring/rg	spring/self	srd
store/force	store/state	temp/berendsen	temp/rescale	thermal/conductivity	tmd	ttm	viscosity
viscous	wall/colloid	wall/gran	wall/harmonic	wall/lj126	wall/lj93	wall/reflect	wall/region
wall/srd