diff --git a/doc/Section_commands.html b/doc/Section_commands.html
index ca64e36c5d..74ed768f3c 100644
--- a/doc/Section_commands.html
+++ b/doc/Section_commands.html
@@ -311,7 +311,7 @@ included when LAMMPS was built.  Not all packages are included in a
 default LAMMPS build.  These dependencies are listed as Restrictions
 in the command's documentation.
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "angle_coeff.html">angle_coeff</A></TD><TD ><A HREF = "angle_style.html">angle_style</A></TD><TD ><A HREF = "atom_modify.html">atom_modify</A></TD><TD ><A HREF = "atom_style.html">atom_style</A></TD><TD ><A HREF = "bond_coeff.html">bond_coeff</A></TD><TD ><A HREF = "bond_style.html">bond_style</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "boundary.html">boundary</A></TD><TD ><A HREF = "change_box.html">change_box</A></TD><TD ><A HREF = "clear.html">clear</A></TD><TD ><A HREF = "communicate.html">communicate</A></TD><TD ><A HREF = "compute.html">compute</A></TD><TD ><A HREF = "compute_modify.html">compute_modify</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "create_atoms.html">create_atoms</A></TD><TD ><A HREF = "create_box.html">create_box</A></TD><TD ><A HREF = "delete_atoms.html">delete_atoms</A></TD><TD ><A HREF = "delete_bonds.html">delete_bonds</A></TD><TD ><A HREF = "dielectric.html">dielectric</A></TD><TD ><A HREF = "dihedral_coeff.html">dihedral_coeff</A></TD></TR>
@@ -335,7 +335,7 @@ in the command's documentation.
 <P>See the <A HREF = "fix.html">fix</A> command for one-line descriptions
 of each style or click on the style itself for a full description:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "fix_adapt.html">adapt</A></TD><TD ><A HREF = "fix_addforce.html">addforce</A></TD><TD ><A HREF = "fix_aveforce.html">aveforce</A></TD><TD ><A HREF = "fix_ave_atom.html">ave/atom</A></TD><TD ><A HREF = "fix_ave_correlate.html">ave/correlate</A></TD><TD ><A HREF = "fix_ave_histo.html">ave/histo</A></TD><TD ><A HREF = "fix_ave_spatial.html">ave/spatial</A></TD><TD ><A HREF = "fix_ave_time.html">ave/time</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_bond_break.html">bond/break</A></TD><TD ><A HREF = "fix_bond_create.html">bond/create</A></TD><TD ><A HREF = "fix_bond_swap.html">bond/swap</A></TD><TD ><A HREF = "fix_box_relax.html">box/relax</A></TD><TD ><A HREF = "fix_deform.html">deform</A></TD><TD ><A HREF = "fix_deposit.html">deposit</A></TD><TD ><A HREF = "fix_drag.html">drag</A></TD><TD ><A HREF = "fix_dt_reset.html">dt/reset</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_efield.html">efield</A></TD><TD ><A HREF = "fix_enforce2d.html">enforce2d</A></TD><TD ><A HREF = "fix_evaporate.html">evaporate</A></TD><TD ><A HREF = "fix_external.html">external</A></TD><TD ><A HREF = "fix_freeze.html">freeze</A></TD><TD ><A HREF = "fix_gravity.html">gravity</A></TD><TD ><A HREF = "fix_heat.html">heat</A></TD><TD ><A HREF = "fix_indent.html">indent</A></TD></TR>
@@ -351,7 +351,7 @@ of each style or click on the style itself for a full description:
 <P>These are fix styles contributed by users, which can be used if
 <A HREF = "Section_start.html#2_3">LAMMPS is built with the appropriate package</A>.
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "fix_atc.html">atc</A></TD><TD ><A HREF = "fix_imd.html">imd</A></TD><TD ><A HREF = "fix_langevin_eff.html">langevin/eff</A></TD><TD ><A HREF = "fix_nh_eff.html">nph/eff</A></TD><TD ><A HREF = "fix_nh_eff.html">npt/eff</A></TD><TD ><A HREF = "fix_nve_eff.html">nve/eff</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "fix_nh_eff.html">nvt/eff</A></TD><TD ><A HREF = "fix_nvt_sllod_eff.html">nvt/sllod/eff</A></TD><TD ><A HREF = "fix_qeq_reax.html">qeq/reax</A></TD><TD ><A HREF = "fix_smd.html">smd</A></TD><TD ><A HREF = "fix_temp_rescale_eff.html">temp/rescale/eff</A> 
 </TD></TR></TABLE></DIV>
@@ -363,7 +363,7 @@ of each style or click on the style itself for a full description:
 <P>See the <A HREF = "compute.html">compute</A> command for one-line descriptions of
 each style or click on the style itself for a full description:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "compute_angle_local.html">angle/local</A></TD><TD ><A HREF = "compute_atom_molecule.html">atom/molecule</A></TD><TD ><A HREF = "compute_bond_local.html">bond/local</A></TD><TD ><A HREF = "compute_centro_atom.html">centro/atom</A></TD><TD ><A HREF = "compute_cna_atom.html">cna/atom</A></TD><TD ><A HREF = "compute_com.html">com</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_com_molecule.html">com/molecule</A></TD><TD ><A HREF = "compute_coord_atom.html">coord/atom</A></TD><TD ><A HREF = "compute_damage_atom.html">damage/atom</A></TD><TD ><A HREF = "compute_dihedral_local.html">dihedral/local</A></TD><TD ><A HREF = "compute_displace_atom.html">displace/atom</A></TD><TD ><A HREF = "compute_erotate_asphere.html">erotate/asphere</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "compute_erotate_sphere.html">erotate/sphere</A></TD><TD ><A HREF = "compute_event_displace.html">event/displace</A></TD><TD ><A HREF = "compute_group_group.html">group/group</A></TD><TD ><A HREF = "compute_gyration.html">gyration</A></TD><TD ><A HREF = "compute_gyration_molecule.html">gyration/molecule</A></TD><TD ><A HREF = "compute_heat_flux.html">heat/flux</A></TD></TR>
@@ -377,7 +377,7 @@ each style or click on the style itself for a full description:
 <P>These are compute styles contributed by users, which can be used if
 <A HREF = "Section_start.html#2_3">LAMMPS is built with the appropriate package</A>.
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "compute_ackland_atom.html">ackland/atom</A></TD><TD ><A HREF = "compute_ke_eff.html">ke/eff</A></TD><TD ><A HREF = "compute_ke_atom_eff.html">ke/atom/eff</A></TD><TD ><A HREF = "compute_temp_eff.html">temp/eff</A></TD><TD ><A HREF = "compute_temp_deform_eff.html">temp/deform/eff</A></TD><TD ><A HREF = "compute_temp_region_eff.html">temp/region/eff</A> 
 </TD></TR></TABLE></DIV>
 
@@ -388,7 +388,7 @@ each style or click on the style itself for a full description:
 <P>See the <A HREF = "pair_style.html">pair_style</A> command for an overview of pair
 potentials.  Click on the style itself for a full description:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "pair_none.html">none</A></TD><TD ><A HREF = "pair_hybrid.html">hybrid</A></TD><TD ><A HREF = "pair_hybrid.html">hybrid/overlay</A></TD><TD ><A HREF = "pair_airebo.html">airebo</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_born.html">born</A></TD><TD ><A HREF = "pair_born.html">born/coul/long</A></TD><TD ><A HREF = "pair_buck.html">buck</A></TD><TD ><A HREF = "pair_buck.html">buck/coul/cut</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_buck.html">buck/coul/long</A></TD><TD ><A HREF = "pair_colloid.html">colloid</A></TD><TD ><A HREF = "pair_comb.html">comb</A></TD><TD ><A HREF = "pair_coul.html">coul/cut</A></TD></TR>
@@ -400,20 +400,22 @@ potentials.  Click on the style itself for a full description:
 <TR ALIGN="center"><TD ><A HREF = "pair_hbond_dreiding.html">hbond/dreiding/morse</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/charmm</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/charmm/implicit</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long/opt</A></TD><TD ><A HREF = "pair_class2.html">lj/class2</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/cut</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/long</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/opt</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/tip4p</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lubricate.html">lubricate</A></TD><TD ><A HREF = "pair_meam.html">meam</A></TD><TD ><A HREF = "pair_morse.html">morse</A></TD><TD ><A HREF = "pair_morse.html">morse/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_peri.html">peri/lps</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb</A></TD><TD ><A HREF = "pair_reax.html">reax</A></TD><TD ><A HREF = "pair_resquared.html">resquared</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_soft.html">soft</A></TD><TD ><A HREF = "pair_sw.html">sw</A></TD><TD ><A HREF = "pair_table.html">table</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid</A> 
+<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/tip4p</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_smooth.html">lj/smooth</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut/gpu</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_meam.html">meam</A></TD><TD ><A HREF = "pair_morse.html">morse</A></TD><TD ><A HREF = "pair_morse.html">morse/opt</A></TD><TD ><A HREF = "pair_peri.html">peri/lps</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_peri.html">peri/pmb</A></TD><TD ><A HREF = "pair_reax.html">reax</A></TD><TD ><A HREF = "pair_resquared.html">resquared</A></TD><TD ><A HREF = "pair_soft.html">soft</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_sw.html">sw</A></TD><TD ><A HREF = "pair_table.html">table</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff</A></TD><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_yukawa.html">yukawa</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are pair styles contributed by users, which can be used if
 <A HREF = "Section_start.html#2_3">LAMMPS is built with the appropriate package</A>.
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
-<TR ALIGN="center"><TD ><A HREF = "pair_buck_coul.html">buck/coul</A></TD><TD ><A HREF = "pair_cmm.html">cg/cmm</A></TD><TD ><A HREF = "pair_cmm.html">cg/cmm/coul/cut</A></TD><TD ><A HREF = "pair_cmm.html">cg/cmm/coul/long</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_eam.html">eam/cd</A></TD><TD ><A HREF = "pair_eff.html">eff/cut</A></TD><TD ><A HREF = "pair_lj_coul.html">lj/coul</A></TD><TD ><A HREF = "pair_reax_c.html">reax/c</A> 
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
+<TR ALIGN="center"><TD ><A HREF = "pair_buck_coul.html">buck/coul</A></TD><TD ><A HREF = "pair_cmm.html">cg/cmm</A></TD><TD ><A HREF = "pair_cmm.html">cg/cmm/gpu</A></TD><TD ><A HREF = "pair_cmm.html">cg/cmm/coul/cut</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_cmm.html">cg/cmm/coul/long</A></TD><TD ><A HREF = "pair_cmm.html">cg/cmm/coul/long/gpu</A></TD><TD ><A HREF = "pair_eam.html">eam/cd</A></TD><TD ><A HREF = "pair_eff.html">eff/cut</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj_coul.html">lj/coul</A></TD><TD ><A HREF = "pair_reax_c.html">reax/c</A> 
 </TD></TR></TABLE></DIV>
 
 <HR>
@@ -423,7 +425,7 @@ potentials.  Click on the style itself for a full description:
 <P>See the <A HREF = "bond_style.html">bond_style</A> command for an overview of bond
 potentials.  Click on the style itself for a full description:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_none.html">none</A></TD><TD WIDTH="100"><A HREF = "bond_hybrid.html">hybrid</A></TD><TD WIDTH="100"><A HREF = "bond_class2.html">class2</A></TD><TD WIDTH="100"><A HREF = "bond_fene.html">fene</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_fene_expand.html">fene/expand</A></TD><TD WIDTH="100"><A HREF = "bond_harmonic.html">harmonic</A></TD><TD WIDTH="100"><A HREF = "bond_morse.html">morse</A></TD><TD WIDTH="100"><A HREF = "bond_nonlinear.html">nonlinear</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "bond_quartic.html">quartic</A></TD><TD WIDTH="100"><A HREF = "bond_table.html">table</A> 
@@ -436,7 +438,7 @@ potentials.  Click on the style itself for a full description:
 <P>See the <A HREF = "angle_style.html">angle_style</A> command for an overview of
 angle potentials.  Click on the style itself for a full description:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_none.html">none</A></TD><TD WIDTH="100"><A HREF = "angle_hybrid.html">hybrid</A></TD><TD WIDTH="100"><A HREF = "angle_charmm.html">charmm</A></TD><TD WIDTH="100"><A HREF = "angle_class2.html">class2</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_cosine.html">cosine</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_delta.html">cosine/delta</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_periodic.html">cosine/periodic</A></TD><TD WIDTH="100"><A HREF = "angle_cosine_squared.html">cosine/squared</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "angle_harmonic.html">harmonic</A></TD><TD WIDTH="100"><A HREF = "angle_table.html">table</A> 
@@ -445,7 +447,7 @@ angle potentials.  Click on the style itself for a full description:
 <P>These are angle styles contributed by users, which can be used if
 <A HREF = "Section_start.html#2_3">LAMMPS is built with the appropriate package</A>.
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD ><A HREF = "angle_cmm.html">cg/cmm</A> 
 </TD></TR></TABLE></DIV>
 
@@ -457,7 +459,7 @@ angle potentials.  Click on the style itself for a full description:
 of dihedral potentials.  Click on the style itself for a full
 description:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "dihedral_none.html">none</A></TD><TD WIDTH="100"><A HREF = "dihedral_hybrid.html">hybrid</A></TD><TD WIDTH="100"><A HREF = "dihedral_charmm.html">charmm</A></TD><TD WIDTH="100"><A HREF = "dihedral_class2.html">class2</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "dihedral_harmonic.html">harmonic</A></TD><TD WIDTH="100"><A HREF = "dihedral_helix.html">helix</A></TD><TD WIDTH="100"><A HREF = "dihedral_multi_harmonic.html">multi/harmonic</A></TD><TD WIDTH="100"><A HREF = "dihedral_opls.html">opls</A> 
 </TD></TR></TABLE></DIV>
@@ -470,7 +472,7 @@ description:
 of improper potentials.  Click on the style itself for a full
 description:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "improper_none.html">none</A></TD><TD WIDTH="100"><A HREF = "improper_hybrid.html">hybrid</A></TD><TD WIDTH="100"><A HREF = "improper_class2.html">class2</A></TD><TD WIDTH="100"><A HREF = "improper_cvff.html">cvff</A></TD></TR>
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "improper_harmonic.html">harmonic</A></TD><TD WIDTH="100"><A HREF = "improper_umbrella.html">umbrella</A> 
 </TD></TR></TABLE></DIV>
@@ -482,14 +484,14 @@ description:
 <P>See the <A HREF = "kspace_style.html">kspace_style</A> command for an overview of
 Kspace solvers.  Click on the style itself for a full description:
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">ewald</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/tip4p</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are Kspace solvers contributed by users, which can be used if
 <A HREF = "Section_start.html#2_3">LAMMPS is built with the appropriate package</A>.
 </P>
-<DIV ALIGN=center><TABLE  BORDER=1 >
+<DIV ALIGN=center><TABLE  WIDTH="0%"  BORDER=1 >
 <TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">ewald/n</A> 
 </TD></TR></TABLE></DIV>
 
diff --git a/doc/Section_commands.txt b/doc/Section_commands.txt
index 2c0de9615f..4a8d83d524 100644
--- a/doc/Section_commands.txt
+++ b/doc/Section_commands.txt
@@ -605,14 +605,17 @@ potentials.  Click on the style itself for a full description:
 "lj/cut/gpu"_pair_lj.html,
 "lj/cut/opt"_pair_lj.html,
 "lj/cut/coul/cut"_pair_lj.html,
+"lj/cut/coul/cut/gpu"_pair_lj.html,
 "lj/cut/coul/debye"_pair_lj.html,
 "lj/cut/coul/long"_pair_lj.html,
+"lj/cut/coul/long/gpu"_pair_lj.html,
 "lj/cut/coul/long/tip4p"_pair_lj.html,
 "lj/expand"_pair_lj_expand.html,
 "lj/gromacs"_pair_gromacs.html,
 "lj/gromacs/coul/gromacs"_pair_gromacs.html,
 "lj/smooth"_pair_lj_smooth.html,
 "lj96/cut"_pair_lj96_cut.html,
+"lj96/cut/gpu"_pair_lj96_cut.html,
 "lubricate"_pair_lubricate.html,
 "meam"_pair_meam.html,
 "morse"_pair_morse.html,
@@ -634,8 +637,10 @@ These are pair styles contributed by users, which can be used if
 
 "buck/coul"_pair_buck_coul.html,
 "cg/cmm"_pair_cmm.html,
+"cg/cmm/gpu"_pair_cmm.html,
 "cg/cmm/coul/cut"_pair_cmm.html,
 "cg/cmm/coul/long"_pair_cmm.html,
+"cg/cmm/coul/long/gpu"_pair_cmm.html,
 "eam/cd"_pair_eam.html,
 "eff/cut"_pair_eff.html,
 "lj/coul"_pair_lj_coul.html,
diff --git a/doc/Section_start.html b/doc/Section_start.html
index 41a45d5170..4be3c89537 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -403,9 +403,9 @@ LAMMPS is built.
 the files in these packages require other packages to also be
 included.  If this is not the case, then those subsidiary files in
 "gpu" and "opt" will not be installed either.  To install all the
-files in package "gpu", the "asphere" package must also be installed.
-To install all the files in package "opt", the "kspace" and "manybody"
-packages must also be installed.
+files in package "gpu", the "asphere" and "kspace" packages must also be 
+installed. To install all the files in package "opt", the "kspace" and 
+"manybody" packages must also be installed.
 </P>
 <P>You may wish to exclude certain packages if you will never run certain
 kinds of simulations.  This will keep you from having to build
@@ -909,53 +909,141 @@ certain NVIDIA CUDA software on your system:
 </P>
 <UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
 <LI>Go to http://www.nvidia.com/object/cuda_get.html
-<LI>Install a driver and toolkit appopriate for your system (SDK is not necessary)
-<LI>Run make in lammps/lib/gpu, editing a Makefile if necessary
+<LI>Install a driver and toolkit appropriate for your system (SDK is not necessary)
+<LI>Follow the instructions in README in lammps/lib/gpu to build the library.
 <LI>Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties 
 </UL>
-<H4>GPU hardware 
+<H4>GPU configuration 
 </H4>
 <P>When using GPUs, you are restricted to one physical GPU per LAMMPS
-process.  This can be multiple GPUs on a single node or across
-multiple nodes.  For each GPU pair style, the first two arguments (GPU
-mode followed by GPU ID) control how GPUs are selected.  If you are
-running on a single node, the mode is "one/node" and the parameter is
-the ID of the first GPU to select:
+process. Multiple processes can share a single GPU and in many cases it
+will be more efficient to run with multiple processes per GPU. Any GPU
+accelerated style requires that <A HREF = "fix_gpu.html">fix gpu</A> be used in the
+input script to select and initialize the GPUs. The format for the fix
+is:
 </P>
-<PRE>pair_style lj/cut/gpu one/node 0 2.5 
+<PRE>fix <I>name</I> all gpu <I>mode</I> <I>first</I> <I>last</I> <I>split</I> 
 </PRE>
-<P>The ID is the GPU ID reported by the driver for CUDA enabled graphics
-cards.  For multiple GPU cards on a node, an MPI process should be run
-for each graphics card.  In this case, each process will grab the GPU
-with ID equal to the process rank plus the GPU parameter.
+<P>where <I>name</I> is the name for the fix. The gpu fix must be the first
+fix specified for a given run, otherwise the program will exit
+with an error. The gpu fix will not have any effect on runs 
+that do not use GPU acceleration; there should be no problem
+with specifying the fix first in any input script.
 </P>
-<P>For multiple nodes with one GPU per node, the mode is "one/gpu" and
-the parameter is the ID of the GPU used on every node:
+<P><I>mode</I> can be either "force" or "force/neigh". In the former,
+neighbor list calculation is performed on the CPU using the
+standard LAMMPS routines. In the latter, the neighbor list
+calculation is performed on the GPU. The GPU neighbor list
+can be used for better performance, however, it 
+should not be used with a triclinic box.
 </P>
-<PRE>pair_style lj/cut/gpu one/gpu 1 2.5 
+<P>There are cases when it might be more efficient to select the CPU for neighbor
+list builds. If a non-GPU enabled style requires a neighbor list, it will also
+be built using CPU routines. Redundant CPU and GPU neighbor list calculations
+will typically be less efficient. For <A HREF = "pair_hybrid.html">hybrid</A> pair
+styles, GPU calculated neighbor lists might be less efficient because
+no particles will be skipped in a given neighbor list.
+</P>
+<P><I>first</I> is the ID (as reported by lammps/lib/gpu/nvc_get_devices)
+of the first GPU that will be used on each node. <I>last</I> is the
+ID of the last GPU that will be used on each node. If you have
+only one GPU per node, <I>first</I> and <I>last</I> will typically both be
+0. Selecting a non-sequential set of GPU IDs (e.g. 0,1,3)
+is not currently supported.
+</P>
+<P><I>split</I> is the fraction of particles whose forces, torques,
+energies, and/or virials will be calculated on the GPU. This
+can be used to perform CPU and GPU force calculations
+simultaneously. If <I>split</I> is negative, the software will
+attempt to calculate the optimal fraction automatically 
+every 25 timesteps based on CPU and GPU timings. Because the GPU speedups
+are dependent on the number of particles, automatic calculation of the
+split can be less efficient, but typically results in loop times
+within 20% of an optimal fixed split.
+</P>
+<P>If you have two GPUs per node, 8 CPU cores per node, and
+would like to run on 4 nodes with dynamic balancing of
+force calculation across CPU and GPU cores, the fix
+might be
+</P>
+<PRE>fix 0 all gpu force/neigh 0 1 -1 
 </PRE>
-<P>In this case, MPI should be run with exactly one process per node.
+<P>with LAMMPS run on 32 processes. In this case, all
+CPU cores and GPU devices on the nodes would be utilized.
+Each GPU device would be shared by 4 CPU cores. The
+CPU cores would perform force calculations for some
+fraction of the particles at the same time the GPUs
+performed force calculation for the other particles.
 </P>
-<P>For multiple nodes with multiple GPUs, the mode is "multi/gpu" and the
-parameter is the number of GPUs per node:
+<P>Because of the large number of cores on each GPU
+device, it might be more efficient to run on fewer
+processes per GPU when the number of particles per process
+is small (100's of particles); this can be necessary
+to keep the GPU cores busy.
 </P>
-<PRE>pair_style lj/cut/gpu multi/gpu 3 2.5 
-</PRE>
-<P>In this case, LAMMPS will attempt to grab 3 GPUs per node and this
-requires that the number of processes per node be 3. The first GPU
-selected must have ID zero for this mode (in the example, GPUs 0, 1,
-and 2 will be selected on every node).  An additional constraint is
-that the MPI processes must be filled by slot on each node such that
-the process ranks on each node are always sequential. This is a option
-for the MPI launcher (mpirun/mpiexec) and will be the default on many
-clusters.
+<H4>GPU input script 
+</H4>
+<P>In order to use GPU acceleration in LAMMPS, 
+<A HREF = "fix_gpu.html">fix_gpu</A>
+should be used in order to initialize and configure the
+GPUs for use. Additionally, GPU enabled styles must be
+selected in the input script. Currently,
+this is limited to a few <A HREF = "pair_style.html">pair styles</A>.
+Some GPU-enabled styles have additional restrictions
+listed in their documentation.
+</P>
+<H4>GPU asynchronous pair computation 
+</H4>
+<P>The GPU accelerated pair styles can be used to perform
+pair style force calculation on the GPU while other 
+calculations are
+performed on the CPU. One method to do this is to specify
+a <I>split</I> in the gpu fix as described above. In this case,
+force calculation for the pair style will also be performed
+on the CPU. 
+</P>
+<P>When the CPU work in a GPU pair style has finished,
+the next force computation will begin, possibly before the
+GPU has finished. If <I>split</I> is 1.0 in the gpu fix, the next
+force computation will begin almost immediately. This can
+be used to run a <A HREF = "pair_hybrid.html">hybrid</A> GPU pair style at 
+the same time as a hybrid CPU pair style. In this case, the 
+GPU pair style should be first in the hybrid command in order to
+perform simultaneous calculations. This also
+allows <A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>, 
+<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>, 
+and <A HREF = "kspace_style.html">long-range</A> force
+computations to be run simultaneously with the GPU pair style.
+Once all CPU force computations have completed, the gpu fix
+will block until the GPU has finished all work before continuing
+the run.
+</P>
+<H4>GPU timing 
+</H4>
+<P>GPU accelerated pair styles can perform computations asynchronously
+with CPU computations. The "Pair" time reported by LAMMPS
+will be the maximum of the time required to complete the CPU
+pair style computations and the time required to complete the GPU
+pair style computations. Any time spent for GPU-enabled pair styles
+for computations that run simultaneously with <A HREF = "bond_style.html">bond</A>, 
+<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>, 
+<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A> calculations
+will not be included in the "Pair" time.
+</P>
+<P>When <I>mode</I> for the gpu fix is force/neigh,
+the time for neighbor list calculations on the GPU will be added
+into the "Pair" time, not the "Neigh" time. A breakdown of the
+times required for various tasks on the GPU (data copy, neighbor
+calculations, force computations, etc.) are output only
+with the LAMMPS screen output at the end of each run. These timings represent
+total time spent on the GPU for each routine, regardless of asynchronous
+CPU calculations.
 </P>
 <H4>GPU single vs double precision 
 </H4>
 <P>See the lammps/lib/gpu/README file for instructions on how to build 
-the LAMMPS gpu library for single vs double precision.  The latter
-requires that your GPU card supports double precision. The lj/cut/gpu
-pair style does not support double precision.
+the LAMMPS gpu library for single, mixed, and double precision.  The latter
+requires that your GPU card supports double precision. 
 </P>
 <HR>
 
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index 3fc685cd25..ae747ded66 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -396,9 +396,9 @@ The two exceptions to this are the "gpu" and "opt" packages.  Some of
 the files in these packages require other packages to also be
 included.  If this is not the case, then those subsidiary files in
 "gpu" and "opt" will not be installed either.  To install all the
-files in package "gpu", the "asphere" package must also be installed.
-To install all the files in package "opt", the "kspace" and "manybody"
-packages must also be installed.
+files in package "gpu", the "asphere" and "kspace" packages must also be 
+installed. To install all the files in package "opt", the "kspace" and 
+"manybody" packages must also be installed.
 
 You may wish to exclude certain packages if you will never run certain
 kinds of simulations.  This will keep you from having to build
@@ -899,53 +899,141 @@ certain NVIDIA CUDA software on your system:
 
 Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
 Go to http://www.nvidia.com/object/cuda_get.html
-Install a driver and toolkit appopriate for your system (SDK is not necessary)
-Run make in lammps/lib/gpu, editing a Makefile if necessary
+Install a driver and toolkit appropriate for your system (SDK is not necessary)
+Follow the instructions in README in lammps/lib/gpu to build the library.
 Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties :ul
 
-GPU hardware :h4
+GPU configuration :h4
 
 When using GPUs, you are restricted to one physical GPU per LAMMPS
-process.  This can be multiple GPUs on a single node or across
-multiple nodes.  For each GPU pair style, the first two arguments (GPU
-mode followed by GPU ID) control how GPUs are selected.  If you are
-running on a single node, the mode is "one/node" and the parameter is
-the ID of the first GPU to select:
+process. Multiple processes can share a single GPU and in many cases it
+will be more efficient to run with multiple processes per GPU. Any GPU
+accelerated style requires that "fix gpu"_fix_gpu.html be used in the
+input script to select and initialize the GPUs. The format for the fix
+is:
 
-pair_style lj/cut/gpu one/node 0 2.5 :pre
+fix {name} all gpu {mode} {first} {last} {split} :pre
 
-The ID is the GPU ID reported by the driver for CUDA enabled graphics
-cards.  For multiple GPU cards on a node, an MPI process should be run
-for each graphics card.  In this case, each process will grab the GPU
-with ID equal to the process rank plus the GPU parameter.
+where {name} is the name for the fix. The gpu fix must be the first
+fix specified for a given run, otherwise the program will exit
+with an error. The gpu fix will not have any effect on runs 
+that do not use GPU acceleration; there should be no problem
+with specifying the fix first in any input script.
 
-For multiple nodes with one GPU per node, the mode is "one/gpu" and
-the parameter is the ID of the GPU used on every node:
+{mode} can be either "force" or "force/neigh". In the former,
+neighbor list calculation is performed on the CPU using the
+standard LAMMPS routines. In the latter, the neighbor list
+calculation is performed on the GPU. The GPU neighbor list
+can be used for better performance, however, it 
+should not be used with a triclinic box.
 
-pair_style lj/cut/gpu one/gpu 1 2.5 :pre
+There are cases when it might be more efficient to select the CPU for neighbor
+list builds. If a non-GPU enabled style requires a neighbor list, it will also
+be built using CPU routines. Redundant CPU and GPU neighbor list calculations
+will typically be less efficient. For "hybrid"_pair_hybrid.html pair
+styles, GPU calculated neighbor lists might be less efficient because
+no particles will be skipped in a given neighbor list.
 
-In this case, MPI should be run with exactly one process per node.
+{first} is the ID (as reported by lammps/lib/gpu/nvc_get_devices)
+of the first GPU that will be used on each node. {last} is the
+ID of the last GPU that will be used on each node. If you have
+only one GPU per node, {first} and {last} will typically both be
+0. Selecting a non-sequential set of GPU IDs (e.g. 0,1,3)
+is not currently supported.
 
-For multiple nodes with multiple GPUs, the mode is "multi/gpu" and the
-parameter is the number of GPUs per node:
+{split} is the fraction of particles whose forces, torques,
+energies, and/or virials will be calculated on the GPU. This
+can be used to perform CPU and GPU force calculations
+simultaneously. If {split} is negative, the software will
+attempt to calculate the optimal fraction automatically 
+every 25 timesteps based on CPU and GPU timings. Because the GPU speedups
+are dependent on the number of particles, automatic calculation of the
+split can be less efficient, but typically results in loop times
+within 20% of an optimal fixed split.
 
-pair_style lj/cut/gpu multi/gpu 3 2.5 :pre
+If you have two GPUs per node, 8 CPU cores per node, and
+would like to run on 4 nodes with dynamic balancing of
+force calculation across CPU and GPU cores, the fix
+might be
 
-In this case, LAMMPS will attempt to grab 3 GPUs per node and this
-requires that the number of processes per node be 3. The first GPU
-selected must have ID zero for this mode (in the example, GPUs 0, 1,
-and 2 will be selected on every node).  An additional constraint is
-that the MPI processes must be filled by slot on each node such that
-the process ranks on each node are always sequential. This is a option
-for the MPI launcher (mpirun/mpiexec) and will be the default on many
-clusters.
+fix 0 all gpu force/neigh 0 1 -1 :pre
+
+with LAMMPS run on 32 processes. In this case, all
+CPU cores and GPU devices on the nodes would be utilized.
+Each GPU device would be shared by 4 CPU cores. The
+CPU cores would perform force calculations for some
+fraction of the particles at the same time the GPUs
+performed force calculation for the other particles.
+
+Because of the large number of cores on each GPU
+device, it might be more efficient to run on fewer
+processes per GPU when the number of particles per process
+is small (100's of particles); this can be necessary
+to keep the GPU cores busy.
+
+GPU input script :h4
+
+In order to use GPU acceleration in LAMMPS, 
+"fix_gpu"_fix_gpu.html
+should be used in order to initialize and configure the
+GPUs for use. Additionally, GPU enabled styles must be
+selected in the input script. Currently,
+this is limited to a few "pair styles"_pair_style.html.
+Some GPU-enabled styles have additional restrictions
+listed in their documentation.
+
+GPU asynchronous pair computation :h4
+
+The GPU accelerated pair styles can be used to perform
+pair style force calculation on the GPU while other 
+calculations are
+performed on the CPU. One method to do this is to specify
+a {split} in the gpu fix as described above. In this case,
+force calculation for the pair style will also be performed
+on the CPU. 
+
+When the CPU work in a GPU pair style has finished,
+the next force computation will begin, possibly before the
+GPU has finished. If {split} is 1.0 in the gpu fix, the next
+force computation will begin almost immediately. This can
+be used to run a "hybrid"_pair_hybrid.html GPU pair style at 
+the same time as a hybrid CPU pair style. In this case, the 
+GPU pair style should be first in the hybrid command in order to
+perform simultaneous calculations. This also
+allows "bond"_bond_style.html, "angle"_angle_style.html, 
+"dihedral"_dihedral_style.html, "improper"_improper_style.html, 
+and "long-range"_kspace_style.html force
+computations to be run simultaneously with the GPU pair style.
+Once all CPU force computations have completed, the gpu fix
+will block until the GPU has finished all work before continuing
+the run.
+
+GPU timing :h4
+
+GPU accelerated pair styles can perform computations asynchronously
+with CPU computations. The "Pair" time reported by LAMMPS
+will be the maximum of the time required to complete the CPU
+pair style computations and the time required to complete the GPU
+pair style computations. Any time spent for GPU-enabled pair styles
+for computations that run simultaneously with "bond"_bond_style.html, 
+"angle"_angle_style.html, "dihedral"_dihedral_style.html, 
+"improper"_improper_style.html, and "long-range"_kspace_style.html calculations
+will not be included in the "Pair" time.
+
+When {mode} for the gpu fix is force/neigh,
+the time for neighbor list calculations on the GPU will be added
+into the "Pair" time, not the "Neigh" time. A breakdown of the
+times required for various tasks on the GPU (data copy, neighbor
+calculations, force computations, etc.) are output only
+with the LAMMPS screen output at the end of each run. These timings represent
+total time spent on the GPU for each routine, regardless of asynchronous
+CPU calculations.
 
 GPU single vs double precision :h4
 
 See the lammps/lib/gpu/README file for instructions on how to build 
-the LAMMPS gpu library for single vs double precision.  The latter
-requires that your GPU card supports double precision. The lj/cut/gpu
-pair style does not support double precision.
+the LAMMPS gpu library for single, mixed, and double precision.  The latter
+requires that your GPU card supports double precision. 
 
 :line
 
diff --git a/doc/fix_gpu.html b/doc/fix_gpu.html
new file mode 100644
index 0000000000..72839bc0d1
--- /dev/null
+++ b/doc/fix_gpu.html
@@ -0,0 +1,107 @@
+<HTML>
+<CENTER><A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> - <A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
+</CENTER>
+
+
+
+
+
+
+<HR>
+
+<H3>fix gpu command 
+</H3>
+<P><B>Syntax:</B>
+</P>
+<PRE>fix ID group-ID gpu mode first last split 
+</PRE>
+<UL><LI>ID, group-ID are documented in <A HREF = "fix.html">fix</A> command 
+
+<LI>gpu = style name of this fix command 
+
+<LI>mode = force or force/neigh 
+
+<LI>first = ID of first GPU to be used on each node 
+
+<LI>last = ID of last GPU to be used on each node 
+
+<LI>split = fraction of particles assigned to the GPU 
+
+
+</UL>
+<P><B>Examples:</B>
+</P>
+<PRE>fix 0 all gpu force 0 0 1.0
+fix 0 all gpu force 0 0 0.75
+fix 0 all gpu force/neigh 0 0 1.0
+fix 0 all gpu force/neigh 0 1 -1.0 
+</PRE>
+<P><B>Description:</B>
+</P>
+<P>Select and initialize GPUs to be used for acceleration and configure
+GPU acceleration in LAMMPS. This fix is required in order to use
+any style with GPU acceleration. The fix must be the first fix
+specified for a run or an error will be generated. The fix will not have an 
+effect on any LAMMPS computations that do not use GPU acceleration, so there 
+should not be any problems with specifying this fix first in input scripts.
+</P>
+<P><I>mode</I> specifies where neighbor list calculations will be performed.
+If <I>mode</I> is force, neighbor list calculation is performed on the
+CPU. If <I>mode</I> is force/neigh, neighbor list calculation is 
+performed on the GPU. GPU neighbor
+list calculation currently cannot be used with a triclinic box.
+GPU neighbor lists are not compatible with styles that are not GPU-enabled.
+When a non-GPU enabled style requires a neighbor list, it will also be
+built using CPU routines. In these cases, it will typically be more efficient
+to only use CPU neighbor list builds. For <A HREF = "pair_hybrid.html">hybrid</A> pair
+styles, GPU calculated neighbor lists might be less efficient because
+no particles will be skipped in a given neighbor list.
+</P>
+<P><I>first</I> and <I>last</I> specify the GPUs that will be used for simulation.
+On each node, the GPU IDs in the inclusive range from <I>first</I> to <I>last</I> will
+be used.
+</P>
+<P><I>split</I> can be used for load balancing force calculation work between
+CPU and GPU cores in GPU-enabled pair styles. If 0<<I>split</I><1.0, 
+a fixed fraction of particles is offloaded to the GPU while force calculation
+for the other particles occurs simulataneously on the CPU. If <I>split</I><0,
+the optimal fraction (based on CPU and GPU timings) is calculated
+every 25 timesteps. If <I>split</I>=1.0, all force calculations for 
+GPU accelerated pair styles are performed
+on the GPU. In this case, <A HREF = "pair_hybrid.html">hybrid</A>, 
+<A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>, 
+<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>, 
+and <A HREF = "kspace_style.html">long-range</A> calculations can be performed on the CPU 
+while the GPU is performing force calculations for the GPU-enabled pair
+style.
+</P>
+<P>In order to use GPU acceleration, a GPU enabled style must be
+selected in the input script in addition to this fix. Currently,
+this is limited to a few <A HREF = "pair_style.html">pair styles</A>.
+</P>
+<P>More details about these settings and various possible hardware
+configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P><B>Restart, fix_modify, output, run start/stop, minimize info:</B>
+</P>
+<P>No information about this fix is written to <A HREF = "restart.html">binary restart
+files</A>.  None of the <A HREF = "fix_modify.html">fix_modify</A> options
+are relevant to this fix.
+</P>
+<P>No parameter of this fix can be used with the <I>start/stop</I> keywords of
+the <A HREF = "run.html">run</A> command.
+</P>
+<P><B>Restrictions:</B> 
+</P>
+<P>The fix must be the first fix specified for a given run. The force/neigh
+<I>mode</I> should not be used with a triclinic box or GPU-enabled pair styles
+that need <A HREF = "special_bonds.html">special_bonds</A> settings.
+</P>
+<P>Currently, group-ID must be all.
+</P>
+<P><B>Related commands:</B> none
+</P>
+<P><B>Default:</B> none
+</P>
+</HTML>
diff --git a/doc/fix_gpu.txt b/doc/fix_gpu.txt
new file mode 100644
index 0000000000..88fa6f5414
--- /dev/null
+++ b/doc/fix_gpu.txt
@@ -0,0 +1,97 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+fix gpu command :h3
+
+[Syntax:]
+
+fix ID group-ID gpu mode first last split :pre
+
+ID, group-ID are documented in "fix"_fix.html command :ulb,l
+gpu = style name of this fix command :l
+mode = force or force/neigh :l
+first = ID of first GPU to be used on each node :l
+last = ID of last GPU to be used on each node :l
+split = fraction of particles assigned to the GPU :l
+:ule
+
+[Examples:]
+
+fix 0 all gpu force 0 0 1.0
+fix 0 all gpu force 0 0 0.75
+fix 0 all gpu force/neigh 0 0 1.0
+fix 0 all gpu force/neigh 0 1 -1.0 :pre
+
+[Description:]
+
+Select and initialize GPUs to be used for acceleration and configure
+GPU acceleration in LAMMPS. This fix is required in order to use
+any style with GPU acceleration. The fix must be the first fix
+specified for a run or an error will be generated. The fix will not have an 
+effect on any LAMMPS computations that do not use GPU acceleration, so there 
+should not be any problems with specifying this fix first in input scripts.
+
+{mode} specifies where neighbor list calculations will be performed.
+If {mode} is force, neighbor list calculation is performed on the
+CPU. If {mode} is force/neigh, neighbor list calculation is 
+performed on the GPU. GPU neighbor
+list calculation currently cannot be used with a triclinic box.
+GPU neighbor lists are not compatible with styles that are not GPU-enabled.
+When a non-GPU enabled style requires a neighbor list, it will also be
+built using CPU routines. In these cases, it will typically be more efficient
+to only use CPU neighbor list builds. For "hybrid"_pair_hybrid.html pair
+styles, GPU calculated neighbor lists might be less efficient because
+no particles will be skipped in a given neighbor list.
+
+{first} and {last} specify the GPUs that will be used for simulation.
+On each node, the GPU IDs in the inclusive range from {first} to {last} will
+be used.
+
+{split} can be used for load balancing force calculation work between
+CPU and GPU cores in GPU-enabled pair styles. If 0<{split}<1.0, 
+a fixed fraction of particles is offloaded to the GPU while force calculation
+for the other particles occurs simulataneously on the CPU. If {split}<0,
+the optimal fraction (based on CPU and GPU timings) is calculated
+every 25 timesteps. If {split}=1.0, all force calculations for 
+GPU accelerated pair styles are performed
+on the GPU. In this case, "hybrid"_pair_hybrid.html, 
+"bond"_bond_style.html, "angle"_angle_style.html, 
+"dihedral"_dihedral_style.html, "improper"_improper_style.html, 
+and "long-range"_kspace_style.html calculations can be performed on the CPU 
+while the GPU is performing force calculations for the GPU-enabled pair
+style.
+
+In order to use GPU acceleration, a GPU enabled style must be
+selected in the input script in addition to this fix. Currently,
+this is limited to a few "pair styles"_pair_style.html.
+
+More details about these settings and various possible hardware
+configuration are in "this section"_Section_start.html#2_8 of the
+manual.
+
+[Restart, fix_modify, output, run start/stop, minimize info:]
+
+No information about this fix is written to "binary restart
+files"_restart.html.  None of the "fix_modify"_fix_modify.html options
+are relevant to this fix.
+
+No parameter of this fix can be used with the {start/stop} keywords of
+the "run"_run.html command.
+
+[Restrictions:] 
+
+The fix must be the first fix specified for a given run. The force/neigh
+{mode} should not be used with a triclinic box or GPU-enabled pair styles
+that need "special_bonds"_special_bonds.html settings.
+
+Currently, group-ID must be all.
+
+[Related commands:] none
+
+[Default:] none
+
diff --git a/doc/pair_cmm.html b/doc/pair_cmm.html
index f6033ded25..5f43b6ef63 100644
--- a/doc/pair_cmm.html
+++ b/doc/pair_cmm.html
@@ -11,19 +11,25 @@
 
 <H3>pair_style cg/cmm command 
 </H3>
+<H3>pair_style cg/cmm/gpu command 
+</H3>
 <H3>pair_style cg/cmm/coul/cut command 
 </H3>
 <H3>pair_style cg/cmm/coul/long command 
 </H3>
+<H3>pair_style cg/cmm/coul/long/gpu command 
+</H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>pair_style style args 
 </PRE>
-<UL><LI>style = <I>cg/cmm</I> or <I>cg/cmm/coul/cut</I> or <I>cg/cmm/coul/long</I>
+<UL><LI>style = <I>cg/cmm</I> or <I>cg/cmm/gpu</I> or <I>cg/cmm/coul/cut</I> or <I>cg/cmm/coul/long</I> or <I>cg/cmm/coul/long/gpu</I>
 <LI>args = list of arguments for a particular style 
 </UL>
 <PRE>  <I>cg/cmm</I> args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
+  <I>cg/cmm/gpu</I> args = cutoff
+    cutoff = global cutoff for Lennard Jones interactions (distance units)
   <I>cg/cmm/coul/cut</I> args = cutoff (cutoff2) (kappa)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
@@ -32,6 +38,10 @@
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units) 
 </PRE>
+<PRE>  <I>cg/cmm/coul/long/gpu</I> args = cutoff (cutoff2)
+    cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
+    cutoff2 = global cutoff for Coulombic (optional) (distance units) 
+</PRE>
 <P><B>Examples:</B>
 </P>
 <PRE>pair_style cg/cmm 2.5
@@ -55,6 +65,9 @@ given by
 <P>as required for the CMM Coarse-grained MD parametrization discussed in
 <A HREF = "#Shinoda">(Shinoda)</A> and <A HREF = "#DeVane">(DeVane)</A>.  Rc is the cutoff.
 </P>
+<P>Style <I>cg/cmm/gpu</I> is a GPU-enabled version of style <I>cg/cmm</I>.
+See more details below.
+</P>
 <P>Style <I>cg/cmm/coul/cut</I> adds a Coulombic pairwise interaction given by
 </P>
 <CENTER><IMG SRC = "Eqs/pair_coulomb.jpg">
@@ -83,6 +96,9 @@ option.  The Coulombic cutoff specified for this style means that
 pairwise interactions within this distance are computed directly;
 interactions outside that distance are computed in reciprocal space.
 </P>
+<P>Style <I>cg/cmm/coul/long/gpu</I> is a GPU-enabled version of style <I>cg/cmm/coul/long</I>.
+See more details below.
+</P>
 <P>The following coefficients must be defined for each pair of atoms
 types via the <A HREF = "pair_coeff.html">pair_coeff</A> command as in the examples
 above, or in the data file or restart files read by the
@@ -113,6 +129,27 @@ pair_style command.
 </P>
 <HR>
 
+<P>The <I>cg/cmm/gpu</I> and <I>cg/cmm/coul/long/gpu</I> styles 
+are identical to the <I>cg/cmm</I> and <I>cg/cmm/coul/long</I>
+styles, except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the <A HREF = "Section_start.html#2_8">Running on GPUs</A> section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+</P>
+<P>More details about these settings and various possible hardware
+configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P>Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+</P>
+<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I> and
+<A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls the
+essential GPU selection and initialization steps.
+</P>
+<HR>
+
 <P><B>Mixing, shift, table, tail correction, restart, and rRESPA info</B>:
 </P>
 <P>For atom type pairs I,J and I != J, the epsilon and sigma coefficients
diff --git a/doc/pair_cmm.txt b/doc/pair_cmm.txt
index 7ebd3141fc..ec5884c9a5 100644
--- a/doc/pair_cmm.txt
+++ b/doc/pair_cmm.txt
@@ -7,17 +7,21 @@
 :line
 
 pair_style cg/cmm command :h3
+pair_style cg/cmm/gpu command :h3
 pair_style cg/cmm/coul/cut command :h3
 pair_style cg/cmm/coul/long command :h3
+pair_style cg/cmm/coul/long/gpu command :h3
 
 [Syntax:]
 
 pair_style style args :pre
 
-style = {cg/cmm} or {cg/cmm/coul/cut} or {cg/cmm/coul/long}
+style = {cg/cmm} or {cg/cmm/gpu} or {cg/cmm/coul/cut} or {cg/cmm/coul/long} or {cg/cmm/coul/long/gpu}
 args = list of arguments for a particular style :ul
   {cg/cmm} args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
+  {cg/cmm/gpu} args = cutoff
+    cutoff = global cutoff for Lennard Jones interactions (distance units)
   {cg/cmm/coul/cut} args = cutoff (cutoff2) (kappa)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
@@ -25,6 +29,9 @@ args = list of arguments for a particular style :ul
   {cg/cmm/coul/long} args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units) :pre
+  {cg/cmm/coul/long/gpu} args = cutoff (cutoff2)
+    cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
+    cutoff2 = global cutoff for Coulombic (optional) (distance units) :pre
 
 [Examples:]
 
@@ -49,6 +56,9 @@ given by
 as required for the CMM Coarse-grained MD parametrization discussed in
 "(Shinoda)"_#Shinoda and "(DeVane)"_#DeVane.  Rc is the cutoff.
 
+Style {cg/cmm/gpu} is a GPU-enabled version of style {cg/cmm}.
+See more details below.
+
 Style {cg/cmm/coul/cut} adds a Coulombic pairwise interaction given by
 
 :c,image(Eqs/pair_coulomb.jpg)
@@ -77,6 +87,9 @@ option.  The Coulombic cutoff specified for this style means that
 pairwise interactions within this distance are computed directly;
 interactions outside that distance are computed in reciprocal space.
 
+Style {cg/cmm/coul/long/gpu} is a GPU-enabled version of style {cg/cmm/coul/long}.
+See more details below.
+
 The following coefficients must be defined for each pair of atoms
 types via the "pair_coeff"_pair_coeff.html command as in the examples
 above, or in the data file or restart files read by the
@@ -107,6 +120,27 @@ pair_style command.
 
 :line
 
+The {cg/cmm/gpu} and {cg/cmm/coul/long/gpu} styles 
+are identical to the {cg/cmm} and {cg/cmm/coul/long}
+styles, except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the "Running on GPUs"_Section_start.html#2_8 section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+
+More details about these settings and various possible hardware
+configuration are in "this section"_Section_start.html#2_8 of the
+manual.
+
+Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+
+The "newton pair"_newton.html setting must be {off} and
+"fix gpu"_fix_gpu.html must be used. The fix controls the
+essential GPU selection and initialization steps.
+
+:line
+
 [Mixing, shift, table, tail correction, restart, and rRESPA info]:
 
 For atom type pairs I,J and I != J, the epsilon and sigma coefficients
diff --git a/doc/pair_gayberne.html b/doc/pair_gayberne.html
index a9cbd492d1..a7533a42ca 100644
--- a/doc/pair_gayberne.html
+++ b/doc/pair_gayberne.html
@@ -17,11 +17,9 @@
 </P>
 <PRE>pair_style gayberne gamma upsilon mu cutoff 
 </PRE>
-<PRE>pair_style gayberne/gpu gpuflag gpunum gamma upsilon mu cutoff 
+<PRE>pair_style gayberne/gpu gamma upsilon mu cutoff 
 </PRE>
 <UL><LI>style = <I>gayberne</I> or <I>gayberne/gpu</I>
-<LI>gpumode = <I>one/node</I> or <I>one/gpu</I> or <I>multi/gpu</I>, only used with gayberne/gpu
-<LI>gpuID = ID or number of GPUs, only used with gayberne/gpu
 <LI>gamma = shift for potential minimum (typically 1)
 <LI>upsilon = exponent for eta orientation-dependent energy function
 <LI>mu = exponent for chi orientation-dependent energy function
@@ -30,7 +28,7 @@
 <P><B>Examples:</B>
 </P>
 <PRE>pair_style gayberne 1.0 1.0 1.0 10.0
-pair_style gayberne/gpu one/node 0 1.0 1.0 1.0 10.0
+pair_style gayberne/gpu 1.0 1.0 1.0 10.0
 pair_coeff * * 1.0 1.7 1.7 3.4 3.4 1.0 1.0 1.0 
 </PRE>
 <P><B>Description:</B>
@@ -50,10 +48,8 @@ both particles are spherical, the formula reduces to the usual
 Lennard-Jones interaction (see details below for when Gay-Berne treats
 a particle as "spherical").
 </P>
-<P>Style <I>gayberne/gpu</I> is a GPU-enabled version of style <I>gayberne</I> that
-should give identical answers.  Depending on system size and the GPU
-processor you have on your system, it may be 100x faster (for the
-pairwise portion of the run time).  See more details below.
+<P>Style <I>gayberne/gpu</I> is a GPU-enabled version of style <I>gayberne</I>.
+See more details below.
 </P>
 <P>For large uniform molecules it has been shown that the energy
 parameters are approximately representable in terms of local contact
@@ -141,27 +137,11 @@ to specify its interaction with other spherical particles.
 <P>The <I>gayberne/gpu</I> style is identical to the <I>gayberne</I> style, except
 that each processor off-loads its pairwise calculations to a GPU chip.
 Depending on the hardware available on your system this can provide a
-significant speed-up, espcially for the relatively expensive
+significant speed-up, especially for the relatively expensive
 computations inherent in Gay-Berne interactions.  See the <A HREF = "Section_start.html#2_8">Running on
 GPUs</A> section of the manual for more details
 about hardware and software requirements for using GPUs.
 </P>
-<P>The <I>gpumode</I> and <I>gpuID</I> settings in the pair_style command refer to
-how the GPUs on your system are configured.
-</P>
-<P>Set <I>gpumode</I> to <I>one/node</I> if you have a single compute "node" on
-your system, which may have multiple cores and/or GPUs.  <I>GpuID</I>
-should be set to the ID of the (first) GPU you wish to use with LAMMPS
-(another GPU might be driving your display).
-</P>
-<P>Set <I>gpumode</I> to <I>one/gpu</I> if you have multiple compute "nodes" on
-your system, with one GPU per node.  <I>GpuID</I> should be set to the ID
-of the GPU.
-</P>
-<P>Set <I>gpumode</I> to <I>multi/gpu</I> if you have multiple compute "nodes" on
-your system, each with multiple GPUs.  <I>GpuID</I> should be set to the
-number of GPUs per node.
-</P>
 <P>More details about these settings and various possible hardware
 configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
 manual.
@@ -169,7 +149,9 @@ manual.
 <P>Additional requirements in your input script to run with style
 <I>gayberne/gpu</I> are as follows:
 </P>
-<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I>.
+<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I> and
+<A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls the
+essential GPU selection and initialization steps.
 </P>
 <HR>
 
diff --git a/doc/pair_gayberne.txt b/doc/pair_gayberne.txt
index 71618b616b..d2eedd4dc2 100755
--- a/doc/pair_gayberne.txt
+++ b/doc/pair_gayberne.txt
@@ -12,11 +12,9 @@ pair_style gayberne/gpu command :h3
 [Syntax:]
 
 pair_style gayberne gamma upsilon mu cutoff :pre
-pair_style gayberne/gpu gpuflag gpunum gamma upsilon mu cutoff :pre
+pair_style gayberne/gpu gamma upsilon mu cutoff :pre
 
 style = {gayberne} or {gayberne/gpu}
-gpumode = {one/node} or {one/gpu} or {multi/gpu}, only used with gayberne/gpu
-gpuID = ID or number of GPUs, only used with gayberne/gpu
 gamma = shift for potential minimum (typically 1)
 upsilon = exponent for eta orientation-dependent energy function
 mu = exponent for chi orientation-dependent energy function
@@ -25,7 +23,7 @@ cutoff = global cutoff for interactions (distance units) :ul
 [Examples:]
 
 pair_style gayberne 1.0 1.0 1.0 10.0
-pair_style gayberne/gpu one/node 0 1.0 1.0 1.0 10.0
+pair_style gayberne/gpu 1.0 1.0 1.0 10.0
 pair_coeff * * 1.0 1.7 1.7 3.4 3.4 1.0 1.0 1.0 :pre
 
 [Description:]
@@ -45,10 +43,8 @@ both particles are spherical, the formula reduces to the usual
 Lennard-Jones interaction (see details below for when Gay-Berne treats
 a particle as "spherical").
 
-Style {gayberne/gpu} is a GPU-enabled version of style {gayberne} that
-should give identical answers.  Depending on system size and the GPU
-processor you have on your system, it may be 100x faster (for the
-pairwise portion of the run time).  See more details below.
+Style {gayberne/gpu} is a GPU-enabled version of style {gayberne}.
+See more details below.
 
 For large uniform molecules it has been shown that the energy
 parameters are approximately representable in terms of local contact
@@ -136,27 +132,11 @@ to specify its interaction with other spherical particles.
 The {gayberne/gpu} style is identical to the {gayberne} style, except
 that each processor off-loads its pairwise calculations to a GPU chip.
 Depending on the hardware available on your system this can provide a
-significant speed-up, espcially for the relatively expensive
+significant speed-up, especially for the relatively expensive
 computations inherent in Gay-Berne interactions.  See the "Running on
 GPUs"_Section_start.html#2_8 section of the manual for more details
 about hardware and software requirements for using GPUs.
 
-The {gpumode} and {gpuID} settings in the pair_style command refer to
-how the GPUs on your system are configured.
-
-Set {gpumode} to {one/node} if you have a single compute "node" on
-your system, which may have multiple cores and/or GPUs.  {GpuID}
-should be set to the ID of the (first) GPU you wish to use with LAMMPS
-(another GPU might be driving your display).
-
-Set {gpumode} to {one/gpu} if you have multiple compute "nodes" on
-your system, with one GPU per node.  {GpuID} should be set to the ID
-of the GPU.
-
-Set {gpumode} to {multi/gpu} if you have multiple compute "nodes" on
-your system, each with multiple GPUs.  {GpuID} should be set to the
-number of GPUs per node.
-
 More details about these settings and various possible hardware
 configuration are in "this section"_Section_start.html#2_8 of the
 manual.
@@ -164,7 +144,9 @@ manual.
 Additional requirements in your input script to run with style
 {gayberne/gpu} are as follows:
 
-The "newton pair"_newton.html setting must be {off}.
+The "newton pair"_newton.html setting must be {off} and
+"fix gpu"_fix_gpu.html must be used. The fix controls the
+essential GPU selection and initialization steps.
 
 :line
 
diff --git a/doc/pair_lj.html b/doc/pair_lj.html
index b09d383003..c70f96baff 100644
--- a/doc/pair_lj.html
+++ b/doc/pair_lj.html
@@ -17,30 +17,35 @@
 </H3>
 <H3>pair_style lj/cut/coul/cut command 
 </H3>
+<H3>pair_style lj/cut/coul/cut/gpu command 
+</H3>
 <H3>pair_style lj/cut/coul/debye command 
 </H3>
 <H3>pair_style lj/cut/coul/long command 
 </H3>
+<H3>pair_style lj/cut/coul/long/gpu command 
+</H3>
 <H3>pair_style lj/cut/coul/long/tip4p command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>pair_style style args 
 </PRE>
-<UL><LI>style = <I>lj/cut</I> or <I>lj/cut/gpu</I> or <I>lj/cut/opt</I> or <I>lj/cut/coul/cut</I> or <I>lj/cut/coul/debye</I>         or <I>lj/cut/coul/long</I> or <I>lj/cut/coul/long/tip4p</I>
+<UL><LI>style = <I>lj/cut</I> or <I>lj/cut/gpu</I> or <I>lj/cut/opt</I> or <I>lj/cut/coul/cut</I>         or <I>lj/cut/coul/debye</I> or <I>lj/cut/coul/long</I> or <I>lj/cut/coul/long/tip4p</I>
 <LI>args = list of arguments for a particular style 
 </UL>
 <PRE>  <I>lj/cut</I> args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
-  <I>lj/cut/gpu</I> args = gpumode gpuID cutoff
-    gpumode = <I>one/node</I> or <I>one/gpu</I> or <I>multi/gpu</I>
-    gpuID = ID or number of GPUs
+  <I>lj/cut/gpu</I> args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
   <I>lj/cut/opt</I> args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
   <I>lj/cut/coul/cut</I> args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
+  <I>lj/cut/coul/cut/gpu</I> args = cutoff (cutoff2)
+    cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
+    cutoff2 = global cutoff for Coulombic (optional) (distance units)
   <I>lj/cut/coul/debye</I> args = kappa cutoff (cutoff2)
     kappa = Debye length (inverse distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
@@ -48,6 +53,9 @@
   <I>lj/cut/coul/long</I> args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
+  <I>lj/cut/coul/long/gpu</I> args = cutoff (cutoff2)
+    cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
+    cutoff2 = global cutoff for Coulombic (optional) (distance units)
   <I>lj/cut/coul/long/tip4p</I> args = otype htype btype atype qdist cutoff (cutoff2)
     otype,htype = atom types for TIP4P O and H
     btype,atype = bond and angle types for TIP4P waters
@@ -58,12 +66,13 @@
 <P><B>Examples:</B>
 </P>
 <PRE>pair_style lj/cut 2.5
-pair_style lj/cut/gpu one/node 0 2.5
+pair_style lj/cut/gpu 2.5
 pair_style lj/cut/opt 2.5
 pair_coeff * * 1 1
 pair_coeff 1 1 1 1.1 2.8 
 </PRE>
 <PRE>pair_style lj/cut/coul/cut 10.0
+pair_style lj/cut/coul/cut/gpu 10.0
 pair_style lj/cut/coul/cut 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0
@@ -76,6 +85,7 @@ pair_coeff 1 1 1.0 1.5 2.5
 pair_coeff 1 1 1.0 1.5 2.5 5.0 
 </PRE>
 <PRE>pair_style lj/cut/coul/long 10.0
+pair_style lj/cut/coul/long/gpu 10.0
 pair_style lj/cut/coul/long 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 
@@ -94,10 +104,8 @@ given by
 </CENTER>
 <P>Rc is the cutoff.
 </P>
-<P>Style <I>lj/cut/gpu</I> is a GPU-enabled version of style <I>lj/cut</I> that
-should give identical answers.  Depending on system size and the GPU
-processor you have on your system, it may be 4x faster (for the
-pairwise portion of the run time).  See more details below.
+<P>Style <I>lj/cut/gpu</I> is a GPU-enabled version of style <I>lj/cut</I>.
+See more details below.
 </P>
 <P>Style <I>lj/cut/opt</I> is an optimized version of style <I>lj/cut</I> that
 should give identical answers.  Depending on system size and the
@@ -115,6 +123,9 @@ specified in the pair_style command, it is used for both the LJ and
 Coulombic terms.  If two cutoffs are specified, they are used as
 cutoffs for the LJ and Coulombic terms respectively.
 </P>
+<P>Style <I>lj/cut/coul/cut/gpu</I> is a GPU-enabled version of style <I>lj/cut/coul/cut</I>.
+See more details below.
+</P>
 <P>Style <I>lj/cut/coul/debye</I> adds an additional exp() damping factor
 to the Coulombic term, given by
 </P>
@@ -131,6 +142,9 @@ option.  The Coulombic cutoff specified for this style means that
 pairwise interactions within this distance are computed directly;
 interactions outside that distance are computed in reciprocal space.
 </P>
+<P>Style <I>lj/cut/coul/long/gpu</I> is a GPU-enabled version of style <I>lj/cut/coul/long</I>.
+See more details below.
+</P>
 <P>Style <I>lj/cut/coul/long/tip4p</I> implements the TIP4P water model of
 <A HREF = "#Jorgensen">(Jorgensen)</A>, which introduces a massless site located a
 short distance away from the oxygen atom along the bisector of the HOH
@@ -177,9 +191,10 @@ Coulombic cutoff specified in the pair_style command.
 </P>
 <HR>
 
-<P>The <I>lj/cut/gpu</I> style is identical to the <I>lj/cut</I> style, except that
-each processor off-loads its pairwise calculations to a GPU chip.
-Depending on the hardware available on your system this can provide a
+<P>The <I>lj/cut/gpu</I>, <I>lj/cut/coul/cut/gpu</I>, and <I>lj/cut/coul/long/gpu</I> styles 
+are identical to the <I>lj/cut</I>, <I>lj/cut/coul/cut</I>, and <I>lj/cut/coul/long</I>
+styles, except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
 speed-up.  See the <A HREF = "Section_start.html#2_8">Running on GPUs</A> section of
 the manual for more details about hardware and software requirements
 for using GPUs.
@@ -204,10 +219,12 @@ number of GPUs per node.
 configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
 manual.
 </P>
-<P>Additional requirements in your input script to run with style
-<I>lj/cut/gpu</I> are as follows:
+<P>Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
 </P>
-<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I>.
+<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I> and
+<A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls
+the essential GPU selection and initialization steps.
 </P>
 <HR>
 
@@ -248,7 +265,8 @@ See the <A HREF = "run_style.html">run_style</A> command for details.
 <P><B>Restrictions:</B>
 </P>
 <P>The <I>lj/cut/coul/long</I> and <I>lj/cut/coul/long/tip4p</I> styles are part of
-the "kspace" package.  The <I>lj/cut/gpu</I> style is part of the "gpu"
+the "kspace" package.  The <I>lj/cut/gpu</I>, <I>lj/cut/coul/cut/gpu</I>, and
+<I>lj/cut/coul/long/gpu</I> styles are part of the "gpu"
 package.  The <I>lj/cut/opt</I> style is part of the "opt" package.  They
 are only enabled if LAMMPS was built with those packages.  See the
 <A HREF = "Section_start.html#2_3">Making LAMMPS</A> section for more info.  Note
diff --git a/doc/pair_lj.txt b/doc/pair_lj.txt
index 3099113334..42f0c9d6cc 100644
--- a/doc/pair_lj.txt
+++ b/doc/pair_lj.txt
@@ -10,28 +10,31 @@ pair_style lj/cut command :h3
 pair_style lj/cut/gpu command :h3
 pair_style lj/cut/opt command :h3
 pair_style lj/cut/coul/cut command :h3
+pair_style lj/cut/coul/cut/gpu command :h3
 pair_style lj/cut/coul/debye command :h3
 pair_style lj/cut/coul/long command :h3
+pair_style lj/cut/coul/long/gpu command :h3
 pair_style lj/cut/coul/long/tip4p command :h3
 
 [Syntax:]
 
 pair_style style args :pre
 
-style = {lj/cut} or {lj/cut/gpu} or {lj/cut/opt} or {lj/cut/coul/cut} or {lj/cut/coul/debye} \
-        or {lj/cut/coul/long} or {lj/cut/coul/long/tip4p}
+style = {lj/cut} or {lj/cut/gpu} or {lj/cut/opt} or {lj/cut/coul/cut} \
+        or {lj/cut/coul/debye} or {lj/cut/coul/long} or {lj/cut/coul/long/tip4p}
 args = list of arguments for a particular style :ul
   {lj/cut} args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
-  {lj/cut/gpu} args = gpumode gpuID cutoff
-    gpumode = {one/node} or {one/gpu} or {multi/gpu}
-    gpuID = ID or number of GPUs
+  {lj/cut/gpu} args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
   {lj/cut/opt} args = cutoff
     cutoff = global cutoff for Lennard Jones interactions (distance units)
   {lj/cut/coul/cut} args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
+  {lj/cut/coul/cut/gpu} args = cutoff (cutoff2)
+    cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
+    cutoff2 = global cutoff for Coulombic (optional) (distance units)
   {lj/cut/coul/debye} args = kappa cutoff (cutoff2)
     kappa = Debye length (inverse distance units)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
@@ -39,6 +42,9 @@ args = list of arguments for a particular style :ul
   {lj/cut/coul/long} args = cutoff (cutoff2)
     cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
     cutoff2 = global cutoff for Coulombic (optional) (distance units)
+  {lj/cut/coul/long/gpu} args = cutoff (cutoff2)
+    cutoff = global cutoff for LJ (and Coulombic if only 1 arg) (distance units)
+    cutoff2 = global cutoff for Coulombic (optional) (distance units)
   {lj/cut/coul/long/tip4p} args = otype htype btype atype qdist cutoff (cutoff2)
     otype,htype = atom types for TIP4P O and H
     btype,atype = bond and angle types for TIP4P waters
@@ -49,12 +55,13 @@ args = list of arguments for a particular style :ul
 [Examples:]
 
 pair_style lj/cut 2.5
-pair_style lj/cut/gpu one/node 0 2.5
+pair_style lj/cut/gpu 2.5
 pair_style lj/cut/opt 2.5
 pair_coeff * * 1 1
 pair_coeff 1 1 1 1.1 2.8 :pre
 
 pair_style lj/cut/coul/cut 10.0
+pair_style lj/cut/coul/cut/gpu 10.0
 pair_style lj/cut/coul/cut 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0
@@ -67,6 +74,7 @@ pair_coeff 1 1 1.0 1.5 2.5
 pair_coeff 1 1 1.0 1.5 2.5 5.0 :pre
 
 pair_style lj/cut/coul/long 10.0
+pair_style lj/cut/coul/long/gpu 10.0
 pair_style lj/cut/coul/long 10.0 8.0
 pair_coeff * * 100.0 3.0
 pair_coeff 1 1 100.0 3.5 9.0 :pre
@@ -85,10 +93,8 @@ given by
 
 Rc is the cutoff.
 
-Style {lj/cut/gpu} is a GPU-enabled version of style {lj/cut} that
-should give identical answers.  Depending on system size and the GPU
-processor you have on your system, it may be 4x faster (for the
-pairwise portion of the run time).  See more details below.
+Style {lj/cut/gpu} is a GPU-enabled version of style {lj/cut}.
+See more details below.
 
 Style {lj/cut/opt} is an optimized version of style {lj/cut} that
 should give identical answers.  Depending on system size and the
@@ -106,6 +112,9 @@ specified in the pair_style command, it is used for both the LJ and
 Coulombic terms.  If two cutoffs are specified, they are used as
 cutoffs for the LJ and Coulombic terms respectively.
 
+Style {lj/cut/coul/cut/gpu} is a GPU-enabled version of style {lj/cut/coul/cut}.
+See more details below.
+
 Style {lj/cut/coul/debye} adds an additional exp() damping factor
 to the Coulombic term, given by
 
@@ -122,6 +131,9 @@ option.  The Coulombic cutoff specified for this style means that
 pairwise interactions within this distance are computed directly;
 interactions outside that distance are computed in reciprocal space.
 
+Style {lj/cut/coul/long/gpu} is a GPU-enabled version of style {lj/cut/coul/long}.
+See more details below.
+
 Style {lj/cut/coul/long/tip4p} implements the TIP4P water model of
 "(Jorgensen)"_#Jorgensen, which introduces a massless site located a
 short distance away from the oxygen atom along the bisector of the HOH
@@ -168,9 +180,10 @@ Coulombic cutoff specified in the pair_style command.
 
 :line
 
-The {lj/cut/gpu} style is identical to the {lj/cut} style, except that
-each processor off-loads its pairwise calculations to a GPU chip.
-Depending on the hardware available on your system this can provide a
+The {lj/cut/gpu}, {lj/cut/coul/cut/gpu}, and {lj/cut/coul/long/gpu} styles 
+are identical to the {lj/cut}, {lj/cut/coul/cut}, and {lj/cut/coul/long}
+styles, except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
 speed-up.  See the "Running on GPUs"_Section_start.html#2_8 section of
 the manual for more details about hardware and software requirements
 for using GPUs.
@@ -195,10 +208,12 @@ More details about these settings and various possible hardware
 configuration are in "this section"_Section_start.html#2_8 of the
 manual.
 
-Additional requirements in your input script to run with style
-{lj/cut/gpu} are as follows:
+Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
 
-The "newton pair"_newton.html setting must be {off}.
+The "newton pair"_newton.html setting must be {off} and
+"fix gpu"_fix_gpu.html must be used. The fix controls
+the essential GPU selection and initialization steps.
 
 :line
 
@@ -239,7 +254,8 @@ See the "run_style"_run_style.html command for details.
 [Restrictions:]
 
 The {lj/cut/coul/long} and {lj/cut/coul/long/tip4p} styles are part of
-the "kspace" package.  The {lj/cut/gpu} style is part of the "gpu"
+the "kspace" package.  The {lj/cut/gpu}, {lj/cut/coul/cut/gpu}, and
+{lj/cut/coul/long/gpu} styles are part of the "gpu"
 package.  The {lj/cut/opt} style is part of the "opt" package.  They
 are only enabled if LAMMPS was built with those packages.  See the
 "Making LAMMPS"_Section_start.html#2_3 section for more info.  Note
diff --git a/doc/pair_lj96_cut.html b/doc/pair_lj96_cut.html
index 7b0b9f184a..997ba0983c 100644
--- a/doc/pair_lj96_cut.html
+++ b/doc/pair_lj96_cut.html
@@ -11,15 +11,19 @@
 
 <H3>pair_style lj96/cut command 
 </H3>
+<H3>pair_style lj96/cut/gpu command 
+</H3>
 <P><B>Syntax:</B>
 </P>
-<PRE>pair_style lj96/cut cutoff 
+<PRE>pair_style style cutoff 
 </PRE>
-<UL><LI>cutoff = global cutoff for lj96/cut interactions (distance units) 
+<UL><LI>style = <I>lj96/cut</I> or <I>lj96/cut/gpu</I>
+<LI>cutoff = global cutoff for lj96/cut interactions (distance units) 
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>pair_style lj96/cut 2.5
+pair_style lj96/cut/gpu 2.5
 pair_coeff * * 1.0 1.0 4.0
 pair_coeff 1 1 1.0 1.0 
 </PRE>
@@ -32,6 +36,9 @@ of the standard 12/6 potential, given by
 </CENTER>
 <P>Rc is the cutoff.
 </P>
+<P>Style <I>lj96/cut/gpu</I> is a GPU-enabled version of style <I>lj96/cut</I>.
+See more details below.
+</P>
 <P>The following coefficients must be defined for each pair of atoms
 types via the <A HREF = "pair_coeff.html">pair_coeff</A> command as in the examples
 above, or in the data file or restart files read by the
@@ -47,6 +54,26 @@ cutoff specified in the pair_style command is used.
 </P>
 <HR>
 
+<P>The <I>lj96/cut/gpu</I> style is identical to the <I>lj96/cut</I> style, except that
+each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the <A HREF = "Section_start.html#2_8">Running on GPUs</A> section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+</P>
+<P>More details about these settings and various possible hardware
+configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P>Additional requirements in your input script to run with the <I>lj96/cut/gpu</I> 
+style are as follows:
+</P>
+<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I> and
+<A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls the
+essential GPU selection and initialization steps
+</P>
+<HR>
+
 <P><B>Mixing, shift, table, tail correction, restart, rRESPA info</B>:
 </P>
 <P>For atom type pairs I,J and I != J, the epsilon and sigma coefficients
@@ -76,7 +103,11 @@ details.
 </P>
 <HR>
 
-<P><B>Restrictions:</B> none
+<P><B>Restrictions:</B>
+</P>
+<P>The <I>lj96/cut/gpu</I> style is part of the "gpu" package. It
+is only enabled if LAMMPS is built with this packages.  See the
+<A HREF = "Section_start.html#2_3">Making LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
 </P>
diff --git a/doc/pair_lj96_cut.txt b/doc/pair_lj96_cut.txt
index 1f82e5dbd7..892fc6fa8a 100644
--- a/doc/pair_lj96_cut.txt
+++ b/doc/pair_lj96_cut.txt
@@ -7,16 +7,19 @@
 :line
 
 pair_style lj96/cut command :h3
+pair_style lj96/cut/gpu command :h3
 
 [Syntax:]
 
-pair_style lj96/cut cutoff :pre
+pair_style style cutoff :pre
 
+style = {lj96/cut} or {lj96/cut/gpu}
 cutoff = global cutoff for lj96/cut interactions (distance units) :ul
 
 [Examples:]
 
 pair_style lj96/cut 2.5
+pair_style lj96/cut/gpu 2.5
 pair_coeff * * 1.0 1.0 4.0
 pair_coeff 1 1 1.0 1.0 :pre
 
@@ -29,6 +32,9 @@ of the standard 12/6 potential, given by
 
 Rc is the cutoff.
 
+Style {lj96/cut/gpu} is a GPU-enabled version of style {lj96/cut}.
+See more details below.
+
 The following coefficients must be defined for each pair of atoms
 types via the "pair_coeff"_pair_coeff.html command as in the examples
 above, or in the data file or restart files read by the
@@ -44,6 +50,26 @@ cutoff specified in the pair_style command is used.
 
 :line
 
+The {lj96/cut/gpu} style is identical to the {lj96/cut} style, except that
+each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the "Running on GPUs"_Section_start.html#2_8 section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+
+More details about these settings and various possible hardware
+configuration are in "this section"_Section_start.html#2_8 of the
+manual.
+
+Additional requirements in your input script to run with the {lj96/cut/gpu} 
+style are as follows:
+
+The "newton pair"_newton.html setting must be {off} and
+"fix gpu"_fix_gpu.html must be used. The fix controls the
+essential GPU selection and initialization steps
+
+:line
+
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
 
 For atom type pairs I,J and I != J, the epsilon and sigma coefficients
@@ -73,7 +99,11 @@ details.
 
 :line
 
-[Restrictions:] none
+[Restrictions:]
+
+The {lj96/cut/gpu} style is part of the "gpu" package. It
+is only enabled if LAMMPS is built with this packages.  See the
+"Making LAMMPS"_Section_start.html#2_3 section for more info.
 
 [Related commands:]
 

angle_coeff	angle_style	atom_modify	atom_style	bond_coeff	bond_style
boundary	change_box	clear	communicate	compute	compute_modify
create_atoms	create_box	delete_atoms	delete_bonds	dielectric	dihedral_coeff
adapt	addforce	aveforce	ave/atom	ave/correlate	ave/histo	ave/spatial	ave/time
bond/break	bond/create	bond/swap	box/relax	deform	deposit	drag	dt/reset
efield	enforce2d	evaporate	external	freeze	gravity	heat	indent
atc	imd	langevin/eff	nph/eff	npt/eff	nve/eff
nvt/eff	nvt/sllod/eff	qeq/reax	smd	temp/rescale/eff
angle/local	atom/molecule	bond/local	centro/atom	cna/atom	com
com/molecule	coord/atom	damage/atom	dihedral/local	displace/atom	erotate/asphere
erotate/sphere	event/displace	group/group	gyration	gyration/molecule	heat/flux
none	hybrid	hybrid/overlay	airebo
born	born/coul/long	buck	buck/coul/cut
buck/coul/long	colloid	comb	coul/cut
hbond/dreiding/morse	lj/charmm/coul/charmm	lj/charmm/coul/charmm/implicit	lj/charmm/coul/long
lj/charmm/coul/long/opt	lj/class2	lj/class2/coul/cut	lj/class2/coul/long
lj/cut	lj/cut/gpu	lj/cut/opt	lj/cut/coul/cut
lj/cut/coul/debye	lj/cut/coul/long	lj/cut/coul/long/tip4p	lj/expand
lj/gromacs	lj/gromacs/coul/gromacs	lj/smooth	lj96/cut
lubricate	meam	morse	morse/opt
peri/lps	peri/pmb	reax	resquared
soft	sw	table	tersoff
tersoff/zbl	yukawa	yukawa/colloid +
lj/cut/coul/cut/gpu	lj/cut/coul/debye	lj/cut/coul/long	lj/cut/coul/long/gpu
lj/cut/coul/long/tip4p	lj/expand	lj/gromacs	lj/gromacs/coul/gromacs
lj/smooth	lj96/cut	lj96/cut/gpu	lubricate
meam	morse	morse/opt	peri/lps
peri/pmb	reax	resquared	soft
sw	table	tersoff	tersoff/zbl
yukawa	yukawa/colloid