diff --git a/doc/Section_accelerate.html b/doc/Section_accelerate.html
index ebbd6a21ef..27b80f3d63 100644
--- a/doc/Section_accelerate.html
+++ b/doc/Section_accelerate.html
@@ -145,12 +145,12 @@ such as when using a barostat.
 <P>Accelerated versions of various <A HREF = "pair_style.html">pair_style</A>,
 <A HREF = "fix.html">fixes</A>, <A HREF = "compute.html">computes</A>, and other commands have
 been added to LAMMPS, which will typically run faster than the
-standard non-accelerated versions, if you have the appropriate
-hardware on your system.
+standard non-accelerated versions.  Some require appropriate hardware
+on your system, e.g. GPUs or Intel Xeon Phi chips.
 </P>
-<P>All of these commands are in <A HREF = "Section_packages.html">packages</A>.
-Currently, there are 6 such accelerator packages in LAMMPS, either as
-standard or user packages:
+<P>All of these commands are in packages provided with LAMMPS, as
+explained <A HREF = "Section_packages.html">here</A>.  Currently, there are 6 such
+accelerator packages in LAMMPS, either as standard or user packages:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
 <TR><TD ><A HREF = "#acc_7">USER-CUDA</A> </TD><TD > for NVIDIA GPUs</TD></TR>
@@ -177,20 +177,34 @@ Lennard-Jones <A HREF = "pair_lj.html">pair_style lj/cut</A>:
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/omp</A>
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/opt</A> 
 </UL>
-<P>Assuming LAMMPS was built with the appropriate package, these styles
-can be invoked by specifying them explicitly in your input script.  Or
-the <A HREF = "Section_start.html#start_7">-suffix command-line switch</A> can be
-used to automatically invoke the accelerated versions, without
-changing the input script.  Use of the <A HREF = "suffix.html">suffix</A> command
-allows a suffix to be set explicitly and to be turned off and back on
-at various points within an input script.
+<P>Assuming LAMMPS was built with the appropriate package, a simulation
+using accelerated styles from the package can be run without modifying
+your input script, by specifying <A HREF = "Section_start.html#start_7">command-line
+switches</A>.  The details of how to do this
+vary from package to package and are explained below.  There is also a
+<A HREF = "suffix.html">suffix</A> command and a <A HREF = "package.html">package</A> command that
+accomplish the same thing and can be used within an input script if
+preferred.  The <A HREF = "suffix.html">suffix</A> command allows more precise
+control of whether an accelerated or unaccelerated version of a style
+is used at various points within an input script.
 </P>
 <P>To see what styles are currently available in each of the accelerated
 packages, see <A HREF = "Section_commands.html#cmd_5">Section_commands 5</A> of the
-manual.  The doc page for each indvidual style (e.g. <A HREF = "pair_lj.html">pair
+manual.  The doc page for individual commands (e.g. <A HREF = "pair_lj.html">pair
 lj/cut</A> or <A HREF = "fix_nve.html">fix nve</A>) also lists any
 accelerated variants available for that style.
 </P>
+<P>The examples directory has several sub-directories with scripts and
+README files for using the accelerator packages:
+</P>
+<UL><LI>examples/cuda for USER-CUDA package
+<LI>examples/gpu for GPU package
+<LI>examples/intel for USER-INTEL package
+<LI>examples/kokkos for KOKKOS package 
+</UL>
+<P>Likewise, the bench directory has FERMI and KEPLER sub-directories
+with scripts and README files for using all the accelerator packages.
+</P>
 <P>Here is a brief summary of what the various packages provide.  Details
 are in individual sections below.
 </P>
@@ -208,8 +222,8 @@ coprocessors.  This can result in additional speedup over 2x depending
 on the hardware configuration. 
 
 <LI>Styles with a "kk" suffix are part of the KOKKOS package, and can be
-run using OpenMP, on an NVIDIA GPU, or on an Intel(R) Xeon Phi(TM).
-The speed-up depends on a variety of factors, as discussed below. 
+run using OpenMP, on an NVIDIA GPU, or on an Intel Xeon Phi.  The
+speed-up depends on a variety of factors, as discussed below. 
 
 <LI>Styles with an "omp" suffix are part of the USER-OMP package and allow
 a pair-style to be run in multi-threaded mode using OpenMP.  This can
@@ -226,7 +240,7 @@ CPU.
 </P>
 <UL><LI>what hardware and software the accelerated package requires
 <LI>how to build LAMMPS with the accelerated package
-<LI>how to run an input script with the accelerated package
+<LI>how to run with the accelerated package via either command-line switches or modifying the input script
 <LI>speed-ups to expect
 <LI>guidelines for best performance
 <LI>restrictions 
@@ -249,7 +263,10 @@ due to if tests and other conditional code.
 <UL><LI>include the OPT package and build LAMMPS
 <LI>use OPT pair styles in your input script 
 </UL>
-<P>Details follow.
+<P>The last step can be done using the "-sf opt" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  Or the effect of the "-sf" switch
+can be duplicated by adding a <A HREF = "suffix.html">suffix opt</A> command to your
+input script.
 </P>
 <P><B>Required hardware/software:</B>
 </P>
@@ -257,28 +274,30 @@ due to if tests and other conditional code.
 </P>
 <P><B>Building LAMMPS with the OPT package:</B>
 </P>
-<P>Include the package and build LAMMPS.
+<P>Include the package and build LAMMPS:
 </P>
-<PRE>make yes-opt
+<PRE>cd lammps/src
+make yes-opt
 make machine 
 </PRE>
-<P>No additional compile/link flags are needed in your machine
-Makefile in src/MAKE.
+<P>No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
 </P>
-<P><B>Running with the OPT package:</B>
+<P><B>Run with the OPT package from the command line:</B>
 </P>
-<P>You can explicitly add an "opt" suffix to the
-<A HREF = "pair_style.html">pair_style</A> command in your input script:
-</P>
-<PRE>pair_style lj/cut/opt 2.5 
-</PRE>
-<P>Or you can run with the -sf <A HREF = "Section_start.html#start_7">command-line
-switch</A>, which will automatically append
-"opt" to styles that support it.
+<P>Use the "-sf opt" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "opt" to styles that support it.
 </P>
 <PRE>lmp_machine -sf opt -in in.script
 mpirun -np 4 lmp_machine -sf opt -in in.script 
 </PRE>
+<P><B>Or run with the OPT package by editing an input script:</B>
+</P>
+<P>Use the <A HREF = "suffix.html">suffix opt</A> command, or you can explicitly add an
+"opt" suffix to individual styles in your input script, e.g.
+</P>
+<PRE>pair_style lj/cut/opt 2.5 
+</PRE>
 <P><B>Speed-ups to expect:</B>
 </P>
 <P>You should see a reduction in the "Pair time" value printed at the end
@@ -305,13 +324,17 @@ uses the OpenMP interface for multi-threading.
 </P>
 <P>Here is a quick overview of how to use the USER-OMP package:
 </P>
-<UL><LI>specify the -fopenmp flag for compiling and linking in your machine Makefile
+<UL><LI>use the -fopenmp flag for compiling and linking in your Makefile.machine
 <LI>include the USER-OMP package and build LAMMPS
-<LI>specify how many threads per MPI task to run with via an environment variable or the package omp command
-<LI>enable the USER-OMP package via the "-sf omp" command-line switch, or the package omp commmand
+<LI>use the mpirun command to set the number of MPI tasks/node
+<LI>specify how many threads per MPI task to use
 <LI>use USER-OMP styles in your input script 
 </UL>
-<P>Details follow.
+<P>The latter two steps can be done using the "-pk omp" and "-sf omp"
+<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the <A HREF = "package.html">package omp</A> or <A HREF = "suffix.html">suffix omp</A> commands
+respectively to your input script.
 </P>
 <P><B>Required hardware/software:</B>
 </P>
@@ -321,73 +344,65 @@ MPI task running on a CPU.
 </P>
 <P><B>Building LAMMPS with the USER-OMP package:</B>
 </P>
-<P>Include the package and build LAMMPS.  
+<P>Include the package and build LAMMPS:
 </P>
 <PRE>cd lammps/src
 make yes-user-omp
 make machine 
 </PRE>
-<P>Your lo-level src/MAKE/Makefile.machine needs a flag for OpenMP
-support in both the CCFLAGS and LINKFLAGS variables.  For GNU and
-Intel compilers, this flag is <I>-fopenmp</I>.  Without this flag the
-USER-OMP styles will still be compiled and work, but will not support
-multi-threading.
+<P>Your src/MAKE/Makefile.machine needs a flag for OpenMP support in both
+the CCFLAGS and LINKFLAGS variables.  For GNU and Intel compilers,
+this flag is "-fopenmp".  Without this flag the USER-OMP styles will
+still be compiled and work, but will not support multi-threading.
 </P>
-<P><B>Running with the USER-OMP package:</B>
+<P><B>Run with the USER-OMP package from the command line:</B>
 </P>
-<P>There are 3 issues (a,b,c) to address:
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 </P>
-<P>(a) Specify how many threads per MPI task to use
+<P>You need to choose how many threads per MPI task will be used by the
+USER-OMP package.  Note that the product of MPI tasks * threads/task
+should not exceed the physical number of cores (on a node), otherwise
+performance will suffer.
 </P>
-<P>Note that the product of MPI tasks * threads/task should not exceed
-the physical number of cores, otherwise performance will suffer.
+<P>Use the "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "omp" to styles that support it.  Use
+the "-pk omp Nt" <A HREF = "Section_start.html#start_7">command-line switch</A>, to
+set Nt = # of OpenMP threads per MPI task to use.
 </P>
-<P>By default LAMMPS uses 1 thread per MPI task.  If the environment
-variable OMP_NUM_THREADS is set to a valid value, this value is used.
-You can set this environment variable when you launch LAMMPS, e.g.
-</P>
-<PRE>env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
-env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
-mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script 
+<PRE>lmp_machine -sf omp -pk omp 16 -in in.script                       # 1 MPI task on a 16-core node
+mpirun -np 4 lmp_machine -sf omp -pk omp 4 -in in.script           # 4 MPI tasks each with 4 threads on a single 16-core node
+mpirun -np 32 -ppn 4 lmp_machine -sf omp -pk omp 4 -in in.script   # ditto on 8 16-core nodes 
 </PRE>
-<P>or you can set it permanently in your shell's start-up script.  
-All three of these examples use a total of 4 CPU cores.
+<P>Note that if the "-sf omp" switch is used, it also issues a default
+<A HREF = "package.html">package omp 0</A> command, which sets the number of threads
+per MPI task via the OMP_NUM_THREADS environment variable.
 </P>
-<P>Note that different MPI implementations have different ways of passing
-the OMP_NUM_THREADS environment variable to all MPI processes.  The
-2nd line above is for MPICH; the 3rd line with -x is for OpenMPI.
-Check your MPI documentation for additional details.
+<P>Using the "-pk" switch explicitly allows for direct setting of the
+number of threads and additional options.  Its syntax is the same as
+the "package omp" command.  See the <A HREF = "package.html">package</A> command doc
+page for details, including the default values used for all its
+options if it is not specified, and how to set the number of threads
+via the OMP_NUM_THREADS environment variable if desired.
 </P>
-<P>You can also set the number of threads per MPI task via the <A HREF = "package.html">package
-omp</A> command, which will override any OMP_NUM_THREADS
-setting.
+<P><B>Or run with the USER-OMP package by editing an input script:</B>
 </P>
-<P>(b) Enable the USER-OMP package
+<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+and threads/MPI task is the same.
 </P>
-<P>This can be done in one of two ways.  Use a <A HREF = "package.html">package omp</A>
-command near the top of your input script.
+<P>Use the <A HREF = "suffix.html">suffix omp</A> command, or you can explicitly add an
+"omp" suffix to individual styles in your input script, e.g.
 </P>
-<P>Or use the "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A>,
-which will automatically invoke the command <A HREF = "package.html">package omp
-*</A>.
-</P>
-<P>(c) Use OMP-accelerated styles
-</P>
-<P>This can be done by explicitly adding an "omp" suffix to any supported
-style in your input script:
-</P>
-<PRE>pair_style lj/cut/omp 2.5
-fix nve/omp 
+<PRE>pair_style lj/cut/omp 2.5 
 </PRE>
-<P>Or you can run with the "-sf omp" <A HREF = "Section_start.html#start_7">command-line
-switch</A>, which will automatically append
-"omp" to styles that support it.
-</P>
-<PRE>lmp_machine -sf omp -in in.script
-mpirun -np 4 lmp_machine -sf omp -in in.script 
-</PRE>
-<P>Using the "suffix omp" command in your input script does the same
-thing.
+<P>You must also use the <A HREF = "package.html">package omp</A> command to enable the
+USER-OMP package, unless the "-sf omp" or "-pk omp" <A HREF = "Section_start.html#start_7">command-line
+switches</A> were used.  It specifies how many
+threads per MPI task to use, as well as other options.  Its doc page
+explains how to set the number of threads via an environment variable
+if desired.
 </P>
 <P><B>Speed-ups to expect:</B>
 </P>
@@ -462,7 +477,7 @@ and thus reducing the work done by the long-range solver.  Using the
 with the USER-OMP package, is an alternative way to reduce the number
 of MPI tasks assigned to the KSpace calculation. 
 </UL>
-<P>Other performance tips are as follows:
+<P>Additional performance tips are as follows:
 </P>
 <UL><LI>The best parallel efficiency from <I>omp</I> styles is typically achieved
 when there is at least one MPI task per physical processor,
@@ -491,14 +506,14 @@ versions of many pair styles, including the 3-body Stillinger-Weber
 pair style, and for <A HREF = "kspace_style.html">kspace_style pppm</A> for
 long-range Coulombics.  It has the following general features:
 </P>
-<UL><LI>The package is designed to exploit common GPU hardware configurations
-where one or more GPUs are coupled to many cores of one or more
-multi-core CPUs, e.g. within a node of a parallel machine. 
+<UL><LI>It is designed to exploit common GPU hardware configurations where one
+or more GPUs are coupled to many cores of one or more multi-core CPUs,
+e.g. within a node of a parallel machine. 
 
 <LI>Atom-based data (e.g. coordinates, forces) moves back-and-forth
 between the CPU(s) and GPU every timestep. 
 
-<LI>Neighbor lists can be constructed on the CPU or on the GPU 
+<LI>Neighbor lists can be built on the CPU or on the GPU 
 
 <LI>The charge assignement and force interpolation portions of PPPM can be
 run on the GPU.  The FFT portion, which requires MPI communication
@@ -520,16 +535,17 @@ hardware.
 </UL>
 <P>Here is a quick overview of how to use the GPU package:
 </P>
-<UL><LI>build the library in lib/gpu for your GPU hardware (CUDA_ARCH) with desired precision (CUDA_PREC)
+<UL><LI>build the library in lib/gpu for your GPU hardware wity desired precision
 <LI>include the GPU package and build LAMMPS
-<LI>decide how many MPI tasks per GPU to run with, i.e. set MPI tasks/node via mpirun
-<LI>specify how many GPUs per node to use (default = 1) via the package gpu command
-<LI>enable the GPU package via the "-sf gpu" command-line switch, or the package gpu commmand
-<LI>use the newton command to turn off Newton's law for pairwise interactions
-<LI>use the package gpu command to enable neighbor list building on the GPU if desired
-<LI>use GPU pair styles and kspace styles in your input script 
+<LI>use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU
+<LI>specify the # of GPUs per node
+<LI>use GPU styles in your input script 
 </UL>
-<P>Details follow.
+<P>The latter two steps can be done using the "-pk gpu" and "-sf gpu"
+<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the <A HREF = "package.html">package gpu</A> or <A HREF = "suffix.html">suffix gpu</A> commands
+respectively to your input script.
 </P>
 <P><B>Required hardware/software:</B>
 </P>
@@ -544,7 +560,7 @@ install the NVIDIA Cuda software on your system:
 <P><B>Building LAMMPS with the GPU package:</B>
 </P>
 <P>This requires two steps (a,b): build the GPU library, then build
-LAMMPS.
+LAMMPS with the GPU package.
 </P>
 <P>(a) Build the GPU library
 </P>
@@ -560,9 +576,9 @@ attention to 3 settings in this makefile.
 for different GPU choices, e.g. Fermi vs Kepler.  It also lists the
 possible precision settings:
 </P>
-<PRE>CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
-CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
-CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc, in double 
+<PRE>CUDA_PREC = -D_SINGLE_SINGLE  # single precision for all calculations
+CUDA_PREC = -D_DOUBLE_DOUBLE  # double precision for all calculations
+CUDA_PREC = -D_SINGLE_DOUBLE  # accumulation of forces, etc, in double 
 </PRE>
 <P>The last setting is the mixed mode referred to above.  Note that your
 GPU must support double precision to use either the 2nd or 3rd of
@@ -584,74 +600,74 @@ own Makefile.lammps.machine if needed.
 re-build the entire library.  Do a "clean" first, e.g. "make -f
 Makefile.linux clean", followed by the make command above.
 </P>
-<P>(b) Build LAMMPS
+<P>(b) Build LAMMPS with the GPU package
 </P>
 <PRE>cd lammps/src
 make yes-gpu
 make machine 
 </PRE>
-<P>Note that if you change the GPU library precision (discussed above),
-you also need to re-install the GPU package and re-build LAMMPS, so
-that all affected files are re-compiled and linked to the new GPU
-library.
+<P>No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
 </P>
-<P><B>Running with the GPU package:</B>
+<P>Note that if you change the GPU library precision (discussed above)
+and rebuild the GPU library, then you also need to re-install the GPU
+package and re-build LAMMPS, so that all affected files are
+re-compiled and linked to the new GPU library.
 </P>
-<P>The examples/gpu and bench/GPU directories have scripts that can be
-run with the GPU package, as well as detailed instructions on how to
-run them.
+<P><B>Run with the GPU package from the command line:</B>
 </P>
-<P>To run with the GPU package, there are 3 basic issues (a,b,c) to
-address:
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 </P>
-<P>(a) Use one or more MPI tasks per GPU
+<P>When using the GPU package, you cannot assign more than one GPU to a
+single MPI task.  However multiple MPI tasks can share the same GPU,
+and in many cases it will be more efficient to run this way.  Likewise
+it may be more efficient to use less MPI tasks/node than the available
+# of CPU cores.  Assignment of multiple MPI tasks to a GPU will happen
+automatically if you create more MPI tasks/node than there are
+GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
+shared by 4 MPI tasks.
 </P>
-<P>The total number of MPI tasks used by LAMMPS (one or multiple per
-compute node) is set in the usual manner via the mpirun or mpiexec
-commands, and is independent of the GPU package.
+<P>Use the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "gpu" to styles that support it.  Use
+the "-pk gpu Ng" <A HREF = "Section_start.html#start_7">command-line switch</A> to
+set Ng = # of GPUs/node to use.
 </P>
-<P>When using the GPU package, you cannot assign more than one physical
-GPU to a single MPI task.  However multiple MPI tasks can share the
-same GPU, and in many cases it will be more efficient to run this way.
+<PRE>lmp_machine -sf gpu -pk gpu 1 -in in.script                         # 1 MPI task uses 1 GPU
+mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script           # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node
+mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes 
+</PRE>
+<P>Note that if the "-sf gpu" switch is used, it also issues a default
+<A HREF = "package.html">package gpu 1</A> command, which sets the number of
+GPUs/node to use to 1.
 </P>
-<P>The default is to have all MPI tasks on a compute node use a single
-GPU.  To use multiple GPUs per node, be sure to create one or more MPI
-tasks per GPU, and use the first/last settings in the <A HREF = "package.html">package
-gpu</A> command to include all the GPU IDs on the node.
-E.g. first = 0, last = 1, for 2 GPUs.  On a node with 8 CPU cores
-and 2 GPUs, this would specify that each GPU is shared by 4 MPI tasks.
+<P>Using the "-pk" switch explicitly allows for direct setting of the
+number of GPUs/node to use and additional options.  Its syntax is the
+same as same as the "package gpu" command.  See the
+<A HREF = "package.html">package</A> command doc page for details, including the
+default values used for all its options if it is not specified.
 </P>
-<P>(b) Enable the GPU package
+<P><B>Or run with the GPU package by editing an input script:</B>
 </P>
-<P>This can be done in one of two ways.  Use a <A HREF = "package.html">package gpu</A>
-command near the top of your input script.
+<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+and use of multiple MPI tasks/GPU is the same.
 </P>
-<P>Or use the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line switch</A>,
-which will automatically invoke the command <A HREF = "package.html">package gpu force/neigh 0
-0 1</A>.  Note that this specifies use of a single GPU (per
-node), so you must specify the package command in your input script
-explicitly if you want to use multiple GPUs per node.
-</P>
-<P>(c) Use GPU-accelerated styles
-</P>
-<P>This can be done by explicitly adding a "gpu" suffix to any supported
-style in your input script:
+<P>Use the <A HREF = "suffix.html">suffix gpu</A> command, or you can explicitly add an
+"gpu" suffix to individual styles in your input script, e.g.
 </P>
 <PRE>pair_style lj/cut/gpu 2.5 
 </PRE>
-<P>Or you can run with the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line
-switch</A>, which will automatically append
-"gpu" to styles that support it.
+<P>You must also use the <A HREF = "package.html">package gpu</A> command to enable the
+GPU package, unless the "-sf gpu" or "-pk gpu" <A HREF = "Section_start.html#start_7">command-line
+switches</A> were used.  It specifies the
+number of GPUs/node to use, as well as other options.
 </P>
-<PRE>lmp_machine -sf gpu -in in.script
-mpirun -np 4 lmp_machine -sf gpu -in in.script 
-</PRE>
-<P>Using the "suffix gpu" command in your input script does the same
-thing.
-</P>
-<P>IMPORTANT NOTE: The input script must also use the
-<A HREF = "newton.html">newton</A> command with a pairwise setting of <I>off</I>,
-since <I>on</I> is the default.
+<P>IMPORTANT NOTE: The input script must also use a newton pairwise
+setting of <I>off</I> in order to use GPU package pair styles.  This can be
+set via the <A HREF = "package.html">package gpu</A> or <A HREF = "newton.html">newton</A>
+commands.
 </P>
 <P><B>Speed-ups to expect:</B>
 </P>
@@ -745,18 +761,23 @@ single CPU (core), assigned to each GPU.
 </UL>
 <P>Here is a quick overview of how to use the USER-CUDA package:
 </P>
-<UL><LI>build the library in lib/cuda for your GPU hardware (arch with desired precision (precision)
+<UL><LI>build the library in lib/cuda for your GPU hardware with desired precision
 <LI>include the USER-CUDA package and build LAMMPS
 <LI>use the mpirun command to specify 1 MPI task per GPU (on each node)
-<LI>specify how many GPUs per node to use (default = 1) via the package cuda command
 <LI>enable the USER-CUDA package via the "-c on" command-line switch
+<LI>specify the # of GPUs per node
 <LI>use USER-CUDA styles in your input script 
 </UL>
-<P>Details follow.
+<P>The latter two steps can be done using the "-pk cuda" and "-sf cuda"
+<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the <A HREF = "package.html">package cuda</A> or <A HREF = "suffix.html">suffix cuda</A> commands
+respectively to your input script.
 </P>
 <P><B>Required hardware/software:</B>
 </P>
-<P>To use this package, you need to have one or more NVIDIA GPUs and install the NVIDIA Cuda software on your system:
+<P>To use this package, you need to have one or more NVIDIA GPUs and
+install the NVIDIA Cuda software on your system:
 </P>
 <P>Your NVIDIA GPU needs to support Compute Capability 1.3. This list may
 help you to find out the Compute Capability of your card:
@@ -771,7 +792,7 @@ projects can be compiled without problems.
 <P><B>Building LAMMPS with the USER-CUDA package:</B>
 </P>
 <P>This requires two steps (a,b): build the USER-CUDA library, then build
-LAMMPS.
+LAMMPS with the USER-CUDA package.
 </P>
 <P>(a) Build the USER-CUDA library
 </P>
@@ -816,58 +837,66 @@ the library is built.
 to re-build the entire library.  Do a "make clean" first, followed by
 "make".
 </P>
-<P>(b) Build LAMMPS
+<P>(b) Build LAMMPS with the USER-CUDA package
 </P>
 <PRE>cd lammps/src
 make yes-user-cuda
 make machine 
 </PRE>
+<P>No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
+</P>
 <P>Note that if you change the USER-CUDA library precision (discussed
-above), you also need to re-install the USER-CUDA package and re-build
-LAMMPS, so that all affected files are re-compiled and linked to the
-new USER-CUDA library.
+above) and rebuild the USER-CUDA library, then you also need to
+re-install the USER-CUDA package and re-build LAMMPS, so that all
+affected files are re-compiled and linked to the new USER-CUDA
+library.
 </P>
-<P><B>Running with the USER-CUDA package:</B>
+<P><B>Run with the USER-CUDA package from the command line:</B>
 </P>
-<P>The bench/CUDA directories has scripts that can be run with the
-USER-CUDA package, as well as detailed instructions on how to run
-them.
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 </P>
-<P>To run with the USER-CUDA package, there are 3 basic issues (a,b,c) to
-address:
+<P>When using the USER-CUDA package, you must use exactly one MPI task
+per physical GPU.
 </P>
-<P>(a) Use one MPI task per GPU
+<P>You must use the "-c on" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to enable the USER-CUDA package.
 </P>
-<P>This is a requirement of the USER-CUDA package, i.e. you cannot
-use multiple MPI tasks per physical GPU.  So if you are running
-on nodes with 1 or 2 GPUs, use the mpirun or mpiexec command
-to specify 1 or 2 MPI tasks per node.
+<P>Use the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "cuda" to styles that support it.  Use
+the "-pk cuda Ng" <A HREF = "Section_start.html#start_7">command-line switch</A> to
+set Ng = # of GPUs per node.
 </P>
-<P>If the nodes have more than 1 GPU, you must use the <A HREF = "package.html">package
-cuda</A> command near the top of your input script to
-specify that more than 1 GPU will be used (the default = 1).
+<PRE>lmp_machine -c on -sf cuda -pk cuda 1 -in in.script                       # 1 MPI task uses 1 GPU
+mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script          # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node
+mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script  # ditto on 12 16-core nodes 
+</PRE>
+<P>The "-pk" switch must be used (unless the <A HREF = "package.html">package cuda</A>
+command is used in the input script) to set the number of GPUs/node to
+use.  It also allows for setting of additional options.  Its syntax is
+the same as same as the "package cuda" command.  See the
+<A HREF = "package.html">package</A> command doc page for details.
 </P>
-<P>(b) Enable the USER-CUDA package
+<P><B>Or run with the USER-CUDA package by editing an input script:</B>
 </P>
-<P>The "-c on" or "-cuda on" <A HREF = "Section_start.html#start_7">command-line
-switch</A> must be used when launching LAMMPS.
+<P>The discussion above for the mpirun/mpiexec command and the requirement
+of one MPI task per GPU is the same.
 </P>
-<P>(c) Use USER-CUDA-accelerated styles
+<P>You must still use the "-c on" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to enable the USER-CUDA package.
 </P>
-<P>This can be done by explicitly adding a "cuda" suffix to any supported
-style in your input script:
+<P>Use the <A HREF = "suffix.html">suffix cuda</A> command, or you can explicitly add a
+"cuda" suffix to individual styles in your input script, e.g.
 </P>
 <PRE>pair_style lj/cut/cuda 2.5 
 </PRE>
-<P>Or you can run with the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line
-switch</A>, which will automatically append
-"cuda" to styles that support it.
-</P>
-<PRE>lmp_machine -sf cuda -in in.script
-mpirun -np 4 lmp_machine -sf cuda -in in.script 
-</PRE>
-<P>Using the "suffix cuda" command in your input script does the same
-thing.
+<P>You must use the <A HREF = "package.html">package cuda</A> command to set the the
+number of GPUs/node, unless the "-pk" <A HREF = "Section_start.html#start_7">command-line
+switch</A> was used.  The command also
+allows for setting of additional options.
 </P>
 <P><B>Speed-ups to expect:</B>
 </P>
@@ -944,23 +973,39 @@ neighbor list builds, time integration, etc) can be parallelized for
 one or the other of the two modes.  The first mode is called the
 "host" and is one or more threads running on one or more physical CPUs
 (within the node).  Currently, both multi-core CPUs and an Intel Phi
-processor (running in native mode) are supported.  The second mode is
-called the "device" and is an accelerator chip of some kind.
-Currently only an NVIDIA GPU is supported.  If your compute node does
-not have a GPU, then there is only one mode of execution, i.e. the
-host and device are the same.
+processor (running in native mode, not offload mode like the
+USER-INTEL package) are supported.  The second mode is called the
+"device" and is an accelerator chip of some kind.  Currently only an
+NVIDIA GPU is supported.  If your compute node does not have a GPU,
+then there is only one mode of execution, i.e. the host and device are
+the same.
+</P>
+<P>Here is a quick overview of how to use the KOKKOS package
+for GPU acceleration:
+</P>
+<UL><LI>specify variables and settings in your Makefile.machine that enable GPU, Phi, or OpenMP support
+<LI>include the KOKKOS package and build LAMMPS
+<LI>enable the KOKKOS package and its hardware options via the "-k on" command-line switch
+<LI>use KOKKOS styles in your input script 
+</UL>
+<P>The latter two steps can be done using the "-k on", "-pk kokkos" and
+"-sf kk" <A HREF = "Section_start.html#start_7">command-line switches</A>
+respectively.  Or the effect of the "-pk" or "-sf" switches can be
+duplicated by adding the <A HREF = "package.html">package kokkos</A> or <A HREF = "suffix.html">suffix
+kk</A> commands respectively to your input script.
 </P>
 <P><B>Required hardware/software:</B>
 </P>
-<P>The KOKKOS package can be used to build and run
-LAMMPS on the following kinds of hardware configurations:
+<P>The KOKKOS package can be used to build and run LAMMPS on the
+following kinds of hardware:
 </P>
 <UL><LI>CPU-only: one MPI task per CPU core (MPI-only, but using KOKKOS styles)
 <LI>CPU-only: one or a few MPI tasks per node with additional threading via OpenMP
 <LI>Phi: on one or more Intel Phi coprocessors (per node)
 <LI>GPU: on the GPUs of a node with additional OpenMP threading on the CPUs 
 </UL>
-<P>Intel Xeon Phi coprocessors are supported in "native" mode only.
+<P>Note that Intel Xeon Phi coprocessors are supported in "native" mode,
+not "offload" mode like the USER-INTEL package supports.
 </P>
 <P>Only NVIDIA GPUs are currently supported.
 </P>
@@ -1013,7 +1058,7 @@ e.g. g++ in the first two examples above, then you *must* perform a
 to force all the KOKKOS-dependent files to be re-compiled with the new
 options.
 </P>
-<P>You can also hardwire these variables in the specified machine
+<P>You can also hardwire these make variables in the specified machine
 makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above,
 with a line like:
 </P>
@@ -1043,79 +1088,106 @@ or in the machine makefile in the src/MAKE directory.  See <A HREF = "Section_st
 KOKKOS package.  All compilation and computation is performed in
 double precision.
 </P>
-<P><B>Running with the KOKKOS package:</B>
+<P><B>Run with the KOKKOS package from the command line:</B>
 </P>
-<P>The examples/kokkos and bench/KOKKOS directories have scripts that can
-be run with the KOKKOS package, as well as detailed instructions on
-how to run them.
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 </P>
-<P>There are 3 issues (a,b,c) to address:
+<P>When using KOKKOS built with host=OMP, you need to choose how many
+OpenMP threads per MPI task will be used (via the "-k" command-line
+switch discussed below).  Note that the product of MPI tasks * OpenMP
+threads/task should not exceed the physical number of cores (on a
+node), otherwise performance will suffer.
 </P>
-<P>(a) Launching LAMMPS in different KOKKOS modes
+<P>When using the KOKKOS package built with device=CUDA, you must use
+exactly one MPI task per physical GPU.
 </P>
-<P>Here are examples of how to run LAMMPS for the different compute-node
-configurations listed above.
+<P>When using the KOKKOS package built with host=MIC for Intel Xeon Phi
+coprocessor support you need to insure there are one or more MPI tasks
+per coprocessor, and choose the number of coprocessor threads to use
+per MPI task (via the "-k" command-line switch discussed below).  The
+product of MPI tasks * coprocessor threads/task should not exceed the
+maximum number of threads the coproprocessor is designed to run,
+otherwise performance will suffer.  This value is 240 for current
+generation Xeon Phi(TM) chips, which is 60 physical cores * 4
+threads/core.  Note that with the KOKKOS package you do not need to
+specify how many Phi coprocessors there are per node; each
+coprocessors is simply treated as running some number of MPI tasks.
 </P>
-<P>Note that the -np setting for the mpirun command in these examples is
-for runs on a single node.  To scale these examples up to run on a
-system with N compute nodes, simply multiply the -np setting by N.
+<P>You must use the "-k on" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to enable the KOKKOS package.  It
+takes additional arguments for hardware settings appropriate to your
+system.  Those arguments are <A HREF = "Section_start.html#start_7">documented
+here</A>.  The two most commonly used arguments
+are:
 </P>
-<P>CPU-only, dual hex-core CPUs:
-</P>
-<PRE>mpirun -np 12 lmp_g++ -in in.lj      # MPI-only mode with no Kokkos
-mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj      # MPI-only mode with Kokkos
-mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj     # one MPI task, 12 threads
-mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj      # two MPI tasks, 6 threads/task 
+<PRE>-k on t Nt
+-k on g Ng 
 </PRE>
-<P>Intel Phi with 61 cores (240 total usable cores, with 4x hardware threading):
+<P>The "t Nt" option applies to host=OMP (even if device=CUDA) and
+host=MIC.  For host=OMP, it specifies how many OpenMP threads per MPI
+task to use with a node.  For host=MIC, it specifies how many Xeon Phi
+threads per MPI task to use within a node.  The default is Nt = 1.
+Note that for host=OMP this is effectively MPI-only mode which may be
+fine.  But for host=MIC you will typically end up using far less than
+all the 240 available threads, which could give very poor performance.
 </P>
-<PRE>mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj      # 12*20 = 240
-mpirun -np 15 lmp_g++ -k on t 16 -sf kk -in in.lj
-mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj
-mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj 
+<P>The "g Ng" option applies to device=CUDA.  It specifies how many GPUs
+per compute node to use.  The default is 1, so this only needs to be
+specified is you have 2 or more GPUs per compute node.
+</P>
+<P>The "-k on" switch also issues a default <A HREF = "package.html">package kokkos neigh full
+comm host</A> command which sets various KOKKOS options to
+default values, as discussed on the <A HREF = "package.html">package</A> command doc
+page.
+</P>
+<P>Use the "-sf kk" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "kk" to styles that support it.  Use
+the "-pk kokkos" <A HREF = "Section_start.html#start_7">command-line switch</A> if
+you wish to override any of the default values set by the <A HREF = "package.html">package
+kokkos</A> command invoked by the "-k on" switch.
+</P>
+<PRE>host=OMP, dual hex-core nodes (12 threads/node):
+mpirun -np 12 lmp_g++ -in in.lj                           # MPI-only mode with no Kokkos
+mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj              # MPI-only mode with Kokkos
+mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj          # one MPI task, 12 threads
+mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj           # two MPI tasks, 6 threads/task 
+mpirun -np 32 -ppn 2 lmp_g++ -k on t 6 -sf kk -in in.lj   # ditto on 16 nodes 
 </PRE>
-<P>Dual hex-core CPUs and a single GPU:
+<P>host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading):
+mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj           # 1 MPI task on 1 Phi, 1*240 = 240
+mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj            # 30 MPI tasks on 1 Phi, 30*8 = 240
+mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj           # 12 MPI tasks on 1 Phi, 12*20 = 240
+mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj   # ditto on 8 Phis
 </P>
-<PRE>mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj       # one MPI task, 6 threads on CPU 
+<PRE>host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU:
+mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj          # one MPI task, 6 threads on CPU
+mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj   # ditto on 4 nodes 
 </PRE>
-<P>Dual 8-core CPUs and 2 GPUs:
-</P>
-<PRE>mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # two MPI tasks, 8 threads per CPU 
+<PRE>host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs:
+mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj           # two MPI tasks, 8 threads per CPU
+mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # ditto on 16 nodes 
 </PRE>
-<P>(b) Enable the KOKKOS package
+<P><B>Or run with the KOKKOS package by editing an input script:</B>
 </P>
-<P>As illustrated above, the "-k on" or "-kokkos on" <A HREF = "Section_start.html#start_7">command-line
-switch</A> must be used when launching LAMMPS.
+<P>The discussion above for the mpirun/mpiexec command and setting
+appropriate thread and GPU values for host=OMP or host=MIC or
+device=CUDA are the same.
 </P>
-<P>As documented <A HREF = "Section_start.html#start_7">here</A>, the command-line
-swithc allows for several options.  Commonly used ones, as illustrated
-above, are:
+<P>You must still use the "-k on" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to enable the KOKKOS package, and
+specify its additional arguments for hardware options appopriate to
+your system, as documented above.
 </P>
-<UL><LI>-k on t Nt : specifies how many threads per MPI task to use within a
-compute node.  For good performance, the product of MPI tasks *
-threads/task should not exceed the number of physical cores on a CPU
-or Intel Phi (including hardware threading, e.g. 240). 
-
-<LI>-k on g Ng : specifies how many GPUs per compute node are available.
-The default is 1, so this should be specified is you have 2 or more
-GPUs per compute node. 
-</UL>
-<P>(c) Use KOKKOS-accelerated styles
-</P>
-<P>This can be done by explicitly adding a "kk" suffix to any supported
-style in your input script:
+<P>Use the <A HREF = "suffix.html">suffix kk</A> command, or you can explicitly add a
+"kk" suffix to individual styles in your input script, e.g.
 </P>
 <PRE>pair_style lj/cut/kk 2.5 
 </PRE>
-<P>Or you can run with the "-sf kk" <A HREF = "Section_start.html#start_7">command-line
-switch</A>, which will automatically append
-"kk" to styles that support it.
-</P>
-<PRE>lmp_machine -sf kk -in in.script
-mpirun -np 4 lmp_machine -sf kk -in in.script 
-</PRE>
-<P>Using the "suffix kk" command in your input script does the same
-thing.
+<P>You only need to use the <A HREF = "package.html">package kokkos</A> command if you
+wish to change any of its option defaults.
 </P>
 <P><B>Speed-ups to expect:</B>
 </P>
@@ -1135,8 +1207,8 @@ than 20%).
 performance of a KOKKOS style is a bit slower than the USER-OMP
 package. 
 
-<LI>When running on GPUs, KOKKOS currently out-performs the 
-USER-CUDA and GPU packages. 
+<LI>When running on GPUs, KOKKOS is typically faster than the USER-CUDA
+and GPU packages. 
 
 <LI>When running on Intel Xeon Phi, KOKKOS is not as fast as
 the USER-INTEL package, which is optimized for that hardware. 
@@ -1147,8 +1219,8 @@ hardware.
 </P>
 <P><B>Guidelines for best performance:</B>
 </P>
-<P>Here are guidline for using the KOKKOS package on the different hardware
-configurations listed above.
+<P>Here are guidline for using the KOKKOS package on the different
+hardware configurations listed above.
 </P>
 <P>Many of the guidelines use the <A HREF = "package.html">package kokkos</A> command
 See its doc page for details and default settings.  Experimenting with
@@ -1159,7 +1231,7 @@ its options can provide a speed-up for specific calculations.
 <P>If N is the number of physical cores/node, then the number of MPI
 tasks/node * number of threads/task should not exceed N, and should
 typically equal N.  Note that the default threads/task is 1, as set by
-the "t" keyword of the -k <A HREF = "Section_start.html#start_7">command-line
+the "t" keyword of the "-k" <A HREF = "Section_start.html#start_7">command-line
 switch</A>.  If you do not change this, no
 additional parallelism (beyond MPI) will be invoked on the host
 CPU(s).
@@ -1170,15 +1242,14 @@ CPU(s).
 <LI>run with N MPI tasks/node and 1 thread/task
 <LI>run with settings in between these extremes 
 </UL>
-<P>Examples of mpirun commands in these modes, for nodes with dual
-hex-core CPUs and no GPU, are shown above.
+<P>Examples of mpirun commands in these modes are shown above.
 </P>
 <P>When using KOKKOS to perform multi-threading, it is important for
 performance to bind both MPI tasks to physical cores, and threads to
 physical cores, so they do not migrate during a simulation.
 </P>
 <P>If you are not certain MPI tasks are being bound (check the defaults
-for your MPI installation), it can be forced with these flags:
+for your MPI installation), binding can be forced with these flags:
 </P>
 <PRE>OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
 Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... 
@@ -1201,7 +1272,7 @@ details).
 <P>The -np setting of the mpirun command should set the number of MPI
 tasks/node to be equal to the # of physical GPUs on the node. 
 </P>
-<P>Use the <A HREF = "Section_commands.html#start_7">-kokkos command-line switch</A> to
+<P>Use the "-k" <A HREF = "Section_commands.html#start_7">command-line switch</A> to
 specify the number of GPUs per node, and the number of threads per MPI
 task.  As above for multi-core CPUs (and no GPU), if N is the number
 of physical cores/node, then the number of MPI tasks/node * number of
@@ -1211,14 +1282,13 @@ threads/task to a smaller value.  This is because using all the cores
 on a dual-socket node will incur extra cost to copy memory from the
 2nd socket to the GPU.
 </P>
-<P>Examples of mpirun commands that follow these rules, for nodes with
-dual hex-core CPUs and one or two GPUs, are shown above.
+<P>Examples of mpirun commands that follow these rules are shown above.
 </P>
-<P>When using a GPU, you will achieve the best performance if your input
-script does not use any fix or compute styles which are not yet
-Kokkos-enabled.  This allows data to stay on the GPU for multiple
-timesteps, without being copied back to the host CPU.  Invoking a
-non-Kokkos fix or compute, or performing I/O for
+<P>IMPORTANT NOTE: When using a GPU, you will achieve the best
+performance if your input script does not use any fix or compute
+styles which are not yet Kokkos-enabled.  This allows data to stay on
+the GPU for multiple timesteps, without being copied back to the host
+CPU.  Invoking a non-Kokkos fix or compute, or performing I/O for
 <A HREF = "thermo_style.html">thermo</A> or <A HREF = "dump.html">dump</A> output will cause data
 to be copied back to the CPU.
 </P>
@@ -1254,8 +1324,7 @@ threads/task as Nt.  The product of these 2 values should be N, i.e.
 4 so that logical threads from more than one MPI task do not run on
 the same physical core.
 </P>
-<P>Examples of mpirun commands that follow these rules, for Intel Phi
-nodes with 61 cores, are shown above.
+<P>Examples of mpirun commands that follow these rules are shown above.
 </P>
 <P><B>Restrictions:</B>
 </P>
@@ -1276,11 +1345,12 @@ change in the future.
 <P>The USER-INTEL package was developed by Mike Brown at Intel
 Corporation.  It provides a capability to accelerate simulations by
 offloading neighbor list and non-bonded force calculations to Intel(R)
-Xeon Phi(TM) coprocessors.  Additionally, it supports running
-simulations in single, mixed, or double precision with vectorization,
-even if a coprocessor is not present, i.e. on an Intel(R) CPU.  The
-same C++ code is used for both cases.  When offloading to a
-coprocessor, the routine is run twice, once with an offload flag.
+Xeon Phi(TM) coprocessors (not native mode like the KOKKOS package).
+Additionally, it supports running simulations in single, mixed, or
+double precision with vectorization, even if a coprocessor is not
+present, i.e. on an Intel(R) CPU.  The same C++ code is used for both
+cases.  When offloading to a coprocessor, the routine is run twice,
+once with an offload flag.
 </P>
 <P>The USER-INTEL package can be used in tandem with the USER-OMP
 package.  This is useful when offloading pair style computations to
@@ -1302,20 +1372,27 @@ package is available.
 <P>Here is a quick overview of how to use the USER-INTEL package
 for CPU acceleration:
 </P>
-<UL><LI>specify these CCFLAGS in your machine Makefile: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost
-<LI>specify -fopenmp with LINKFLAGS in your machine Makefile
-<LI>include the USER-INTEL package and (optionally) USER-OMP package and build LAMMP
-<LI>if also using the USER-OMP package, specify how many threads per MPI task to run with via an environment variable or the package omp command
+<UL><LI>specify these CCFLAGS in your Makefile.machine: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost
+<LI>specify -fopenmp with LINKFLAGS in your Makefile.machine
+<LI>include the USER-INTEL package and (optionally) USER-OMP package and build LAMMPS
+<LI>if using the USER-OMP package, specify how many threads per MPI task to use
 <LI>use USER-INTEL styles in your input script 
 </UL>
-<P>Running with the USER-INTEL package to offload to the Intel(R) Xeon Phi(TM)
-is the same except for these additional steps:
+<P>Using the USER-INTEL package to offload work to the Intel(R)
+Xeon Phi(TM) coprocessor is the same except for these additional
+steps:
 </P>
-<UL><LI>add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Machine makefile
-<LI>add the flag -offload to the LINKFLAGS in your Machine makefile
-<LI>the package intel command can be used to adjust threads per coprocessor 
+<UL><LI>add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Makefile.machine
+<LI>add the flag -offload to LINKFLAGS in your Makefile.machine
+<LI>specify how many threads per coprocessor to use 
 </UL>
-<P>Details follow.
+<P>The latter two steps in the first case and the last step in the
+coprocessor case can be done using the "-pk omp" and "-sf intel" and
+"-pk intel" <A HREF = "Section_start.html#start_7">command-line switches</A>
+respectively.  Or the effect of the "-pk" or "-sf" switches can be
+duplicated by adding the <A HREF = "package.html">package omp</A> or <A HREF = "suffix.html">suffix
+intel</A> or <A HREF = "package.html">package intel</A> commands
+respectively to your input script.
 </P>
 <P><B>Required hardware/software:</B>
 </P>
@@ -1331,7 +1408,7 @@ compiler must support the OpenMP interface.
 </P>
 <P><B>Building LAMMPS with the USER-INTEL package:</B>
 </P>
-<P>Include the package and build LAMMPS.  
+<P>Include the package(s) and build LAMMPS:  
 </P>
 <PRE>cd lammps/src
 make yes-user-intel
@@ -1364,77 +1441,101 @@ has support for offload to coprocessors; the former does not.
 issues that are being addressed. If using Intel(R) MPI, version 5 or
 higher is recommended.
 </P>
-<P><B>Running with the USER-INTEL package:</B>
+<P><B>Running with the USER-INTEL package from the command line:</B>
 </P>
-<P>The examples/intel directory has scripts that can be run with the
-USER-INTEL package, as well as detailed instructions on how to run
-them.
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 </P>
-<P>Note that the total number of MPI tasks used by LAMMPS (one or
-multiple per compute node) is set in the usual manner via the mpirun
-or mpiexec commands, and is independent of the USER-INTEL package.
-</P>
-<P>To run with the USER-INTEL package, there are 3 basic issues (a,b,c)
-to address:
-</P>
-<P>(a) Specify how many threads per MPI task to use on the CPU.
-</P>
-<P>Whether using the USER-INTEL package to offload computations to
-Intel(R) Xeon Phi(TM) coprocessors or not, work performed on the CPU
-can be multi-threaded via the USER-OMP package, assuming the USER-OMP
-package was also installed when LAMMPS was built.
-</P>
-<P>In this case, the instructions above for the USER-OMP package, in its
-"Running with the USER-OMP package" sub-section apply here as well.
-</P>
-<P>You can specify the number of threads per MPI task via the
-OMP_NUM_THREADS environment variable or the <A HREF = "package.html">package omp</A>
-command.  The product of MPI tasks * threads/task should not exceed
-the physical number of cores on the CPU (per node), otherwise
+<P>If LAMMPS was also built with the USER-OMP package, you need to choose
+how many OpenMP threads per MPI task will be used by the USER-OMP
+package.  Note that the product of MPI tasks * OpenMP threads/task
+should not exceed the physical number of cores (on a node), otherwise
 performance will suffer.
 </P>
-<P>Note that the threads per MPI task setting is completely independent
-of the number of threads used on the coprocessor.  Only the <A HREF = "package.html">package
-intel</A> command can be used to control thread counts on
-the coprocessor.
+<P>If LAMMPS was built with coprocessor support for the USER-INTEL
+package, you need to specify the number of coprocessor/node and the
+number of threads to use on the coprocessor per MPI task.  Note that
+coprocessor threads (which run on the coprocessor) are totally
+independent from OpenMP threads (which run on the CPU).  The product
+of MPI tasks * coprocessor threads/task should not exceed the maximum
+number of threads the coproprocessor is designed to run, otherwise
+performance will suffer.  This value is 240 for current generation
+Xeon Phi(TM) chips, which is 60 physical cores * 4 threads/core.  The
+threads/core value can be set to a smaller value if desired by an
+option on the <A HREF = "package.html">package intel</A> command, in which case the
+maximum number of threads is also reduced.
 </P>
-<P>(b) Enable the USER-INTEL package
+<P>Use the "-sf intel" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "intel" to styles that support it.  If
+a style does not support it, a "omp" suffix is tried next.  Use the
+"-pk omp Nt" <A HREF = "Section_start.html#start_7">command-line switch</A>, to set
+Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with
+the USER-OMP package.  Use the "-pk intel Nphi" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to set Nphi = # of Xeon Phi(TM)
+coprocessors/node, if LAMMPS was built with coprocessor support.
 </P>
-<P>This can be done in one of two ways.  Use a <A HREF = "package.html">package intel</A>
-command near the top of your input script.
+<PRE>CPU-only without USER-OMP (but using Intel vectorization on CPU):
+lmp_machine -sf intel -in in.script                 # 1 MPI task
+mpirun -np 32 lmp_machine -sf intel -in in.script   # 32 MPI tasks on as many nodes as needed (e.g. 2 16-core nodes) 
+</PRE>
+<PRE>CPU-only with USER-OMP (and Intel vectorization on CPU):
+lmp_machine -sf intel -pk intel 16 0 -in in.script                # 1 MPI task on a 16-core node
+mpirun -np 4 lmp_machine -sf intel -pk intel 4 0 -in in.script    # 4 MPI tasks each with 4 threads on a single 16-core node
+mpirun -np 32 lmp_machine -sf intel -pk intel 4 0 -in in.script   # ditto on 8 16-core nodes 
+</PRE>
+<PRE>CPUs + Xeon Phi(TM) coprocessors with USER-OMP:
+lmp_machine -sf intel -pk intel 16 1 -in in.script                                  # 1 MPI task, 240 threads on 1 coprocessor
+mpirun -np 4 lmp_machine -sf intel -pk intel 4 1 tptask 60 -in in.script            # 4 MPI tasks each with 4 OpenMP threads on a single 16-core node, 
+                                                                                    # each MPI task uses 60 threads on 1 coprocessor
+mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.script   # ditto on 8 16-core nodes for MPI tasks and OpenMP threads, 
+                                                                                    # each MPI task uses 120 threads on one of 2 coprocessors 
+</PRE>
+<P>Note that if the "-sf intel" switch is used, it also issues two
+default commands: <A HREF = "package.html">package omp 0</A> and <A HREF = "package.html">package intel
+1</A> command.  These set the number of OpenMP threads per
+MPI task via the OMP_NUM_THREADS environment variable, and the number
+of Xeon Phi(TM) coprocessors/node to 1.  The former is ignored if
+LAMMPS was not built with the USER-OMP package.  The latter is ignored
+is LAMMPS was not built with coprocessor support, except for its
+optional precision setting.
 </P>
-<P>Or use the "-sf intel" <A HREF = "Section_start.html#start_7">command-line
-switch</A>, which will automatically invoke
-the command "package intel * mixed balance -1 offload_cards 1
-offload_tpc 4 offload_threads 240".  Note that this specifies mixed
-precision and use of a single Xeon Phi(TM) coprocessor (per node), so
-you must specify the package command in your input script explicitly
-if you want a different precision or to use multiple Phi coprocessor
-per node.  Also note that the balance and offload keywords are ignored
-if you did not build LAMMPS with offload support for a coprocessor, as
-descibed above.
+<P>Using the "-pk omp" switch explicitly allows for direct setting of the
+number of OpenMP threads per MPI task, and additional options.  Using
+the "-pk intel" switch explicitly allows for direct setting of the
+number of coprocessors/node, and additional options.  The syntax for
+these two switches is the same as the <A HREF = "package.html">package omp</A> and
+<A HREF = "package.html">package intel</A> commands.  See the <A HREF = "package.html">package</A>
+command doc page for details, including the default values used for
+all its options if these switches are not specified, and how to set
+the number of OpenMP threads via the OMP_NUM_THREADS environment
+variable if desired.
 </P>
-<P>(c) Use USER-INTEL-accelerated styles
+<P><B>Or run with the USER-INTEL package by editing an input script:</B>
 </P>
-<P>This can be done by explicitly adding an "intel" suffix to any
-supported style in your input script:
+<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+OpenMP threads per MPI task, and coprocessor threads per MPI task is
+the same.
+</P>
+<P>Use the <A HREF = "suffix.html">suffix intel</A> command, or you can explicitly add an
+"intel" suffix to individual styles in your input script, e.g.
 </P>
 <PRE>pair_style lj/cut/intel 2.5 
 </PRE>
-<P>Or you can run with the "-sf intel" <A HREF = "Section_start.html#start_7">command-line
-switch</A>, which will automatically append
-"intel" to styles that support it.
+<P>You must also use the <A HREF = "package.html">package omp</A> command to enable the
+USER-OMP package (assuming LAMMPS was built with USER-OMP) unless the "-sf
+intel" or "-pk omp" <A HREF = "Section_start.html#start_7">command-line switches</A>
+were used.  It specifies how many OpenMP threads per MPI task to use,
+as well as other options.  Its doc page explains how to set the number
+of threads via an environment variable if desired.
 </P>
-<PRE>lmp_machine -sf intel -in in.script
-mpirun -np 4 lmp_machine -sf intel -in in.script 
-</PRE>
-<P>Using the "suffix intel" command in your input script does the same
-thing.
-</P>
-<P>IMPORTANT NOTE: Using an "intel" suffix in any of the above modes,
-actually invokes two suffixes, "intel" and "omp".  "Intel" is tried
-first, and if the style does not support it, "omp" is tried next.  If
-neither is supported, the default non-suffix style is used.
+<P>You must also use the <A HREF = "package.html">package intel</A> command to enable
+coprocessor support within the USER-INTEL package (assuming LAMMPS was
+built with coprocessor support) unless the "-sf intel" or "-pk intel"
+<A HREF = "Section_start.html#start_7">command-line switches</A> were used.  It
+specifies how many coprocessors/node to use, as well as other
+coprocessor options.
 </P>
 <P><B>Speed-ups to expect:</B>
 </P>
@@ -1472,8 +1573,8 @@ threads to use per core can be accomplished with keyword settings of
 the <A HREF = "package.html">package intel</A> command. 
 
 <LI>If desired, only a fraction of the pair style computation can be
-offloaded to the coprocessors.  This is accomplished by setting a
-balance fraction in the <A HREF = "package.html">package intel</A> command.  A
+offloaded to the coprocessors.  This is accomplished by using the
+<I>balance</I> keyword in the <A HREF = "package.html">package intel</A> command.  A
 balance of 0 runs all calculations on the CPU.  A balance of 1 runs
 all calculations on the coprocessor.  A balance of 0.5 runs half of
 the calculations on the coprocessor.  Setting the balance to -1 (the
@@ -1487,10 +1588,6 @@ performance to use fewer MPI tasks and OpenMP threads than available
 cores.  This is due to the fact that additional threads are generated
 internally to handle the asynchronous offload tasks. 
 
-<LI>If you have multiple coprocessors on each compute node, the
-<I>offload_cards</I> keyword can be specified with the <A HREF = "package.html">package
-intel</A> command. 
-
 <LI>If running short benchmark runs with dynamic load balancing, adding a
 short warm-up run (10-20 steps) will allow the load-balancer to find a
 near-optimal setting that will carry over to additional runs. 
@@ -1509,13 +1606,13 @@ dihedral, improper calculations, computation and data transfer to the
 coprocessor will run concurrently with computations and MPI
 communications for these calculations on the host CPU.  The USER-INTEL
 package has two modes for deciding which atoms will be handled by the
-coprocessor.  This choice is controlled with the "offload_ghost"
-keyword of the <A HREF = "package.html">package intel</A> command.  When set to 0,
-ghost atoms (atoms at the borders between MPI tasks) are not offloaded
-to the card.  This allows for overlap of MPI communication of forces
-with computation on the coprocessor when the <A HREF = "newton.html">newton</A>
-setting is "on".  The default is dependent on the style being used,
-however, better performance may be achieved by setting this option
+coprocessor.  This choice is controlled with the <I>ghost</I> keyword of
+the <A HREF = "package.html">package intel</A> command.  When set to 0, ghost atoms
+(atoms at the borders between MPI tasks) are not offloaded to the
+card.  This allows for overlap of MPI communication of forces with
+computation on the coprocessor when the <A HREF = "newton.html">newton</A> setting
+is "on".  The default is dependent on the style being used, however,
+better performance may be achieved by setting this option
 explictly. 
 </UL>
 <P><B>Restrictions:</B>
diff --git a/doc/Section_accelerate.txt b/doc/Section_accelerate.txt
index b0b0e5fbdb..81b646b931 100644
--- a/doc/Section_accelerate.txt
+++ b/doc/Section_accelerate.txt
@@ -141,12 +141,12 @@ such as when using a barostat.
 Accelerated versions of various "pair_style"_pair_style.html,
 "fixes"_fix.html, "computes"_compute.html, and other commands have
 been added to LAMMPS, which will typically run faster than the
-standard non-accelerated versions, if you have the appropriate
-hardware on your system.
+standard non-accelerated versions.  Some require appropriate hardware
+on your system, e.g. GPUs or Intel Xeon Phi chips.
 
-All of these commands are in "packages"_Section_packages.html.
-Currently, there are 6 such accelerator packages in LAMMPS, either as
-standard or user packages:
+All of these commands are in packages provided with LAMMPS, as
+explained "here"_Section_packages.html.  Currently, there are 6 such
+accelerator packages in LAMMPS, either as standard or user packages:
 
 "USER-CUDA"_#acc_7 : for NVIDIA GPUs
 "GPU"_acc_6 : for NVIDIA GPUs as well as OpenCL support
@@ -171,20 +171,34 @@ Lennard-Jones "pair_style lj/cut"_pair_lj.html:
 "pair_style lj/cut/omp"_pair_lj.html
 "pair_style lj/cut/opt"_pair_lj.html :ul
 
-Assuming LAMMPS was built with the appropriate package, these styles
-can be invoked by specifying them explicitly in your input script.  Or
-the "-suffix command-line switch"_Section_start.html#start_7 can be
-used to automatically invoke the accelerated versions, without
-changing the input script.  Use of the "suffix"_suffix.html command
-allows a suffix to be set explicitly and to be turned off and back on
-at various points within an input script.
+Assuming LAMMPS was built with the appropriate package, a simulation
+using accelerated styles from the package can be run without modifying
+your input script, by specifying "command-line
+switches"_Section_start.html#start_7.  The details of how to do this
+vary from package to package and are explained below.  There is also a
+"suffix"_suffix.html command and a "package"_package.html command that
+accomplish the same thing and can be used within an input script if
+preferred.  The "suffix"_suffix.html command allows more precise
+control of whether an accelerated or unaccelerated version of a style
+is used at various points within an input script.
 
 To see what styles are currently available in each of the accelerated
 packages, see "Section_commands 5"_Section_commands.html#cmd_5 of the
-manual.  The doc page for each indvidual style (e.g. "pair
+manual.  The doc page for individual commands (e.g. "pair
 lj/cut"_pair_lj.html or "fix nve"_fix_nve.html) also lists any
 accelerated variants available for that style.
 
+The examples directory has several sub-directories with scripts and
+README files for using the accelerator packages:
+
+examples/cuda for USER-CUDA package
+examples/gpu for GPU package
+examples/intel for USER-INTEL package
+examples/kokkos for KOKKOS package :ul
+
+Likewise, the bench directory has FERMI and KEPLER sub-directories
+with scripts and README files for using all the accelerator packages.
+
 Here is a brief summary of what the various packages provide.  Details
 are in individual sections below.
 
@@ -202,8 +216,8 @@ coprocessors.  This can result in additional speedup over 2x depending
 on the hardware configuration. :l
 
 Styles with a "kk" suffix are part of the KOKKOS package, and can be
-run using OpenMP, on an NVIDIA GPU, or on an Intel(R) Xeon Phi(TM).
-The speed-up depends on a variety of factors, as discussed below. :l
+run using OpenMP, on an NVIDIA GPU, or on an Intel Xeon Phi.  The
+speed-up depends on a variety of factors, as discussed below. :l
 
 Styles with an "omp" suffix are part of the USER-OMP package and allow
 a pair-style to be run in multi-threaded mode using OpenMP.  This can
@@ -220,7 +234,7 @@ The following sections explain:
 
 what hardware and software the accelerated package requires
 how to build LAMMPS with the accelerated package
-how to run an input script with the accelerated package
+how to run with the accelerated package via either command-line switches or modifying the input script
 speed-ups to expect
 guidelines for best performance
 restrictions :ul
@@ -243,7 +257,10 @@ Here is a quick overview of how to use the OPT package:
 include the OPT package and build LAMMPS
 use OPT pair styles in your input script :ul
 
-Details follow.
+The last step can be done using the "-sf opt" "command-line
+switch"_Section_start.html#start_7.  Or the effect of the "-sf" switch
+can be duplicated by adding a "suffix opt"_suffix.html command to your
+input script.
 
 [Required hardware/software:]
 
@@ -251,28 +268,30 @@ None.
 
 [Building LAMMPS with the OPT package:]
 
-Include the package and build LAMMPS.
+Include the package and build LAMMPS:
 
+cd lammps/src
 make yes-opt
 make machine :pre
 
-No additional compile/link flags are needed in your machine
-Makefile in src/MAKE.
+No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
 
-[Running with the OPT package:]
+[Run with the OPT package from the command line:]
 
-You can explicitly add an "opt" suffix to the
-"pair_style"_pair_style.html command in your input script:
-
-pair_style lj/cut/opt 2.5 :pre
-
-Or you can run with the -sf "command-line
-switch"_Section_start.html#start_7, which will automatically append
-"opt" to styles that support it.
+Use the "-sf opt" "command-line switch"_Section_start.html#start_7,
+which will automatically append "opt" to styles that support it.
 
 lmp_machine -sf opt -in in.script
 mpirun -np 4 lmp_machine -sf opt -in in.script :pre
 
+[Or run with the OPT package by editing an input script:]
+
+Use the "suffix opt"_suffix.html command, or you can explicitly add an
+"opt" suffix to individual styles in your input script, e.g.
+
+pair_style lj/cut/opt 2.5 :pre
+
 [Speed-ups to expect:]
 
 You should see a reduction in the "Pair time" value printed at the end
@@ -299,13 +318,17 @@ uses the OpenMP interface for multi-threading.
 
 Here is a quick overview of how to use the USER-OMP package:
 
-specify the -fopenmp flag for compiling and linking in your machine Makefile
+use the -fopenmp flag for compiling and linking in your Makefile.machine
 include the USER-OMP package and build LAMMPS
-specify how many threads per MPI task to run with via an environment variable or the package omp command
-enable the USER-OMP package via the "-sf omp" command-line switch, or the package omp commmand
+use the mpirun command to set the number of MPI tasks/node
+specify how many threads per MPI task to use
 use USER-OMP styles in your input script :ul
 
-Details follow.
+The latter two steps can be done using the "-pk omp" and "-sf omp"
+"command-line switches"_Section_start.html#start_7 respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the "package omp"_package.html or "suffix omp"_suffix.html commands
+respectively to your input script.
 
 [Required hardware/software:]
 
@@ -315,73 +338,65 @@ MPI task running on a CPU.
 
 [Building LAMMPS with the USER-OMP package:]
 
-Include the package and build LAMMPS.  
+Include the package and build LAMMPS:
 
 cd lammps/src
 make yes-user-omp
 make machine :pre
 
-Your lo-level src/MAKE/Makefile.machine needs a flag for OpenMP
-support in both the CCFLAGS and LINKFLAGS variables.  For GNU and
-Intel compilers, this flag is {-fopenmp}.  Without this flag the
-USER-OMP styles will still be compiled and work, but will not support
-multi-threading.
+Your src/MAKE/Makefile.machine needs a flag for OpenMP support in both
+the CCFLAGS and LINKFLAGS variables.  For GNU and Intel compilers,
+this flag is "-fopenmp".  Without this flag the USER-OMP styles will
+still be compiled and work, but will not support multi-threading.
 
-[Running with the USER-OMP package:]
+[Run with the USER-OMP package from the command line:]
 
-There are 3 issues (a,b,c) to address:
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 
-(a) Specify how many threads per MPI task to use
+You need to choose how many threads per MPI task will be used by the
+USER-OMP package.  Note that the product of MPI tasks * threads/task
+should not exceed the physical number of cores (on a node), otherwise
+performance will suffer.
 
-Note that the product of MPI tasks * threads/task should not exceed
-the physical number of cores, otherwise performance will suffer.
+Use the "-sf omp" "command-line switch"_Section_start.html#start_7,
+which will automatically append "omp" to styles that support it.  Use
+the "-pk omp Nt" "command-line switch"_Section_start.html#start_7, to
+set Nt = # of OpenMP threads per MPI task to use.
 
-By default LAMMPS uses 1 thread per MPI task.  If the environment
-variable OMP_NUM_THREADS is set to a valid value, this value is used.
-You can set this environment variable when you launch LAMMPS, e.g.
+lmp_machine -sf omp -pk omp 16 -in in.script                       # 1 MPI task on a 16-core node
+mpirun -np 4 lmp_machine -sf omp -pk omp 4 -in in.script           # 4 MPI tasks each with 4 threads on a single 16-core node
+mpirun -np 32 -ppn 4 lmp_machine -sf omp -pk omp 4 -in in.script   # ditto on 8 16-core nodes :pre
 
-env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
-env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
-mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script :pre
+Note that if the "-sf omp" switch is used, it also issues a default
+"package omp 0"_package.html command, which sets the number of threads
+per MPI task via the OMP_NUM_THREADS environment variable.
 
-or you can set it permanently in your shell's start-up script.  
-All three of these examples use a total of 4 CPU cores.
+Using the "-pk" switch explicitly allows for direct setting of the
+number of threads and additional options.  Its syntax is the same as
+the "package omp" command.  See the "package"_package.html command doc
+page for details, including the default values used for all its
+options if it is not specified, and how to set the number of threads
+via the OMP_NUM_THREADS environment variable if desired.
 
-Note that different MPI implementations have different ways of passing
-the OMP_NUM_THREADS environment variable to all MPI processes.  The
-2nd line above is for MPICH; the 3rd line with -x is for OpenMPI.
-Check your MPI documentation for additional details.
+[Or run with the USER-OMP package by editing an input script:]
 
-You can also set the number of threads per MPI task via the "package
-omp"_package.html command, which will override any OMP_NUM_THREADS
-setting.
+The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+and threads/MPI task is the same.
 
-(b) Enable the USER-OMP package
+Use the "suffix omp"_suffix.html command, or you can explicitly add an
+"omp" suffix to individual styles in your input script, e.g.
 
-This can be done in one of two ways.  Use a "package omp"_package.html
-command near the top of your input script.
+pair_style lj/cut/omp 2.5 :pre
 
-Or use the "-sf omp" "command-line switch"_Section_start.html#start_7,
-which will automatically invoke the command "package omp
-*"_package.html.
-
-(c) Use OMP-accelerated styles
-
-This can be done by explicitly adding an "omp" suffix to any supported
-style in your input script:
-
-pair_style lj/cut/omp 2.5
-fix nve/omp :pre
-
-Or you can run with the "-sf omp" "command-line
-switch"_Section_start.html#start_7, which will automatically append
-"omp" to styles that support it.
-
-lmp_machine -sf omp -in in.script
-mpirun -np 4 lmp_machine -sf omp -in in.script :pre
-
-Using the "suffix omp" command in your input script does the same
-thing.
+You must also use the "package omp"_package.html command to enable the
+USER-OMP package, unless the "-sf omp" or "-pk omp" "command-line
+switches"_Section_start.html#start_7 were used.  It specifies how many
+threads per MPI task to use, as well as other options.  Its doc page
+explains how to set the number of threads via an environment variable
+if desired.
 
 [Speed-ups to expect:]
 
@@ -456,7 +471,7 @@ and thus reducing the work done by the long-range solver.  Using the
 with the USER-OMP package, is an alternative way to reduce the number
 of MPI tasks assigned to the KSpace calculation. :l,ule
 
-Other performance tips are as follows:
+Additional performance tips are as follows:
 
 The best parallel efficiency from {omp} styles is typically achieved
 when there is at least one MPI task per physical processor,
@@ -485,14 +500,14 @@ versions of many pair styles, including the 3-body Stillinger-Weber
 pair style, and for "kspace_style pppm"_kspace_style.html for
 long-range Coulombics.  It has the following general features:
 
-The package is designed to exploit common GPU hardware configurations
-where one or more GPUs are coupled to many cores of one or more
-multi-core CPUs, e.g. within a node of a parallel machine. :ulb,l
+It is designed to exploit common GPU hardware configurations where one
+or more GPUs are coupled to many cores of one or more multi-core CPUs,
+e.g. within a node of a parallel machine. :ulb,l
 
 Atom-based data (e.g. coordinates, forces) moves back-and-forth
 between the CPU(s) and GPU every timestep. :l
 
-Neighbor lists can be constructed on the CPU or on the GPU :l
+Neighbor lists can be built on the CPU or on the GPU :l
 
 The charge assignement and force interpolation portions of PPPM can be
 run on the GPU.  The FFT portion, which requires MPI communication
@@ -514,16 +529,17 @@ hardware. :l,ule
 
 Here is a quick overview of how to use the GPU package:
 
-build the library in lib/gpu for your GPU hardware (CUDA_ARCH) with desired precision (CUDA_PREC)
+build the library in lib/gpu for your GPU hardware wity desired precision
 include the GPU package and build LAMMPS
-decide how many MPI tasks per GPU to run with, i.e. set MPI tasks/node via mpirun
-specify how many GPUs per node to use (default = 1) via the package gpu command
-enable the GPU package via the "-sf gpu" command-line switch, or the package gpu commmand
-use the newton command to turn off Newton's law for pairwise interactions
-use the package gpu command to enable neighbor list building on the GPU if desired
-use GPU pair styles and kspace styles in your input script :ul
+use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU
+specify the # of GPUs per node
+use GPU styles in your input script :ul
 
-Details follow.
+The latter two steps can be done using the "-pk gpu" and "-sf gpu"
+"command-line switches"_Section_start.html#start_7 respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the "package gpu"_package.html or "suffix gpu"_suffix.html commands
+respectively to your input script.
 
 [Required hardware/software:]
 
@@ -538,7 +554,7 @@ Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) t
 [Building LAMMPS with the GPU package:]
 
 This requires two steps (a,b): build the GPU library, then build
-LAMMPS.
+LAMMPS with the GPU package.
 
 (a) Build the GPU library
 
@@ -554,9 +570,9 @@ See lib/gpu/Makefile.linux.double for examples of the ARCH settings
 for different GPU choices, e.g. Fermi vs Kepler.  It also lists the
 possible precision settings:
 
-CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
-CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
-CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc, in double :pre
+CUDA_PREC = -D_SINGLE_SINGLE  # single precision for all calculations
+CUDA_PREC = -D_DOUBLE_DOUBLE  # double precision for all calculations
+CUDA_PREC = -D_SINGLE_DOUBLE  # accumulation of forces, etc, in double :pre
 
 The last setting is the mixed mode referred to above.  Note that your
 GPU must support double precision to use either the 2nd or 3rd of
@@ -578,74 +594,74 @@ Note that to change the precision of the GPU library, you need to
 re-build the entire library.  Do a "clean" first, e.g. "make -f
 Makefile.linux clean", followed by the make command above.
 
-(b) Build LAMMPS
+(b) Build LAMMPS with the GPU package
 
 cd lammps/src
 make yes-gpu
 make machine :pre
 
-Note that if you change the GPU library precision (discussed above),
-you also need to re-install the GPU package and re-build LAMMPS, so
-that all affected files are re-compiled and linked to the new GPU
-library.
+No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
 
-[Running with the GPU package:]
+Note that if you change the GPU library precision (discussed above)
+and rebuild the GPU library, then you also need to re-install the GPU
+package and re-build LAMMPS, so that all affected files are
+re-compiled and linked to the new GPU library.
 
-The examples/gpu and bench/GPU directories have scripts that can be
-run with the GPU package, as well as detailed instructions on how to
-run them.
+[Run with the GPU package from the command line:]
 
-To run with the GPU package, there are 3 basic issues (a,b,c) to
-address:
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 
-(a) Use one or more MPI tasks per GPU
+When using the GPU package, you cannot assign more than one GPU to a
+single MPI task.  However multiple MPI tasks can share the same GPU,
+and in many cases it will be more efficient to run this way.  Likewise
+it may be more efficient to use less MPI tasks/node than the available
+# of CPU cores.  Assignment of multiple MPI tasks to a GPU will happen
+automatically if you create more MPI tasks/node than there are
+GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
+shared by 4 MPI tasks.
 
-The total number of MPI tasks used by LAMMPS (one or multiple per
-compute node) is set in the usual manner via the mpirun or mpiexec
-commands, and is independent of the GPU package.
+Use the "-sf gpu" "command-line switch"_Section_start.html#start_7,
+which will automatically append "gpu" to styles that support it.  Use
+the "-pk gpu Ng" "command-line switch"_Section_start.html#start_7 to
+set Ng = # of GPUs/node to use.
 
-When using the GPU package, you cannot assign more than one physical
-GPU to a single MPI task.  However multiple MPI tasks can share the
-same GPU, and in many cases it will be more efficient to run this way.
+lmp_machine -sf gpu -pk gpu 1 -in in.script                         # 1 MPI task uses 1 GPU
+mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script           # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node
+mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes :pre
 
-The default is to have all MPI tasks on a compute node use a single
-GPU.  To use multiple GPUs per node, be sure to create one or more MPI
-tasks per GPU, and use the first/last settings in the "package
-gpu"_package.html command to include all the GPU IDs on the node.
-E.g. first = 0, last = 1, for 2 GPUs.  On a node with 8 CPU cores
-and 2 GPUs, this would specify that each GPU is shared by 4 MPI tasks.
+Note that if the "-sf gpu" switch is used, it also issues a default
+"package gpu 1"_package.html command, which sets the number of
+GPUs/node to use to 1.
 
-(b) Enable the GPU package
+Using the "-pk" switch explicitly allows for direct setting of the
+number of GPUs/node to use and additional options.  Its syntax is the
+same as same as the "package gpu" command.  See the
+"package"_package.html command doc page for details, including the
+default values used for all its options if it is not specified.
 
-This can be done in one of two ways.  Use a "package gpu"_package.html
-command near the top of your input script.
+[Or run with the GPU package by editing an input script:]
 
-Or use the "-sf gpu" "command-line switch"_Section_start.html#start_7,
-which will automatically invoke the command "package gpu force/neigh 0
-0 1"_package.html.  Note that this specifies use of a single GPU (per
-node), so you must specify the package command in your input script
-explicitly if you want to use multiple GPUs per node.
+The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+and use of multiple MPI tasks/GPU is the same.
 
-(c) Use GPU-accelerated styles
-
-This can be done by explicitly adding a "gpu" suffix to any supported
-style in your input script:
+Use the "suffix gpu"_suffix.html command, or you can explicitly add an
+"gpu" suffix to individual styles in your input script, e.g.
 
 pair_style lj/cut/gpu 2.5 :pre
 
-Or you can run with the "-sf gpu" "command-line
-switch"_Section_start.html#start_7, which will automatically append
-"gpu" to styles that support it.
+You must also use the "package gpu"_package.html command to enable the
+GPU package, unless the "-sf gpu" or "-pk gpu" "command-line
+switches"_Section_start.html#start_7 were used.  It specifies the
+number of GPUs/node to use, as well as other options.
 
-lmp_machine -sf gpu -in in.script
-mpirun -np 4 lmp_machine -sf gpu -in in.script :pre
-
-Using the "suffix gpu" command in your input script does the same
-thing.
-
-IMPORTANT NOTE: The input script must also use the
-"newton"_newton.html command with a pairwise setting of {off},
-since {on} is the default.
+IMPORTANT NOTE: The input script must also use a newton pairwise
+setting of {off} in order to use GPU package pair styles.  This can be
+set via the "package gpu"_package.html or "newton"_newton.html
+commands.
 
 [Speed-ups to expect:]
 
@@ -739,18 +755,23 @@ single CPU (core), assigned to each GPU. :l,ule
 
 Here is a quick overview of how to use the USER-CUDA package:
 
-build the library in lib/cuda for your GPU hardware (arch with desired precision (precision)
+build the library in lib/cuda for your GPU hardware with desired precision
 include the USER-CUDA package and build LAMMPS
 use the mpirun command to specify 1 MPI task per GPU (on each node)
-specify how many GPUs per node to use (default = 1) via the package cuda command
 enable the USER-CUDA package via the "-c on" command-line switch
+specify the # of GPUs per node
 use USER-CUDA styles in your input script :ul
 
-Details follow.
+The latter two steps can be done using the "-pk cuda" and "-sf cuda"
+"command-line switches"_Section_start.html#start_7 respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the "package cuda"_package.html or "suffix cuda"_suffix.html commands
+respectively to your input script.
 
 [Required hardware/software:]
 
-To use this package, you need to have one or more NVIDIA GPUs and install the NVIDIA Cuda software on your system:
+To use this package, you need to have one or more NVIDIA GPUs and
+install the NVIDIA Cuda software on your system:
 
 Your NVIDIA GPU needs to support Compute Capability 1.3. This list may
 help you to find out the Compute Capability of your card:
@@ -765,7 +786,7 @@ projects can be compiled without problems.
 [Building LAMMPS with the USER-CUDA package:]
 
 This requires two steps (a,b): build the USER-CUDA library, then build
-LAMMPS.
+LAMMPS with the USER-CUDA package.
 
 (a) Build the USER-CUDA library
 
@@ -810,58 +831,66 @@ Note that if you change any of the options (like precision), you need
 to re-build the entire library.  Do a "make clean" first, followed by
 "make".
 
-(b) Build LAMMPS
+(b) Build LAMMPS with the USER-CUDA package
 
 cd lammps/src
 make yes-user-cuda
 make machine :pre
 
+No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
+
 Note that if you change the USER-CUDA library precision (discussed
-above), you also need to re-install the USER-CUDA package and re-build
-LAMMPS, so that all affected files are re-compiled and linked to the
-new USER-CUDA library.
+above) and rebuild the USER-CUDA library, then you also need to
+re-install the USER-CUDA package and re-build LAMMPS, so that all
+affected files are re-compiled and linked to the new USER-CUDA
+library.
 
-[Running with the USER-CUDA package:]
+[Run with the USER-CUDA package from the command line:]
 
-The bench/CUDA directories has scripts that can be run with the
-USER-CUDA package, as well as detailed instructions on how to run
-them.
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 
-To run with the USER-CUDA package, there are 3 basic issues (a,b,c) to
-address:
+When using the USER-CUDA package, you must use exactly one MPI task
+per physical GPU.
 
-(a) Use one MPI task per GPU
+You must use the "-c on" "command-line
+switch"_Section_start.html#start_7 to enable the USER-CUDA package.
 
-This is a requirement of the USER-CUDA package, i.e. you cannot
-use multiple MPI tasks per physical GPU.  So if you are running
-on nodes with 1 or 2 GPUs, use the mpirun or mpiexec command
-to specify 1 or 2 MPI tasks per node.
+Use the "-sf cuda" "command-line switch"_Section_start.html#start_7,
+which will automatically append "cuda" to styles that support it.  Use
+the "-pk cuda Ng" "command-line switch"_Section_start.html#start_7 to
+set Ng = # of GPUs per node.
 
-If the nodes have more than 1 GPU, you must use the "package
-cuda"_package.html command near the top of your input script to
-specify that more than 1 GPU will be used (the default = 1).
+lmp_machine -c on -sf cuda -pk cuda 1 -in in.script                       # 1 MPI task uses 1 GPU
+mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script          # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node
+mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script  # ditto on 12 16-core nodes :pre
 
-(b) Enable the USER-CUDA package
+The "-pk" switch must be used (unless the "package cuda"_package.html
+command is used in the input script) to set the number of GPUs/node to
+use.  It also allows for setting of additional options.  Its syntax is
+the same as same as the "package cuda" command.  See the
+"package"_package.html command doc page for details.
 
-The "-c on" or "-cuda on" "command-line
-switch"_Section_start.html#start_7 must be used when launching LAMMPS.
+[Or run with the USER-CUDA package by editing an input script:]
 
-(c) Use USER-CUDA-accelerated styles
+The discussion above for the mpirun/mpiexec command and the requirement
+of one MPI task per GPU is the same.
 
-This can be done by explicitly adding a "cuda" suffix to any supported
-style in your input script:
+You must still use the "-c on" "command-line
+switch"_Section_start.html#start_7 to enable the USER-CUDA package.
+
+Use the "suffix cuda"_suffix.html command, or you can explicitly add a
+"cuda" suffix to individual styles in your input script, e.g.
 
 pair_style lj/cut/cuda 2.5 :pre
 
-Or you can run with the "-sf cuda" "command-line
-switch"_Section_start.html#start_7, which will automatically append
-"cuda" to styles that support it.
-
-lmp_machine -sf cuda -in in.script
-mpirun -np 4 lmp_machine -sf cuda -in in.script :pre
-
-Using the "suffix cuda" command in your input script does the same
-thing.
+You must use the "package cuda"_package.html command to set the the
+number of GPUs/node, unless the "-pk" "command-line
+switch"_Section_start.html#start_7 was used.  The command also
+allows for setting of additional options.
 
 [Speed-ups to expect:]
 
@@ -938,23 +967,39 @@ neighbor list builds, time integration, etc) can be parallelized for
 one or the other of the two modes.  The first mode is called the
 "host" and is one or more threads running on one or more physical CPUs
 (within the node).  Currently, both multi-core CPUs and an Intel Phi
-processor (running in native mode) are supported.  The second mode is
-called the "device" and is an accelerator chip of some kind.
-Currently only an NVIDIA GPU is supported.  If your compute node does
-not have a GPU, then there is only one mode of execution, i.e. the
-host and device are the same.
+processor (running in native mode, not offload mode like the
+USER-INTEL package) are supported.  The second mode is called the
+"device" and is an accelerator chip of some kind.  Currently only an
+NVIDIA GPU is supported.  If your compute node does not have a GPU,
+then there is only one mode of execution, i.e. the host and device are
+the same.
+
+Here is a quick overview of how to use the KOKKOS package
+for GPU acceleration:
+
+specify variables and settings in your Makefile.machine that enable GPU, Phi, or OpenMP support
+include the KOKKOS package and build LAMMPS
+enable the KOKKOS package and its hardware options via the "-k on" command-line switch
+use KOKKOS styles in your input script :ul
+
+The latter two steps can be done using the "-k on", "-pk kokkos" and
+"-sf kk" "command-line switches"_Section_start.html#start_7
+respectively.  Or the effect of the "-pk" or "-sf" switches can be
+duplicated by adding the "package kokkos"_package.html or "suffix
+kk"_suffix.html commands respectively to your input script.
 
 [Required hardware/software:]
 
-The KOKKOS package can be used to build and run
-LAMMPS on the following kinds of hardware configurations:
+The KOKKOS package can be used to build and run LAMMPS on the
+following kinds of hardware:
 
 CPU-only: one MPI task per CPU core (MPI-only, but using KOKKOS styles)
 CPU-only: one or a few MPI tasks per node with additional threading via OpenMP
 Phi: on one or more Intel Phi coprocessors (per node)
 GPU: on the GPUs of a node with additional OpenMP threading on the CPUs :ul
 
-Intel Xeon Phi coprocessors are supported in "native" mode only.
+Note that Intel Xeon Phi coprocessors are supported in "native" mode,
+not "offload" mode like the USER-INTEL package supports.
 
 Only NVIDIA GPUs are currently supported.
 
@@ -1007,7 +1052,7 @@ e.g. g++ in the first two examples above, then you *must* perform a
 to force all the KOKKOS-dependent files to be re-compiled with the new
 options.
 
-You can also hardwire these variables in the specified machine
+You can also hardwire these make variables in the specified machine
 makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above,
 with a line like:
 
@@ -1037,79 +1082,107 @@ IMPORTANT NOTE: Currently, there are no precision options with the
 KOKKOS package.  All compilation and computation is performed in
 double precision.
 
-[Running with the KOKKOS package:]
+[Run with the KOKKOS package from the command line:]
 
-The examples/kokkos and bench/KOKKOS directories have scripts that can
-be run with the KOKKOS package, as well as detailed instructions on
-how to run them.
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 
-There are 3 issues (a,b,c) to address:
+When using KOKKOS built with host=OMP, you need to choose how many
+OpenMP threads per MPI task will be used (via the "-k" command-line
+switch discussed below).  Note that the product of MPI tasks * OpenMP
+threads/task should not exceed the physical number of cores (on a
+node), otherwise performance will suffer.
 
-(a) Launching LAMMPS in different KOKKOS modes
+When using the KOKKOS package built with device=CUDA, you must use
+exactly one MPI task per physical GPU.
 
-Here are examples of how to run LAMMPS for the different compute-node
-configurations listed above.
+When using the KOKKOS package built with host=MIC for Intel Xeon Phi
+coprocessor support you need to insure there are one or more MPI tasks
+per coprocessor, and choose the number of coprocessor threads to use
+per MPI task (via the "-k" command-line switch discussed below).  The
+product of MPI tasks * coprocessor threads/task should not exceed the
+maximum number of threads the coproprocessor is designed to run,
+otherwise performance will suffer.  This value is 240 for current
+generation Xeon Phi(TM) chips, which is 60 physical cores * 4
+threads/core.  Note that with the KOKKOS package you do not need to
+specify how many Phi coprocessors there are per node; each
+coprocessors is simply treated as running some number of MPI tasks.
 
-Note that the -np setting for the mpirun command in these examples is
-for runs on a single node.  To scale these examples up to run on a
-system with N compute nodes, simply multiply the -np setting by N.
+You must use the "-k on" "command-line
+switch"_Section_start.html#start_7 to enable the KOKKOS package.  It
+takes additional arguments for hardware settings appropriate to your
+system.  Those arguments are "documented
+here"_Section_start.html#start_7.  The two most commonly used arguments
+are:
 
-CPU-only, dual hex-core CPUs:
+-k on t Nt
+-k on g Ng :pre
 
-mpirun -np 12 lmp_g++ -in in.lj      # MPI-only mode with no Kokkos
-mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj      # MPI-only mode with Kokkos
-mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj     # one MPI task, 12 threads
-mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj      # two MPI tasks, 6 threads/task :pre
+The "t Nt" option applies to host=OMP (even if device=CUDA) and
+host=MIC.  For host=OMP, it specifies how many OpenMP threads per MPI
+task to use with a node.  For host=MIC, it specifies how many Xeon Phi
+threads per MPI task to use within a node.  The default is Nt = 1.
+Note that for host=OMP this is effectively MPI-only mode which may be
+fine.  But for host=MIC you will typically end up using far less than
+all the 240 available threads, which could give very poor performance.
 
-Intel Phi with 61 cores (240 total usable cores, with 4x hardware threading):
+The "g Ng" option applies to device=CUDA.  It specifies how many GPUs
+per compute node to use.  The default is 1, so this only needs to be
+specified is you have 2 or more GPUs per compute node.
 
-mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj      # 12*20 = 240
-mpirun -np 15 lmp_g++ -k on t 16 -sf kk -in in.lj
-mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj
-mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj :pre
+The "-k on" switch also issues a default "package kokkos neigh full
+comm host"_package.html command which sets various KOKKOS options to
+default values, as discussed on the "package"_package.html command doc
+page.
 
-Dual hex-core CPUs and a single GPU:
+Use the "-sf kk" "command-line switch"_Section_start.html#start_7,
+which will automatically append "kk" to styles that support it.  Use
+the "-pk kokkos" "command-line switch"_Section_start.html#start_7 if
+you wish to override any of the default values set by the "package
+kokkos"_package.html command invoked by the "-k on" switch.
 
-mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj       # one MPI task, 6 threads on CPU :pre
+host=OMP, dual hex-core nodes (12 threads/node):
+mpirun -np 12 lmp_g++ -in in.lj                           # MPI-only mode with no Kokkos
+mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj              # MPI-only mode with Kokkos
+mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj          # one MPI task, 12 threads
+mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj           # two MPI tasks, 6 threads/task 
+mpirun -np 32 -ppn 2 lmp_g++ -k on t 6 -sf kk -in in.lj   # ditto on 16 nodes :pre
 
-Dual 8-core CPUs and 2 GPUs:
+host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading):
+mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj           # 1 MPI task on 1 Phi, 1*240 = 240
+mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj            # 30 MPI tasks on 1 Phi, 30*8 = 240
+mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj           # 12 MPI tasks on 1 Phi, 12*20 = 240
+mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj   # ditto on 8 Phis
 
-mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # two MPI tasks, 8 threads per CPU :pre
 
-(b) Enable the KOKKOS package
+host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU:
+mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj          # one MPI task, 6 threads on CPU
+mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj   # ditto on 4 nodes :pre
 
-As illustrated above, the "-k on" or "-kokkos on" "command-line
-switch"_Section_start.html#start_7 must be used when launching LAMMPS.
+host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs:
+mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj           # two MPI tasks, 8 threads per CPU
+mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # ditto on 16 nodes :pre
 
-As documented "here"_Section_start.html#start_7, the command-line
-swithc allows for several options.  Commonly used ones, as illustrated
-above, are:
+[Or run with the KOKKOS package by editing an input script:]
 
--k on t Nt : specifies how many threads per MPI task to use within a
-compute node.  For good performance, the product of MPI tasks *
-threads/task should not exceed the number of physical cores on a CPU
-or Intel Phi (including hardware threading, e.g. 240). :ulb,l
+The discussion above for the mpirun/mpiexec command and setting
+appropriate thread and GPU values for host=OMP or host=MIC or
+device=CUDA are the same.
 
--k on g Ng : specifies how many GPUs per compute node are available.
-The default is 1, so this should be specified is you have 2 or more
-GPUs per compute node. :l,ule
+You must still use the "-k on" "command-line
+switch"_Section_start.html#start_7 to enable the KOKKOS package, and
+specify its additional arguments for hardware options appopriate to
+your system, as documented above.
 
-(c) Use KOKKOS-accelerated styles
-
-This can be done by explicitly adding a "kk" suffix to any supported
-style in your input script:
+Use the "suffix kk"_suffix.html command, or you can explicitly add a
+"kk" suffix to individual styles in your input script, e.g.
 
 pair_style lj/cut/kk 2.5 :pre
 
-Or you can run with the "-sf kk" "command-line
-switch"_Section_start.html#start_7, which will automatically append
-"kk" to styles that support it.
-
-lmp_machine -sf kk -in in.script
-mpirun -np 4 lmp_machine -sf kk -in in.script :pre
-
-Using the "suffix kk" command in your input script does the same
-thing.
+You only need to use the "package kokkos"_package.html command if you
+wish to change any of its option defaults.
 
 [Speed-ups to expect:]
 
@@ -1129,8 +1202,8 @@ When running on CPUs only, with multiple threads per MPI task,
 performance of a KOKKOS style is a bit slower than the USER-OMP
 package. :l
 
-When running on GPUs, KOKKOS currently out-performs the 
-USER-CUDA and GPU packages. :l
+When running on GPUs, KOKKOS is typically faster than the USER-CUDA
+and GPU packages. :l
 
 When running on Intel Xeon Phi, KOKKOS is not as fast as
 the USER-INTEL package, which is optimized for that hardware. :l,ule
@@ -1141,8 +1214,8 @@ hardware.
 
 [Guidelines for best performance:]
 
-Here are guidline for using the KOKKOS package on the different hardware
-configurations listed above.
+Here are guidline for using the KOKKOS package on the different
+hardware configurations listed above.
 
 Many of the guidelines use the "package kokkos"_package.html command
 See its doc page for details and default settings.  Experimenting with
@@ -1153,7 +1226,7 @@ its options can provide a speed-up for specific calculations.
 If N is the number of physical cores/node, then the number of MPI
 tasks/node * number of threads/task should not exceed N, and should
 typically equal N.  Note that the default threads/task is 1, as set by
-the "t" keyword of the -k "command-line
+the "t" keyword of the "-k" "command-line
 switch"_Section_start.html#start_7.  If you do not change this, no
 additional parallelism (beyond MPI) will be invoked on the host
 CPU(s).
@@ -1164,15 +1237,14 @@ run with 1 MPI task/node and N threads/task
 run with N MPI tasks/node and 1 thread/task
 run with settings in between these extremes :ul
 
-Examples of mpirun commands in these modes, for nodes with dual
-hex-core CPUs and no GPU, are shown above.
+Examples of mpirun commands in these modes are shown above.
 
 When using KOKKOS to perform multi-threading, it is important for
 performance to bind both MPI tasks to physical cores, and threads to
 physical cores, so they do not migrate during a simulation.
 
 If you are not certain MPI tasks are being bound (check the defaults
-for your MPI installation), it can be forced with these flags:
+for your MPI installation), binding can be forced with these flags:
 
 OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
 Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... :pre
@@ -1195,7 +1267,7 @@ details).
 The -np setting of the mpirun command should set the number of MPI
 tasks/node to be equal to the # of physical GPUs on the node. 
 
-Use the "-kokkos command-line switch"_Section_commands.html#start_7 to
+Use the "-k" "command-line switch"_Section_commands.html#start_7 to
 specify the number of GPUs per node, and the number of threads per MPI
 task.  As above for multi-core CPUs (and no GPU), if N is the number
 of physical cores/node, then the number of MPI tasks/node * number of
@@ -1205,14 +1277,13 @@ threads/task to a smaller value.  This is because using all the cores
 on a dual-socket node will incur extra cost to copy memory from the
 2nd socket to the GPU.
 
-Examples of mpirun commands that follow these rules, for nodes with
-dual hex-core CPUs and one or two GPUs, are shown above.
+Examples of mpirun commands that follow these rules are shown above.
 
-When using a GPU, you will achieve the best performance if your input
-script does not use any fix or compute styles which are not yet
-Kokkos-enabled.  This allows data to stay on the GPU for multiple
-timesteps, without being copied back to the host CPU.  Invoking a
-non-Kokkos fix or compute, or performing I/O for
+IMPORTANT NOTE: When using a GPU, you will achieve the best
+performance if your input script does not use any fix or compute
+styles which are not yet Kokkos-enabled.  This allows data to stay on
+the GPU for multiple timesteps, without being copied back to the host
+CPU.  Invoking a non-Kokkos fix or compute, or performing I/O for
 "thermo"_thermo_style.html or "dump"_dump.html output will cause data
 to be copied back to the CPU.
 
@@ -1248,8 +1319,7 @@ threads/task as Nt.  The product of these 2 values should be N, i.e.
 4 so that logical threads from more than one MPI task do not run on
 the same physical core.
 
-Examples of mpirun commands that follow these rules, for Intel Phi
-nodes with 61 cores, are shown above.
+Examples of mpirun commands that follow these rules are shown above.
 
 [Restrictions:]
 
@@ -1270,11 +1340,12 @@ change in the future.
 The USER-INTEL package was developed by Mike Brown at Intel
 Corporation.  It provides a capability to accelerate simulations by
 offloading neighbor list and non-bonded force calculations to Intel(R)
-Xeon Phi(TM) coprocessors.  Additionally, it supports running
-simulations in single, mixed, or double precision with vectorization,
-even if a coprocessor is not present, i.e. on an Intel(R) CPU.  The
-same C++ code is used for both cases.  When offloading to a
-coprocessor, the routine is run twice, once with an offload flag.
+Xeon Phi(TM) coprocessors (not native mode like the KOKKOS package).
+Additionally, it supports running simulations in single, mixed, or
+double precision with vectorization, even if a coprocessor is not
+present, i.e. on an Intel(R) CPU.  The same C++ code is used for both
+cases.  When offloading to a coprocessor, the routine is run twice,
+once with an offload flag.
 
 The USER-INTEL package can be used in tandem with the USER-OMP
 package.  This is useful when offloading pair style computations to
@@ -1296,20 +1367,27 @@ package is available.
 Here is a quick overview of how to use the USER-INTEL package
 for CPU acceleration:
 
-specify these CCFLAGS in your machine Makefile: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost
-specify -fopenmp with LINKFLAGS in your machine Makefile
-include the USER-INTEL package and (optionally) USER-OMP package and build LAMMP
-if also using the USER-OMP package, specify how many threads per MPI task to run with via an environment variable or the package omp command
+specify these CCFLAGS in your Makefile.machine: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost
+specify -fopenmp with LINKFLAGS in your Makefile.machine
+include the USER-INTEL package and (optionally) USER-OMP package and build LAMMPS
+if using the USER-OMP package, specify how many threads per MPI task to use
 use USER-INTEL styles in your input script :ul
 
-Running with the USER-INTEL package to offload to the Intel(R) Xeon Phi(TM)
-is the same except for these additional steps:
+Using the USER-INTEL package to offload work to the Intel(R)
+Xeon Phi(TM) coprocessor is the same except for these additional
+steps:
 
-add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Machine makefile
-add the flag -offload to the LINKFLAGS in your Machine makefile
-the package intel command can be used to adjust threads per coprocessor :ul
+add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Makefile.machine
+add the flag -offload to LINKFLAGS in your Makefile.machine
+specify how many threads per coprocessor to use :ul
 
-Details follow.
+The latter two steps in the first case and the last step in the
+coprocessor case can be done using the "-pk omp" and "-sf intel" and
+"-pk intel" "command-line switches"_Section_start.html#start_7
+respectively.  Or the effect of the "-pk" or "-sf" switches can be
+duplicated by adding the "package omp"_package.html or "suffix
+intel"_suffix.html or "package intel"_package.html commands
+respectively to your input script.
 
 [Required hardware/software:]
 
@@ -1325,7 +1403,7 @@ compiler must support the OpenMP interface.
 
 [Building LAMMPS with the USER-INTEL package:]
 
-Include the package and build LAMMPS.  
+Include the package(s) and build LAMMPS:  
 
 cd lammps/src
 make yes-user-intel
@@ -1358,77 +1436,101 @@ If using an Intel compiler, it is recommended that Intel(R) Compiler
 issues that are being addressed. If using Intel(R) MPI, version 5 or
 higher is recommended.
 
-[Running with the USER-INTEL package:]
+[Running with the USER-INTEL package from the command line:]
 
-The examples/intel directory has scripts that can be run with the
-USER-INTEL package, as well as detailed instructions on how to run
-them.
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
 
-Note that the total number of MPI tasks used by LAMMPS (one or
-multiple per compute node) is set in the usual manner via the mpirun
-or mpiexec commands, and is independent of the USER-INTEL package.
-
-To run with the USER-INTEL package, there are 3 basic issues (a,b,c)
-to address:
-
-(a) Specify how many threads per MPI task to use on the CPU.
-
-Whether using the USER-INTEL package to offload computations to
-Intel(R) Xeon Phi(TM) coprocessors or not, work performed on the CPU
-can be multi-threaded via the USER-OMP package, assuming the USER-OMP
-package was also installed when LAMMPS was built.
-
-In this case, the instructions above for the USER-OMP package, in its
-"Running with the USER-OMP package" sub-section apply here as well.
-
-You can specify the number of threads per MPI task via the
-OMP_NUM_THREADS environment variable or the "package omp"_package.html
-command.  The product of MPI tasks * threads/task should not exceed
-the physical number of cores on the CPU (per node), otherwise
+If LAMMPS was also built with the USER-OMP package, you need to choose
+how many OpenMP threads per MPI task will be used by the USER-OMP
+package.  Note that the product of MPI tasks * OpenMP threads/task
+should not exceed the physical number of cores (on a node), otherwise
 performance will suffer.
 
-Note that the threads per MPI task setting is completely independent
-of the number of threads used on the coprocessor.  Only the "package
-intel"_package.html command can be used to control thread counts on
-the coprocessor.
+If LAMMPS was built with coprocessor support for the USER-INTEL
+package, you need to specify the number of coprocessor/node and the
+number of threads to use on the coprocessor per MPI task.  Note that
+coprocessor threads (which run on the coprocessor) are totally
+independent from OpenMP threads (which run on the CPU).  The product
+of MPI tasks * coprocessor threads/task should not exceed the maximum
+number of threads the coproprocessor is designed to run, otherwise
+performance will suffer.  This value is 240 for current generation
+Xeon Phi(TM) chips, which is 60 physical cores * 4 threads/core.  The
+threads/core value can be set to a smaller value if desired by an
+option on the "package intel"_package.html command, in which case the
+maximum number of threads is also reduced.
 
-(b) Enable the USER-INTEL package
+Use the "-sf intel" "command-line switch"_Section_start.html#start_7,
+which will automatically append "intel" to styles that support it.  If
+a style does not support it, a "omp" suffix is tried next.  Use the
+"-pk omp Nt" "command-line switch"_Section_start.html#start_7, to set
+Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with
+the USER-OMP package.  Use the "-pk intel Nphi" "command-line
+switch"_Section_start.html#start_7 to set Nphi = # of Xeon Phi(TM)
+coprocessors/node, if LAMMPS was built with coprocessor support.
 
-This can be done in one of two ways.  Use a "package intel"_package.html
-command near the top of your input script.
+CPU-only without USER-OMP (but using Intel vectorization on CPU):
+lmp_machine -sf intel -in in.script                 # 1 MPI task
+mpirun -np 32 lmp_machine -sf intel -in in.script   # 32 MPI tasks on as many nodes as needed (e.g. 2 16-core nodes) :pre
 
-Or use the "-sf intel" "command-line
-switch"_Section_start.html#start_7, which will automatically invoke
-the command "package intel * mixed balance -1 offload_cards 1
-offload_tpc 4 offload_threads 240".  Note that this specifies mixed
-precision and use of a single Xeon Phi(TM) coprocessor (per node), so
-you must specify the package command in your input script explicitly
-if you want a different precision or to use multiple Phi coprocessor
-per node.  Also note that the balance and offload keywords are ignored
-if you did not build LAMMPS with offload support for a coprocessor, as
-descibed above.
+CPU-only with USER-OMP (and Intel vectorization on CPU):
+lmp_machine -sf intel -pk intel 16 0 -in in.script                # 1 MPI task on a 16-core node
+mpirun -np 4 lmp_machine -sf intel -pk intel 4 0 -in in.script    # 4 MPI tasks each with 4 threads on a single 16-core node
+mpirun -np 32 lmp_machine -sf intel -pk intel 4 0 -in in.script   # ditto on 8 16-core nodes :pre
 
-(c) Use USER-INTEL-accelerated styles
+CPUs + Xeon Phi(TM) coprocessors with USER-OMP:
+lmp_machine -sf intel -pk intel 16 1 -in in.script                                  # 1 MPI task, 240 threads on 1 coprocessor
+mpirun -np 4 lmp_machine -sf intel -pk intel 4 1 tptask 60 -in in.script            # 4 MPI tasks each with 4 OpenMP threads on a single 16-core node, 
+                                                                                    # each MPI task uses 60 threads on 1 coprocessor
+mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.script   # ditto on 8 16-core nodes for MPI tasks and OpenMP threads, 
+                                                                                    # each MPI task uses 120 threads on one of 2 coprocessors :pre
 
-This can be done by explicitly adding an "intel" suffix to any
-supported style in your input script:
+Note that if the "-sf intel" switch is used, it also issues two
+default commands: "package omp 0"_package.html and "package intel
+1"_package.html command.  These set the number of OpenMP threads per
+MPI task via the OMP_NUM_THREADS environment variable, and the number
+of Xeon Phi(TM) coprocessors/node to 1.  The former is ignored if
+LAMMPS was not built with the USER-OMP package.  The latter is ignored
+is LAMMPS was not built with coprocessor support, except for its
+optional precision setting.
+
+Using the "-pk omp" switch explicitly allows for direct setting of the
+number of OpenMP threads per MPI task, and additional options.  Using
+the "-pk intel" switch explicitly allows for direct setting of the
+number of coprocessors/node, and additional options.  The syntax for
+these two switches is the same as the "package omp"_package.html and
+"package intel"_package.html commands.  See the "package"_package.html
+command doc page for details, including the default values used for
+all its options if these switches are not specified, and how to set
+the number of OpenMP threads via the OMP_NUM_THREADS environment
+variable if desired.
+
+[Or run with the USER-INTEL package by editing an input script:]
+
+The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+OpenMP threads per MPI task, and coprocessor threads per MPI task is
+the same.
+
+Use the "suffix intel"_suffix.html command, or you can explicitly add an
+"intel" suffix to individual styles in your input script, e.g.
 
 pair_style lj/cut/intel 2.5 :pre
 
-Or you can run with the "-sf intel" "command-line
-switch"_Section_start.html#start_7, which will automatically append
-"intel" to styles that support it.
+You must also use the "package omp"_package.html command to enable the
+USER-OMP package (assuming LAMMPS was built with USER-OMP) unless the "-sf
+intel" or "-pk omp" "command-line switches"_Section_start.html#start_7
+were used.  It specifies how many OpenMP threads per MPI task to use,
+as well as other options.  Its doc page explains how to set the number
+of threads via an environment variable if desired.
 
-lmp_machine -sf intel -in in.script
-mpirun -np 4 lmp_machine -sf intel -in in.script :pre
-
-Using the "suffix intel" command in your input script does the same
-thing.
-
-IMPORTANT NOTE: Using an "intel" suffix in any of the above modes,
-actually invokes two suffixes, "intel" and "omp".  "Intel" is tried
-first, and if the style does not support it, "omp" is tried next.  If
-neither is supported, the default non-suffix style is used.
+You must also use the "package intel"_package.html command to enable
+coprocessor support within the USER-INTEL package (assuming LAMMPS was
+built with coprocessor support) unless the "-sf intel" or "-pk intel"
+"command-line switches"_Section_start.html#start_7 were used.  It
+specifies how many coprocessors/node to use, as well as other
+coprocessor options.
 
 [Speed-ups to expect:]
 
@@ -1466,8 +1568,8 @@ threads to use per core can be accomplished with keyword settings of
 the "package intel"_package.html command. :ulb,l
 
 If desired, only a fraction of the pair style computation can be
-offloaded to the coprocessors.  This is accomplished by setting a
-balance fraction in the "package intel"_package.html command.  A
+offloaded to the coprocessors.  This is accomplished by using the
+{balance} keyword in the "package intel"_package.html command.  A
 balance of 0 runs all calculations on the CPU.  A balance of 1 runs
 all calculations on the coprocessor.  A balance of 0.5 runs half of
 the calculations on the coprocessor.  Setting the balance to -1 (the
@@ -1481,10 +1583,6 @@ performance to use fewer MPI tasks and OpenMP threads than available
 cores.  This is due to the fact that additional threads are generated
 internally to handle the asynchronous offload tasks. :l
 
-If you have multiple coprocessors on each compute node, the
-{offload_cards} keyword can be specified with the "package
-intel"_package.html command. :l
-
 If running short benchmark runs with dynamic load balancing, adding a
 short warm-up run (10-20 steps) will allow the load-balancer to find a
 near-optimal setting that will carry over to additional runs. :l
@@ -1503,13 +1601,13 @@ dihedral, improper calculations, computation and data transfer to the
 coprocessor will run concurrently with computations and MPI
 communications for these calculations on the host CPU.  The USER-INTEL
 package has two modes for deciding which atoms will be handled by the
-coprocessor.  This choice is controlled with the "offload_ghost"
-keyword of the "package intel"_package.html command.  When set to 0,
-ghost atoms (atoms at the borders between MPI tasks) are not offloaded
-to the card.  This allows for overlap of MPI communication of forces
-with computation on the coprocessor when the "newton"_newton.html
-setting is "on".  The default is dependent on the style being used,
-however, better performance may be achieved by setting this option
+coprocessor.  This choice is controlled with the {ghost} keyword of
+the "package intel"_package.html command.  When set to 0, ghost atoms
+(atoms at the borders between MPI tasks) are not offloaded to the
+card.  This allows for overlap of MPI communication of forces with
+computation on the coprocessor when the "newton"_newton.html setting
+is "on".  The default is dependent on the style being used, however,
+better performance may be achieved by setting this option
 explictly. :l,ule
 
 [Restrictions:]
diff --git a/doc/fix_qeq.html b/doc/fix_qeq.html
index 1389a85185..8e90b9fdd2 100644
--- a/doc/fix_qeq.html
+++ b/doc/fix_qeq.html
@@ -42,9 +42,29 @@ fix 1 qeq qeq/dynamic 1 12 1.0e-3 100 my_qeq
 and Goddard)</A> and formulated in <A HREF = "#Nakano">(Nakano)</A> (also known
 as the matrix inversion method) and in <A HREF = "#Rick">(Rick and Stuart)</A> (also
 known as the extended Lagrangian method) based on the
-electronegativity equilization principle.  These fixes can be used
-with any potential in LAMMPS, so long as it defines and uses charges
-on each atom and that QEq parameters are provided.
+electronegativity equilization principle.
+</P>
+<P>These fixes can be used with any <A HREF = "pair_style.html">pair style</A> in
+LAMMPS, so long as per-atom charges are defined.  The most typical
+use-case is in conjunction with a <A HREF = "pair_style.html">pair style</A> that
+performs charge equilibration periodically (e.g. every timestep), such
+as the ReaxFF or Streitz-Mintmire potential (the latter is not yet
+implemented in LAMMPS).  But these fixes can also be used with
+potentials that normally assume per-atom charges are fixed, e.g. a
+<A HREF = "pair_buck.html">Buckingham</A> or <A HREF = "pair_lj.html">LJ/Coulombic</A> potential.
+</P>
+<P>Because the charge equilibration calculation is effectively
+independent of the pair style, these fixes can also be used to perform
+a one-time assignment of charges to atoms.  For example, you could
+define the QEq fix, perform a zero-timestep run via the <A HREF = "run.html">run</A>
+command without any pair style defined which would set per-atom
+charges (based on the current atom configuration), then remove the fix
+via the <A HREF = "unfix.html">unfix</A> command before performing further dynamics.
+</P>
+<P>IMPORTANT NOTE: Computing and using charge values different from
+published values defined for a fixed-charge potential like Buckingham
+or CHARMM or AMBER, can have a strong effect on energies and forces,
+and produces a different model than the published versions.
 </P>
 <P>IMPORTANT NOTE: The <A HREF = "fix_qeq_comb.html">fix qeq/comb</A> command must
 still be used to perform charge equliibration with the <A HREF = "pair_comb.html">COMB
diff --git a/doc/fix_qeq.txt b/doc/fix_qeq.txt
index b771fba8d8..6d9fbc25f9 100644
--- a/doc/fix_qeq.txt
+++ b/doc/fix_qeq.txt
@@ -36,9 +36,29 @@ Perform the charge equilibration (QEq) method as described in "(Rappe
 and Goddard)"_#Rappe and formulated in "(Nakano)"_#Nakano (also known
 as the matrix inversion method) and in "(Rick and Stuart)"_#Rick (also
 known as the extended Lagrangian method) based on the
-electronegativity equilization principle.  These fixes can be used
-with any potential in LAMMPS, so long as it defines and uses charges
-on each atom and that QEq parameters are provided.
+electronegativity equilization principle.
+
+These fixes can be used with any "pair style"_pair_style.html in
+LAMMPS, so long as per-atom charges are defined.  The most typical
+use-case is in conjunction with a "pair style"_pair_style.html that
+performs charge equilibration periodically (e.g. every timestep), such
+as the ReaxFF or Streitz-Mintmire potential (the latter is not yet
+implemented in LAMMPS).  But these fixes can also be used with
+potentials that normally assume per-atom charges are fixed, e.g. a
+"Buckingham"_pair_buck.html or "LJ/Coulombic"_pair_lj.html potential.
+
+Because the charge equilibration calculation is effectively
+independent of the pair style, these fixes can also be used to perform
+a one-time assignment of charges to atoms.  For example, you could
+define the QEq fix, perform a zero-timestep run via the "run"_run.html
+command without any pair style defined which would set per-atom
+charges (based on the current atom configuration), then remove the fix
+via the "unfix"_unfix.html command before performing further dynamics.
+
+IMPORTANT NOTE: Computing and using charge values different from
+published values defined for a fixed-charge potential like Buckingham
+or CHARMM or AMBER, can have a strong effect on energies and forces,
+and produces a different model than the published versions.
 
 IMPORTANT NOTE: The "fix qeq/comb"_fix_qeq_comb.html command must
 still be used to perform charge equliibration with the "COMB
diff --git a/doc/package.html b/doc/package.html
index 6a1f0ec39a..42fb14396b 100644
--- a/doc/package.html
+++ b/doc/package.html
@@ -19,69 +19,80 @@
 
 <LI>args = arguments specific to the style 
 
-<PRE>  <I>cuda</I> args = keyword value ...
-    one or more keyword/value pairs may be appended
-    keywords = <I>gpu/node</I> or <I>gpu/node/special</I> or <I>timing</I> or <I>test</I> or <I>override/bpa</I>
-      <I>gpu/node</I> value = N
-        N = number of GPUs to be used per node
-      <I>gpu/node/special</I> values = N gpu1 .. gpuN
-        N = number of GPUs to be used per node
-        gpu1 .. gpuN = N IDs of the GPUs to use
+<PRE>  <I>cuda</I> args = Ngpu keyword value ...
+    Ngpu = # of GPUs per node
+    zero or more keyword/value pairs may be appended
+    keywords = <I>gpuID</I> or <I>timing</I> or <I>test</I> or <I>thread</I>
+      <I>gpuID</I> values = gpu1 .. gpuN
+        gpu1 .. gpuN = IDs of the Ngpu GPUs to use
       <I>timing</I> values = none
       <I>test</I> values = id
         id = atom-ID of a test particle
-      <I>override/bpa</I> values = flag
-        flag = 0 for TpA algorithm, 1 for BpA algorithm 
-  <I>gpu</I> args = mode first last split keyword value ...
-    mode = force or force/neigh
-    first = ID of first GPU to be used on each node
-    last = ID of last GPU to be used on each node
-    split = fraction of particles assigned to the GPU
-    zero or more keyword/value pairs may be appended
-    keywords = <I>threads_per_atom</I> or <I>cellsize</I> or <I>device</I>
-      <I>threads_per_atom</I> value = Nthreads
+      <I>thread</I> = auto or tpa or bpa
+        auto = test whether tpa or bpa is faster
+        tpa = one thread per atom
+        bpa = one block per atom
+  <I>gpu</I> args = Ngpu keyword value ...
+    Ngpu = # of GPUs per node
+    zero or more keyword/value pairs may be appended 
+    keywords = <I>neigh</I> or <I>split</I> or <I>gpuID</I> or <I>tpa</I> or <I>binsize</I> or <I>device</I>
+      <I>neigh</I> value = <I>yes</I> or <I>no</I>
+        yes = neighbor list build on GPU (default)
+        no = neighbor list build on CPU
+      <I>split</I> = fraction
+        fraction = fraction of atoms assigned to GPU (default = 1.0)
+      <I>gpuID</I> values = first last
+        first = ID of first GPU to be used on each node
+        last = ID of last GPU to be used on each node
+      <I>tpa</I> value = Nthreads
         Nthreads = # of GPU threads used per atom
-      <I>cellsize</I> value = dist
-        dist = length (distance units) in each dimension for neighbor bins
+      <I>binsize</I> value = size
+        size = bin size for neighbor list construction (distance units)
       <I>device</I> value = device_type
         device_type = <I>kepler</I> or <I>fermi</I> or <I>cypress</I> or <I>generic</I>
-  <I>intel</I> args = Nthreads precision keyword value ...
-    Nthreads = # of OpenMP threads to associate with each MPI process on host
-    precision = <I>single</I> or <I>mixed</I> or <I>double</I>
-    keywords = <I>balance</I> or <I>offload_cards</I> or <I>offload_ghost</I> or <I>offload_tpc</I> or <I>offload_threads</I>
+  <I>intel</I> args = NPhi keyword value ...
+    Nphi = # of coprocessors per node
+    zero or more keyword/value pairs may be appended 
+    keywords = <I>prec</I> or <I>balance</I> or <I>ghost</I> or <I>tpc</I> or <I>tptask</I>
+      <I>prec</I> value = <I>single</I> or <I>mixed</I> or <I>double</I>
+        single = perform force calculations in single precision
+        mixed = perform force calculations in mixed precision
+        double = perform force calculations in double precision
      <I>balance</I> value = split
        split = fraction of work to offload to coprocessor, -1 for dynamic
-     <I>offload_cards</I> value = ncops
-       ncops = number of coprocessors to use on each node
-     <I>offload_ghost</I> value = offload_type
-       offload_type = 1 to include ghost atoms for offload, 0 for local only
-     <I>offload_tpc</I> value = tpc
-       tpc = number of threads to use on each core of coprocessor
-     <I>offload_threads</I> value = tptask
-       tptask = max number of threads to use on coprocessor for each MPI task
+     <I>ghost</I> value = <I>yes</I> or <I>no</I>
+       yes = include ghost atoms for offload
+       no = do not include ghost atoms for offload
+     <I>tpc</I> value = Ntpc
+       Ntpc = number of threads to use on each physical core of coprocessor
+     <I>tptask</I> value = Ntptask
+       Ntptask = max number of threads to use on coprocessor for each MPI task
   <I>kokkos</I> args = keyword value ...
     one or more keyword/value pairs may be appended
     keywords = <I>neigh</I> or <I>comm/exchange</I> or <I>comm/forward</I>
       <I>neigh</I> value = <I>full</I> or <I>half/thread</I> or <I>half</I> or <I>n2</I> or <I>full/cluster</I>
       <I>comm/exchange</I> value = <I>no</I> or <I>host</I> or <I>device</I>
       <I>comm/forward</I> value = <I>no</I> or <I>host</I> or <I>device</I>
-  <I>omp</I> args = Nthreads mode
-    Nthreads = # of OpenMP threads to associate with each MPI process
-    mode = force or force/neigh (optional) 
+  <I>omp</I> args = Nthreads keyword value ...
+    Nthread = # of OpenMP threads to associate with each MPI process
+    zero or more keyword/value pairs may be appended 
+    keywords = <I>neigh</I>
+      <I>neigh</I> value = <I>yes</I> or <I>no</I>
+        yes = threaded neighbor list build (default)
+        no = non-threaded neighbor list build 
 </PRE>
 
 </UL>
 <P><B>Examples:</B>
 </P>
-<PRE>package gpu force 0 0 1.0
-package gpu force 0 0 0.75
-package gpu force/neigh 0 0 1.0
-package gpu force/neigh 0 1 -1.0
+<PRE>package gpu 1
+package gpu 1 split 0.75
+package gpu 2 split -1.0
 package cuda gpu/node/special 2 0 2
 package cuda test 3948
 package kokkos neigh half/thread comm/forward device
-package omp * force/neigh
-package omp 4 force
+package omp 0 neigh yes
+package omp 4
 package intel * mixed balance -1 
 </PRE>
 <P><B>Description:</B>
@@ -90,6 +101,10 @@ package intel * mixed balance -1
 following packages use it: USER-CUDA, GPU, USER-INTEL, KOKKOS, and
 USER-OMP.
 </P>
+<P>Talk about command line switches
+</P>
+<P>When does it have to be invoked
+</P>
 <P>To use the accelerated GPU and USER-OMP styles, the use of the package
 command is required.  However, as described in the "Defaults" section
 below, if you use the "-sf gpu" or "-sf omp" <A HREF = "Section_start.html#start_7">command-line
@@ -105,98 +120,105 @@ need to use the package command if you want to change the defaults.
 more details about using these various packages for accelerating
 LAMMPS calculations.
 </P>
+<P>Package GPU always sets newton pair off.  Not so for USER-CUDA>
+</P>
 <HR>
 
-<P>The <I>cuda</I> style invokes options associated with the use of the
-USER-CUDA package.  
+<P>The <I>cuda</I> style invokes settings associated with the use of the
+USER-CUDA package.
 </P>
-<P>The <I>gpu/node</I> keyword specifies the number <I>N</I> of GPUs to be used on
-each node.  An MPI process with rank <I>K</I> will use the GPU (K mod N).
-This implies that processes should be assigned with successive ranks
-on each node, which is the default with most (or even all) MPI
-implementations. The default value for <I>N</I> is 2.
+<P>The <I>Ngpus</I> argument sets the number of GPUs per node.  There must be
+exactly one MPI task per GPU, as set by the mpirun or mpiexec command.
 </P>
-<P>The <I>gpu/node/special</I> keyword also specifies the number (N) of GPUs
-to be used on each node, but allows more control over their
-specification.  An MPI process with rank <I>K</I> will use the GPU <I>gpuI</I>
-with l = (K mod N) + 1. This implies that processes should be assigned
-with successive ranks on each node, which is the default with most (or
-even all) MPI implementations.  For example if you have three GPUs on
-a machine, one of which is used for the X-Server (the GPU with the ID
-1) while the others (with IDs 0 and 2) are used for computations you
-would specify:
+<P>Optional keyword/value pairs can also be specified.  Each has a
+default value as listed below.
 </P>
-<PRE>package cuda gpu/node/special 2 0 2 
+<P>The <I>gpuID</I> keyword allows selection of which GPUs on each node will
+be used for a simulation.  GPU IDs range from 0 to N-1 where N is the
+physical number of GPUs/node.  An ID is specified for each of the
+Ngpus being used.  For example if you have three GPUs on a machine,
+one of which is used for the X-Server (the GPU with the ID 1) while
+the others (with IDs 0 and 2) are used for computations you would
+specify:
+</P>
+<PRE>package cuda 2 gpuID 0 2 
 </PRE>
-<P>A main purpose of the <I>gpu/node/special</I> optoin is to allow two (or
-more) simulations to be run on one workstation.  In that case one
-would set the first simulation to use GPU 0 and the second to use GPU
-1. This is not necessary though, if the GPUs are in what is called
-<I>compute exclusive</I> mode.  Using that setting, every process will get
-its own GPU automatically.  This <I>compute exclusive</I> mode can be set
-as root using the <I>nvidia-smi</I> tool which is part of the CUDA
-installation.
+<P>The purpose of the <I>gpuID</I> keyword is to allow two (or more)
+simulations to be run on one workstation.  In that case one could set
+the first simulation to use GPU 0 and the second to use GPU 1. This is
+not necessary however, if the GPUs are in what is called <I>compute
+exclusive</I> mode.  Using that setting, every process will get its own
+GPU automatically.  This <I>compute exclusive</I> mode can be set as root
+using the <I>nvidia-smi</I> tool which is part of the CUDA installation.
 </P>
-<P>Note that if the <I>gpu/node/special</I> keyword is not used, the USER-CUDA
+<P>Also note that if the <I>gpuID</I> keyword is not used, the USER-CUDA
 package sorts existing GPUs on each node according to their number of
 multiprocessors.  This way, compute GPUs will be priorized over
 X-Server GPUs.
 </P>
-<P>Use of the <I>timing</I> keyword will output detailed timing information
-for various subroutines.
+<P>If the <I>timing</I> keyword is specified, detailed timing information for
+various subroutines will be output.
 </P>
-<P>The <I>test</I> keyword will output info for the the specified atom at
-several points during each time step.  This is mainly usefull for
-debugging purposes.  Note that the simulation will be severly slowed
-down if this option is used.
+<P>If the <I>test</I> keyword is specified, information for the specified atom
+with atom-ID will be output at several points during each timestep.
+This is mainly usefull for debugging purposes.  Note that the
+simulation slow down dramatically if this option is used.
 </P>
-<P>The <I>override/bpa</I> keyword can be used to specify which mode is used
-for pair-force evaluation.  TpA = one thread per atom; BpA = one block
-per atom.  If this keyword is not used, a short test at the begin of
-each run will determine which method is more effective (the result of
-this test is part of the LAMMPS output).  Therefore it is usually not
-necessary to use this keyword.
+<P>The <I>thread</I> keyword can be used to specify how GPU threads are
+assigned work during pair style force evaluation.  If the value =
+<I>tpa</I>, one thread per atom is used.  If the value = <I>bpa</I>, one block
+per atom is used.  If the value = <I>auto</I>, a short test is performed at
+the beginning of each run to determing where <I>tpa</I> or <I>bpa</I> mode is
+faster.  The result of this test is output.  Since <I>auto</I> is the
+default value, it is usually not necessary to use this keyword.
 </P>
 <HR>
 
-<P>The <I>gpu</I> style invokes options associated with the use of the GPU
-package. 
+<P>The <I>gpu</I> style invokes settings associated with the use of the GPU
+package.
 </P>
-<P>The <I>mode</I> setting specifies where neighbor list calculations will be
-performed.  If <I>mode</I> is force, neighbor list calculation is performed
-on the CPU. If <I>mode</I> is force/neigh, neighbor list calculation is
-performed on the GPU. GPU neighbor list calculation currently cannot
-be used with a triclinic box. GPU neighbor list calculation currently
-cannot be used with <A HREF = "pair_hybrid.html">hybrid</A> pair styles.  GPU
-neighbor lists are not compatible with styles that are not
-GPU-enabled.  When a non-GPU enabled style requires a neighbor list,
-it will also be built using CPU routines. In these cases, it will
-typically be more efficient to only use CPU neighbor list builds.
+<P>The <I>Ngpu</I> argument sets the number of GPUs per node.  There must be
+at least as many MPI tasks per node as GPUs, as set by the mpirun or
+mpiexec command.  If there are more MPI tasks (per node) 
+than GPUs, multiple MPI tasks will share each GPU.
 </P>
-<P>The <I>first</I> and <I>last</I> settings specify the GPUs that will be used for
-simulation.  On each node, the GPU IDs in the inclusive range from
-<I>first</I> to <I>last</I> will be used.
+<P>Optional keyword/value pairs can also be specified.  Each has a
+default value as listed below.
 </P>
-<P>The <I>split</I> setting can be used for load balancing force calculation
-work between CPU and GPU cores in GPU-enabled pair styles. If 0 <
-<I>split</I> < 1.0, a fixed fraction of particles is offloaded to the GPU
-while force calculation for the other particles occurs simulataneously
-on the CPU. If <I>split</I><0, the optimal fraction (based on CPU and GPU
-timings) is calculated every 25 timesteps. If <I>split</I> = 1.0, all force
-calculations for GPU accelerated pair styles are performed on the
-GPU. In this case, <A HREF = "pair_hybrid.html">hybrid</A>, <A HREF = "bond_style.html">bond</A>,
-<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
-<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A>
-calculations can be performed on the CPU while the GPU is performing
-force calculations for the GPU-enabled pair style.  If all CPU force
-computations complete before the GPU, LAMMPS will block until the GPU
-has finished before continuing the timestep.
+<P>The <I>neigh</I> keyword specifies where neighbor lists for pair style
+computation will be built.  If <I>neigh</I> is <I>yes</I>, which is the default,
+neighbor list building is performed on the GPU.  If <I>neigh</I> is <I>no</I>,
+neighbor list building is performed on the CPU.  GPU neighbor list
+building currently cannot be used with a triclinic box.  GPU neighbor
+list calculation currently cannot be used with
+<A HREF = "pair_hybrid.html">hybrid</A> pair styles.  GPU neighbor lists are not
+compatible with comannds that are not GPU-enabled.  When a non-GPU
+enabled command requires a neighbor list, it will also be built on the
+CPU.  In these cases, it will typically be more efficient to only use
+CPU neighbor list builds.
+</P>
+<P>The <I>split</I> keyword can be used for load balancing force calculations
+between CPU and GPU cores in GPU-enabled pair styles. If 0 < <I>split</I> <
+1.0, a fixed fraction of particles is offloaded to the GPU while force
+calculation for the other particles occurs simulataneously on the
+CPU. If <I>split</I> < 0.0, the optimal fraction (based on CPU and GPU
+timings) is calculated every 25 timesteps.  If <I>split</I> = 1.0, all
+force calculations for GPU accelerated pair styles are performed on
+the GPU.  In this case, other <A HREF = "pair_hybrid.html">hybrid</A> pair
+interactions, <A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>,
+<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>, and
+<A HREF = "kspace_style.html">long-range</A> calculations can be performed on the
+CPU while the GPU is performing force calculations for the GPU-enabled
+pair style.  If all CPU force computations complete before the GPU
+completes, LAMMPS will block until the GPU has finished before
+continuing the timestep.
 </P>
 <P>As an example, if you have two GPUs per node and 8 CPU cores per node,
 and would like to run on 4 nodes (32 cores) with dynamic balancing of
 force calculation across CPU and GPU cores, you could specify
 </P>
-<PRE>package gpu force/neigh 0 1 -1 
+<PRE>mpirun -np 32 -sf gpu -in in.script    # launch command
+package gpu 2 split -1                 # input script command 
 </PRE>
 <P>In this case, all CPU cores and GPU devices on the nodes would be
 utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
@@ -204,93 +226,105 @@ cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.
 </P>
-<P>The <I>threads_per_atom</I> keyword allows control of the number of GPU
-threads used per-atom to perform the short range force calculation.
-By default, the value will be chosen based on the pair style, however,
-the value can be set with this keyword to fine-tune performance.  For
+<P>The <I>gpuID</I> keyword allows selection of which GPUs on each node will
+be used for a simulation.  The <I>first</I> and <I>last</I> values specify the
+GPU IDs to use (from 0 to Ngpu-1).  By default, first = 0 and last =
+Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
+of physical GPUs.  If you only wish to use a subset, set Ngpu to a
+smaller number and first/last to a sub-range of the available GPUs.
+</P>
+<P>The <I>tpa</I> keyword sets the number of GPU thread per atom used to
+perform force calculations.  With a default value of 1, the number of
+threads will be chosen based on the pair style, however, the value can
+be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 </P>
-<P>The <I>cellsize</I> keyword can be used to control the size of the cells used
-for binning atoms in neighbor list calculations. Setting this value is 
-normally not needed; the optimal value is close to the default 
-(equal to the cutoff distance for the short range interactions 
-plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs 
-than CPUs and this can be used to reduce the time required for long-range 
-calculations or in some cases to eliminate them with models such as 
-<A HREF = "pair_coul.html">coul/wolf</A> or <A HREF = "pair_coul.html">coul/dsf</A>. For very large cutoffs,
-it can be more efficient to use smaller values for cellsize in parallel
-simulations. For example, with a cutoff of 20*sigma and a neighbor skin of
-sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations.
+<P>The <I>binsize</I> keyword sets the size of bins used to bin atoms in
+neighbor list builds.  Setting this value is normally not needed; the
+optimal value is close to the default, which is set equal to the
+cutoff distance for the short range interactions plus the neighbor
+skin.  Note that this is 2x larger than the default bin size for
+neighbor list builds on the CPU.  This is becuase GPUs can perform
+efficiently with much larger cutoffs than CPUs.  This can be used to
+reduce the time required for long-range calculations or in some cases
+to eliminate them with pair style models such as
+<A HREF = "pair_coul.html">coul/wolf</A> or <A HREF = "pair_coul.html">coul/dsf</A>.  For very
+large cutoffs, it can be more efficient to use smaller values for
+<I>binsize</I> in parallel simulations.  For example, with a cutoff of
+20*sigma in LJ <A HREF = "units.html">units</A> and a neighbor skin distance of
+sigma, a <I>binsize</I> = 5.25*sigma can be more efficient than the
+default.
 </P>
-<P>The <I>device</I> keyword can be used to tune parameters to optimize for a specific
-accelerator when using OpenCL. For CUDA, the <I>device</I> keyword is ignored.
-Currently, the device type is limited to NVIDIA Kepler, NVIDIA Fermi, 
-AMD Cypress, or a generic device. More devices will be added soon. The default
-device type can be specified when building LAMMPS with the GPU library.
+<P>The <I>device</I> keyword can be used to tune parameters optimized for a
+specific accelerator, when using OpenCL.  For CUDA, the <I>device</I>
+keyword is ignored.  Currently, the device type is limited to NVIDIA
+Kepler, NVIDIA Fermi, AMD Cypress, or a generic device.  More devices
+may be added later.  The default device type can be specified when
+building LAMMPS with the GPU library, via settings in the
+lib/gpu/Makefile that is used.
 </P>
 <HR>
 
-<P>The <I>intel</I> style invokes options associated with the use of the
-USER-INTEL package.
+<P>The <I>intel</I> style invokes settings associated with the use of the
+USER-INTEL package.  All of its settings, except the <I>prec</I> keyword,
+are ignored if LAMMPS was not built with Xeon Phi coprocessor support,
+when building with the USER-INTEL package.  All of its settings,
+including the <I>prec</I> keyword are applicable if LAMMPS was built with
+coprocessor support.
 </P>
-<P>The <I>Nthreads</I> argument allows to one explicitly set the number of
-OpenMP threads to be allocated for each MPI process, An <I>Nthreads</I>
-value of '*' instructs LAMMPS to use whatever is the default for the
-given OpenMP environment. This is usually determined via the
-OMP_NUM_THREADS environment variable or the compiler runtime.
+<P>The <I>Nphi</I> argument sets the number of coprocessors per node.
 </P>
-<P>The <I>precision</I> argument determines the precision mode to use and can
-take values of <I>single</I> (intel styles use single precision for all
-calculations), <I>mixed</I> (intel styles use double precision for
-accumulation and storage of forces, torques, energies, and virial
-terms and single precision for everything else), or <I>double</I> (intel
-styles use double precision for all calculations).
+<P>Optional keyword/value pairs can also be specified.  Each has a
+default value as listed below.
 </P>
-<P>Additional keyword-value pairs are available that are used to
-determine how work is offloaded to an Intel(R) coprocessor. If LAMMPS is
-built without offload support, these values are ignored. The
-additional settings are as follows:
+<P>The <I>prec</I> keyword argument determines the precision mode to use for
+computing pair style forces, either on the CPU or on the coprocessor,
+when using a USER-INTEL supported <A HREF = "pair_style.html">pair style</A>.  It
+can take a value of <I>single</I>, <I>mixed</I> which is the default, or
+<I>double</I>.  <I>Single</I> means single precision is used for the entire
+force calculation.  <I>Mixed</I> means forces between a pair of atoms are
+computed in single precision, but accumulated and stored in double
+precision, including storage of forces, torques, energies, and virial
+quantities.  <I>Double</I> means double precision is used for the entire
+force calculation.
 </P>
-<P>The <I>balance</I> setting is used to set the fraction of work offloaded to
-the coprocessor for an intel style (in the inclusive range 0.0 to
-1.0). While this fraction of work is running on the coprocessor, other
-calculations will run on the host, including neighbor and pair
-calculations that are not offloaded, angle, bond, dihedral, kspace,
-and some MPI communications. If the balance is set to -1, the fraction
-of work is dynamically adjusted automatically throughout the run. This
-can typically give performance within 5 to 10 percent of the optimal
-fixed fraction.
+<P>The <I>balance</I> keyword sets the fraction of <A HREF = "pair_style.html">pair
+style</A> work offloaded to the coprocessor style for
+split values between 0.0 and 1.0 inclusive.  While this fraction of
+work is running on the coprocessor, other calculations will run on the
+host, including neighbor and pair calculations that are not offloaded,
+angle, bond, dihedral, kspace, and some MPI communications.  If
+<I>split</I> is set to -1, the fraction of work is dynamically adjusted
+automatically throughout the run.  This typically give performance
+within 5 to 10 percent of the optimal fixed fraction.
 </P>
-<P>The <I>offload_cards</I> setting determines the number of coprocessors to
-use on each node.
-</P>
-<P>Additional options for fine tuning performance with offload are as
-follows:
-</P>
-<P>The <I>offload_ghost</I> setting determines whether or not ghost atoms,
-atoms at the borders between MPI tasks, are offloaded for neighbor and
-force calculations. When set to "0", ghost atoms are not offloaded.
-This option can reduce the amount of data transfer with the
-coprocessor and also can overlap MPI communication of forces with
+<P>The <I>ghost</I> keyword determines whether or not ghost atoms, i.e. atoms
+at the boundaries of proessor sub-domains, are offloaded for neighbor
+and force calculations.  When the value = "no", ghost atoms are not
+offloaded.  This option can reduce the amount of data transfer with
+the coprocessor and can also overlap MPI communication of forces with
 computation on the coprocessor when the <A HREF = "newton.html">newton pair</A>
-setting is "on".  When set to "1", ghost atoms are offloaded. In some
-cases this can provide better performance, especially if the offload
-fraction is high.
+setting is "on".  When the value = "ues", ghost atoms are offloaded.
+In some cases this can provide better performance, especially if the
+<I>balance</I> fraction is high.
 </P>
-<P>The <I>offload_tpc</I> option sets the maximum number of threads that will
-run on each core of the coprocessor.
+<P>The <I>tpc</I> keyword sets the maximum # of threads <I>Ntpc</I> that will
+run on each physical core of the coprocessor.  The default value is
+set to 4, which is the number of hardware threads per core supported
+by the current generation Xeon Phi chips.
 </P>
-<P>The <I>offload_threads</I> option sets the maximum number of threads that
-will be used on the coprocessor for each MPI task. This, along with
-the <I>offload_tpc</I> setting, are the only methods for changing the
-number of threads on the coprocessor. The OMP_NUM_THREADS keyword and
-<I>Nthreads</I> options are only used for threads on the host.
+<P>The <I>tptask</I> keyword sets the maximum # of threads (Ntptask</I> that will
+be used on the coprocessor for each MPI task.  This, along with the
+<I>tpc</I> keyword setting, are the only methods for changing the number of
+threads used on the coprocessor.  The default value is set to 240 =
+60*4, which is the maximum # of threads supported by an entire current
+generation Xeon Phi chip.
 </P>
 <HR>
 
-<P>The <I>kokkos</I> style invokes options associated with the use of the
+<P>The <I>kokkos</I> style invokes settings associated with the use of the
 KOKKOS package.
 </P>
 <P>The <I>neigh</I> keyword determines what kinds of neighbor lists are built.
@@ -346,43 +380,59 @@ multiple threads to pack/unpack communicated data.
 </P>
 <HR>
 
-<P>The <I>omp</I> style invokes options associated with the use of the
+<P>The <I>omp</I> style invokes settings associated with the use of the
 USER-OMP package.
 </P>
-<P>The first argument allows to explicitly set the number of OpenMP
-threads to be allocated for each MPI process.  For example, if your
-system has nodes with dual quad-core processors, it has a total of 8
-cores per node.  You could run MPI on 2 cores on each node (e.g. using
-options for the mpirun command), and set the <I>Nthreads</I> setting to 4.
-This would effectively use all 8 cores on each node.  Since each MPI
-process would spawn 4 threads (one of which runs as part of the MPI
-process itself).
+<P>The <I>Nthread</I> argument sets the number of OpenMP threads allocated for
+each MPI task.  For example, if your system has nodes with dual
+quad-core processors, it has a total of 8 cores per node.  You could
+use two MPI tasks per node (e.g. using the -ppn option of the mpirun
+command), and set <I>Nthreads</I> = 4.  This would use all 8 cores on each
+node.  Note that the product of MPI tasks * threads/task should not
+exceed the physical number of cores (on a node), otherwise performance
+will suffer.
 </P>
-<P>For performance reasons, you should not set <I>Nthreads</I> to more threads
-than there are physical cores (per MPI task), but LAMMPS cannot check
-for this.
-</P>
-<P>An <I>Nthreads</I> value of '*' instructs LAMMPS to use whatever is the
+<P>Setting <I>Nthread</I> = 0 instructs LAMMPS to use whatever value is the
 default for the given OpenMP environment. This is usually determined
 via the <I>OMP_NUM_THREADS</I> environment variable or the compiler
-runtime.  Please note that in most cases the default for OpenMP
-capable compilers is to use one thread for each available CPU core
-when <I>OMP_NUM_THREADS</I> is not set, which can lead to extremely bad
+runtime.  Note that in most cases the default for OpenMP capable
+compilers is to use one thread for each available CPU core when
+<I>OMP_NUM_THREADS</I> is not explicitly set, which can lead to poor
 performance.
 </P>
-<P>Which combination of threads and MPI tasks gives the best performance
-is difficult to predict and can depend on many components of your input.
-Not all features of LAMMPS support OpenMP and the parallel efficiency
-can be very different, too.
+<P>Here are examples of how to set the environment variable when
+launching LAMMPS:
 </P>
-<P>The <I>mode</I> setting specifies where neighbor list calculations will be
-multi-threaded as well.  If <I>mode</I> is force, neighbor list calculation
-is performed in serial. If <I>mode</I> is force/neigh, a multi-threaded
-neighbor list build is used. Using the force/neigh setting is almost
-always faster and should produce idential neighbor lists at the
-expense of using some more memory (neighbor list pages are always
-allocated for all threads at the same time and each thread works on
-its own pages).
+<PRE>env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
+env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
+mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script 
+</PRE>
+<P>or you can set it permanently in your shell's start-up script.  
+All three of these examples use a total of 4 CPU cores.
+</P>
+<P>Note that different MPI implementations have different ways of passing
+the OMP_NUM_THREADS environment variable to all MPI processes.  The
+2nd example line above is for MPICH; the 3rd example line with -x is
+for OpenMPI.  Check your MPI documentation for additional details.
+</P>
+<P>What combination of threads and MPI tasks gives the best performance
+is difficult to predict and can depend on many components of your
+input.  Not all features of LAMMPS support OpenMP threading via the
+USER-OMP packaage and the parallel efficiency can be very different,
+too.
+</P>
+<P>Optional keyword/value pairs can also be specified.  Each has a
+default value as listed below.
+</P>
+<P>The <I>neigh</I> keyword specifies whether neighbor list building will be
+multi-threaded in addition to force calculations.  If <I>neigh</I> is set
+to <I>no</I> then neighbor list calculation is performed only by MPI tasks
+with no OpenMP threading.  If <I>mode</I> is <I>yes</I> (the default), a
+multi-threaded neighbor list build is used.  Using <I>neigh</I> = <I>yes</I> is
+almost always faster and should produce idential neighbor lists at the
+expense of using more memory.  Specifically, neighbor list pages are
+allocated for all threads at the same time and each thread works
+within its own pages.
 </P>
 <HR>
 
@@ -399,6 +449,10 @@ LAMMPS</A> section for more info.
 with the GPU package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
 </P>
+<P>The intel style of this command can only be invoked if LAMMPS was
+built with the USER-INTEL package.  See the <A HREF = "Section_start.html#start_3">Making
+LAMMPS</A> section for more info.
+</P>
 <P>The kk style of this command can only be invoked if LAMMPS was built
 with the KOKKOS package.  See the <A HREF = "Section_start.html#start_3">Making
 LAMMPS</A> section for more info.
@@ -409,35 +463,50 @@ LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
 </P>
-<P><A HREF = "suffix.html">suffix</A>
+<P><A HREF = "suffix.html">suffix</A>, "-pk" <A HREF = "Section_start.html#start_7">command-line
+setting</A>
 </P>
 <P><B>Default:</B>
 </P>
-<P>The default settings for the USER-CUDA package are "package cuda gpu
-2".  This is the case whether the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line
-switch</A> is used or not.
+<P>To use the USER-CUDA package, the package cuda command must be invoked
+explicitly in your input script or via the "-pk cuda" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  This will set the # of GPUs/node.
+The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled,
+test = not enabled, and thread = auto.
 </P>
-<P>If the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line switch</A> is
-used then it is as if the command "package gpu force/neigh 0 0 1" were
-invoked, to specify default settings for the GPU package.  If the
-command-line switch is not used, then no defaults are set, and you
-must specify the appropriate package command in your input script.
+<P>For the GPU package, the default is Ngpu = 1 and the option defaults
+are neigh = yes, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1, binsize =
+pair cutoff + neighbor skin, device = not used.  These settings are
+made automatically if the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line
+switch</A> is used.  If it is not used, you
+must invoke the package gpu command in your input script or via the
+"-pk gpu" <A HREF = "Section_start.html#start_7">command-line switch</A>.
 </P>
-<P>The default settings for the USER-INTEL package are "package intel *
-mixed balance -1 offload_cards 1 offload_tpc 4 offload_threads 240".
-The <I>offload_ghost</I> default setting is determined by the intel style
-being used.  The value used is output to the screen in the offload
-report at the end of each run.
+<P>For the USER-INTEL package, the default is Nphi = 1 and the option
+defaults are prec = mixed, balance = -1, tpc = 4, tptask = 240.  The
+default ghost option is determined by the pair style being used.  This
+value used is output to the screen in the offload report at the end of
+each run.  These settings are made automatically if the "-sf intel"
+<A HREF = "Section_start.html#start_7">command-line switch</A> is used.  If it is
+not used, you must invoke the package intel command in your input
+script or or via the "-pk intel" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.
 </P>
-<P>The default settings for the KOKKOS package are "package kk neigh full 
-comm/exchange host comm/forward host".  This is the case whether the
-"-sf kk" <A HREF = "Section_start.html#start_7">command-line switch</A> is used or
-not.
+<P>The default settings for the KOKKOS package are "package kokkos neigh
+full comm/exchange host comm/forward host".  This is the case whether
+the "-sf kk" <A HREF = "Section_start.html#start_7">command-line switch</A> is used
+or not.
+To use the KOKKOS package, the package kokkos command must be invoked
+explicitly in your input script or via the "-pk kokkos" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  This will set the # of GPUs/node.
+The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled,
+test = not enabled, and thread = auto.
 </P>
-<P>If the "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A> is
-used then it is as if the command "package omp *" were invoked, to
-specify default settings for the USER-OMP package.  If the
-command-line switch is not used, then no defaults are set, and you
-must specify the appropriate package command in your input script.
+<P>For the OMP package, the default is Nthreads = 0 and the option
+defaults are neigh = yes.  These settings are made automatically if
+the "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A> is
+used.  If it is not used, you must invoke the package omp command in
+your input script or via the "-pk omp" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.
 </P>
 </HTML>
diff --git a/doc/package.txt b/doc/package.txt
index 833da02733..9c565f35b2 100644
--- a/doc/package.txt
+++ b/doc/package.txt
@@ -14,68 +14,79 @@ package style args :pre
 
 style = {cuda} or {gpu} or {intel} or {kokkos} or {omp} :ulb,l
 args = arguments specific to the style :l
-  {cuda} args = keyword value ...
-    one or more keyword/value pairs may be appended
-    keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa}
-      {gpu/node} value = N
-        N = number of GPUs to be used per node
-      {gpu/node/special} values = N gpu1 .. gpuN
-        N = number of GPUs to be used per node
-        gpu1 .. gpuN = N IDs of the GPUs to use
+  {cuda} args = Ngpu keyword value ...
+    Ngpu = # of GPUs per node
+    zero or more keyword/value pairs may be appended
+    keywords = {gpuID} or {timing} or {test} or {thread}
+      {gpuID} values = gpu1 .. gpuN
+        gpu1 .. gpuN = IDs of the Ngpu GPUs to use
       {timing} values = none
       {test} values = id
         id = atom-ID of a test particle
-      {override/bpa} values = flag
-        flag = 0 for TpA algorithm, 1 for BpA algorithm 
-  {gpu} args = mode first last split keyword value ...
-    mode = force or force/neigh
-    first = ID of first GPU to be used on each node
-    last = ID of last GPU to be used on each node
-    split = fraction of particles assigned to the GPU
-    zero or more keyword/value pairs may be appended
-    keywords = {threads_per_atom} or {cellsize} or {device}
-      {threads_per_atom} value = Nthreads
+      {thread} = auto or tpa or bpa
+        auto = test whether tpa or bpa is faster
+        tpa = one thread per atom
+        bpa = one block per atom
+  {gpu} args = Ngpu keyword value ...
+    Ngpu = # of GPUs per node
+    zero or more keyword/value pairs may be appended 
+    keywords = {neigh} or {split} or {gpuID} or {tpa} or {binsize} or {device}
+      {neigh} value = {yes} or {no}
+        yes = neighbor list build on GPU (default)
+        no = neighbor list build on CPU
+      {split} = fraction
+        fraction = fraction of atoms assigned to GPU (default = 1.0)
+      {gpuID} values = first last
+        first = ID of first GPU to be used on each node
+        last = ID of last GPU to be used on each node
+      {tpa} value = Nthreads
         Nthreads = # of GPU threads used per atom
-      {cellsize} value = dist
-        dist = length (distance units) in each dimension for neighbor bins
+      {binsize} value = size
+        size = bin size for neighbor list construction (distance units)
       {device} value = device_type
         device_type = {kepler} or {fermi} or {cypress} or {phi} or {intel} or {generic}
-  {intel} args = Nthreads precision keyword value ...
-    Nthreads = # of OpenMP threads to associate with each MPI process on host
-    precision = {single} or {mixed} or {double}
-    keywords = {balance} or {offload_cards} or {offload_ghost} or {offload_tpc} or {offload_threads}
+  {intel} args = NPhi keyword value ...
+    Nphi = # of coprocessors per node
+    zero or more keyword/value pairs may be appended 
+    keywords = {prec} or {balance} or {ghost} or {tpc} or {tptask}
+      {prec} value = {single} or {mixed} or {double}
+        single = perform force calculations in single precision
+        mixed = perform force calculations in mixed precision
+        double = perform force calculations in double precision
      {balance} value = split
        split = fraction of work to offload to coprocessor, -1 for dynamic
-     {offload_cards} value = ncops
-       ncops = number of coprocessors to use on each node
-     {offload_ghost} value = offload_type
-       offload_type = 1 to include ghost atoms for offload, 0 for local only
-     {offload_tpc} value = tpc
-       tpc = number of threads to use on each core of coprocessor
-     {offload_threads} value = tptask
-       tptask = max number of threads to use on coprocessor for each MPI task
+     {ghost} value = {yes} or {no}
+       yes = include ghost atoms for offload
+       no = do not include ghost atoms for offload
+     {tpc} value = Ntpc
+       Ntpc = number of threads to use on each physical core of coprocessor
+     {tptask} value = Ntptask
+       Ntptask = max number of threads to use on coprocessor for each MPI task
   {kokkos} args = keyword value ...
     one or more keyword/value pairs may be appended
     keywords = {neigh} or {comm/exchange} or {comm/forward}
       {neigh} value = {full} or {half/thread} or {half} or {n2} or {full/cluster}
       {comm/exchange} value = {no} or {host} or {device}
       {comm/forward} value = {no} or {host} or {device}
-  {omp} args = Nthreads mode
-    Nthreads = # of OpenMP threads to associate with each MPI process
-    mode = force or force/neigh (optional) :pre
+  {omp} args = Nthreads keyword value ...
+    Nthread = # of OpenMP threads to associate with each MPI process
+    zero or more keyword/value pairs may be appended 
+    keywords = {neigh}
+      {neigh} value = {yes} or {no}
+        yes = threaded neighbor list build (default)
+        no = non-threaded neighbor list build :pre
 :ule
 
 [Examples:]
 
-package gpu force 0 0 1.0
-package gpu force 0 0 0.75
-package gpu force/neigh 0 0 1.0
-package gpu force/neigh 0 1 -1.0
+package gpu 1
+package gpu 1 split 0.75
+package gpu 2 split -1.0
 package cuda gpu/node/special 2 0 2
 package cuda test 3948
 package kokkos neigh half/thread comm/forward device
-package omp * force/neigh
-package omp 4 force
+package omp 0 neigh yes
+package omp 4
 package intel * mixed balance -1 :pre
 
 [Description:]
@@ -84,6 +95,10 @@ This command invokes package-specific settings.  Currently the
 following packages use it: USER-CUDA, GPU, USER-INTEL, KOKKOS, and
 USER-OMP.
 
+Talk about command line switches
+
+When does it have to be invoked
+
 To use the accelerated GPU and USER-OMP styles, the use of the package
 command is required.  However, as described in the "Defaults" section
 below, if you use the "-sf gpu" or "-sf omp" "command-line
@@ -99,98 +114,105 @@ See "Section_accelerate"_Section_accelerate.html of the manual for
 more details about using these various packages for accelerating
 LAMMPS calculations.
 
+Package GPU always sets newton pair off.  Not so for USER-CUDA>
+
 :line
 
-The {cuda} style invokes options associated with the use of the
-USER-CUDA package.  
+The {cuda} style invokes settings associated with the use of the
+USER-CUDA package.
 
-The {gpu/node} keyword specifies the number {N} of GPUs to be used on
-each node.  An MPI process with rank {K} will use the GPU (K mod N).
-This implies that processes should be assigned with successive ranks
-on each node, which is the default with most (or even all) MPI
-implementations. The default value for {N} is 2.
+The {Ngpus} argument sets the number of GPUs per node.  There must be
+exactly one MPI task per GPU, as set by the mpirun or mpiexec command.
 
-The {gpu/node/special} keyword also specifies the number (N) of GPUs
-to be used on each node, but allows more control over their
-specification.  An MPI process with rank {K} will use the GPU {gpuI}
-with l = (K mod N) + 1. This implies that processes should be assigned
-with successive ranks on each node, which is the default with most (or
-even all) MPI implementations.  For example if you have three GPUs on
-a machine, one of which is used for the X-Server (the GPU with the ID
-1) while the others (with IDs 0 and 2) are used for computations you
-would specify:
+Optional keyword/value pairs can also be specified.  Each has a
+default value as listed below.
 
-package cuda gpu/node/special 2 0 2 :pre
+The {gpuID} keyword allows selection of which GPUs on each node will
+be used for a simulation.  GPU IDs range from 0 to N-1 where N is the
+physical number of GPUs/node.  An ID is specified for each of the
+Ngpus being used.  For example if you have three GPUs on a machine,
+one of which is used for the X-Server (the GPU with the ID 1) while
+the others (with IDs 0 and 2) are used for computations you would
+specify:
 
-A main purpose of the {gpu/node/special} optoin is to allow two (or
-more) simulations to be run on one workstation.  In that case one
-would set the first simulation to use GPU 0 and the second to use GPU
-1. This is not necessary though, if the GPUs are in what is called
-{compute exclusive} mode.  Using that setting, every process will get
-its own GPU automatically.  This {compute exclusive} mode can be set
-as root using the {nvidia-smi} tool which is part of the CUDA
-installation.
+package cuda 2 gpuID 0 2 :pre
 
-Note that if the {gpu/node/special} keyword is not used, the USER-CUDA
+The purpose of the {gpuID} keyword is to allow two (or more)
+simulations to be run on one workstation.  In that case one could set
+the first simulation to use GPU 0 and the second to use GPU 1. This is
+not necessary however, if the GPUs are in what is called {compute
+exclusive} mode.  Using that setting, every process will get its own
+GPU automatically.  This {compute exclusive} mode can be set as root
+using the {nvidia-smi} tool which is part of the CUDA installation.
+
+Also note that if the {gpuID} keyword is not used, the USER-CUDA
 package sorts existing GPUs on each node according to their number of
 multiprocessors.  This way, compute GPUs will be priorized over
 X-Server GPUs.
- 
-Use of the {timing} keyword will output detailed timing information
-for various subroutines.
 
-The {test} keyword will output info for the the specified atom at
-several points during each time step.  This is mainly usefull for
-debugging purposes.  Note that the simulation will be severly slowed
-down if this option is used.
+If the {timing} keyword is specified, detailed timing information for
+various subroutines will be output.
 
-The {override/bpa} keyword can be used to specify which mode is used
-for pair-force evaluation.  TpA = one thread per atom; BpA = one block
-per atom.  If this keyword is not used, a short test at the begin of
-each run will determine which method is more effective (the result of
-this test is part of the LAMMPS output).  Therefore it is usually not
-necessary to use this keyword.
+If the {test} keyword is specified, information for the specified atom
+with atom-ID will be output at several points during each timestep.
+This is mainly usefull for debugging purposes.  Note that the
+simulation slow down dramatically if this option is used.
+
+The {thread} keyword can be used to specify how GPU threads are
+assigned work during pair style force evaluation.  If the value =
+{tpa}, one thread per atom is used.  If the value = {bpa}, one block
+per atom is used.  If the value = {auto}, a short test is performed at
+the beginning of each run to determing where {tpa} or {bpa} mode is
+faster.  The result of this test is output.  Since {auto} is the
+default value, it is usually not necessary to use this keyword.
 
 :line
 
-The {gpu} style invokes options associated with the use of the GPU
-package. 
+The {gpu} style invokes settings associated with the use of the GPU
+package.
 
-The {mode} setting specifies where neighbor list calculations will be
-performed.  If {mode} is force, neighbor list calculation is performed
-on the CPU. If {mode} is force/neigh, neighbor list calculation is
-performed on the GPU. GPU neighbor list calculation currently cannot
-be used with a triclinic box. GPU neighbor list calculation currently
-cannot be used with "hybrid"_pair_hybrid.html pair styles.  GPU
-neighbor lists are not compatible with styles that are not
-GPU-enabled.  When a non-GPU enabled style requires a neighbor list,
-it will also be built using CPU routines. In these cases, it will
-typically be more efficient to only use CPU neighbor list builds.
+The {Ngpu} argument sets the number of GPUs per node.  There must be
+at least as many MPI tasks per node as GPUs, as set by the mpirun or
+mpiexec command.  If there are more MPI tasks (per node) 
+than GPUs, multiple MPI tasks will share each GPU.
 
-The {first} and {last} settings specify the GPUs that will be used for
-simulation.  On each node, the GPU IDs in the inclusive range from
-{first} to {last} will be used.
+Optional keyword/value pairs can also be specified.  Each has a
+default value as listed below.
 
-The {split} setting can be used for load balancing force calculation
-work between CPU and GPU cores in GPU-enabled pair styles. If 0 <
-{split} < 1.0, a fixed fraction of particles is offloaded to the GPU
-while force calculation for the other particles occurs simulataneously
-on the CPU. If {split}<0, the optimal fraction (based on CPU and GPU
-timings) is calculated every 25 timesteps. If {split} = 1.0, all force
-calculations for GPU accelerated pair styles are performed on the
-GPU. In this case, "hybrid"_pair_hybrid.html, "bond"_bond_style.html,
-"angle"_angle_style.html, "dihedral"_dihedral_style.html,
-"improper"_improper_style.html, and "long-range"_kspace_style.html
-calculations can be performed on the CPU while the GPU is performing
-force calculations for the GPU-enabled pair style.  If all CPU force
-computations complete before the GPU, LAMMPS will block until the GPU
-has finished before continuing the timestep.
+The {neigh} keyword specifies where neighbor lists for pair style
+computation will be built.  If {neigh} is {yes}, which is the default,
+neighbor list building is performed on the GPU.  If {neigh} is {no},
+neighbor list building is performed on the CPU.  GPU neighbor list
+building currently cannot be used with a triclinic box.  GPU neighbor
+list calculation currently cannot be used with
+"hybrid"_pair_hybrid.html pair styles.  GPU neighbor lists are not
+compatible with comannds that are not GPU-enabled.  When a non-GPU
+enabled command requires a neighbor list, it will also be built on the
+CPU.  In these cases, it will typically be more efficient to only use
+CPU neighbor list builds.
+
+The {split} keyword can be used for load balancing force calculations
+between CPU and GPU cores in GPU-enabled pair styles. If 0 < {split} <
+1.0, a fixed fraction of particles is offloaded to the GPU while force
+calculation for the other particles occurs simulataneously on the
+CPU. If {split} < 0.0, the optimal fraction (based on CPU and GPU
+timings) is calculated every 25 timesteps.  If {split} = 1.0, all
+force calculations for GPU accelerated pair styles are performed on
+the GPU.  In this case, other "hybrid"_pair_hybrid.html pair
+interactions, "bond"_bond_style.html, "angle"_angle_style.html,
+"dihedral"_dihedral_style.html, "improper"_improper_style.html, and
+"long-range"_kspace_style.html calculations can be performed on the
+CPU while the GPU is performing force calculations for the GPU-enabled
+pair style.  If all CPU force computations complete before the GPU
+completes, LAMMPS will block until the GPU has finished before
+continuing the timestep.
 
 As an example, if you have two GPUs per node and 8 CPU cores per node,
 and would like to run on 4 nodes (32 cores) with dynamic balancing of
 force calculation across CPU and GPU cores, you could specify
 
-package gpu force/neigh 0 1 -1 :pre
+mpirun -np 32 -sf gpu -in in.script    # launch command
+package gpu 2 split -1                 # input script command :pre
 
 In this case, all CPU cores and GPU devices on the nodes would be
 utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
@@ -198,94 +220,105 @@ cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.
 
-The {threads_per_atom} keyword allows control of the number of GPU
-threads used per-atom to perform the short range force calculation.
-By default, the value will be chosen based on the pair style, however,
-the value can be set with this keyword to fine-tune performance.  For
+The {gpuID} keyword allows selection of which GPUs on each node will
+be used for a simulation.  The {first} and {last} values specify the
+GPU IDs to use (from 0 to Ngpu-1).  By default, first = 0 and last =
+Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
+of physical GPUs.  If you only wish to use a subset, set Ngpu to a
+smaller number and first/last to a sub-range of the available GPUs.
+
+The {tpa} keyword sets the number of GPU thread per atom used to
+perform force calculations.  With a default value of 1, the number of
+threads will be chosen based on the pair style, however, the value can
+be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 
-The {cellsize} keyword can be used to control the size of the cells used
-for binning atoms in neighbor list calculations. Setting this value is 
-normally not needed; the optimal value is close to the default 
-(equal to the cutoff distance for the short range interactions 
-plus the neighbor skin). GPUs can perform efficiently with much larger cutoffs 
-than CPUs and this can be used to reduce the time required for long-range 
-calculations or in some cases to eliminate them with models such as 
-"coul/wolf"_pair_coul.html or "coul/dsf"_pair_coul.html. For very large cutoffs,
-it can be more efficient to use smaller values for cellsize in parallel
-simulations. For example, with a cutoff of 20*sigma and a neighbor skin of
-sigma, a cellsize of 5.25*sigma can be efficient for parallel simulations.
+The {binsize} keyword sets the size of bins used to bin atoms in
+neighbor list builds.  Setting this value is normally not needed; the
+optimal value is close to the default, which is set equal to the
+cutoff distance for the short range interactions plus the neighbor
+skin.  Note that this is 2x larger than the default bin size for
+neighbor list builds on the CPU.  This is becuase GPUs can perform
+efficiently with much larger cutoffs than CPUs.  This can be used to
+reduce the time required for long-range calculations or in some cases
+to eliminate them with pair style models such as
+"coul/wolf"_pair_coul.html or "coul/dsf"_pair_coul.html.  For very
+large cutoffs, it can be more efficient to use smaller values for
+{binsize} in parallel simulations.  For example, with a cutoff of
+20*sigma in LJ "units"_units.html and a neighbor skin distance of
+sigma, a {binsize} = 5.25*sigma can be more efficient than the
+default.
 
-The {device} keyword can be used to tune parameters to optimize for a specific
-accelerator when using OpenCL. For CUDA, the {device} keyword is ignored.
-Currently, the device type is limited to NVIDIA Kepler, NVIDIA Fermi, 
-AMD Cypress, Intel CPU, Intel Phi, or a generic device. More devices will be 
-added soon. The default device type can be specified when building LAMMPS with
-the GPU library.
+The {device} keyword can be used to tune parameters optimized for a
+specific accelerator, when using OpenCL.  For CUDA, the {device}
+keyword is ignored.  Currently, the device type is limited to NVIDIA
+Kepler, NVIDIA Fermi, AMD Cypress, or a generic device.  More devices
+may be added later.  The default device type can be specified when
+building LAMMPS with the GPU library, via settings in the
+lib/gpu/Makefile that is used.
 
 :line
 
-The {intel} style invokes options associated with the use of the
-USER-INTEL package.
+The {intel} style invokes settings associated with the use of the
+USER-INTEL package.  All of its settings, except the {prec} keyword,
+are ignored if LAMMPS was not built with Xeon Phi coprocessor support,
+when building with the USER-INTEL package.  All of its settings,
+including the {prec} keyword are applicable if LAMMPS was built with
+coprocessor support.
 
-The {Nthreads} argument allows to one explicitly set the number of
-OpenMP threads to be allocated for each MPI process, An {Nthreads}
-value of '*' instructs LAMMPS to use whatever is the default for the
-given OpenMP environment. This is usually determined via the
-OMP_NUM_THREADS environment variable or the compiler runtime.
+The {Nphi} argument sets the number of coprocessors per node.
 
-The {precision} argument determines the precision mode to use and can
-take values of {single} (intel styles use single precision for all
-calculations), {mixed} (intel styles use double precision for
-accumulation and storage of forces, torques, energies, and virial
-terms and single precision for everything else), or {double} (intel
-styles use double precision for all calculations).
+Optional keyword/value pairs can also be specified.  Each has a
+default value as listed below.
 
-Additional keyword-value pairs are available that are used to
-determine how work is offloaded to an Intel(R) coprocessor. If LAMMPS is
-built without offload support, these values are ignored. The
-additional settings are as follows:
+The {prec} keyword argument determines the precision mode to use for
+computing pair style forces, either on the CPU or on the coprocessor,
+when using a USER-INTEL supported "pair style"_pair_style.html.  It
+can take a value of {single}, {mixed} which is the default, or
+{double}.  {Single} means single precision is used for the entire
+force calculation.  {Mixed} means forces between a pair of atoms are
+computed in single precision, but accumulated and stored in double
+precision, including storage of forces, torques, energies, and virial
+quantities.  {Double} means double precision is used for the entire
+force calculation.
 
-The {balance} setting is used to set the fraction of work offloaded to
-the coprocessor for an intel style (in the inclusive range 0.0 to
-1.0). While this fraction of work is running on the coprocessor, other
-calculations will run on the host, including neighbor and pair
-calculations that are not offloaded, angle, bond, dihedral, kspace,
-and some MPI communications. If the balance is set to -1, the fraction
-of work is dynamically adjusted automatically throughout the run. This
-can typically give performance within 5 to 10 percent of the optimal
-fixed fraction.
+The {balance} keyword sets the fraction of "pair
+style"_pair_style.html work offloaded to the coprocessor style for
+split values between 0.0 and 1.0 inclusive.  While this fraction of
+work is running on the coprocessor, other calculations will run on the
+host, including neighbor and pair calculations that are not offloaded,
+angle, bond, dihedral, kspace, and some MPI communications.  If
+{split} is set to -1, the fraction of work is dynamically adjusted
+automatically throughout the run.  This typically give performance
+within 5 to 10 percent of the optimal fixed fraction.
 
-The {offload_cards} setting determines the number of coprocessors to
-use on each node.
-
-Additional options for fine tuning performance with offload are as
-follows:
-
-The {offload_ghost} setting determines whether or not ghost atoms,
-atoms at the borders between MPI tasks, are offloaded for neighbor and
-force calculations. When set to "0", ghost atoms are not offloaded.
-This option can reduce the amount of data transfer with the
-coprocessor and also can overlap MPI communication of forces with
+The {ghost} keyword determines whether or not ghost atoms, i.e. atoms
+at the boundaries of proessor sub-domains, are offloaded for neighbor
+and force calculations.  When the value = "no", ghost atoms are not
+offloaded.  This option can reduce the amount of data transfer with
+the coprocessor and can also overlap MPI communication of forces with
 computation on the coprocessor when the "newton pair"_newton.html
-setting is "on".  When set to "1", ghost atoms are offloaded. In some
-cases this can provide better performance, especially if the offload
-fraction is high.
+setting is "on".  When the value = "ues", ghost atoms are offloaded.
+In some cases this can provide better performance, especially if the
+{balance} fraction is high.
 
-The {offload_tpc} option sets the maximum number of threads that will
-run on each core of the coprocessor.
+The {tpc} keyword sets the maximum # of threads {Ntpc} that will
+run on each physical core of the coprocessor.  The default value is
+set to 4, which is the number of hardware threads per core supported
+by the current generation Xeon Phi chips.
 
-The {offload_threads} option sets the maximum number of threads that
-will be used on the coprocessor for each MPI task. This, along with
-the {offload_tpc} setting, are the only methods for changing the
-number of threads on the coprocessor. The OMP_NUM_THREADS keyword and
-{Nthreads} options are only used for threads on the host.
+The {tptask} keyword sets the maximum # of threads (Ntptask} that will
+be used on the coprocessor for each MPI task.  This, along with the
+{tpc} keyword setting, are the only methods for changing the number of
+threads used on the coprocessor.  The default value is set to 240 =
+60*4, which is the maximum # of threads supported by an entire current
+generation Xeon Phi chip.
 
 :line
 
-The {kokkos} style invokes options associated with the use of the
+The {kokkos} style invokes settings associated with the use of the
 KOKKOS package.
 
 The {neigh} keyword determines what kinds of neighbor lists are built.
@@ -341,43 +374,59 @@ multiple threads to pack/unpack communicated data.
 
 :line
 
-The {omp} style invokes options associated with the use of the
+The {omp} style invokes settings associated with the use of the
 USER-OMP package.
 
-The first argument allows to explicitly set the number of OpenMP
-threads to be allocated for each MPI process.  For example, if your
-system has nodes with dual quad-core processors, it has a total of 8
-cores per node.  You could run MPI on 2 cores on each node (e.g. using
-options for the mpirun command), and set the {Nthreads} setting to 4.
-This would effectively use all 8 cores on each node.  Since each MPI
-process would spawn 4 threads (one of which runs as part of the MPI
-process itself).
+The {Nthread} argument sets the number of OpenMP threads allocated for
+each MPI task.  For example, if your system has nodes with dual
+quad-core processors, it has a total of 8 cores per node.  You could
+use two MPI tasks per node (e.g. using the -ppn option of the mpirun
+command), and set {Nthreads} = 4.  This would use all 8 cores on each
+node.  Note that the product of MPI tasks * threads/task should not
+exceed the physical number of cores (on a node), otherwise performance
+will suffer.
 
-For performance reasons, you should not set {Nthreads} to more threads
-than there are physical cores (per MPI task), but LAMMPS cannot check
-for this.
-
-An {Nthreads} value of '*' instructs LAMMPS to use whatever is the
+Setting {Nthread} = 0 instructs LAMMPS to use whatever value is the
 default for the given OpenMP environment. This is usually determined
 via the {OMP_NUM_THREADS} environment variable or the compiler
-runtime.  Please note that in most cases the default for OpenMP
-capable compilers is to use one thread for each available CPU core
-when {OMP_NUM_THREADS} is not set, which can lead to extremely bad
+runtime.  Note that in most cases the default for OpenMP capable
+compilers is to use one thread for each available CPU core when
+{OMP_NUM_THREADS} is not explicitly set, which can lead to poor
 performance.
 
-Which combination of threads and MPI tasks gives the best performance
-is difficult to predict and can depend on many components of your input.
-Not all features of LAMMPS support OpenMP and the parallel efficiency
-can be very different, too.
+Here are examples of how to set the environment variable when
+launching LAMMPS:
 
-The {mode} setting specifies where neighbor list calculations will be
-multi-threaded as well.  If {mode} is force, neighbor list calculation
-is performed in serial. If {mode} is force/neigh, a multi-threaded
-neighbor list build is used. Using the force/neigh setting is almost
-always faster and should produce idential neighbor lists at the
-expense of using some more memory (neighbor list pages are always
-allocated for all threads at the same time and each thread works on
-its own pages).
+env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
+env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
+mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script :pre
+
+or you can set it permanently in your shell's start-up script.  
+All three of these examples use a total of 4 CPU cores.
+
+Note that different MPI implementations have different ways of passing
+the OMP_NUM_THREADS environment variable to all MPI processes.  The
+2nd example line above is for MPICH; the 3rd example line with -x is
+for OpenMPI.  Check your MPI documentation for additional details.
+
+What combination of threads and MPI tasks gives the best performance
+is difficult to predict and can depend on many components of your
+input.  Not all features of LAMMPS support OpenMP threading via the
+USER-OMP packaage and the parallel efficiency can be very different,
+too.
+
+Optional keyword/value pairs can also be specified.  Each has a
+default value as listed below.
+
+The {neigh} keyword specifies whether neighbor list building will be
+multi-threaded in addition to force calculations.  If {neigh} is set
+to {no} then neighbor list calculation is performed only by MPI tasks
+with no OpenMP threading.  If {mode} is {yes} (the default), a
+multi-threaded neighbor list build is used.  Using {neigh} = {yes} is
+almost always faster and should produce idential neighbor lists at the
+expense of using more memory.  Specifically, neighbor list pages are
+allocated for all threads at the same time and each thread works
+within its own pages.
 
 :line
 
@@ -394,6 +443,10 @@ The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
+The intel style of this command can only be invoked if LAMMPS was
+built with the USER-INTEL package.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
 The kk style of this command can only be invoked if LAMMPS was built
 with the KOKKOS package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
@@ -404,33 +457,49 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
-"suffix"_suffix.html
+"suffix"_suffix.html, "-pk" "command-line
+setting"_Section_start.html#start_7
 
 [Default:]
 
-The default settings for the USER-CUDA package are "package cuda gpu
-2".  This is the case whether the "-sf cuda" "command-line
-switch"_Section_start.html#start_7 is used or not.
+To use the USER-CUDA package, the package cuda command must be invoked
+explicitly in your input script or via the "-pk cuda" "command-line
+switch"_Section_start.html#start_7.  This will set the # of GPUs/node.
+The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled,
+test = not enabled, and thread = auto.
 
-If the "-sf gpu" "command-line switch"_Section_start.html#start_7 is
-used then it is as if the command "package gpu force/neigh 0 0 1" were
-invoked, to specify default settings for the GPU package.  If the
-command-line switch is not used, then no defaults are set, and you
-must specify the appropriate package command in your input script.
+For the GPU package, the default is Ngpu = 1 and the option defaults
+are neigh = yes, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1, binsize =
+pair cutoff + neighbor skin, device = not used.  These settings are
+made automatically if the "-sf gpu" "command-line
+switch"_Section_start.html#start_7 is used.  If it is not used, you
+must invoke the package gpu command in your input script or via the
+"-pk gpu" "command-line switch"_Section_start.html#start_7.
 
-The default settings for the USER-INTEL package are "package intel *
-mixed balance -1 offload_cards 1 offload_tpc 4 offload_threads 240".
-The {offload_ghost} default setting is determined by the intel style
-being used.  The value used is output to the screen in the offload
-report at the end of each run.
+For the USER-INTEL package, the default is Nphi = 1 and the option
+defaults are prec = mixed, balance = -1, tpc = 4, tptask = 240.  The
+default ghost option is determined by the pair style being used.  This
+value used is output to the screen in the offload report at the end of
+each run.  These settings are made automatically if the "-sf intel"
+"command-line switch"_Section_start.html#start_7 is used.  If it is
+not used, you must invoke the package intel command in your input
+script or or via the "-pk intel" "command-line
+switch"_Section_start.html#start_7.
 
-The default settings for the KOKKOS package are "package kk neigh full 
-comm/exchange host comm/forward host".  This is the case whether the
-"-sf kk" "command-line switch"_Section_start.html#start_7 is used or
-not.
+The default settings for the KOKKOS package are "package kokkos neigh
+full comm/exchange host comm/forward host".  This is the case whether
+the "-sf kk" "command-line switch"_Section_start.html#start_7 is used
+or not.
+To use the KOKKOS package, the package kokkos command must be invoked
+explicitly in your input script or via the "-pk kokkos" "command-line
+switch"_Section_start.html#start_7.  This will set the # of GPUs/node.
+The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled,
+test = not enabled, and thread = auto.
+
+For the OMP package, the default is Nthreads = 0 and the option
+defaults are neigh = yes.  These settings are made automatically if
+the "-sf omp" "command-line switch"_Section_start.html#start_7 is
+used.  If it is not used, you must invoke the package omp command in
+your input script or via the "-pk omp" "command-line
+switch"_Section_start.html#start_7.
 
-If the "-sf omp" "command-line switch"_Section_start.html#start_7 is
-used then it is as if the command "package omp *" were invoked, to
-specify default settings for the USER-OMP package.  If the
-command-line switch is not used, then no defaults are set, and you
-must specify the appropriate package command in your input script.
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 0c716b3482..6414b58a0e 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -80,68 +80,87 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   if (lmp->citeme) lmp->citeme->add(cite_gpu_package);
 
   if (lmp->cuda)
-    error->all(FLERR,"Cannot use fix GPU with USER-CUDA mode enabled");
+    error->all(FLERR,"Cannot use GPU package with USER-CUDA package enabled");
 
-  if (narg < 7) error->all(FLERR,"Illegal fix GPU command");
-  if (strcmp(arg[1],"all") != 0) error->all(FLERR,"Illegal fix GPU command");
+  if (narg < 4) error->all(FLERR,"Illegal package gpu command");
 
-  int first_gpu, last_gpu;
-
-  if (strcmp(arg[3],"force") == 0)
-    _gpu_mode = GPU_FORCE;
-  else if (strcmp(arg[3],"force/neigh") == 0) {
-    _gpu_mode = GPU_NEIGH;
-    if (domain->triclinic)
-      error->all(FLERR,"Cannot use force/neigh with triclinic box");
-  } else if (strcmp(arg[3],"force/hybrid_neigh") == 0) {
-    _gpu_mode = GPU_HYB_NEIGH;
-    if (domain->triclinic)
-      error->all(FLERR,
-                 "Cannot use force/hybrid_neigh with triclinic box");
-  } else
-    error->all(FLERR,"Illegal fix GPU command");
-
-  first_gpu = force->inumeric(FLERR,arg[4]);
-  last_gpu = force->inumeric(FLERR,arg[5]);
-
-  _particle_split = force->numeric(FLERR,arg[6]);
-  if (_particle_split==0 || _particle_split>1)
-    error->all(FLERR,"Illegal fix GPU command");
+  int ngpu = atoi(arg[3]);
+  if (ngpu <= 0) error->all(FLERR,"Illegal package gpu command");
+  int first_gpu = 0;
+  int last_gpu = ngpu-1;
+  
+  // options
 
+  _gpu_mode = GPU_NEIGH;
+  _particle_split = 1.0;
   int nthreads = 1;
   int threads_per_atom = -1;
-  double cell_size = -1;
-
-  int iarg = 7;
+  double binsize = -1;
   char *opencl_flags = NULL;
+
+  int iarg = 4;
   while (iarg < narg) {
-    if (iarg+2 > narg) error->all(FLERR,"Illegal fix GPU command");
-
-    if (strcmp(arg[iarg],"threads_per_atom") == 0)
+    if (strcmp(arg[iarg],"neigh") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      if (strcmp(arg[iarg]+1,"yes") == 0) _gpu_mode = GPU_NEIGH;
+      else if (strcmp(arg[iarg]+1,"no") == 0) _gpu_mode = GPU_FORCE;
+      else if (strcmp(arg[iarg]+1,"hybrid") == 0) _gpu_mode = GPU_HYB_NEIGH;
+      else error->all(FLERR,"Illegal package gpu command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"split") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      _particle_split = force->numeric(FLERR,arg[iarg+1]);
+      if (_particle_split == 0.0 || _particle_split > 1.0)
+        error->all(FLERR,"Illegal package GPU command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"gpuID") == 0) {
+      if (iarg+3 > narg) error->all(FLERR,"Illegal package gpu command");
+      first_gpu = force->inumeric(FLERR,arg[iarg+1]);
+      last_gpu = force->inumeric(FLERR,arg[iarg+2]);
+      iarg += 3;
+    } else if (strcmp(arg[iarg],"tpa") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       threads_per_atom = force->inumeric(FLERR,arg[iarg+1]);
-    else if (strcmp(arg[iarg],"nthreads") == 0)
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"nthreads") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       nthreads = force->inumeric(FLERR,arg[iarg+1]);
-    else if (strcmp(arg[iarg],"cellsize") == 0)
-      cell_size = force->numeric(FLERR,arg[iarg+1]);
-    else if (strcmp(arg[iarg],"device") == 0)
+      if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"binsize") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      binsize = force->numeric(FLERR,arg[iarg+1]);
+      if (binsize <= 0.0) error->all(FLERR,"Illegal fix GPU command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"device") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       opencl_flags = arg[iarg+1];
-    else
-      error->all(FLERR,"Illegal fix GPU command");
-
-    iarg += 2;
+      iarg += 2;
+    } else error->all(FLERR,"Illegal package gpu command");
   }
 
-  if (nthreads < 1)
-    error->all(FLERR,"Illegal fix GPU command");
+  // error check
+
+  if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) && 
+      domain->triclinic)
+    error->all(FLERR,"Cannot use package gpu neigh yes with triclinic box");
 
   #ifndef _OPENMP
   if (nthreads > 1)
     error->all(FLERR,"No OpenMP support compiled in");
   #endif
 
+  // set newton_pair = 0 since required by all GPU pair styles
+
+  force->newton_pair = 0;
+  if (force->newton_pair || force->newton_bond) force->newton = 1;
+  else force->newton = 0;
+
+  // pass params to GPU library
+
   int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu,
                                  _gpu_mode, _particle_split, nthreads,
-                                 threads_per_atom, cell_size, opencl_flags);
+                                 threads_per_atom, binsize, opencl_flags);
   GPU_EXTRA::check_flag(gpu_flag,error,world);
 }
 
@@ -214,10 +233,9 @@ void FixGPU::setup(int vflag)
       error->all(FLERR,
                  "Cannot use neigh_modify exclude with GPU neighbor builds");
 
-  if (strstr(update->integrate_style,"verlet"))
-    post_force(vflag);
+  if (strstr(update->integrate_style,"verlet")) post_force(vflag);
   else {
-    // In setup only, all forces calculated on gpu are put in the outer level
+    // in setup only, all forces calculated on GPU are put in the outer level
     ((Respa *) update->integrate)->copy_flevel_f(_nlevels_respa-1);
     post_force(vflag);
     ((Respa *) update->integrate)->copy_f_flevel(_nlevels_respa-1);
@@ -273,7 +291,7 @@ void FixGPU::post_force_respa(int vflag, int ilevel, int iloop)
 double FixGPU::memory_usage()
 {
   double bytes = 0.0;
-  // Memory usage currently returned by pair routine
+  // memory usage currently returned by pair routine
   return bytes;
 }
 
diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp
index 1d678144a8..271c377397 100644
--- a/src/USER-CUDA/cuda.cpp
+++ b/src/USER-CUDA/cuda.cpp
@@ -47,15 +47,12 @@
 
 using namespace LAMMPS_NS;
 
-
-
 Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp)
 {
   cuda_exists = true;
   lmp->cuda = this;
 
-  if(universe->me == 0)
-    printf("# Using LAMMPS_CUDA \n");
+  if (universe->me == 0) printf("# Using LAMMPS_CUDA \n");
 
   shared_data.me = universe->me;
   device_set = false;
@@ -153,10 +150,9 @@ Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp)
 
 Cuda::~Cuda()
 {
-
   print_timings();
 
-  if(universe->me == 0) printf("# CUDA: Free memory...\n");
+  if (universe->me == 0) printf("# CUDA: Free memory...\n");
 
   delete cu_q;
   delete cu_x;
@@ -203,79 +199,60 @@ Cuda::~Cuda()
 
 void Cuda::accelerator(int narg, char** arg)
 {
-  if(device_set) return;
+  if (device_set) return;
+  if (universe->me == 0) printf("# CUDA: Activate GPU \n");
 
-  if(universe->me == 0)
-    printf("# CUDA: Activate GPU \n");
+  int pppn = force->inumeric(FLERR,arg[0]);
+  if (pppn <= 0) error->all(FLERR,"Illegal package cuda command");
+
+  // optional args
 
   int* devicelist = NULL;
-  int pppn = 2;
-
-  for(int i = 0; i < narg; i++) {
-    if(strcmp(arg[i], "gpu/node") == 0) {
-      if(++i == narg)
-        error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'gpu/node' option.");
-
-      pppn = force->inumeric(FLERR,arg[i]);
-    }
-
-    if(strcmp(arg[i], "gpu/node/special") == 0) {
-      if(++i == narg)
-        error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node/special'.");
-
-      pppn = force->inumeric(FLERR,arg[i]);
-
-      if(pppn < 1) error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting number of GPUs to be used per node after keyword 'gpu/node special'.");
-
-      if(i + pppn == narg)
-        error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting list of device ids after keyword 'gpu/node special'.");
 
+  int iarg = 1;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"gpuID") == 0) {
+      if (iarg+pppn+1 > narg) error->all(FLERR,"Illegal package cuda command");
       devicelist = new int[pppn];
-
-      for(int k = 0; k < pppn; k++) {
-        i++;
-        devicelist[k] = force->inumeric(FLERR,arg[i]);
-      }
-    }
-
-    if(strcmp(arg[i], "pinned") == 0) {
-      if(++i == narg)
-        error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'pinned' option.");
-
-      pinned = force->inumeric(FLERR,arg[i]) == 0 ? false : true;
-
-      if((pinned == false) && (universe->me == 0)) printf(" #CUDA: Pinned memory is not used for communication\n");
-    }
-
-    if(strcmp(arg[i], "timing") == 0) {
+      for (int k = 0; k < pppn; k++)
+        devicelist[k] = force->inumeric(FLERR,arg[iarg+k+1]);
+      iarg += pppn + 1;
+    } else if (strcmp(arg[iarg],"timing") == 0) {
       dotiming = true;
-    }
-
-    if(strcmp(arg[i], "suffix") == 0) {
-      if(++i == narg)
-        error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option.");
-
-      strcpy(lmp->suffix, arg[i]);
-    }
-
-    if(strcmp(arg[i], "overlap_comm") == 0) {
-      shared_data.overlap_comm = 1;
-    }
-
-    if(strcmp(arg[i], "test") == 0) {
-      if(++i == narg)
-        error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'test' option.");
-
-      testatom = force->numeric(FLERR,arg[i]);
+      iarg++;
+    } else if (strcmp(arg[iarg],"test") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
+      testatom = force->numeric(FLERR,arg[iarg+1]);
       dotestatom = true;
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"thread") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
+      if (strcmp(arg[iarg+1],"auto") == 0) 
+        shared_data.pair.override_block_per_atom = -1;
+      else if (strcmp(arg[iarg+1],"tpa") == 0) 
+        shared_data.pair.override_block_per_atom = 0;
+      else if (strcmp(arg[iarg+1],"bpa") == 0) 
+        shared_data.pair.override_block_per_atom = 1;
+      else error->all(FLERR,"Illegal package cuda command");
+      iarg += 2;
     }
 
-    if(strcmp(arg[i], "override/bpa") == 0) {
-      if(++i == narg)
-        error->all(FLERR, "Invalid Options for 'accelerator' command. Expecting a number after 'override/bpa' option.");
+    // undocumented options
 
-      shared_data.pair.override_block_per_atom = force->inumeric(FLERR,arg[i]);
-    }
+    else if (strcmp(arg[iarg],"suffix") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
+      strcpy(lmp->suffix,arg[iarg+1]);
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"overlap_comm") == 0) {
+      shared_data.overlap_comm = 1;
+      iarg++;
+    } else if (strcmp(arg[iarg],"pinned") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
+      pinned = force->inumeric(FLERR,arg[iarg+1]) == 0 ? false : true;
+      if ((pinned == false) && (universe->me == 0)) 
+        printf(" #CUDA: Pinned memory is not used for communication\n");
+      iarg += 2;
+    } else error->all(FLERR,"Illegal package cuda command");
   }
 
   CudaWrapper_Init(0, (char**)0, universe->me, pppn, devicelist);
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index 8fd3003b49..9c9cd7b0e1 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -49,10 +49,9 @@ enum{NSQ,BIN,MULTI};
 
 FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
 {
-  if (narg < 4)
-    error->all(FLERR, "Illegal package intel command");
-  if (strcmp(arg[1],"all") != 0)
-    error->all(FLERR, "fix Intel has to operate on group 'all'");
+  if (narg < 4) error->all(FLERR,"Illegal package intel command");
+
+  int ncops = force->inumeric(FLERR,arg[3]);
 
   _precision_mode = PREC_MODE_MIXED;
   _offload_balance = 1.0;
@@ -64,6 +63,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   _offload_tpc = 4;
 
   #ifdef _LMP_INTEL_OFFLOAD
+  if (ncops < 1) error->all(FLERR,"Illegal package intel command");
   _offload_affinity_set = 0;
   _off_force_array_s = 0;
   _off_force_array_m = 0;
@@ -86,58 +86,61 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   _offload_cores = offload_cores;
   _offload_threads = offload_cores;
   #endif
-  int ncops = 1;
+
+  // optional keywords
+
   _allow_separate_buffers = 1;
   _offload_ghost = -1;
 
   int iarg = 4;
   while (iarg < narg) {
-    if (strcmp(arg[iarg], "mixed") == 0)
-      _precision_mode = PREC_MODE_MIXED;
-    else if (strcmp(arg[iarg], "double") == 0)
-      _precision_mode = PREC_MODE_DOUBLE;
-    else if (strcmp(arg[iarg], "single") == 0)
-      _precision_mode = PREC_MODE_SINGLE;
-    else if (strcmp(arg[iarg], "offload_affinity_balanced") == 0)
+    if (strcmp(arg[iarg],"prec") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
+      if (strcmp(arg[iarg+1],"single") == 0) 
+        _precision_mode = PREC_MODE_SINGLE;
+      else if (strcmp(arg[iarg+1],"mixed") == 0) 
+        _precision_mode = PREC_MODE_MIXED;
+      else if (strcmp(arg[iarg+1],"double") == 0) 
+        _precision_mode = PREC_MODE_DOUBLE;
+      else error->all(FLERR,"Illegal package intel command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"balance") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
+      _offload_balance = force->numeric(FLERR,arg[iarg+1]);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "ghost") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
+      if (strcmp(arg[iarg+1],"yes") == 0) _offload_ghost = 1;
+      else if (strcmp(arg[iarg+1],"no") == 0) _offload_ghost = 0;
+      else error->all(FLERR,"Illegal package intel command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "tpc") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
+      _offload_tpc = atoi(arg[iarg+1]);
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"tptask") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
+      _offload_threads = atoi(arg[iarg+1]);
+      iarg += 2;
+    }
+
+    // undocumented options
+
+    else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
       _offload_affinity_balanced = 1;
-    else if (strcmp(arg[iarg], "balance") == 0) {
-      if (iarg == narg - 1)
-        error->all(FLERR, "Illegal package intel mode requested");
-      ++iarg;
-      _offload_balance = force->numeric(FLERR,arg[iarg]);
-    } else if (strcmp(arg[iarg], "offload_threads") == 0) {
-      if (iarg == narg - 1)
-        error->all(FLERR, "Illegal package intel mode requested");
-      ++iarg;
-      _offload_threads = atoi(arg[iarg]);
-    } else if (strcmp(arg[iarg], "offload_tpc") == 0) {
-      if (iarg == narg - 1)
-        error->all(FLERR, "Illegal package intel mode requested");
-      ++iarg;
-      _offload_tpc = atoi(arg[iarg]);
-    } else if (strcmp(arg[iarg], "offload_cards") == 0) {
-      if (iarg == narg - 1)
-        error->all(FLERR, "Illegal package intel mode requested");
-      ++iarg;
-      ncops = atoi(arg[iarg]);
-    } else if (strcmp(arg[iarg], "buffers") == 0) {
-      if (iarg == narg - 1)
-        error->all(FLERR, "Illegal package intel mode requested");
-      ++iarg;
-      _allow_separate_buffers = atoi(arg[iarg]);
-    } else if (strcmp(arg[iarg], "offload_ghost") == 0) {
-      if (iarg == narg - 1)
-        error->all(FLERR, "Illegal package intel mode requested");
-      ++iarg;
-      _offload_ghost = atoi(arg[iarg]);
-    } else
-      error->all(FLERR, "Illegal package intel mode requested");
-    ++iarg;
+      iarg++;
+    } else if (strcmp(arg[iarg],"buffers") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package intel command");
+      _allow_separate_buffers = atoi(arg[iarg+1]);
+      iarg += 2;
+    } else error->all(FLERR,"Illegal package intel command");
   }
 
+  // error check
+
   if (_offload_balance > 1.0 || _offload_threads <= 0 ||
       _offload_tpc <= 0 || _offload_tpc > 4)
-    error->all(FLERR, "Illegal package intel mode requested");
+    error->all(FLERR,"Illegal package intel command");
 
   #ifdef _LMP_INTEL_OFFLOAD
   _ncops = ncops;
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index 5b7d2b3926..a19512b140 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -13,7 +13,7 @@
 
 #ifdef FIX_CLASS
 
-FixStyle(Intel,FixIntel)
+FixStyle(INTEL,FixIntel)
 
 #else
 
diff --git a/src/USER-OMP/fix_omp.cpp b/src/USER-OMP/fix_omp.cpp
index 8971805f77..4a669addea 100644
--- a/src/USER-OMP/fix_omp.cpp
+++ b/src/USER-OMP/fix_omp.cpp
@@ -71,13 +71,12 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg)
      thr(NULL), last_omp_style(NULL), last_pair_hybrid(NULL),
      _nthr(-1), _neighbor(true), _mixed(false), _reduced(true)
 {
-  if ((narg < 4) || (narg > 7)) error->all(FLERR,"Illegal package omp command");
-  if (strcmp(arg[1],"all") != 0) error->all(FLERR,"fix OMP has to operate on group 'all'");
+  if (narg < 4) error->all(FLERR,"Illegal package omp command");
 
   int nthreads = 1;
   if (narg > 3) {
 #if defined(_OPENMP)
-    if (strcmp(arg[3],"*") == 0)
+    if (strcmp(arg[3],"0") == 0)
 #pragma omp parallel default(none) shared(nthreads)
       nthreads = omp_get_num_threads();
     else
@@ -97,22 +96,31 @@ FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg)
     comm->nthreads = nthreads;
   }
 
+  // optional keywords
+
   int iarg = 4;
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"force/neigh") == 0)
-      _neighbor = true;
-    else if (strcmp(arg[iarg],"force") == 0)
-      _neighbor = false;
-    else if (strcmp(arg[iarg],"mixed") == 0)
+    if (strcmp(arg[iarg],"neigh") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package omp command");
+      if (strcmp(arg[iarg]+1,"yes") == 0) _neighbor = true;
+      else if (strcmp(arg[iarg]+1,"no") == 0) _neighbor = false;
+      else error->all(FLERR,"Illegal package omp command");
+      iarg += 2;
+    }
+
+    // undocumented options
+
+    else if (strcmp(arg[iarg],"mixed") == 0) {
       _mixed = true;
-    else if (strcmp(arg[iarg],"double") == 0)
+      iarg++;
+    } else if (strcmp(arg[iarg],"double") == 0) {
       _mixed = false;
-    else
-      error->all(FLERR,"Illegal package omp mode requested");
-    ++iarg;
+      iarg++;
+    } else error->all(FLERR,"Illegal package omp command");
   }
 
   // print summary of settings
+
   if (comm->me == 0) {
     const char * const nmode = _neighbor ? "multi-threaded" : "serial";
     const char * const kmode = _mixed ? "mixed" : "double";
diff --git a/src/VORONOI/compute_voronoi_atom.cpp b/src/VORONOI/compute_voronoi_atom.cpp
index 97b195db30..8d69a62af4 100644
--- a/src/VORONOI/compute_voronoi_atom.cpp
+++ b/src/VORONOI/compute_voronoi_atom.cpp
@@ -55,8 +55,7 @@ ComputeVoronoi::ComputeVoronoi(LAMMPS *lmp, int narg, char **arg) :
 
   con_mono = NULL;
   con_poly = NULL;
-  tags = NULL;
-  occvec = sendocc = lroot = lnext = NULL;
+  tags = occvec = sendocc = lroot = lnext = NULL;
 
   int iarg = 3;
   while ( iarg<narg ) {
diff --git a/src/fix_move.cpp b/src/fix_move.cpp
index e25bb501ba..e7ff801626 100644
--- a/src/fix_move.cpp
+++ b/src/fix_move.cpp
@@ -245,8 +245,7 @@ FixMove::FixMove(LAMMPS *lmp, int narg, char **arg) :
   atom->add_callback(0);
   atom->add_callback(1);
 
-  displace = NULL;
-  velocity = NULL;
+  displace = velocity = NULL;
 
   // xoriginal = initial unwrapped positions of atoms
 
@@ -369,8 +368,12 @@ void FixMove::init()
   }
 
   maxatom = atom->nmax;
+  memory->destroy(displace);
+  memory->destroy(velocity);
   if (displaceflag) memory->create(displace,maxatom,3,"move:displace");
+  else displace = NULL;
   if (velocityflag) memory->create(velocity,maxatom,3,"move:velocity");
+  else velocity = NULL;
 
   if (strstr(update->integrate_style,"respa"))
     nlevels_respa = ((Respa *) update->integrate)->nlevels;
diff --git a/src/input.cpp b/src/input.cpp
index 00790f75ff..9ca5383c36 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -1403,10 +1403,13 @@ void Input::package()
     error->all(FLERR,"Package command after simulation box is defined");
   if (narg < 1) error->all(FLERR,"Illegal package command");
 
+  // same checks for packages existing as in LAMMPS::post_create()
+  // since can be invoked here by package command in input script
+
   if (strcmp(arg[0],"cuda") == 0) {
-    if (!lmp->cuda)
+    if (lmp->cuda == NULL || lmp->cuda->cuda_exists == 0)
       error->all(FLERR,
-                 "Package cuda command without USER-CUDA package installed");
+                 "Package cuda command without USER-CUDA package enabled");
     lmp->cuda->accelerator(narg-1,&arg[1]);
 
   } else if (strcmp(arg[0],"gpu") == 0) {
@@ -1420,12 +1423,11 @@ void Input::package()
     for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i];
     modify->add_fix(2+narg,fixarg);
     delete [] fixarg;
-    force->newton_pair = 0;
 
   } else if (strcmp(arg[0],"kokkos") == 0) {
-    if (!lmp->kokkos)
+    if (lmp->kokkos == NULL || lmp->kokkos->kokkos_exists == 0)
       error->all(FLERR,
-                 "Package kokkos command without KOKKOS package installed");
+                 "Package kokkos command without KOKKOS package enabled");
     lmp->kokkos->accelerator(narg-1,&arg[1]);
 
   } else if (strcmp(arg[0],"omp") == 0) {
@@ -1442,14 +1444,14 @@ void Input::package()
     delete [] fixarg;
 
  } else if (strcmp(arg[0],"intel") == 0) {
-    if (!modify->check_package("Intel"))
+    if (!modify->check_package("INTEL"))
       error->all(FLERR,
                  "Package intel command without USER-INTEL package installed");
 
     char **fixarg = new char*[2+narg];
     fixarg[0] = (char *) "package_intel";
     fixarg[1] = (char *) "all";
-    fixarg[2] = (char *) "Intel";
+    fixarg[2] = (char *) "INTEL";
     for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i];
     modify->add_fix(2+narg,fixarg);
     delete [] fixarg;
diff --git a/src/lammps.cpp b/src/lammps.cpp
index 3ae3f216dc..c1fc855e53 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -654,13 +654,12 @@ void LAMMPS::create()
 
 /* ----------------------------------------------------------------------
    check suffix consistency with installed packages
-     do this for GPU, USER-INTEL, USER-OMP
-     already done in constructor for USER-CUDA, KOKKOS
    turn off suffix2 = omp if USER-OMP is not installed
-   invoke package-specific setup commands
+   invoke package-specific deafult package commands
      only invoke if suffix is set and enabled
      also check if suffix2 is set
    called from LAMMPS constructor and after clear() command
+     so that package-specific core classes have been instantiated
 ------------------------------------------------------------------------- */
 
 void LAMMPS::post_create()
@@ -668,16 +667,25 @@ void LAMMPS::post_create()
   if (!suffix_enable) return;
 
   // suffix will always be set if suffix_enable = 1
+  // USER-CUDA and KOKKOS have package classes instantiated if enabled
+  //   via "-c on" and "-k on"
+  // GPU, INTEL, USER-OMP provide their own fixes which will have
+  //   been compiled with LAMMPS if those packages were installed
 
+  if (strcmp(suffix,"cuda") == 0 && (cuda == NULL || cuda->cuda_exists == 0))
+    error->all(FLERR,"Using suffix cuda without USER-CUDA package enabled");
   if (strcmp(suffix,"gpu") == 0 && !modify->check_package("GPU"))
     error->all(FLERR,"Using suffix gpu without GPU package installed");
-  if (strcmp(suffix,"intel") == 0 && !modify->check_package("Intel"))
+  if (strcmp(suffix,"intel") == 0 && !modify->check_package("INTEL"))
     error->all(FLERR,"Using suffix intel without USER-INTEL package installed");
+  if (strcmp(suffix,"kk") == 0 && 
+      (kokkos == NULL || kokkos->kokkos_exists == 0))
+    error->all(FLERR,"Using suffix kk without KOKKOS package enabled");
   if (strcmp(suffix,"omp") == 0 && !modify->check_package("OMP"))
     error->all(FLERR,"Using suffix omp without USER-OMP package installed");
 
   // suffix2 only currently set by -sf intel
-  // need to unset if LAMMPS was not built with USER-OMP package
+  // unset if LAMMPS was not built with USER-OMP package
 
   if (suffix2 && strcmp(suffix2,"omp") == 0 && !modify->check_package("OMP")) {
     delete [] suffix2;
@@ -685,13 +693,12 @@ void LAMMPS::post_create()
   }
 
   if (suffix) {
-    if (strcmp(suffix,"gpu") == 0) input->one("package gpu force/neigh 0 0 1");
-    if (strcmp(suffix,"omp") == 0) input->one("package omp *");
-    if (strcmp(suffix,"intel") == 0) 
-      input->one("package intel * mixed balance -1");
+    if (strcmp(suffix,"gpu") == 0) input->one("package gpu 1");
+    if (strcmp(suffix,"intel") == 0) input->one("package intel 1");
+    if (strcmp(suffix,"omp") == 0) input->one("package omp 0");
   }
   if (suffix2) {
-    if (strcmp(suffix,"omp") == 0) input->one("package omp *");
+    if (strcmp(suffix,"omp") == 0) input->one("package omp 0");
   }
 }
 
diff --git a/src/modify.cpp b/src/modify.cpp
index 637cfc0863..47bf41cd40 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -861,10 +861,10 @@ int Modify::find_fix(const char *id)
 }
 
 /* ----------------------------------------------------------------------
-   check for fix associated with package name
+   check for fix associated with package name in compiled list
    return 1 if found else 0
    used to determine whether LAMMPS was built with
-     GPU, USER-INTEL, USER-OMP packages
+     GPU, USER-INTEL, USER-OMP packages, which have their own fixes
 ------------------------------------------------------------------------- */
 
 int Modify::check_package(const char *package_fix_name)