diff --git a/doc/Manual.html b/doc/Manual.html
index 359798c3cd..857a90c623 100644
--- a/doc/Manual.html
+++ b/doc/Manual.html
@@ -152,23 +152,23 @@ it gives quick access to documentation for all LAMMPS commands.
 
 <UL>  5.1 <A HREF = "Section_accelerate.html#acc_1">Measuring performance</A> 
 <BR>
-  5.2 <A HREF = "Section_accelerate.html#acc_2">General strategies</A> 
+  5.2 <A HREF = "Section_accelerate.html#acc_2">Algorithms and code options to boost performace</A> 
 <BR>
-  5.3 <A HREF = "Section_accelerate.html#acc_3">Packages with optimized styles</A> 
+  5.3 <A HREF = "Section_accelerate.html#acc_3">Accelerator packages with optimized styles</A> 
 <BR>
-  5.4 <A HREF = "Section_accelerate.html#acc_4">OPT package</A> 
+<UL>    5.3.1 <A HREF = "accelerate_cuda.html">USER-CUDA package</A> 
 <BR>
-  5.5 <A HREF = "Section_accelerate.html#acc_5">USER-OMP package</A> 
+    5.3.2 <A HREF = "accelerate_gpu.html">GPU package</A> 
 <BR>
-  5.6 <A HREF = "Section_accelerate.html#acc_6">GPU package</A> 
+    5.3.3 <A HREF = "accelerate_intel.html">USER-INTEL package</A> 
 <BR>
-  5.7 <A HREF = "Section_accelerate.html#acc_7">USER-CUDA package</A> 
+    5.3.4 <A HREF = "accelerate_kokkos.html">KOKKOS package</A> 
 <BR>
-  5.8 <A HREF = "Section_accelerate.html#acc_8">KOKKOS package</A> 
+    5.3.5 <A HREF = "accelerate_omp.html">USER-OMP package</A> 
 <BR>
-  5.9 <A HREF = "Section_accelerate.html#acc_9">USER-INTEL package</A> 
-<BR>
-  5.10 <A HREF = "acc_10">Comparison of GPU and USER-CUDA packages</A> 
+    5.3.6 <A HREF = "accelerate_opt.html">OPT package</A> 
+<BR></UL>
+  5.4 <A HREF = "Section_accelerate.html#acc_4">Comparison of various accelerator packages</A> 
 <BR></UL>
 <LI><A HREF = "Section_howto.html">How-to discussions</A> 
 
@@ -417,16 +417,6 @@ it gives quick access to documentation for all LAMMPS commands.
 
 
 
-
-
-
-
-
-
-
-
-
-
 
 
 
diff --git a/doc/Manual.txt b/doc/Manual.txt
index 59ed9b9387..a0aaf08dd6 100644
--- a/doc/Manual.txt
+++ b/doc/Manual.txt
@@ -120,15 +120,15 @@ it gives quick access to documentation for all LAMMPS commands.
   4.2 "User packages"_pkg_2 :ule,b
 "Accelerating LAMMPS performance"_Section_accelerate.html :l
   5.1 "Measuring performance"_acc_1 :ulb,b
-  5.2 "General strategies"_acc_2 :b
-  5.3 "Packages with optimized styles"_acc_3 :b
-  5.4 "OPT package"_acc_4 :b
-  5.5 "USER-OMP package"_acc_5 :b
-  5.6 "GPU package"_acc_6 :b
-  5.7 "USER-CUDA package"_acc_7 :b
-  5.8 "KOKKOS package"_acc_8 :b
-  5.9 "USER-INTEL package"_acc_9 :b
-  5.10 "Comparison of GPU and USER-CUDA packages"_acc_10 :ule,b
+  5.2 "Algorithms and code options to boost performace"_acc_2 :b
+  5.3 "Accelerator packages with optimized styles"_acc_3 :b
+    5.3.1 "USER-CUDA package"_accelerate_cuda.html :ulb,b
+    5.3.2 "GPU package"_accelerate_gpu.html :b
+    5.3.3 "USER-INTEL package"_accelerate_intel.html :b
+    5.3.4 "KOKKOS package"_accelerate_kokkos.html :b
+    5.3.5 "USER-OMP package"_accelerate_omp.html :b
+    5.3.6 "OPT package"_accelerate_opt.html :ule,b
+  5.4 "Comparison of various accelerator packages"_acc_4 :ule,b
 "How-to discussions"_Section_howto.html :l
   6.1 "Restarting a simulation"_howto_1 :ulb,b
   6.2 "2d simulations"_howto_2 :b
@@ -216,11 +216,6 @@ it gives quick access to documentation for all LAMMPS commands.
 :link(acc_2,Section_accelerate.html#acc_2)
 :link(acc_3,Section_accelerate.html#acc_3)
 :link(acc_4,Section_accelerate.html#acc_4)
-:link(acc_5,Section_accelerate.html#acc_5)
-:link(acc_6,Section_accelerate.html#acc_6)
-:link(acc_7,Section_accelerate.html#acc_7)
-:link(acc_8,Section_accelerate.html#acc_8)
-:link(acc_9,Section_accelerate.html#acc_9)
 
 :link(howto_1,Section_howto.html#howto_1)
 :link(howto_2,Section_howto.html#howto_2)
diff --git a/doc/Section_accelerate.html b/doc/Section_accelerate.html
index 27b80f3d63..7547b571af 100644
--- a/doc/Section_accelerate.html
+++ b/doc/Section_accelerate.html
@@ -17,22 +17,38 @@ Section</A>
 performance for different classes of problems running on different
 kinds of machines.
 </P>
-5.1 <A HREF = "#acc_1">Measuring performance</A><BR>
-5.2 <A HREF = "#acc_2">General strategies</A><BR>
-5.3 <A HREF = "#acc_3">Packages with optimized styles</A><BR>
-5.4 <A HREF = "#acc_4">OPT package</A><BR>
-5.5 <A HREF = "#acc_5">USER-OMP package</A><BR>
-5.6 <A HREF = "#acc_6">GPU package</A><BR>
-5.7 <A HREF = "#acc_7">USER-CUDA package</A><BR>
-5.8 <A HREF = "#acc_8">KOKKOS package</A><BR>
-5.9 <A HREF = "#acc_9">USER-INTEL package</A><BR>
-5.10 <A HREF = "#acc_10">Comparison of USER-CUDA, GPU, and KOKKOS packages</A> <BR>
+<P>There are two thrusts to the discussion that follows.  The
+first is using code options that implement alternate algorithms
+that can speed-up a simulation.  The second is to use one
+of the several accelerator packages provided with LAMMPS that
+contain code optimized for certain kinds of hardware, including
+multi-core CPUs, GPUs, and Intel Xeon Phi coprocessors.
+</P>
+<UL><LI>5.1 <A HREF = "#acc_1">Measuring performance</A> 
 
+<LI>5.2 <A HREF = "#acc_2">Algorithms and code options to boost performace</A> 
+
+<LI>5.3 <A HREF = "#acc_3">Accelerator packages with optimized styles</A> 
+
+<UL><LI>    5.3.1 <A HREF = "accelerate_cuda.html">USER-CUDA package</A> 
+
+<LI>    5.3.2 <A HREF = "accelerate_gpu.html">GPU package</A> 
+
+<LI>    5.3.3 <A HREF = "accelerate_intel.html">USER-INTEL package</A> 
+
+<LI>    5.3.4 <A HREF = "accelerate_kokkos.html">KOKKOS package</A> 
+
+<LI>    5.3.5 <A HREF = "accelerate_omp.html">USER-OMP package</A> 
+
+<LI>    5.3.6 <A HREF = "accelerate_opt.html">OPT package</A> 
+</UL>
+<LI>5.4 <A HREF = "#acc_4">Comparison of various accelerator packages</A> 
+</UL>
 <P>The <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the LAMMPS
 web site gives performance results for the various accelerator
-packages discussed in this section for several of the standard LAMMPS
-benchmarks, as a function of problem size and number of compute nodes,
-on different hardware platforms.
+packages discussed in Section 5.2, for several of the standard LAMMPS
+benchmark problems, as a function of problem size and number of
+compute nodes, on different hardware platforms.
 </P>
 <HR>
 
@@ -104,11 +120,9 @@ various options.
 <LI>Staggered PPPM
 <LI>single vs double PPPM
 <LI>partial charge PPPM
-<LI>verlet/split
-<LI>processor mapping via processors numa command
-<LI>load-balancing: balance and fix balance
-<LI>processor command for layout
-<LI>OMP when lots of cores 
+<LI>verlet/split run style
+<LI>processor command for proc layout and numa layout
+<LI>load-balancing: balance and fix balance 
 </UL>
 <P>2-FFT PPPM, also called <I>analytic differentiation</I> or <I>ad</I> PPPM, uses
 2 FFTs instead of the 4 FFTs used by the default <I>ik differentiation</I>
@@ -146,28 +160,30 @@ such as when using a barostat.
 <A HREF = "fix.html">fixes</A>, <A HREF = "compute.html">computes</A>, and other commands have
 been added to LAMMPS, which will typically run faster than the
 standard non-accelerated versions.  Some require appropriate hardware
-on your system, e.g. GPUs or Intel Xeon Phi chips.
+to be present on your system, e.g. GPUs or Intel Xeon Phi
+coprocessors.
 </P>
-<P>All of these commands are in packages provided with LAMMPS, as
-explained <A HREF = "Section_packages.html">here</A>.  Currently, there are 6 such
-accelerator packages in LAMMPS, either as standard or user packages:
+<P>All of these commands are in packages provided with LAMMPS.  An
+overview of packages is give in <A HREF = "Section_packages.html">Section
+packages</A>.  Currently, there are 6 accelerator
+packages in LAMMPS, either as standard or user packages:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
-<TR><TD ><A HREF = "#acc_7">USER-CUDA</A> </TD><TD > for NVIDIA GPUs</TD></TR>
-<TR><TD ><A HREF = "acc_6">GPU</A> </TD><TD > for NVIDIA GPUs as well as OpenCL support</TD></TR>
-<TR><TD ><A HREF = "acc_9">USER-INTEL</A> </TD><TD > for Intel CPUs and Intel Xeon Phi</TD></TR>
-<TR><TD ><A HREF = "acc_8">KOKKOS</A> </TD><TD > for GPUs, Intel Xeon Phi, and OpenMP threading</TD></TR>
-<TR><TD ><A HREF = "acc_5">USER-OMP</A> </TD><TD > for OpenMP threading</TD></TR>
-<TR><TD ><A HREF = "acc_4">OPT</A> </TD><TD > generic CPU optimizations 
+<TR><TD ><A HREF = "accelerate_cuda.html">USER-CUDA</A> </TD><TD > for NVIDIA GPUs</TD></TR>
+<TR><TD ><A HREF = "accelerate_gpu.html">GPU</A> </TD><TD > for NVIDIA GPUs as well as OpenCL support</TD></TR>
+<TR><TD ><A HREF = "accelerate_intel.html">USER-INTEL</A> </TD><TD > for Intel CPUs and Intel Xeon Phi</TD></TR>
+<TR><TD ><A HREF = "accelerate_kokkos.html">KOKKOS</A> </TD><TD > for GPUs, Intel Xeon Phi, and OpenMP threading</TD></TR>
+<TR><TD ><A HREF = "accelerate_omp.html">USER-OMP</A> </TD><TD > for OpenMP threading</TD></TR>
+<TR><TD ><A HREF = "accelerate_opt.html">OPT</A> </TD><TD > generic CPU optimizations 
 </TD></TR></TABLE></DIV>
 
 <P>Any accelerated style has the same name as the corresponding standard
 style, except that a suffix is appended.  Otherwise, the syntax for
-the command that specifies the style is identical, their functionality
-is the same, and the numerical results it produces should also be the
+the command that uses the style is identical, their functionality is
+the same, and the numerical results it produces should also be the
 same, except for precision and round-off effects.
 </P>
-<P>For example, all of these styles are variants of the basic
+<P>For example, all of these styles are accelerated variants of the
 Lennard-Jones <A HREF = "pair_lj.html">pair_style lj/cut</A>:
 </P>
 <UL><LI><A HREF = "pair_lj.html">pair_style lj/cut/cuda</A>
@@ -177,25 +193,40 @@ Lennard-Jones <A HREF = "pair_lj.html">pair_style lj/cut</A>:
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/omp</A>
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/opt</A> 
 </UL>
-<P>Assuming LAMMPS was built with the appropriate package, a simulation
-using accelerated styles from the package can be run without modifying
-your input script, by specifying <A HREF = "Section_start.html#start_7">command-line
-switches</A>.  The details of how to do this
-vary from package to package and are explained below.  There is also a
-<A HREF = "suffix.html">suffix</A> command and a <A HREF = "package.html">package</A> command that
-accomplish the same thing and can be used within an input script if
-preferred.  The <A HREF = "suffix.html">suffix</A> command allows more precise
-control of whether an accelerated or unaccelerated version of a style
-is used at various points within an input script.
+<P>To see what accelerate styles are currently available, see
+<A HREF = "Section_commands.html#cmd_5">Section_commands 5</A> of the manual.  The
+doc pages for individual commands (e.g. <A HREF = "pair_lj.html">pair lj/cut</A> or
+<A HREF = "fix_nve.html">fix nve</A>) also list any accelerated variants available
+for that style.
 </P>
-<P>To see what styles are currently available in each of the accelerated
-packages, see <A HREF = "Section_commands.html#cmd_5">Section_commands 5</A> of the
-manual.  The doc page for individual commands (e.g. <A HREF = "pair_lj.html">pair
-lj/cut</A> or <A HREF = "fix_nve.html">fix nve</A>) also lists any
-accelerated variants available for that style.
+<P>To use an accelerator package in LAMMPS, and one or more of the styles
+it provides, follow these general steps.  Details vary from package to
+package and are explained in the individual accelerator sub-section
+doc pages, listed above:
+</P>
+<DIV ALIGN=center><TABLE  BORDER=1 >
+<TR><TD >build the accelerator library </TD><TD >  only for USER-CUDA and GPU packages </TD></TR>
+<TR><TD >install the accelerator package </TD><TD >  make yes-opt, make yes-user-intel, etc </TD></TR>
+<TR><TD >add compile/link flags to Makefile.machine </TD><TD >  in src/MAKE, <br>  only for USER-INTEL, KOKKOS, USER-OMP packages </TD></TR>
+<TR><TD >re-build LAMMPS </TD><TD >  make machine </TD></TR>
+<TR><TD >run a LAMMPS simulation </TD><TD >  lmp_machine < in.script </TD></TR>
+<TR><TD >enable the accelerator package </TD><TD >  via "-c on" and "-k on" <A HREF = "Section_start.html#start_7">command-line switches</A>, <br>  only for USER-CUDA and KOKKOS packages </TD></TR>
+<TR><TD >set any needed options for the package </TD><TD >  via "-pk" <A HREF = "Section_start.html#start_7">command-line switch</A> or  <A HREF = "package.html">package</A> command, <br>  only if defaults need to be changed </TD></TR>
+<TR><TD >use accelerated styles in your input script </TD><TD >  via "-sf" <A HREF = "Section_start.html#start_7">command-line switch</A> or  <A HREF = "suffix.html">suffix</A> command 
+</TD></TR></TABLE></DIV>
+
+<P>The first 4 steps typically only need to be done once, to create an
+executable that uses one or more accelerator packages.  We are working
+to create a "make" tool that will perform all these 4 steps in a
+single command.
+</P>
+<P>The last 4 steps can all be done from the command-line when LAMMPS is
+launched, without changing your input script.  Or you can add
+<A HREF = "package.html">package</A> and <A HREF = "suffix.html">suffix</A> commands to your input
+script.
 </P>
 <P>The examples directory has several sub-directories with scripts and
-README files for using the accelerator packages:
+README files for how to use the following accelerator packages:
 </P>
 <UL><LI>examples/cuda for USER-CUDA package
 <LI>examples/gpu for GPU package
@@ -205,13 +236,18 @@ README files for using the accelerator packages:
 <P>Likewise, the bench directory has FERMI and KEPLER sub-directories
 with scripts and README files for using all the accelerator packages.
 </P>
+<P>As mentioned above, the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark
+page</A> of the LAMMPS web site gives
+performance results for the various accelerator packages for several
+of the standard LAMMPS benchmark problems, as a function of problem
+size and number of compute nodes, on different hardware platforms.
+</P>
 <P>Here is a brief summary of what the various packages provide.  Details
-are in individual sections below.
+are in the individual package sub-sections listed above.
 </P>
 <UL><LI>Styles with a "cuda" or "gpu" suffix are part of the USER-CUDA or GPU
-packages, and can be run on NVIDIA GPUs associated with your CPUs.
-The speed-up on a GPU depends on a variety of factors, as discussed
-below. 
+packages, and can be run on NVIDIA GPUs.  The speed-up on a GPU
+depends on a variety of factors, as discussed below. 
 
 <LI>Styles with an "intel" suffix are part of the USER-INTEL
 package. These styles support vectorized single and mixed precision
@@ -236,1409 +272,29 @@ overload the available bandwidth for communication.
 speed-up the pairwise calculations of your simulation by 5-25% on a
 CPU. 
 </UL>
-<P>The following sections explain:
+<P>The individual accelerator package sub-sections explain:
 </P>
 <UL><LI>what hardware and software the accelerated package requires
 <LI>how to build LAMMPS with the accelerated package
-<LI>how to run with the accelerated package via either command-line switches or modifying the input script
+<LI>how to run with the accelerated package either via command-line switches or modifying the input script
 <LI>speed-ups to expect
 <LI>guidelines for best performance
 <LI>restrictions 
 </UL>
-<P>The final section compares and contrasts the USER-CUDA, GPU, and
-KOKKOS packages, since they all enable use of NVIDIA GPUs.
-</P>
 <HR>
 
-<H4><A NAME = "acc_4"></A>5.4 OPT package 
+<H4><A NAME = "acc_4"></A>5.4 Comparison of various accelerator packages 
 </H4>
-<P>The OPT package was developed by James Fischer (High Performance
-Technologies), David Richie, and Vincent Natoli (Stone Ridge
-Technologies).  It contains a handful of pair styles whose compute()
-methods were rewritten in C++ templated form to reduce the overhead
-due to if tests and other conditional code.
+<P>NOTE: this section still needs to be re-worked with additional KOKKOS
+and USER-INTEL information.
 </P>
-<P>Here is a quick overview of how to use the OPT package:
+<P>The next section compares and contrasts the various accelerator
+options, since there are multiple ways to perform OpenMP threading,
+run on GPUs, and run on Intel Xeon Phi coprocessors.
 </P>
-<UL><LI>include the OPT package and build LAMMPS
-<LI>use OPT pair styles in your input script 
-</UL>
-<P>The last step can be done using the "-sf opt" <A HREF = "Section_start.html#start_7">command-line
-switch</A>.  Or the effect of the "-sf" switch
-can be duplicated by adding a <A HREF = "suffix.html">suffix opt</A> command to your
-input script.
-</P>
-<P><B>Required hardware/software:</B>
-</P>
-<P>None.
-</P>
-<P><B>Building LAMMPS with the OPT package:</B>
-</P>
-<P>Include the package and build LAMMPS:
-</P>
-<PRE>cd lammps/src
-make yes-opt
-make machine 
-</PRE>
-<P>No additional compile/link flags are needed in your Makefile.machine
-in src/MAKE.
-</P>
-<P><B>Run with the OPT package from the command line:</B>
-</P>
-<P>Use the "-sf opt" <A HREF = "Section_start.html#start_7">command-line switch</A>,
-which will automatically append "opt" to styles that support it.
-</P>
-<PRE>lmp_machine -sf opt -in in.script
-mpirun -np 4 lmp_machine -sf opt -in in.script 
-</PRE>
-<P><B>Or run with the OPT package by editing an input script:</B>
-</P>
-<P>Use the <A HREF = "suffix.html">suffix opt</A> command, or you can explicitly add an
-"opt" suffix to individual styles in your input script, e.g.
-</P>
-<PRE>pair_style lj/cut/opt 2.5 
-</PRE>
-<P><B>Speed-ups to expect:</B>
-</P>
-<P>You should see a reduction in the "Pair time" value printed at the end
-of a run.  On most machines for reasonable problem sizes, it will be a
-5 to 20% savings.
-</P>
-<P><B>Guidelines for best performance:</B>
-</P>
-<P>None.  Just try out an OPT pair style to see how it performs.
-</P>
-<P><B>Restrictions:</B>
-</P>
-<P>None.
-</P>
-<HR>
-
-<H4><A NAME = "acc_5"></A>5.5 USER-OMP package 
-</H4>
-<P>The USER-OMP package was developed by Axel Kohlmeyer at Temple
-University.  It provides multi-threaded versions of most pair styles,
-nearly all bonded styles (bond, angle, dihedral, improper), several
-Kspace styles, and a few fix styles.  The package currently
-uses the OpenMP interface for multi-threading.
-</P>
-<P>Here is a quick overview of how to use the USER-OMP package:
-</P>
-<UL><LI>use the -fopenmp flag for compiling and linking in your Makefile.machine
-<LI>include the USER-OMP package and build LAMMPS
-<LI>use the mpirun command to set the number of MPI tasks/node
-<LI>specify how many threads per MPI task to use
-<LI>use USER-OMP styles in your input script 
-</UL>
-<P>The latter two steps can be done using the "-pk omp" and "-sf omp"
-<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
-the effect of the "-pk" or "-sf" switches can be duplicated by adding
-the <A HREF = "package.html">package omp</A> or <A HREF = "suffix.html">suffix omp</A> commands
-respectively to your input script.
-</P>
-<P><B>Required hardware/software:</B>
-</P>
-<P>Your compiler must support the OpenMP interface.  You should have one
-or more multi-core CPUs so that multiple threads can be launched by an
-MPI task running on a CPU.
-</P>
-<P><B>Building LAMMPS with the USER-OMP package:</B>
-</P>
-<P>Include the package and build LAMMPS:
-</P>
-<PRE>cd lammps/src
-make yes-user-omp
-make machine 
-</PRE>
-<P>Your src/MAKE/Makefile.machine needs a flag for OpenMP support in both
-the CCFLAGS and LINKFLAGS variables.  For GNU and Intel compilers,
-this flag is "-fopenmp".  Without this flag the USER-OMP styles will
-still be compiled and work, but will not support multi-threading.
-</P>
-<P><B>Run with the USER-OMP package from the command line:</B>
-</P>
-<P>The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-</P>
-<P>You need to choose how many threads per MPI task will be used by the
-USER-OMP package.  Note that the product of MPI tasks * threads/task
-should not exceed the physical number of cores (on a node), otherwise
-performance will suffer.
-</P>
-<P>Use the "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A>,
-which will automatically append "omp" to styles that support it.  Use
-the "-pk omp Nt" <A HREF = "Section_start.html#start_7">command-line switch</A>, to
-set Nt = # of OpenMP threads per MPI task to use.
-</P>
-<PRE>lmp_machine -sf omp -pk omp 16 -in in.script                       # 1 MPI task on a 16-core node
-mpirun -np 4 lmp_machine -sf omp -pk omp 4 -in in.script           # 4 MPI tasks each with 4 threads on a single 16-core node
-mpirun -np 32 -ppn 4 lmp_machine -sf omp -pk omp 4 -in in.script   # ditto on 8 16-core nodes 
-</PRE>
-<P>Note that if the "-sf omp" switch is used, it also issues a default
-<A HREF = "package.html">package omp 0</A> command, which sets the number of threads
-per MPI task via the OMP_NUM_THREADS environment variable.
-</P>
-<P>Using the "-pk" switch explicitly allows for direct setting of the
-number of threads and additional options.  Its syntax is the same as
-the "package omp" command.  See the <A HREF = "package.html">package</A> command doc
-page for details, including the default values used for all its
-options if it is not specified, and how to set the number of threads
-via the OMP_NUM_THREADS environment variable if desired.
-</P>
-<P><B>Or run with the USER-OMP package by editing an input script:</B>
-</P>
-<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
-and threads/MPI task is the same.
-</P>
-<P>Use the <A HREF = "suffix.html">suffix omp</A> command, or you can explicitly add an
-"omp" suffix to individual styles in your input script, e.g.
-</P>
-<PRE>pair_style lj/cut/omp 2.5 
-</PRE>
-<P>You must also use the <A HREF = "package.html">package omp</A> command to enable the
-USER-OMP package, unless the "-sf omp" or "-pk omp" <A HREF = "Section_start.html#start_7">command-line
-switches</A> were used.  It specifies how many
-threads per MPI task to use, as well as other options.  Its doc page
-explains how to set the number of threads via an environment variable
-if desired.
-</P>
-<P><B>Speed-ups to expect:</B>
-</P>
-<P>Depending on which styles are accelerated, you should look for a
-reduction in the "Pair time", "Bond time", "KSpace time", and "Loop
-time" values printed at the end of a run.  
-</P>
-<P>You may see a small performance advantage (5 to 20%) when running a
-USER-OMP style (in serial or parallel) with a single thread per MPI
-task, versus running standard LAMMPS with its standard
-(un-accelerated) styles (in serial or all-MPI parallelization with 1
-task/core).  This is because many of the USER-OMP styles contain
-similar optimizations to those used in the OPT package, as described
-above.
-</P>
-<P>With multiple threads/task, the optimal choice of MPI tasks/node and
-OpenMP threads/task can vary a lot and should always be tested via
-benchmark runs for a specific simulation running on a specific
-machine, paying attention to guidelines discussed in the next
-sub-section.
-</P>
-<P>A description of the multi-threading strategy used in the USER-OMP
-package and some performance examples are <A HREF = "http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1">presented
-here</A>
-</P>
-<P><B>Guidelines for best performance:</B>
-</P>
-<P>For many problems on current generation CPUs, running the USER-OMP
-package with a single thread/task is faster than running with multiple
-threads/task.  This is because the MPI parallelization in LAMMPS is
-often more efficient than multi-threading as implemented in the
-USER-OMP package.  The parallel efficiency (in a threaded sense) also
-varies for different USER-OMP styles.
-</P>
-<P>Using multiple threads/task can be more effective under the following
-circumstances:
-</P>
-<UL><LI>Individual compute nodes have a significant number of CPU cores but
-the CPU itself has limited memory bandwidth, e.g. for Intel Xeon 53xx
-(Clovertown) and 54xx (Harpertown) quad core processors. Running one
-MPI task per CPU core will result in significant performance
-degradation, so that running with 4 or even only 2 MPI tasks per node
-is faster.  Running in hybrid MPI+OpenMP mode will reduce the
-inter-node communication bandwidth contention in the same way, but
-offers an additional speedup by utilizing the otherwise idle CPU
-cores. 
-
-<LI>The interconnect used for MPI communication does not provide
-sufficient bandwidth for a large number of MPI tasks per node.  For
-example, this applies to running over gigabit ethernet or on Cray XT4
-or XT5 series supercomputers.  As in the aforementioned case, this
-effect worsens when using an increasing number of nodes. 
-
-<LI>The system has a spatially inhomogeneous particle density which does
-not map well to the <A HREF = "processors.html">domain decomposition scheme</A> or
-<A HREF = "balance.html">load-balancing</A> options that LAMMPS provides.  This is
-because multi-threading achives parallelism over the number of
-particles, not via their distribution in space. 
-
-<LI>A machine is being used in "capability mode", i.e. near the point
-where MPI parallelism is maxed out.  For example, this can happen when
-using the <A HREF = "kspace_style.html">PPPM solver</A> for long-range
-electrostatics on large numbers of nodes.  The scaling of the KSpace
-calculation (see the <A HREF = "kspace_style.html">kspace_style</A> command) becomes
-the performance-limiting factor.  Using multi-threading allows less
-MPI tasks to be invoked and can speed-up the long-range solver, while
-increasing overall performance by parallelizing the pairwise and
-bonded calculations via OpenMP.  Likewise additional speedup can be
-sometimes be achived by increasing the length of the Coulombic cutoff
-and thus reducing the work done by the long-range solver.  Using the
-<A HREF = "run_style.html">run_style verlet/split</A> command, which is compatible
-with the USER-OMP package, is an alternative way to reduce the number
-of MPI tasks assigned to the KSpace calculation. 
-</UL>
-<P>Additional performance tips are as follows:
-</P>
-<UL><LI>The best parallel efficiency from <I>omp</I> styles is typically achieved
-when there is at least one MPI task per physical processor,
-i.e. socket or die. 
-
-<LI>It is usually most efficient to restrict threading to a single
-socket, i.e. use one or more MPI task per socket. 
-
-<LI>Several current MPI implementation by default use a processor affinity
-setting that restricts each MPI task to a single CPU core.  Using
-multi-threading in this mode will force the threads to share that core
-and thus is likely to be counterproductive.  Instead, binding MPI
-tasks to a (multi-core) socket, should solve this issue. 
-</UL>
-<P><B>Restrictions:</B>
-</P>
-<P>None.
-</P>
-<HR>
-
-<H4><A NAME = "acc_6"></A>5.6 GPU package 
-</H4>
-<P>The GPU package was developed by Mike Brown at ORNL and his
-collaborators, particularly Trung Nguyen (ORNL).  It provides GPU
-versions of many pair styles, including the 3-body Stillinger-Weber
-pair style, and for <A HREF = "kspace_style.html">kspace_style pppm</A> for
-long-range Coulombics.  It has the following general features:
-</P>
-<UL><LI>It is designed to exploit common GPU hardware configurations where one
-or more GPUs are coupled to many cores of one or more multi-core CPUs,
-e.g. within a node of a parallel machine. 
-
-<LI>Atom-based data (e.g. coordinates, forces) moves back-and-forth
-between the CPU(s) and GPU every timestep. 
-
-<LI>Neighbor lists can be built on the CPU or on the GPU 
-
-<LI>The charge assignement and force interpolation portions of PPPM can be
-run on the GPU.  The FFT portion, which requires MPI communication
-between processors, runs on the CPU. 
-
-<LI>Asynchronous force computations can be performed simultaneously on the
-CPU(s) and GPU. 
-
-<LI>It allows for GPU computations to be performed in single or double
-precision, or in mixed-mode precision, where pairwise forces are
-computed in single precision, but accumulated into double-precision
-force vectors. 
-
-<LI>LAMMPS-specific code is in the GPU package.  It makes calls to a
-generic GPU library in the lib/gpu directory.  This library provides
-NVIDIA support as well as more general OpenCL support, so that the
-same functionality can eventually be supported on a variety of GPU
-hardware. 
-</UL>
-<P>Here is a quick overview of how to use the GPU package:
-</P>
-<UL><LI>build the library in lib/gpu for your GPU hardware wity desired precision
-<LI>include the GPU package and build LAMMPS
-<LI>use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU
-<LI>specify the # of GPUs per node
-<LI>use GPU styles in your input script 
-</UL>
-<P>The latter two steps can be done using the "-pk gpu" and "-sf gpu"
-<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
-the effect of the "-pk" or "-sf" switches can be duplicated by adding
-the <A HREF = "package.html">package gpu</A> or <A HREF = "suffix.html">suffix gpu</A> commands
-respectively to your input script.
-</P>
-<P><B>Required hardware/software:</B>
-</P>
-<P>To use this package, you currently need to have an NVIDIA GPU and
-install the NVIDIA Cuda software on your system:
-</P>
-<UL><LI>Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/gpus/0/information
-<LI>Go to http://www.nvidia.com/object/cuda_get.html
-<LI>Install a driver and toolkit appropriate for your system (SDK is not necessary)
-<LI>Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) to list supported devices and properties 
-</UL>
-<P><B>Building LAMMPS with the GPU package:</B>
-</P>
-<P>This requires two steps (a,b): build the GPU library, then build
-LAMMPS with the GPU package.
-</P>
-<P>(a) Build the GPU library
-</P>
-<P>The GPU library is in lammps/lib/gpu.  Select a Makefile.machine (in
-lib/gpu) appropriate for your system.  You should pay special
-attention to 3 settings in this makefile.
-</P>
-<UL><LI>CUDA_HOME = needs to be where NVIDIA Cuda software is installed on your system
-<LI>CUDA_ARCH = needs to be appropriate to your GPUs
-<LI>CUDA_PREC = precision (double, mixed, single) you desire 
-</UL>
-<P>See lib/gpu/Makefile.linux.double for examples of the ARCH settings
-for different GPU choices, e.g. Fermi vs Kepler.  It also lists the
-possible precision settings:
-</P>
-<PRE>CUDA_PREC = -D_SINGLE_SINGLE  # single precision for all calculations
-CUDA_PREC = -D_DOUBLE_DOUBLE  # double precision for all calculations
-CUDA_PREC = -D_SINGLE_DOUBLE  # accumulation of forces, etc, in double 
-</PRE>
-<P>The last setting is the mixed mode referred to above.  Note that your
-GPU must support double precision to use either the 2nd or 3rd of
-these settings.
-</P>
-<P>To build the library, type:
-</P>
-<PRE>make -f Makefile.machine 
-</PRE>
-<P>If successful, it will produce the files libgpu.a and Makefile.lammps.
-</P>
-<P>The latter file has 3 settings that need to be appropriate for the
-paths and settings for the CUDA system software on your machine.
-Makefile.lammps is a copy of the file specified by the EXTRAMAKE
-setting in Makefile.machine.  You can change EXTRAMAKE or create your
-own Makefile.lammps.machine if needed.
-</P>
-<P>Note that to change the precision of the GPU library, you need to
-re-build the entire library.  Do a "clean" first, e.g. "make -f
-Makefile.linux clean", followed by the make command above.
-</P>
-<P>(b) Build LAMMPS with the GPU package
-</P>
-<PRE>cd lammps/src
-make yes-gpu
-make machine 
-</PRE>
-<P>No additional compile/link flags are needed in your Makefile.machine
-in src/MAKE.
-</P>
-<P>Note that if you change the GPU library precision (discussed above)
-and rebuild the GPU library, then you also need to re-install the GPU
-package and re-build LAMMPS, so that all affected files are
-re-compiled and linked to the new GPU library.
-</P>
-<P><B>Run with the GPU package from the command line:</B>
-</P>
-<P>The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-</P>
-<P>When using the GPU package, you cannot assign more than one GPU to a
-single MPI task.  However multiple MPI tasks can share the same GPU,
-and in many cases it will be more efficient to run this way.  Likewise
-it may be more efficient to use less MPI tasks/node than the available
-# of CPU cores.  Assignment of multiple MPI tasks to a GPU will happen
-automatically if you create more MPI tasks/node than there are
-GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
-shared by 4 MPI tasks.
-</P>
-<P>Use the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line switch</A>,
-which will automatically append "gpu" to styles that support it.  Use
-the "-pk gpu Ng" <A HREF = "Section_start.html#start_7">command-line switch</A> to
-set Ng = # of GPUs/node to use.
-</P>
-<PRE>lmp_machine -sf gpu -pk gpu 1 -in in.script                         # 1 MPI task uses 1 GPU
-mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script           # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node
-mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes 
-</PRE>
-<P>Note that if the "-sf gpu" switch is used, it also issues a default
-<A HREF = "package.html">package gpu 1</A> command, which sets the number of
-GPUs/node to use to 1.
-</P>
-<P>Using the "-pk" switch explicitly allows for direct setting of the
-number of GPUs/node to use and additional options.  Its syntax is the
-same as same as the "package gpu" command.  See the
-<A HREF = "package.html">package</A> command doc page for details, including the
-default values used for all its options if it is not specified.
-</P>
-<P><B>Or run with the GPU package by editing an input script:</B>
-</P>
-<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
-and use of multiple MPI tasks/GPU is the same.
-</P>
-<P>Use the <A HREF = "suffix.html">suffix gpu</A> command, or you can explicitly add an
-"gpu" suffix to individual styles in your input script, e.g.
-</P>
-<PRE>pair_style lj/cut/gpu 2.5 
-</PRE>
-<P>You must also use the <A HREF = "package.html">package gpu</A> command to enable the
-GPU package, unless the "-sf gpu" or "-pk gpu" <A HREF = "Section_start.html#start_7">command-line
-switches</A> were used.  It specifies the
-number of GPUs/node to use, as well as other options.
-</P>
-<P>IMPORTANT NOTE: The input script must also use a newton pairwise
-setting of <I>off</I> in order to use GPU package pair styles.  This can be
-set via the <A HREF = "package.html">package gpu</A> or <A HREF = "newton.html">newton</A>
-commands.
-</P>
-<P><B>Speed-ups to expect:</B>
-</P>
-<P>The performance of a GPU versus a multi-core CPU is a function of your
-hardware, which pair style is used, the number of atoms/GPU, and the
-precision used on the GPU (double, single, mixed).
-</P>
-<P>See the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the
-LAMMPS web site for performance of the GPU package on various
-hardware, including the Titan HPC platform at ORNL.
-</P>
-<P>You should also experiment with how many MPI tasks per GPU to use to
-give the best performance for your problem and machine.  This is also
-a function of the problem size and the pair style being using.
-Likewise, you should experiment with the precision setting for the GPU
-library to see if single or mixed precision will give accurate
-results, since they will typically be faster.
-</P>
-<P><B>Guidelines for best performance:</B>
-</P>
-<UL><LI>Using multiple MPI tasks per GPU will often give the best performance,
-as allowed my most multi-core CPU/GPU configurations. 
-
-<LI>If the number of particles per MPI task is small (e.g. 100s of
-particles), it can be more efficient to run with fewer MPI tasks per
-GPU, even if you do not use all the cores on the compute node. 
-
-<LI>The <A HREF = "package.html">package gpu</A> command has several options for tuning
-performance.  Neighbor lists can be built on the GPU or CPU.  Force
-calculations can be dynamically balanced across the CPU cores and
-GPUs.  GPU-specific settings can be made which can be optimized
-for different hardware.  See the <A HREF = "package.html">packakge</A> command
-doc page for details. 
-
-<LI>As described by the <A HREF = "package.html">package gpu</A> command, GPU
-accelerated pair styles can perform computations asynchronously with
-CPU computations. The "Pair" time reported by LAMMPS will be the
-maximum of the time required to complete the CPU pair style
-computations and the time required to complete the GPU pair style
-computations. Any time spent for GPU-enabled pair styles for
-computations that run simultaneously with <A HREF = "bond_style.html">bond</A>,
-<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
-<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A>
-calculations will not be included in the "Pair" time. 
-
-<LI>When the <I>mode</I> setting for the package gpu command is force/neigh,
-the time for neighbor list calculations on the GPU will be added into
-the "Pair" time, not the "Neigh" time.  An additional breakdown of the
-times required for various tasks on the GPU (data copy, neighbor
-calculations, force computations, etc) are output only with the LAMMPS
-screen output (not in the log file) at the end of each run.  These
-timings represent total time spent on the GPU for each routine,
-regardless of asynchronous CPU calculations. 
-
-<LI>The output section "GPU Time Info (average)" reports "Max Mem / Proc".
-This is the maximum memory used at one time on the GPU for data
-storage by a single MPI process. 
-</UL>
-<P><B>Restrictions:</B>
-</P>
-<P>None.
-</P>
-<HR>
-
-<H4><A NAME = "acc_7"></A>5.7 USER-CUDA package 
-</H4>
-<P>The USER-CUDA package was developed by Christian Trott (Sandia) while
-at U Technology Ilmenau in Germany.  It provides NVIDIA GPU versions
-of many pair styles, many fixes, a few computes, and for long-range
-Coulombics via the PPPM command.  It has the following general
-features:
-</P>
-<UL><LI>The package is designed to allow an entire LAMMPS calculation, for
-many timesteps, to run entirely on the GPU (except for inter-processor
-MPI communication), so that atom-based data (e.g. coordinates, forces)
-do not have to move back-and-forth between the CPU and GPU. 
-
-<LI>The speed-up advantage of this approach is typically better when the
-number of atoms per GPU is large 
-
-<LI>Data will stay on the GPU until a timestep where a non-USER-CUDA fix
-or compute is invoked.  Whenever a non-GPU operation occurs (fix,
-compute, output), data automatically moves back to the CPU as needed.
-This may incur a performance penalty, but should otherwise work
-transparently. 
-
-<LI>Neighbor lists are constructed on the GPU. 
-
-<LI>The package only supports use of a single MPI task, running on a
-single CPU (core), assigned to each GPU. 
-</UL>
-<P>Here is a quick overview of how to use the USER-CUDA package:
-</P>
-<UL><LI>build the library in lib/cuda for your GPU hardware with desired precision
-<LI>include the USER-CUDA package and build LAMMPS
-<LI>use the mpirun command to specify 1 MPI task per GPU (on each node)
-<LI>enable the USER-CUDA package via the "-c on" command-line switch
-<LI>specify the # of GPUs per node
-<LI>use USER-CUDA styles in your input script 
-</UL>
-<P>The latter two steps can be done using the "-pk cuda" and "-sf cuda"
-<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
-the effect of the "-pk" or "-sf" switches can be duplicated by adding
-the <A HREF = "package.html">package cuda</A> or <A HREF = "suffix.html">suffix cuda</A> commands
-respectively to your input script.
-</P>
-<P><B>Required hardware/software:</B>
-</P>
-<P>To use this package, you need to have one or more NVIDIA GPUs and
-install the NVIDIA Cuda software on your system:
-</P>
-<P>Your NVIDIA GPU needs to support Compute Capability 1.3. This list may
-help you to find out the Compute Capability of your card:
-</P>
-<P>http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units
-</P>
-<P>Install the Nvidia Cuda Toolkit (version 3.2 or higher) and the
-corresponding GPU drivers.  The Nvidia Cuda SDK is not required, but
-we recommend it also be installed.  You can then make sure its sample
-projects can be compiled without problems.
-</P>
-<P><B>Building LAMMPS with the USER-CUDA package:</B>
-</P>
-<P>This requires two steps (a,b): build the USER-CUDA library, then build
-LAMMPS with the USER-CUDA package.
-</P>
-<P>(a) Build the USER-CUDA library
-</P>
-<P>The USER-CUDA library is in lammps/lib/cuda.  If your <I>CUDA</I> toolkit
-is not installed in the default system directoy <I>/usr/local/cuda</I> edit
-the file <I>lib/cuda/Makefile.common</I> accordingly.
-</P>
-<P>To set options for the library build, type "make OPTIONS", where
-<I>OPTIONS</I> are one or more of the following. The settings will be
-written to the <I>lib/cuda/Makefile.defaults</I> and used when
-the library is built.
-</P>
-<PRE><I>precision=N</I> to set the precision level
-  N = 1 for single precision (default)
-  N = 2 for double precision
-  N = 3 for positions in double precision
-  N = 4 for positions and velocities in double precision
-<I>arch=M</I> to set GPU compute capability
-  M = 35 for Kepler GPUs
-  M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
-  M = 21 for CC2.1 (GF104/114,  e.g. GTX560, GTX460, GTX450)
-  M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
-<I>prec_timer=0/1</I> to use hi-precision timers
-  0 = do not use them (default)
-  1 = use them
-  this is usually only useful for Mac machines 
-<I>dbg=0/1</I> to activate debug mode
-  0 = no debug mode (default)
-  1 = yes debug mode
-  this is only useful for developers
-<I>cufft=1</I> for use of the CUDA FFT library
-  0 = no CUFFT support (default)
-  in the future other CUDA-enabled FFT libraries might be supported 
-</PRE>
-<P>To build the library, simply type:
-</P>
-<PRE>make 
-</PRE>
-<P>If successful, it will produce the files libcuda.a and Makefile.lammps.
-</P>
-<P>Note that if you change any of the options (like precision), you need
-to re-build the entire library.  Do a "make clean" first, followed by
-"make".
-</P>
-<P>(b) Build LAMMPS with the USER-CUDA package
-</P>
-<PRE>cd lammps/src
-make yes-user-cuda
-make machine 
-</PRE>
-<P>No additional compile/link flags are needed in your Makefile.machine
-in src/MAKE.
-</P>
-<P>Note that if you change the USER-CUDA library precision (discussed
-above) and rebuild the USER-CUDA library, then you also need to
-re-install the USER-CUDA package and re-build LAMMPS, so that all
-affected files are re-compiled and linked to the new USER-CUDA
-library.
-</P>
-<P><B>Run with the USER-CUDA package from the command line:</B>
-</P>
-<P>The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-</P>
-<P>When using the USER-CUDA package, you must use exactly one MPI task
-per physical GPU.
-</P>
-<P>You must use the "-c on" <A HREF = "Section_start.html#start_7">command-line
-switch</A> to enable the USER-CUDA package.
-</P>
-<P>Use the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line switch</A>,
-which will automatically append "cuda" to styles that support it.  Use
-the "-pk cuda Ng" <A HREF = "Section_start.html#start_7">command-line switch</A> to
-set Ng = # of GPUs per node.
-</P>
-<PRE>lmp_machine -c on -sf cuda -pk cuda 1 -in in.script                       # 1 MPI task uses 1 GPU
-mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script          # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node
-mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script  # ditto on 12 16-core nodes 
-</PRE>
-<P>The "-pk" switch must be used (unless the <A HREF = "package.html">package cuda</A>
-command is used in the input script) to set the number of GPUs/node to
-use.  It also allows for setting of additional options.  Its syntax is
-the same as same as the "package cuda" command.  See the
-<A HREF = "package.html">package</A> command doc page for details.
-</P>
-<P><B>Or run with the USER-CUDA package by editing an input script:</B>
-</P>
-<P>The discussion above for the mpirun/mpiexec command and the requirement
-of one MPI task per GPU is the same.
-</P>
-<P>You must still use the "-c on" <A HREF = "Section_start.html#start_7">command-line
-switch</A> to enable the USER-CUDA package.
-</P>
-<P>Use the <A HREF = "suffix.html">suffix cuda</A> command, or you can explicitly add a
-"cuda" suffix to individual styles in your input script, e.g.
-</P>
-<PRE>pair_style lj/cut/cuda 2.5 
-</PRE>
-<P>You must use the <A HREF = "package.html">package cuda</A> command to set the the
-number of GPUs/node, unless the "-pk" <A HREF = "Section_start.html#start_7">command-line
-switch</A> was used.  The command also
-allows for setting of additional options.
-</P>
-<P><B>Speed-ups to expect:</B>
-</P>
-<P>The performance of a GPU versus a multi-core CPU is a function of your
-hardware, which pair style is used, the number of atoms/GPU, and the
-precision used on the GPU (double, single, mixed).
-</P>
-<P>See the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the
-LAMMPS web site for performance of the USER-CUDA package on different
-hardware.
-</P>
-<P><B>Guidelines for best performance:</B>
-</P>
-<UL><LI>The USER-CUDA package offers more speed-up relative to CPU performance
-when the number of atoms per GPU is large, e.g. on the order of tens
-or hundreds of 1000s. 
-
-<LI>As noted above, this package will continue to run a simulation
-entirely on the GPU(s) (except for inter-processor MPI communication),
-for multiple timesteps, until a CPU calculation is required, either by
-a fix or compute that is non-GPU-ized, or until output is performed
-(thermo or dump snapshot or restart file).  The less often this
-occurs, the faster your simulation will run. 
-</UL>
-<P><B>Restrictions:</B>
-</P>
-<P>None.
-</P>
-<HR>
-
-<H4><A NAME = "acc_8"></A>5.8 KOKKOS package 
-</H4>
-<P>The KOKKOS package was developed primaritly by Christian Trott
-(Sandia) with contributions of various styles by others, including
-Sikandar Mashayak (UIUC).  The underlying Kokkos library was written
-primarily by Carter Edwards, Christian Trott, and Dan Sunderland (all
-Sandia).
-</P>
-<P>The KOKKOS package contains versions of pair, fix, and atom styles
-that use data structures and macros provided by the Kokkos library,
-which is included with LAMMPS in lib/kokkos.
-</P>
-<P>The Kokkos library is part of
-<A HREF = "http://trilinos.sandia.gov/packages/kokkos">Trilinos</A> and is a
-templated C++ library that provides two key abstractions for an
-application like LAMMPS.  First, it allows a single implementation of
-an application kernel (e.g. a pair style) to run efficiently on
-different kinds of hardware, such as a GPU, Intel Phi, or many-core
-chip.
-</P>
-<P>The Kokkos library also provides data abstractions to adjust (at
-compile time) the memory layout of basic data structures like 2d and
-3d arrays and allow the transparent utilization of special hardware
-load and store operations.  Such data structures are used in LAMMPS to
-store atom coordinates or forces or neighbor lists.  The layout is
-chosen to optimize performance on different platforms.  Again this
-functionality is hidden from the developer, and does not affect how
-the kernel is coded.
-</P>
-<P>These abstractions are set at build time, when LAMMPS is compiled with
-the KOKKOS package installed.  This is done by selecting a "host" and
-"device" to build for, compatible with the compute nodes in your
-machine (one on a desktop machine or 1000s on a supercomputer).
-</P>
-<P>All Kokkos operations occur within the context of an individual MPI
-task running on a single node of the machine.  The total number of MPI
-tasks used by LAMMPS (one or multiple per compute node) is set in the
-usual manner via the mpirun or mpiexec commands, and is independent of
-Kokkos.
-</P>
-<P>Kokkos provides support for two different modes of execution per MPI
-task.  This means that computational tasks (pairwise interactions,
-neighbor list builds, time integration, etc) can be parallelized for
-one or the other of the two modes.  The first mode is called the
-"host" and is one or more threads running on one or more physical CPUs
-(within the node).  Currently, both multi-core CPUs and an Intel Phi
-processor (running in native mode, not offload mode like the
-USER-INTEL package) are supported.  The second mode is called the
-"device" and is an accelerator chip of some kind.  Currently only an
-NVIDIA GPU is supported.  If your compute node does not have a GPU,
-then there is only one mode of execution, i.e. the host and device are
-the same.
-</P>
-<P>Here is a quick overview of how to use the KOKKOS package
-for GPU acceleration:
-</P>
-<UL><LI>specify variables and settings in your Makefile.machine that enable GPU, Phi, or OpenMP support
-<LI>include the KOKKOS package and build LAMMPS
-<LI>enable the KOKKOS package and its hardware options via the "-k on" command-line switch
-<LI>use KOKKOS styles in your input script 
-</UL>
-<P>The latter two steps can be done using the "-k on", "-pk kokkos" and
-"-sf kk" <A HREF = "Section_start.html#start_7">command-line switches</A>
-respectively.  Or the effect of the "-pk" or "-sf" switches can be
-duplicated by adding the <A HREF = "package.html">package kokkos</A> or <A HREF = "suffix.html">suffix
-kk</A> commands respectively to your input script.
-</P>
-<P><B>Required hardware/software:</B>
-</P>
-<P>The KOKKOS package can be used to build and run LAMMPS on the
-following kinds of hardware:
-</P>
-<UL><LI>CPU-only: one MPI task per CPU core (MPI-only, but using KOKKOS styles)
-<LI>CPU-only: one or a few MPI tasks per node with additional threading via OpenMP
-<LI>Phi: on one or more Intel Phi coprocessors (per node)
-<LI>GPU: on the GPUs of a node with additional OpenMP threading on the CPUs 
-</UL>
-<P>Note that Intel Xeon Phi coprocessors are supported in "native" mode,
-not "offload" mode like the USER-INTEL package supports.
-</P>
-<P>Only NVIDIA GPUs are currently supported.
-</P>
-<P>IMPORTANT NOTE: For good performance of the KOKKOS package on GPUs,
-you must have Kepler generation GPUs (or later).  The Kokkos library
-exploits texture cache options not supported by Telsa generation GPUs
-(or older).
-</P>
-<P>To build the KOKKOS package for GPUs, NVIDIA Cuda software must be
-installed on your system.  See the discussion above for the USER-CUDA
-and GPU packages for details of how to check and do this.
-</P>
-<P><B>Building LAMMPS with the KOKKOS package:</B>
-</P>
-<P>Unlike other acceleration packages discussed in this section, the
-Kokkos library in lib/kokkos does not have to be pre-built before
-building LAMMPS itself.  Instead, options for the Kokkos library are
-specified at compile time, when LAMMPS itself is built.  This can be
-done in one of two ways, as discussed below.
-</P>
-<P>Here are examples of how to build LAMMPS for the different compute-node
-configurations listed above.
-</P>
-<P>CPU-only (run all-MPI or with OpenMP threading):
-</P>
-<PRE>cd lammps/src
-make yes-kokkos
-make g++ OMP=yes 
-</PRE>
-<P>Intel Xeon Phi:
-</P>
-<PRE>cd lammps/src
-make yes-kokkos
-make g++ OMP=yes MIC=yes 
-</PRE>
-<P>CPUs and GPUs:
-</P>
-<PRE>cd lammps/src
-make yes-kokkos
-make cuda CUDA=yes 
-</PRE>
-<P>These examples set the KOKKOS-specific OMP, MIC, CUDA variables on the
-make command line which requires a GNU-compatible make command.  Try
-"gmake" if your system's standard make complains.  
-</P>
-<P>IMPORTANT NOTE: If you build using make line variables and re-build
-LAMMPS twice with different KOKKOS options and the *same* target,
-e.g. g++ in the first two examples above, then you *must* perform a
-"make clean-all" or "make clean-machine" before each build.  This is
-to force all the KOKKOS-dependent files to be re-compiled with the new
-options.
-</P>
-<P>You can also hardwire these make variables in the specified machine
-makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above,
-with a line like:
-</P>
-<PRE>MIC = yes 
-</PRE>
-<P>Note that if you build LAMMPS multiple times in this manner, using
-different KOKKOS options (defined in different machine makefiles), you
-do not have to worry about doing a "clean" in between.  This is
-because the targets will be different.
-</P>
-<P>IMPORTANT NOTE: The 3rd example above for a GPU, uses a different
-machine makefile, in this case src/MAKE/Makefile.cuda, which is
-included in the LAMMPS distribution.  To build the KOKKOS package for
-a GPU, this makefile must use the NVIDA "nvcc" compiler.  And it must
-have a CCFLAGS -arch setting that is appropriate for your NVIDIA
-hardware and installed software.  Typical values for -arch are given
-in <A HREF = "Section_start.html#start_3_4">Section 2.3.4</A> of the manual, as well
-as other settings that must be included in the machine makefile, if
-you create your own.
-</P>
-<P>There are other allowed options when building with the KOKKOS package.
-As above, They can be set either as variables on the make command line
-or in the machine makefile in the src/MAKE directory.  See <A HREF = "Section_start.html#start_3_4">Section
-2.3.4</A> of the manual for details.
-</P>
-<P>IMPORTANT NOTE: Currently, there are no precision options with the
-KOKKOS package.  All compilation and computation is performed in
-double precision.
-</P>
-<P><B>Run with the KOKKOS package from the command line:</B>
-</P>
-<P>The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-</P>
-<P>When using KOKKOS built with host=OMP, you need to choose how many
-OpenMP threads per MPI task will be used (via the "-k" command-line
-switch discussed below).  Note that the product of MPI tasks * OpenMP
-threads/task should not exceed the physical number of cores (on a
-node), otherwise performance will suffer.
-</P>
-<P>When using the KOKKOS package built with device=CUDA, you must use
-exactly one MPI task per physical GPU.
-</P>
-<P>When using the KOKKOS package built with host=MIC for Intel Xeon Phi
-coprocessor support you need to insure there are one or more MPI tasks
-per coprocessor, and choose the number of coprocessor threads to use
-per MPI task (via the "-k" command-line switch discussed below).  The
-product of MPI tasks * coprocessor threads/task should not exceed the
-maximum number of threads the coproprocessor is designed to run,
-otherwise performance will suffer.  This value is 240 for current
-generation Xeon Phi(TM) chips, which is 60 physical cores * 4
-threads/core.  Note that with the KOKKOS package you do not need to
-specify how many Phi coprocessors there are per node; each
-coprocessors is simply treated as running some number of MPI tasks.
-</P>
-<P>You must use the "-k on" <A HREF = "Section_start.html#start_7">command-line
-switch</A> to enable the KOKKOS package.  It
-takes additional arguments for hardware settings appropriate to your
-system.  Those arguments are <A HREF = "Section_start.html#start_7">documented
-here</A>.  The two most commonly used arguments
-are:
-</P>
-<PRE>-k on t Nt
--k on g Ng 
-</PRE>
-<P>The "t Nt" option applies to host=OMP (even if device=CUDA) and
-host=MIC.  For host=OMP, it specifies how many OpenMP threads per MPI
-task to use with a node.  For host=MIC, it specifies how many Xeon Phi
-threads per MPI task to use within a node.  The default is Nt = 1.
-Note that for host=OMP this is effectively MPI-only mode which may be
-fine.  But for host=MIC you will typically end up using far less than
-all the 240 available threads, which could give very poor performance.
-</P>
-<P>The "g Ng" option applies to device=CUDA.  It specifies how many GPUs
-per compute node to use.  The default is 1, so this only needs to be
-specified is you have 2 or more GPUs per compute node.
-</P>
-<P>The "-k on" switch also issues a default <A HREF = "package.html">package kokkos neigh full
-comm host</A> command which sets various KOKKOS options to
-default values, as discussed on the <A HREF = "package.html">package</A> command doc
-page.
-</P>
-<P>Use the "-sf kk" <A HREF = "Section_start.html#start_7">command-line switch</A>,
-which will automatically append "kk" to styles that support it.  Use
-the "-pk kokkos" <A HREF = "Section_start.html#start_7">command-line switch</A> if
-you wish to override any of the default values set by the <A HREF = "package.html">package
-kokkos</A> command invoked by the "-k on" switch.
-</P>
-<PRE>host=OMP, dual hex-core nodes (12 threads/node):
-mpirun -np 12 lmp_g++ -in in.lj                           # MPI-only mode with no Kokkos
-mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj              # MPI-only mode with Kokkos
-mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj          # one MPI task, 12 threads
-mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj           # two MPI tasks, 6 threads/task 
-mpirun -np 32 -ppn 2 lmp_g++ -k on t 6 -sf kk -in in.lj   # ditto on 16 nodes 
-</PRE>
-<P>host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading):
-mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj           # 1 MPI task on 1 Phi, 1*240 = 240
-mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj            # 30 MPI tasks on 1 Phi, 30*8 = 240
-mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj           # 12 MPI tasks on 1 Phi, 12*20 = 240
-mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj   # ditto on 8 Phis
-</P>
-<PRE>host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU:
-mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj          # one MPI task, 6 threads on CPU
-mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj   # ditto on 4 nodes 
-</PRE>
-<PRE>host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs:
-mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj           # two MPI tasks, 8 threads per CPU
-mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # ditto on 16 nodes 
-</PRE>
-<P><B>Or run with the KOKKOS package by editing an input script:</B>
-</P>
-<P>The discussion above for the mpirun/mpiexec command and setting
-appropriate thread and GPU values for host=OMP or host=MIC or
-device=CUDA are the same.
-</P>
-<P>You must still use the "-k on" <A HREF = "Section_start.html#start_7">command-line
-switch</A> to enable the KOKKOS package, and
-specify its additional arguments for hardware options appopriate to
-your system, as documented above.
-</P>
-<P>Use the <A HREF = "suffix.html">suffix kk</A> command, or you can explicitly add a
-"kk" suffix to individual styles in your input script, e.g.
-</P>
-<PRE>pair_style lj/cut/kk 2.5 
-</PRE>
-<P>You only need to use the <A HREF = "package.html">package kokkos</A> command if you
-wish to change any of its option defaults.
-</P>
-<P><B>Speed-ups to expect:</B>
-</P>
-<P>The performance of KOKKOS running in different modes is a function of
-your hardware, which KOKKOS-enable styles are used, and the problem
-size.
-</P>
-<P>Generally speaking, the following rules of thumb apply:
-</P>
-<UL><LI>When running on CPUs only, with a single thread per MPI task,
-performance of a KOKKOS style is somewhere between the standard
-(un-accelerated) styles (MPI-only mode), and those provided by the
-USER-OMP package.  However the difference between all 3 is small (less
-than 20%). 
-
-<LI>When running on CPUs only, with multiple threads per MPI task,
-performance of a KOKKOS style is a bit slower than the USER-OMP
-package. 
-
-<LI>When running on GPUs, KOKKOS is typically faster than the USER-CUDA
-and GPU packages. 
-
-<LI>When running on Intel Xeon Phi, KOKKOS is not as fast as
-the USER-INTEL package, which is optimized for that hardware. 
-</UL>
-<P>See the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the
-LAMMPS web site for performance of the KOKKOS package on different
-hardware.
-</P>
-<P><B>Guidelines for best performance:</B>
-</P>
-<P>Here are guidline for using the KOKKOS package on the different
-hardware configurations listed above.
-</P>
-<P>Many of the guidelines use the <A HREF = "package.html">package kokkos</A> command
-See its doc page for details and default settings.  Experimenting with
-its options can provide a speed-up for specific calculations.
-</P>
-<P><B>Running on a multi-core CPU:</B>
-</P>
-<P>If N is the number of physical cores/node, then the number of MPI
-tasks/node * number of threads/task should not exceed N, and should
-typically equal N.  Note that the default threads/task is 1, as set by
-the "t" keyword of the "-k" <A HREF = "Section_start.html#start_7">command-line
-switch</A>.  If you do not change this, no
-additional parallelism (beyond MPI) will be invoked on the host
-CPU(s).
-</P>
-<P>You can compare the performance running in different modes:
-</P>
-<UL><LI>run with 1 MPI task/node and N threads/task
-<LI>run with N MPI tasks/node and 1 thread/task
-<LI>run with settings in between these extremes 
-</UL>
-<P>Examples of mpirun commands in these modes are shown above.
-</P>
-<P>When using KOKKOS to perform multi-threading, it is important for
-performance to bind both MPI tasks to physical cores, and threads to
-physical cores, so they do not migrate during a simulation.
-</P>
-<P>If you are not certain MPI tasks are being bound (check the defaults
-for your MPI installation), binding can be forced with these flags:
-</P>
-<PRE>OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
-Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... 
-</PRE>
-<P>For binding threads with the KOKKOS OMP option, use thread affinity
-environment variables to force binding.  With OpenMP 3.1 (gcc 4.7 or
-later, intel 12 or later) setting the environment variable
-OMP_PROC_BIND=true should be sufficient.  For binding threads with the
-KOKKOS pthreads option, compile LAMMPS the KOKKOS HWLOC=yes option, as
-discussed in <A HREF = "Sections_start.html#start_3_4">Section 2.3.4</A> of the
-manual.
-</P>
-<P><B>Running on GPUs:</B>
-</P>
-<P>Insure the -arch setting in the machine makefile you are using,
-e.g. src/MAKE/Makefile.cuda, is correct for your GPU hardware/software
-(see <A HREF = "Section_start.html#start_3_4">this section</A> of the manual for
-details).
-</P>
-<P>The -np setting of the mpirun command should set the number of MPI
-tasks/node to be equal to the # of physical GPUs on the node. 
-</P>
-<P>Use the "-k" <A HREF = "Section_commands.html#start_7">command-line switch</A> to
-specify the number of GPUs per node, and the number of threads per MPI
-task.  As above for multi-core CPUs (and no GPU), if N is the number
-of physical cores/node, then the number of MPI tasks/node * number of
-threads/task should not exceed N.  With one GPU (and one MPI task) it
-may be faster to use less than all the available cores, by setting
-threads/task to a smaller value.  This is because using all the cores
-on a dual-socket node will incur extra cost to copy memory from the
-2nd socket to the GPU.
-</P>
-<P>Examples of mpirun commands that follow these rules are shown above.
-</P>
-<P>IMPORTANT NOTE: When using a GPU, you will achieve the best
-performance if your input script does not use any fix or compute
-styles which are not yet Kokkos-enabled.  This allows data to stay on
-the GPU for multiple timesteps, without being copied back to the host
-CPU.  Invoking a non-Kokkos fix or compute, or performing I/O for
-<A HREF = "thermo_style.html">thermo</A> or <A HREF = "dump.html">dump</A> output will cause data
-to be copied back to the CPU.
-</P>
-<P>You cannot yet assign multiple MPI tasks to the same GPU with the
-KOKKOS package.  We plan to support this in the future, similar to the
-GPU package in LAMMPS.
-</P>
-<P>You cannot yet use both the host (multi-threaded) and device (GPU)
-together to compute pairwise interactions with the KOKKOS package.  We
-hope to support this in the future, similar to the GPU package in
-LAMMPS.
-</P>
-<P><B>Running on an Intel Phi:</B>
-</P>
-<P>Kokkos only uses Intel Phi processors in their "native" mode, i.e.
-not hosted by a CPU.
-</P>
-<P>As illustrated above, build LAMMPS with OMP=yes (the default) and
-MIC=yes.  The latter insures code is correctly compiled for the Intel
-Phi.  The OMP setting means OpenMP will be used for parallelization on
-the Phi, which is currently the best option within Kokkos.  In the
-future, other options may be added.
-</P>
-<P>Current-generation Intel Phi chips have either 61 or 57 cores.  One
-core should be excluded for running the OS, leaving 60 or 56 cores.
-Each core is hyperthreaded, so there are effectively N = 240 (4*60) or
-N = 224 (4*56) cores to run on.
-</P>
-<P>The -np setting of the mpirun command sets the number of MPI
-tasks/node.  The "-k on t Nt" command-line switch sets the number of
-threads/task as Nt.  The product of these 2 values should be N, i.e.
-240 or 224.  Also, the number of threads/task should be a multiple of
-4 so that logical threads from more than one MPI task do not run on
-the same physical core.
-</P>
-<P>Examples of mpirun commands that follow these rules are shown above.
-</P>
-<P><B>Restrictions:</B>
-</P>
-<P>As noted above, if using GPUs, the number of MPI tasks per compute
-node should equal to the number of GPUs per compute node.  In the
-future Kokkos will support assigning multiple MPI tasks to a single
-GPU.
-</P>
-<P>Currently Kokkos does not support AMD GPUs due to limits in the
-available backend programming models.  Specifically, Kokkos requires
-extensive C++ support from the Kernel language.  This is expected to
-change in the future.
-</P>
-<HR>
-
-<H4><A NAME = "acc_9"></A>5.9 USER-INTEL package 
-</H4>
-<P>The USER-INTEL package was developed by Mike Brown at Intel
-Corporation.  It provides a capability to accelerate simulations by
-offloading neighbor list and non-bonded force calculations to Intel(R)
-Xeon Phi(TM) coprocessors (not native mode like the KOKKOS package).
-Additionally, it supports running simulations in single, mixed, or
-double precision with vectorization, even if a coprocessor is not
-present, i.e. on an Intel(R) CPU.  The same C++ code is used for both
-cases.  When offloading to a coprocessor, the routine is run twice,
-once with an offload flag.
-</P>
-<P>The USER-INTEL package can be used in tandem with the USER-OMP
-package.  This is useful when offloading pair style computations to
-coprocessors, so that other styles not supported by the USER-INTEL
-package, e.g. bond, angle, dihedral, improper, and long-range
-electrostatics, can be run simultaneously in threaded mode on CPU
-cores.  Since less MPI tasks than CPU cores will typically be invoked
-when running with coprocessors, this enables the extra cores to be
-utilized for useful computation.
-</P>
-<P>If LAMMPS is built with both the USER-INTEL and USER-OMP packages
-intsalled, this mode of operation is made easier to use, because the
-"-suffix intel" <A HREF = "Section_start.html#start_7">command-line switch</A> or
-the <A HREF = "suffix.html">suffix intel</A> command will both set a second-choice
-suffix to "omp" so that styles from the USER-OMP package will be used
-if available, after first testing if a style from the USER-INTEL
-package is available.
-</P>
-<P>Here is a quick overview of how to use the USER-INTEL package
-for CPU acceleration:
-</P>
-<UL><LI>specify these CCFLAGS in your Makefile.machine: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost
-<LI>specify -fopenmp with LINKFLAGS in your Makefile.machine
-<LI>include the USER-INTEL package and (optionally) USER-OMP package and build LAMMPS
-<LI>if using the USER-OMP package, specify how many threads per MPI task to use
-<LI>use USER-INTEL styles in your input script 
-</UL>
-<P>Using the USER-INTEL package to offload work to the Intel(R)
-Xeon Phi(TM) coprocessor is the same except for these additional
-steps:
-</P>
-<UL><LI>add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Makefile.machine
-<LI>add the flag -offload to LINKFLAGS in your Makefile.machine
-<LI>specify how many threads per coprocessor to use 
-</UL>
-<P>The latter two steps in the first case and the last step in the
-coprocessor case can be done using the "-pk omp" and "-sf intel" and
-"-pk intel" <A HREF = "Section_start.html#start_7">command-line switches</A>
-respectively.  Or the effect of the "-pk" or "-sf" switches can be
-duplicated by adding the <A HREF = "package.html">package omp</A> or <A HREF = "suffix.html">suffix
-intel</A> or <A HREF = "package.html">package intel</A> commands
-respectively to your input script.
-</P>
-<P><B>Required hardware/software:</B>
-</P>
-<P>To use the offload option, you must have one or more Intel(R) Xeon
-Phi(TM) coprocessors.
-</P>
-<P>Optimizations for vectorization have only been tested with the
-Intel(R) compiler.  Use of other compilers may not result in
-vectorization or give poor performance.
-</P>
-<P>Use of an Intel C++ compiler is reccommended, but not required.  The
-compiler must support the OpenMP interface.
-</P>
-<P><B>Building LAMMPS with the USER-INTEL package:</B>
-</P>
-<P>Include the package(s) and build LAMMPS:  
-</P>
-<PRE>cd lammps/src
-make yes-user-intel
-make yes-user-omp (if desired)
-make machine 
-</PRE>
-<P>If the USER-OMP package is also installed, you can use styles from
-both packages, as described below.
-</P>
-<P>The lo-level src/MAKE/Makefile.machine needs a flag for OpenMP support
-in both the CCFLAGS and LINKFLAGS variables, which is <I>-openmp</I> for
-Intel compilers.  You also need to add -DLAMMPS_MEMALIGN=64 and
--restrict to CCFLAGS.
-</P>
-<P>If you are compiling on the same architecture that will be used for
-the runs, adding the flag <I>-xHost</I> to CCFLAGS will enable
-vectorization with the Intel(R) compiler.
-</P>
-<P>In order to build with support for an Intel(R) coprocessor, the flag
-<I>-offload</I> should be added to the LINKFLAGS line and the flag
--DLMP_INTEL_OFFLOAD should be added to the CCFLAGS line.
-</P>
-<P>Note that the machine makefiles Makefile.intel and
-Makefile.intel_offload are included in the src/MAKE directory with
-options that perform well with the Intel(R) compiler. The latter file
-has support for offload to coprocessors; the former does not.
-</P>
-<P>If using an Intel compiler, it is recommended that Intel(R) Compiler
-2013 SP1 update 1 be used.  Newer versions have some performance
-issues that are being addressed. If using Intel(R) MPI, version 5 or
-higher is recommended.
-</P>
-<P><B>Running with the USER-INTEL package from the command line:</B>
-</P>
-<P>The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-</P>
-<P>If LAMMPS was also built with the USER-OMP package, you need to choose
-how many OpenMP threads per MPI task will be used by the USER-OMP
-package.  Note that the product of MPI tasks * OpenMP threads/task
-should not exceed the physical number of cores (on a node), otherwise
-performance will suffer.
-</P>
-<P>If LAMMPS was built with coprocessor support for the USER-INTEL
-package, you need to specify the number of coprocessor/node and the
-number of threads to use on the coprocessor per MPI task.  Note that
-coprocessor threads (which run on the coprocessor) are totally
-independent from OpenMP threads (which run on the CPU).  The product
-of MPI tasks * coprocessor threads/task should not exceed the maximum
-number of threads the coproprocessor is designed to run, otherwise
-performance will suffer.  This value is 240 for current generation
-Xeon Phi(TM) chips, which is 60 physical cores * 4 threads/core.  The
-threads/core value can be set to a smaller value if desired by an
-option on the <A HREF = "package.html">package intel</A> command, in which case the
-maximum number of threads is also reduced.
-</P>
-<P>Use the "-sf intel" <A HREF = "Section_start.html#start_7">command-line switch</A>,
-which will automatically append "intel" to styles that support it.  If
-a style does not support it, a "omp" suffix is tried next.  Use the
-"-pk omp Nt" <A HREF = "Section_start.html#start_7">command-line switch</A>, to set
-Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with
-the USER-OMP package.  Use the "-pk intel Nphi" <A HREF = "Section_start.html#start_7">command-line
-switch</A> to set Nphi = # of Xeon Phi(TM)
-coprocessors/node, if LAMMPS was built with coprocessor support.
-</P>
-<PRE>CPU-only without USER-OMP (but using Intel vectorization on CPU):
-lmp_machine -sf intel -in in.script                 # 1 MPI task
-mpirun -np 32 lmp_machine -sf intel -in in.script   # 32 MPI tasks on as many nodes as needed (e.g. 2 16-core nodes) 
-</PRE>
-<PRE>CPU-only with USER-OMP (and Intel vectorization on CPU):
-lmp_machine -sf intel -pk intel 16 0 -in in.script                # 1 MPI task on a 16-core node
-mpirun -np 4 lmp_machine -sf intel -pk intel 4 0 -in in.script    # 4 MPI tasks each with 4 threads on a single 16-core node
-mpirun -np 32 lmp_machine -sf intel -pk intel 4 0 -in in.script   # ditto on 8 16-core nodes 
-</PRE>
-<PRE>CPUs + Xeon Phi(TM) coprocessors with USER-OMP:
-lmp_machine -sf intel -pk intel 16 1 -in in.script                                  # 1 MPI task, 240 threads on 1 coprocessor
-mpirun -np 4 lmp_machine -sf intel -pk intel 4 1 tptask 60 -in in.script            # 4 MPI tasks each with 4 OpenMP threads on a single 16-core node, 
-                                                                                    # each MPI task uses 60 threads on 1 coprocessor
-mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.script   # ditto on 8 16-core nodes for MPI tasks and OpenMP threads, 
-                                                                                    # each MPI task uses 120 threads on one of 2 coprocessors 
-</PRE>
-<P>Note that if the "-sf intel" switch is used, it also issues two
-default commands: <A HREF = "package.html">package omp 0</A> and <A HREF = "package.html">package intel
-1</A> command.  These set the number of OpenMP threads per
-MPI task via the OMP_NUM_THREADS environment variable, and the number
-of Xeon Phi(TM) coprocessors/node to 1.  The former is ignored if
-LAMMPS was not built with the USER-OMP package.  The latter is ignored
-is LAMMPS was not built with coprocessor support, except for its
-optional precision setting.
-</P>
-<P>Using the "-pk omp" switch explicitly allows for direct setting of the
-number of OpenMP threads per MPI task, and additional options.  Using
-the "-pk intel" switch explicitly allows for direct setting of the
-number of coprocessors/node, and additional options.  The syntax for
-these two switches is the same as the <A HREF = "package.html">package omp</A> and
-<A HREF = "package.html">package intel</A> commands.  See the <A HREF = "package.html">package</A>
-command doc page for details, including the default values used for
-all its options if these switches are not specified, and how to set
-the number of OpenMP threads via the OMP_NUM_THREADS environment
-variable if desired.
-</P>
-<P><B>Or run with the USER-INTEL package by editing an input script:</B>
-</P>
-<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
-OpenMP threads per MPI task, and coprocessor threads per MPI task is
-the same.
-</P>
-<P>Use the <A HREF = "suffix.html">suffix intel</A> command, or you can explicitly add an
-"intel" suffix to individual styles in your input script, e.g.
-</P>
-<PRE>pair_style lj/cut/intel 2.5 
-</PRE>
-<P>You must also use the <A HREF = "package.html">package omp</A> command to enable the
-USER-OMP package (assuming LAMMPS was built with USER-OMP) unless the "-sf
-intel" or "-pk omp" <A HREF = "Section_start.html#start_7">command-line switches</A>
-were used.  It specifies how many OpenMP threads per MPI task to use,
-as well as other options.  Its doc page explains how to set the number
-of threads via an environment variable if desired.
-</P>
-<P>You must also use the <A HREF = "package.html">package intel</A> command to enable
-coprocessor support within the USER-INTEL package (assuming LAMMPS was
-built with coprocessor support) unless the "-sf intel" or "-pk intel"
-<A HREF = "Section_start.html#start_7">command-line switches</A> were used.  It
-specifies how many coprocessors/node to use, as well as other
-coprocessor options.
-</P>
-<P><B>Speed-ups to expect:</B>
-</P>
-<P>If LAMMPS was not built with coprocessor support when including the
-USER-INTEL package, then acclerated styles will run on the CPU using
-vectorization optimizations and the specified precision.  This may
-give a substantial speed-up for a pair style, particularly if mixed or
-single precision is used.
-</P>
-<P>If LAMMPS was built with coproccesor support, the pair styles will run
-on one or more Intel(R) Xeon Phi(TM) coprocessors (per node).  The
-performance of a Xeon Phi versus a multi-core CPU is a function of
-your hardware, which pair style is used, the number of
-atoms/coprocessor, and the precision used on the coprocessor (double,
-single, mixed).
-</P>
-<P>See the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the
-LAMMPS web site for performance of the USER-INTEL package on different
-hardware.
-</P>
-<P><B>Guidelines for best performance on an Intel(R) Xeon Phi(TM)
-coprocessor:</B>
-</P>
-<UL><LI>The default for the <A HREF = "package.html">package intel</A> command is to have
-all the MPI tasks on a given compute node use a single Xeon Phi(TM)
-coprocessor.  In general, running with a large number of MPI tasks on
-each node will perform best with offload.  Each MPI task will
-automatically get affinity to a subset of the hardware threads
-available on the coprocessor.  For example, if your card has 61 cores,
-with 60 cores available for offload and 4 hardware threads per core
-(240 total threads), running with 24 MPI tasks per node will cause
-each MPI task to use a subset of 10 threads on the coprocessor.  Fine
-tuning of the number of threads to use per MPI task or the number of
-threads to use per core can be accomplished with keyword settings of
-the <A HREF = "package.html">package intel</A> command. 
-
-<LI>If desired, only a fraction of the pair style computation can be
-offloaded to the coprocessors.  This is accomplished by using the
-<I>balance</I> keyword in the <A HREF = "package.html">package intel</A> command.  A
-balance of 0 runs all calculations on the CPU.  A balance of 1 runs
-all calculations on the coprocessor.  A balance of 0.5 runs half of
-the calculations on the coprocessor.  Setting the balance to -1 (the
-default) will enable dynamic load balancing that continously adjusts
-the fraction of offloaded work throughout the simulation.  This option
-typically produces results within 5 to 10 percent of the optimal fixed
-balance. 
-
-<LI>When using offload with CPU hyperthreading disabled, it may help
-performance to use fewer MPI tasks and OpenMP threads than available
-cores.  This is due to the fact that additional threads are generated
-internally to handle the asynchronous offload tasks. 
-
-<LI>If running short benchmark runs with dynamic load balancing, adding a
-short warm-up run (10-20 steps) will allow the load-balancer to find a
-near-optimal setting that will carry over to additional runs. 
-
-<LI>If pair computations are being offloaded to an Intel(R) Xeon Phi(TM)
-coprocessor, a diagnostic line is printed to the screen (not to the
-log file), during the setup phase of a run, indicating that offload
-mode is being used and indicating the number of coprocessor threads
-per MPI task.  Additionally, an offload timing summary is printed at
-the end of each run.  When offloading, the frequency for <A HREF = "atom_modify.html">atom
-sorting</A> is changed to 1 so that the per-atom data is
-effectively sorted at every rebuild of the neighbor lists. 
-
-<LI>For simulations with long-range electrostatics or bond, angle,
-dihedral, improper calculations, computation and data transfer to the
-coprocessor will run concurrently with computations and MPI
-communications for these calculations on the host CPU.  The USER-INTEL
-package has two modes for deciding which atoms will be handled by the
-coprocessor.  This choice is controlled with the <I>ghost</I> keyword of
-the <A HREF = "package.html">package intel</A> command.  When set to 0, ghost atoms
-(atoms at the borders between MPI tasks) are not offloaded to the
-card.  This allows for overlap of MPI communication of forces with
-computation on the coprocessor when the <A HREF = "newton.html">newton</A> setting
-is "on".  The default is dependent on the style being used, however,
-better performance may be achieved by setting this option
-explictly. 
-</UL>
-<P><B>Restrictions:</B>
-</P>
-<P>When offloading to a coprocessor, <A HREF = "pair_hybrid.html">hybrid</A> styles
-that require skip lists for neighbor builds cannot be offloaded.
-Using <A HREF = "pair_hybrid.html">hybrid/overlay</A> is allowed.  Only one intel
-accelerated style may be used with hybrid styles.
-<A HREF = "special_bonds.html">Special_bonds</A> exclusion lists are not currently
-supported with offload, however, the same effect can often be
-accomplished by setting cutoffs for excluded atom types to 0.  None of
-the pair styles in the USER-INTEL package currently support the
-"inner", "middle", "outer" options for rRESPA integration via the
-<A HREF = "run_style.html">run_style respa</A> command; only the "pair" option is
-supported.
-</P>
-<HR>
-
-<H4><A NAME = "acc_10"></A>5.10 Comparison of GPU and USER-CUDA and KOKKOS packages 
-</H4>
 <P>All 3 of these packages accelerate a LAMMPS calculation using NVIDIA
 hardware, but they do it in different ways.
 </P>
-<P>NOTE: this section still needs to be re-worked with additional KOKKOS
-information.
-</P>
 <P>As a consequence, for a particular simulation on specific hardware,
 one package may be faster than the other.  We give guidelines below,
 but the best way to determine which package is faster for your input
diff --git a/doc/Section_accelerate.txt b/doc/Section_accelerate.txt
index 81b646b931..b99c2da7c3 100644
--- a/doc/Section_accelerate.txt
+++ b/doc/Section_accelerate.txt
@@ -14,22 +14,29 @@ This section describes various methods for improving LAMMPS
 performance for different classes of problems running on different
 kinds of machines.
 
-5.1 "Measuring performance"_#acc_1
-5.2 "General strategies"_#acc_2
-5.3 "Packages with optimized styles"_#acc_3
-5.4 "OPT package"_#acc_4
-5.5 "USER-OMP package"_#acc_5
-5.6 "GPU package"_#acc_6
-5.7 "USER-CUDA package"_#acc_7
-5.8 "KOKKOS package"_#acc_8
-5.9 "USER-INTEL package"_#acc_9
-5.10 "Comparison of USER-CUDA, GPU, and KOKKOS packages"_#acc_10 :all(b)
+There are two thrusts to the discussion that follows.  The
+first is using code options that implement alternate algorithms
+that can speed-up a simulation.  The second is to use one
+of the several accelerator packages provided with LAMMPS that
+contain code optimized for certain kinds of hardware, including
+multi-core CPUs, GPUs, and Intel Xeon Phi coprocessors.
+
+5.1 "Measuring performance"_#acc_1 :ulb,l
+5.2 "Algorithms and code options to boost performace"_#acc_2 :l
+5.3 "Accelerator packages with optimized styles"_#acc_3 :l
+    5.3.1 "USER-CUDA package"_accelerate_cuda.html :ulb,l
+    5.3.2 "GPU package"_accelerate_gpu.html :l
+    5.3.3 "USER-INTEL package"_accelerate_intel.html :l
+    5.3.4 "KOKKOS package"_accelerate_kokkos.html :l
+    5.3.5 "USER-OMP package"_accelerate_omp.html :l
+    5.3.6 "OPT package"_accelerate_opt.html :l,ule
+5.4 "Comparison of various accelerator packages"_#acc_4 :l,ule
 
 The "Benchmark page"_http://lammps.sandia.gov/bench.html of the LAMMPS
 web site gives performance results for the various accelerator
-packages discussed in this section for several of the standard LAMMPS
-benchmarks, as a function of problem size and number of compute nodes,
-on different hardware platforms.
+packages discussed in Section 5.2, for several of the standard LAMMPS
+benchmark problems, as a function of problem size and number of
+compute nodes, on different hardware platforms.
 
 :line
 :line
@@ -100,11 +107,9 @@ rRESPA
 Staggered PPPM
 single vs double PPPM
 partial charge PPPM
-verlet/split
-processor mapping via processors numa command
-load-balancing: balance and fix balance
-processor command for layout
-OMP when lots of cores :ul
+verlet/split run style
+processor command for proc layout and numa layout
+load-balancing: balance and fix balance :ul
 
 2-FFT PPPM, also called {analytic differentiation} or {ad} PPPM, uses
 2 FFTs instead of the 4 FFTs used by the default {ik differentiation}
@@ -142,26 +147,28 @@ Accelerated versions of various "pair_style"_pair_style.html,
 "fixes"_fix.html, "computes"_compute.html, and other commands have
 been added to LAMMPS, which will typically run faster than the
 standard non-accelerated versions.  Some require appropriate hardware
-on your system, e.g. GPUs or Intel Xeon Phi chips.
+to be present on your system, e.g. GPUs or Intel Xeon Phi
+coprocessors.
 
-All of these commands are in packages provided with LAMMPS, as
-explained "here"_Section_packages.html.  Currently, there are 6 such
-accelerator packages in LAMMPS, either as standard or user packages:
+All of these commands are in packages provided with LAMMPS.  An
+overview of packages is give in "Section
+packages"_Section_packages.html.  Currently, there are 6 accelerator
+packages in LAMMPS, either as standard or user packages:
 
-"USER-CUDA"_#acc_7 : for NVIDIA GPUs
-"GPU"_acc_6 : for NVIDIA GPUs as well as OpenCL support
-"USER-INTEL"_acc_9 : for Intel CPUs and Intel Xeon Phi
-"KOKKOS"_acc_8 : for GPUs, Intel Xeon Phi, and OpenMP threading
-"USER-OMP"_acc_5 : for OpenMP threading
-"OPT"_acc_4 : generic CPU optimizations :tb(s=:)
+"USER-CUDA"_accelerate_cuda.html : for NVIDIA GPUs
+"GPU"_accelerate_gpu.html : for NVIDIA GPUs as well as OpenCL support
+"USER-INTEL"_accelerate_intel.html : for Intel CPUs and Intel Xeon Phi
+"KOKKOS"_accelerate_kokkos.html : for GPUs, Intel Xeon Phi, and OpenMP threading
+"USER-OMP"_accelerate_omp.html : for OpenMP threading
+"OPT"_accelerate_opt.html : generic CPU optimizations :tb(s=:)
 
 Any accelerated style has the same name as the corresponding standard
 style, except that a suffix is appended.  Otherwise, the syntax for
-the command that specifies the style is identical, their functionality
-is the same, and the numerical results it produces should also be the
+the command that uses the style is identical, their functionality is
+the same, and the numerical results it produces should also be the
 same, except for precision and round-off effects.
 
-For example, all of these styles are variants of the basic
+For example, all of these styles are accelerated variants of the
 Lennard-Jones "pair_style lj/cut"_pair_lj.html:
 
 "pair_style lj/cut/cuda"_pair_lj.html
@@ -171,25 +178,51 @@ Lennard-Jones "pair_style lj/cut"_pair_lj.html:
 "pair_style lj/cut/omp"_pair_lj.html
 "pair_style lj/cut/opt"_pair_lj.html :ul
 
-Assuming LAMMPS was built with the appropriate package, a simulation
-using accelerated styles from the package can be run without modifying
-your input script, by specifying "command-line
-switches"_Section_start.html#start_7.  The details of how to do this
-vary from package to package and are explained below.  There is also a
-"suffix"_suffix.html command and a "package"_package.html command that
-accomplish the same thing and can be used within an input script if
-preferred.  The "suffix"_suffix.html command allows more precise
-control of whether an accelerated or unaccelerated version of a style
-is used at various points within an input script.
+To see what accelerate styles are currently available, see
+"Section_commands 5"_Section_commands.html#cmd_5 of the manual.  The
+doc pages for individual commands (e.g. "pair lj/cut"_pair_lj.html or
+"fix nve"_fix_nve.html) also list any accelerated variants available
+for that style.
 
-To see what styles are currently available in each of the accelerated
-packages, see "Section_commands 5"_Section_commands.html#cmd_5 of the
-manual.  The doc page for individual commands (e.g. "pair
-lj/cut"_pair_lj.html or "fix nve"_fix_nve.html) also lists any
-accelerated variants available for that style.
+To use an accelerator package in LAMMPS, and one or more of the styles
+it provides, follow these general steps.  Details vary from package to
+package and are explained in the individual accelerator sub-section
+doc pages, listed above:
+
+build the accelerator library |
+  only for USER-CUDA and GPU packages |
+install the accelerator package |
+  make yes-opt, make yes-user-intel, etc |
+add compile/link flags to Makefile.machine |
+  in src/MAKE, <br>
+  only for USER-INTEL, KOKKOS, USER-OMP packages |
+re-build LAMMPS |
+  make machine |
+run a LAMMPS simulation |
+  lmp_machine < in.script |
+enable the accelerator package |
+  via "-c on" and "-k on" "command-line switches"_Section_start.html#start_7, <br>
+  only for USER-CUDA and KOKKOS packages |
+set any needed options for the package |
+  via "-pk" "command-line switch"_Section_start.html#start_7 or
+  "package"_package.html command, <br>
+  only if defaults need to be changed |
+use accelerated styles in your input script |
+  via "-sf" "command-line switch"_Section_start.html#start_7 or
+  "suffix"_suffix.html command :tb(c=2,s=|)
+
+The first 4 steps typically only need to be done once, to create an
+executable that uses one or more accelerator packages.  We are working
+to create a "make" tool that will perform all these 4 steps in a
+single command.
+
+The last 4 steps can all be done from the command-line when LAMMPS is
+launched, without changing your input script.  Or you can add
+"package"_package.html and "suffix"_suffix.html commands to your input
+script.
 
 The examples directory has several sub-directories with scripts and
-README files for using the accelerator packages:
+README files for how to use the following accelerator packages:
 
 examples/cuda for USER-CUDA package
 examples/gpu for GPU package
@@ -199,13 +232,18 @@ examples/kokkos for KOKKOS package :ul
 Likewise, the bench directory has FERMI and KEPLER sub-directories
 with scripts and README files for using all the accelerator packages.
 
+As mentioned above, the "Benchmark
+page"_http://lammps.sandia.gov/bench.html of the LAMMPS web site gives
+performance results for the various accelerator packages for several
+of the standard LAMMPS benchmark problems, as a function of problem
+size and number of compute nodes, on different hardware platforms.
+
 Here is a brief summary of what the various packages provide.  Details
-are in individual sections below.
+are in the individual package sub-sections listed above.
 
 Styles with a "cuda" or "gpu" suffix are part of the USER-CUDA or GPU
-packages, and can be run on NVIDIA GPUs associated with your CPUs.
-The speed-up on a GPU depends on a variety of factors, as discussed
-below. :ulb,l
+packages, and can be run on NVIDIA GPUs.  The speed-up on a GPU
+depends on a variety of factors, as discussed below. :ulb,l
 
 Styles with an "intel" suffix are part of the USER-INTEL
 package. These styles support vectorized single and mixed precision
@@ -230,1410 +268,29 @@ Styles with an "opt" suffix are part of the OPT package and typically
 speed-up the pairwise calculations of your simulation by 5-25% on a
 CPU. :l,ule
 
-The following sections explain:
+The individual accelerator package sub-sections explain:
 
 what hardware and software the accelerated package requires
 how to build LAMMPS with the accelerated package
-how to run with the accelerated package via either command-line switches or modifying the input script
+how to run with the accelerated package either via command-line switches or modifying the input script
 speed-ups to expect
 guidelines for best performance
 restrictions :ul
 
-The final section compares and contrasts the USER-CUDA, GPU, and
-KOKKOS packages, since they all enable use of NVIDIA GPUs.
-
 :line
 
-5.4 OPT package :h4,link(acc_4)
+5.4 Comparison of various accelerator packages :h4,link(acc_4)
 
-The OPT package was developed by James Fischer (High Performance
-Technologies), David Richie, and Vincent Natoli (Stone Ridge
-Technologies).  It contains a handful of pair styles whose compute()
-methods were rewritten in C++ templated form to reduce the overhead
-due to if tests and other conditional code.
+NOTE: this section still needs to be re-worked with additional KOKKOS
+and USER-INTEL information.
 
-Here is a quick overview of how to use the OPT package:
-
-include the OPT package and build LAMMPS
-use OPT pair styles in your input script :ul
-
-The last step can be done using the "-sf opt" "command-line
-switch"_Section_start.html#start_7.  Or the effect of the "-sf" switch
-can be duplicated by adding a "suffix opt"_suffix.html command to your
-input script.
-
-[Required hardware/software:]
-
-None.
-
-[Building LAMMPS with the OPT package:]
-
-Include the package and build LAMMPS:
-
-cd lammps/src
-make yes-opt
-make machine :pre
-
-No additional compile/link flags are needed in your Makefile.machine
-in src/MAKE.
-
-[Run with the OPT package from the command line:]
-
-Use the "-sf opt" "command-line switch"_Section_start.html#start_7,
-which will automatically append "opt" to styles that support it.
-
-lmp_machine -sf opt -in in.script
-mpirun -np 4 lmp_machine -sf opt -in in.script :pre
-
-[Or run with the OPT package by editing an input script:]
-
-Use the "suffix opt"_suffix.html command, or you can explicitly add an
-"opt" suffix to individual styles in your input script, e.g.
-
-pair_style lj/cut/opt 2.5 :pre
-
-[Speed-ups to expect:]
-
-You should see a reduction in the "Pair time" value printed at the end
-of a run.  On most machines for reasonable problem sizes, it will be a
-5 to 20% savings.
-
-[Guidelines for best performance:]
-
-None.  Just try out an OPT pair style to see how it performs.
-
-[Restrictions:]
-
-None.
-
-:line
-
-5.5 USER-OMP package :h4,link(acc_5)
-
-The USER-OMP package was developed by Axel Kohlmeyer at Temple
-University.  It provides multi-threaded versions of most pair styles,
-nearly all bonded styles (bond, angle, dihedral, improper), several
-Kspace styles, and a few fix styles.  The package currently
-uses the OpenMP interface for multi-threading.
-
-Here is a quick overview of how to use the USER-OMP package:
-
-use the -fopenmp flag for compiling and linking in your Makefile.machine
-include the USER-OMP package and build LAMMPS
-use the mpirun command to set the number of MPI tasks/node
-specify how many threads per MPI task to use
-use USER-OMP styles in your input script :ul
-
-The latter two steps can be done using the "-pk omp" and "-sf omp"
-"command-line switches"_Section_start.html#start_7 respectively.  Or
-the effect of the "-pk" or "-sf" switches can be duplicated by adding
-the "package omp"_package.html or "suffix omp"_suffix.html commands
-respectively to your input script.
-
-[Required hardware/software:]
-
-Your compiler must support the OpenMP interface.  You should have one
-or more multi-core CPUs so that multiple threads can be launched by an
-MPI task running on a CPU.
-
-[Building LAMMPS with the USER-OMP package:]
-
-Include the package and build LAMMPS:
-
-cd lammps/src
-make yes-user-omp
-make machine :pre
-
-Your src/MAKE/Makefile.machine needs a flag for OpenMP support in both
-the CCFLAGS and LINKFLAGS variables.  For GNU and Intel compilers,
-this flag is "-fopenmp".  Without this flag the USER-OMP styles will
-still be compiled and work, but will not support multi-threading.
-
-[Run with the USER-OMP package from the command line:]
-
-The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-
-You need to choose how many threads per MPI task will be used by the
-USER-OMP package.  Note that the product of MPI tasks * threads/task
-should not exceed the physical number of cores (on a node), otherwise
-performance will suffer.
-
-Use the "-sf omp" "command-line switch"_Section_start.html#start_7,
-which will automatically append "omp" to styles that support it.  Use
-the "-pk omp Nt" "command-line switch"_Section_start.html#start_7, to
-set Nt = # of OpenMP threads per MPI task to use.
-
-lmp_machine -sf omp -pk omp 16 -in in.script                       # 1 MPI task on a 16-core node
-mpirun -np 4 lmp_machine -sf omp -pk omp 4 -in in.script           # 4 MPI tasks each with 4 threads on a single 16-core node
-mpirun -np 32 -ppn 4 lmp_machine -sf omp -pk omp 4 -in in.script   # ditto on 8 16-core nodes :pre
-
-Note that if the "-sf omp" switch is used, it also issues a default
-"package omp 0"_package.html command, which sets the number of threads
-per MPI task via the OMP_NUM_THREADS environment variable.
-
-Using the "-pk" switch explicitly allows for direct setting of the
-number of threads and additional options.  Its syntax is the same as
-the "package omp" command.  See the "package"_package.html command doc
-page for details, including the default values used for all its
-options if it is not specified, and how to set the number of threads
-via the OMP_NUM_THREADS environment variable if desired.
-
-[Or run with the USER-OMP package by editing an input script:]
-
-The discussion above for the mpirun/mpiexec command, MPI tasks/node,
-and threads/MPI task is the same.
-
-Use the "suffix omp"_suffix.html command, or you can explicitly add an
-"omp" suffix to individual styles in your input script, e.g.
-
-pair_style lj/cut/omp 2.5 :pre
-
-You must also use the "package omp"_package.html command to enable the
-USER-OMP package, unless the "-sf omp" or "-pk omp" "command-line
-switches"_Section_start.html#start_7 were used.  It specifies how many
-threads per MPI task to use, as well as other options.  Its doc page
-explains how to set the number of threads via an environment variable
-if desired.
-
-[Speed-ups to expect:]
-
-Depending on which styles are accelerated, you should look for a
-reduction in the "Pair time", "Bond time", "KSpace time", and "Loop
-time" values printed at the end of a run.  
-
-You may see a small performance advantage (5 to 20%) when running a
-USER-OMP style (in serial or parallel) with a single thread per MPI
-task, versus running standard LAMMPS with its standard
-(un-accelerated) styles (in serial or all-MPI parallelization with 1
-task/core).  This is because many of the USER-OMP styles contain
-similar optimizations to those used in the OPT package, as described
-above.
-
-With multiple threads/task, the optimal choice of MPI tasks/node and
-OpenMP threads/task can vary a lot and should always be tested via
-benchmark runs for a specific simulation running on a specific
-machine, paying attention to guidelines discussed in the next
-sub-section.
-
-A description of the multi-threading strategy used in the USER-OMP
-package and some performance examples are "presented
-here"_http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1
-
-[Guidelines for best performance:]
-
-For many problems on current generation CPUs, running the USER-OMP
-package with a single thread/task is faster than running with multiple
-threads/task.  This is because the MPI parallelization in LAMMPS is
-often more efficient than multi-threading as implemented in the
-USER-OMP package.  The parallel efficiency (in a threaded sense) also
-varies for different USER-OMP styles.
-
-Using multiple threads/task can be more effective under the following
-circumstances:
-
-Individual compute nodes have a significant number of CPU cores but
-the CPU itself has limited memory bandwidth, e.g. for Intel Xeon 53xx
-(Clovertown) and 54xx (Harpertown) quad core processors. Running one
-MPI task per CPU core will result in significant performance
-degradation, so that running with 4 or even only 2 MPI tasks per node
-is faster.  Running in hybrid MPI+OpenMP mode will reduce the
-inter-node communication bandwidth contention in the same way, but
-offers an additional speedup by utilizing the otherwise idle CPU
-cores. :ulb,l
-
-The interconnect used for MPI communication does not provide
-sufficient bandwidth for a large number of MPI tasks per node.  For
-example, this applies to running over gigabit ethernet or on Cray XT4
-or XT5 series supercomputers.  As in the aforementioned case, this
-effect worsens when using an increasing number of nodes. :l
-
-The system has a spatially inhomogeneous particle density which does
-not map well to the "domain decomposition scheme"_processors.html or
-"load-balancing"_balance.html options that LAMMPS provides.  This is
-because multi-threading achives parallelism over the number of
-particles, not via their distribution in space. :l
-
-A machine is being used in "capability mode", i.e. near the point
-where MPI parallelism is maxed out.  For example, this can happen when
-using the "PPPM solver"_kspace_style.html for long-range
-electrostatics on large numbers of nodes.  The scaling of the KSpace
-calculation (see the "kspace_style"_kspace_style.html command) becomes
-the performance-limiting factor.  Using multi-threading allows less
-MPI tasks to be invoked and can speed-up the long-range solver, while
-increasing overall performance by parallelizing the pairwise and
-bonded calculations via OpenMP.  Likewise additional speedup can be
-sometimes be achived by increasing the length of the Coulombic cutoff
-and thus reducing the work done by the long-range solver.  Using the
-"run_style verlet/split"_run_style.html command, which is compatible
-with the USER-OMP package, is an alternative way to reduce the number
-of MPI tasks assigned to the KSpace calculation. :l,ule
-
-Additional performance tips are as follows:
-
-The best parallel efficiency from {omp} styles is typically achieved
-when there is at least one MPI task per physical processor,
-i.e. socket or die. :ulb,l
-
-It is usually most efficient to restrict threading to a single
-socket, i.e. use one or more MPI task per socket. :l
-
-Several current MPI implementation by default use a processor affinity
-setting that restricts each MPI task to a single CPU core.  Using
-multi-threading in this mode will force the threads to share that core
-and thus is likely to be counterproductive.  Instead, binding MPI
-tasks to a (multi-core) socket, should solve this issue. :l,ule
-
-[Restrictions:]
-
-None.
-
-:line
-
-5.6 GPU package :h4,link(acc_6)
-
-The GPU package was developed by Mike Brown at ORNL and his
-collaborators, particularly Trung Nguyen (ORNL).  It provides GPU
-versions of many pair styles, including the 3-body Stillinger-Weber
-pair style, and for "kspace_style pppm"_kspace_style.html for
-long-range Coulombics.  It has the following general features:
-
-It is designed to exploit common GPU hardware configurations where one
-or more GPUs are coupled to many cores of one or more multi-core CPUs,
-e.g. within a node of a parallel machine. :ulb,l
-
-Atom-based data (e.g. coordinates, forces) moves back-and-forth
-between the CPU(s) and GPU every timestep. :l
-
-Neighbor lists can be built on the CPU or on the GPU :l
-
-The charge assignement and force interpolation portions of PPPM can be
-run on the GPU.  The FFT portion, which requires MPI communication
-between processors, runs on the CPU. :l
-
-Asynchronous force computations can be performed simultaneously on the
-CPU(s) and GPU. :l
-
-It allows for GPU computations to be performed in single or double
-precision, or in mixed-mode precision, where pairwise forces are
-computed in single precision, but accumulated into double-precision
-force vectors. :l
-
-LAMMPS-specific code is in the GPU package.  It makes calls to a
-generic GPU library in the lib/gpu directory.  This library provides
-NVIDIA support as well as more general OpenCL support, so that the
-same functionality can eventually be supported on a variety of GPU
-hardware. :l,ule
-
-Here is a quick overview of how to use the GPU package:
-
-build the library in lib/gpu for your GPU hardware wity desired precision
-include the GPU package and build LAMMPS
-use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU
-specify the # of GPUs per node
-use GPU styles in your input script :ul
-
-The latter two steps can be done using the "-pk gpu" and "-sf gpu"
-"command-line switches"_Section_start.html#start_7 respectively.  Or
-the effect of the "-pk" or "-sf" switches can be duplicated by adding
-the "package gpu"_package.html or "suffix gpu"_suffix.html commands
-respectively to your input script.
-
-[Required hardware/software:]
-
-To use this package, you currently need to have an NVIDIA GPU and
-install the NVIDIA Cuda software on your system:
-
-Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/gpus/0/information
-Go to http://www.nvidia.com/object/cuda_get.html
-Install a driver and toolkit appropriate for your system (SDK is not necessary)
-Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) to list supported devices and properties :ul
-
-[Building LAMMPS with the GPU package:]
-
-This requires two steps (a,b): build the GPU library, then build
-LAMMPS with the GPU package.
-
-(a) Build the GPU library
-
-The GPU library is in lammps/lib/gpu.  Select a Makefile.machine (in
-lib/gpu) appropriate for your system.  You should pay special
-attention to 3 settings in this makefile.
-
-CUDA_HOME = needs to be where NVIDIA Cuda software is installed on your system
-CUDA_ARCH = needs to be appropriate to your GPUs
-CUDA_PREC = precision (double, mixed, single) you desire :ul
-
-See lib/gpu/Makefile.linux.double for examples of the ARCH settings
-for different GPU choices, e.g. Fermi vs Kepler.  It also lists the
-possible precision settings:
-
-CUDA_PREC = -D_SINGLE_SINGLE  # single precision for all calculations
-CUDA_PREC = -D_DOUBLE_DOUBLE  # double precision for all calculations
-CUDA_PREC = -D_SINGLE_DOUBLE  # accumulation of forces, etc, in double :pre
-
-The last setting is the mixed mode referred to above.  Note that your
-GPU must support double precision to use either the 2nd or 3rd of
-these settings.
-
-To build the library, type:
-
-make -f Makefile.machine :pre
-
-If successful, it will produce the files libgpu.a and Makefile.lammps.
-
-The latter file has 3 settings that need to be appropriate for the
-paths and settings for the CUDA system software on your machine.
-Makefile.lammps is a copy of the file specified by the EXTRAMAKE
-setting in Makefile.machine.  You can change EXTRAMAKE or create your
-own Makefile.lammps.machine if needed.
-
-Note that to change the precision of the GPU library, you need to
-re-build the entire library.  Do a "clean" first, e.g. "make -f
-Makefile.linux clean", followed by the make command above.
-
-(b) Build LAMMPS with the GPU package
-
-cd lammps/src
-make yes-gpu
-make machine :pre
-
-No additional compile/link flags are needed in your Makefile.machine
-in src/MAKE.
-
-Note that if you change the GPU library precision (discussed above)
-and rebuild the GPU library, then you also need to re-install the GPU
-package and re-build LAMMPS, so that all affected files are
-re-compiled and linked to the new GPU library.
-
-[Run with the GPU package from the command line:]
-
-The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-
-When using the GPU package, you cannot assign more than one GPU to a
-single MPI task.  However multiple MPI tasks can share the same GPU,
-and in many cases it will be more efficient to run this way.  Likewise
-it may be more efficient to use less MPI tasks/node than the available
-# of CPU cores.  Assignment of multiple MPI tasks to a GPU will happen
-automatically if you create more MPI tasks/node than there are
-GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
-shared by 4 MPI tasks.
-
-Use the "-sf gpu" "command-line switch"_Section_start.html#start_7,
-which will automatically append "gpu" to styles that support it.  Use
-the "-pk gpu Ng" "command-line switch"_Section_start.html#start_7 to
-set Ng = # of GPUs/node to use.
-
-lmp_machine -sf gpu -pk gpu 1 -in in.script                         # 1 MPI task uses 1 GPU
-mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script           # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node
-mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes :pre
-
-Note that if the "-sf gpu" switch is used, it also issues a default
-"package gpu 1"_package.html command, which sets the number of
-GPUs/node to use to 1.
-
-Using the "-pk" switch explicitly allows for direct setting of the
-number of GPUs/node to use and additional options.  Its syntax is the
-same as same as the "package gpu" command.  See the
-"package"_package.html command doc page for details, including the
-default values used for all its options if it is not specified.
-
-[Or run with the GPU package by editing an input script:]
-
-The discussion above for the mpirun/mpiexec command, MPI tasks/node,
-and use of multiple MPI tasks/GPU is the same.
-
-Use the "suffix gpu"_suffix.html command, or you can explicitly add an
-"gpu" suffix to individual styles in your input script, e.g.
-
-pair_style lj/cut/gpu 2.5 :pre
-
-You must also use the "package gpu"_package.html command to enable the
-GPU package, unless the "-sf gpu" or "-pk gpu" "command-line
-switches"_Section_start.html#start_7 were used.  It specifies the
-number of GPUs/node to use, as well as other options.
-
-IMPORTANT NOTE: The input script must also use a newton pairwise
-setting of {off} in order to use GPU package pair styles.  This can be
-set via the "package gpu"_package.html or "newton"_newton.html
-commands.
-
-[Speed-ups to expect:]
-
-The performance of a GPU versus a multi-core CPU is a function of your
-hardware, which pair style is used, the number of atoms/GPU, and the
-precision used on the GPU (double, single, mixed).
-
-See the "Benchmark page"_http://lammps.sandia.gov/bench.html of the
-LAMMPS web site for performance of the GPU package on various
-hardware, including the Titan HPC platform at ORNL.
-
-You should also experiment with how many MPI tasks per GPU to use to
-give the best performance for your problem and machine.  This is also
-a function of the problem size and the pair style being using.
-Likewise, you should experiment with the precision setting for the GPU
-library to see if single or mixed precision will give accurate
-results, since they will typically be faster.
-
-[Guidelines for best performance:]
-
-Using multiple MPI tasks per GPU will often give the best performance,
-as allowed my most multi-core CPU/GPU configurations. :ulb,l
-
-If the number of particles per MPI task is small (e.g. 100s of
-particles), it can be more efficient to run with fewer MPI tasks per
-GPU, even if you do not use all the cores on the compute node. :l
-
-The "package gpu"_package.html command has several options for tuning
-performance.  Neighbor lists can be built on the GPU or CPU.  Force
-calculations can be dynamically balanced across the CPU cores and
-GPUs.  GPU-specific settings can be made which can be optimized
-for different hardware.  See the "packakge"_package.html command
-doc page for details. :l
-
-As described by the "package gpu"_package.html command, GPU
-accelerated pair styles can perform computations asynchronously with
-CPU computations. The "Pair" time reported by LAMMPS will be the
-maximum of the time required to complete the CPU pair style
-computations and the time required to complete the GPU pair style
-computations. Any time spent for GPU-enabled pair styles for
-computations that run simultaneously with "bond"_bond_style.html,
-"angle"_angle_style.html, "dihedral"_dihedral_style.html,
-"improper"_improper_style.html, and "long-range"_kspace_style.html
-calculations will not be included in the "Pair" time. :l
-
-When the {mode} setting for the package gpu command is force/neigh,
-the time for neighbor list calculations on the GPU will be added into
-the "Pair" time, not the "Neigh" time.  An additional breakdown of the
-times required for various tasks on the GPU (data copy, neighbor
-calculations, force computations, etc) are output only with the LAMMPS
-screen output (not in the log file) at the end of each run.  These
-timings represent total time spent on the GPU for each routine,
-regardless of asynchronous CPU calculations. :l
-
-The output section "GPU Time Info (average)" reports "Max Mem / Proc".
-This is the maximum memory used at one time on the GPU for data
-storage by a single MPI process. :l,ule
-
-[Restrictions:]
-
-None.
-
-:line
-
-5.7 USER-CUDA package :h4,link(acc_7)
-
-The USER-CUDA package was developed by Christian Trott (Sandia) while
-at U Technology Ilmenau in Germany.  It provides NVIDIA GPU versions
-of many pair styles, many fixes, a few computes, and for long-range
-Coulombics via the PPPM command.  It has the following general
-features:
-
-The package is designed to allow an entire LAMMPS calculation, for
-many timesteps, to run entirely on the GPU (except for inter-processor
-MPI communication), so that atom-based data (e.g. coordinates, forces)
-do not have to move back-and-forth between the CPU and GPU. :ulb,l
-
-The speed-up advantage of this approach is typically better when the
-number of atoms per GPU is large :l
-
-Data will stay on the GPU until a timestep where a non-USER-CUDA fix
-or compute is invoked.  Whenever a non-GPU operation occurs (fix,
-compute, output), data automatically moves back to the CPU as needed.
-This may incur a performance penalty, but should otherwise work
-transparently. :l
-
-Neighbor lists are constructed on the GPU. :l
-
-The package only supports use of a single MPI task, running on a
-single CPU (core), assigned to each GPU. :l,ule
-
-Here is a quick overview of how to use the USER-CUDA package:
-
-build the library in lib/cuda for your GPU hardware with desired precision
-include the USER-CUDA package and build LAMMPS
-use the mpirun command to specify 1 MPI task per GPU (on each node)
-enable the USER-CUDA package via the "-c on" command-line switch
-specify the # of GPUs per node
-use USER-CUDA styles in your input script :ul
-
-The latter two steps can be done using the "-pk cuda" and "-sf cuda"
-"command-line switches"_Section_start.html#start_7 respectively.  Or
-the effect of the "-pk" or "-sf" switches can be duplicated by adding
-the "package cuda"_package.html or "suffix cuda"_suffix.html commands
-respectively to your input script.
-
-[Required hardware/software:]
-
-To use this package, you need to have one or more NVIDIA GPUs and
-install the NVIDIA Cuda software on your system:
-
-Your NVIDIA GPU needs to support Compute Capability 1.3. This list may
-help you to find out the Compute Capability of your card:
-
-http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units
-
-Install the Nvidia Cuda Toolkit (version 3.2 or higher) and the
-corresponding GPU drivers.  The Nvidia Cuda SDK is not required, but
-we recommend it also be installed.  You can then make sure its sample
-projects can be compiled without problems.
-
-[Building LAMMPS with the USER-CUDA package:]
-
-This requires two steps (a,b): build the USER-CUDA library, then build
-LAMMPS with the USER-CUDA package.
-
-(a) Build the USER-CUDA library
-
-The USER-CUDA library is in lammps/lib/cuda.  If your {CUDA} toolkit
-is not installed in the default system directoy {/usr/local/cuda} edit
-the file {lib/cuda/Makefile.common} accordingly.
-
-To set options for the library build, type "make OPTIONS", where
-{OPTIONS} are one or more of the following. The settings will be
-written to the {lib/cuda/Makefile.defaults} and used when
-the library is built.
-
-{precision=N} to set the precision level
-  N = 1 for single precision (default)
-  N = 2 for double precision
-  N = 3 for positions in double precision
-  N = 4 for positions and velocities in double precision
-{arch=M} to set GPU compute capability
-  M = 35 for Kepler GPUs
-  M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
-  M = 21 for CC2.1 (GF104/114,  e.g. GTX560, GTX460, GTX450)
-  M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
-{prec_timer=0/1} to use hi-precision timers
-  0 = do not use them (default)
-  1 = use them
-  this is usually only useful for Mac machines 
-{dbg=0/1} to activate debug mode
-  0 = no debug mode (default)
-  1 = yes debug mode
-  this is only useful for developers
-{cufft=1} for use of the CUDA FFT library
-  0 = no CUFFT support (default)
-  in the future other CUDA-enabled FFT libraries might be supported :pre
-
-To build the library, simply type:
-
-make :pre
-
-If successful, it will produce the files libcuda.a and Makefile.lammps.
-
-Note that if you change any of the options (like precision), you need
-to re-build the entire library.  Do a "make clean" first, followed by
-"make".
-
-(b) Build LAMMPS with the USER-CUDA package
-
-cd lammps/src
-make yes-user-cuda
-make machine :pre
-
-No additional compile/link flags are needed in your Makefile.machine
-in src/MAKE.
-
-Note that if you change the USER-CUDA library precision (discussed
-above) and rebuild the USER-CUDA library, then you also need to
-re-install the USER-CUDA package and re-build LAMMPS, so that all
-affected files are re-compiled and linked to the new USER-CUDA
-library.
-
-[Run with the USER-CUDA package from the command line:]
-
-The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-
-When using the USER-CUDA package, you must use exactly one MPI task
-per physical GPU.
-
-You must use the "-c on" "command-line
-switch"_Section_start.html#start_7 to enable the USER-CUDA package.
-
-Use the "-sf cuda" "command-line switch"_Section_start.html#start_7,
-which will automatically append "cuda" to styles that support it.  Use
-the "-pk cuda Ng" "command-line switch"_Section_start.html#start_7 to
-set Ng = # of GPUs per node.
-
-lmp_machine -c on -sf cuda -pk cuda 1 -in in.script                       # 1 MPI task uses 1 GPU
-mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script          # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node
-mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script  # ditto on 12 16-core nodes :pre
-
-The "-pk" switch must be used (unless the "package cuda"_package.html
-command is used in the input script) to set the number of GPUs/node to
-use.  It also allows for setting of additional options.  Its syntax is
-the same as same as the "package cuda" command.  See the
-"package"_package.html command doc page for details.
-
-[Or run with the USER-CUDA package by editing an input script:]
-
-The discussion above for the mpirun/mpiexec command and the requirement
-of one MPI task per GPU is the same.
-
-You must still use the "-c on" "command-line
-switch"_Section_start.html#start_7 to enable the USER-CUDA package.
-
-Use the "suffix cuda"_suffix.html command, or you can explicitly add a
-"cuda" suffix to individual styles in your input script, e.g.
-
-pair_style lj/cut/cuda 2.5 :pre
-
-You must use the "package cuda"_package.html command to set the the
-number of GPUs/node, unless the "-pk" "command-line
-switch"_Section_start.html#start_7 was used.  The command also
-allows for setting of additional options.
-
-[Speed-ups to expect:]
-
-The performance of a GPU versus a multi-core CPU is a function of your
-hardware, which pair style is used, the number of atoms/GPU, and the
-precision used on the GPU (double, single, mixed).
-
-See the "Benchmark page"_http://lammps.sandia.gov/bench.html of the
-LAMMPS web site for performance of the USER-CUDA package on different
-hardware.
-
-[Guidelines for best performance:]
-
-The USER-CUDA package offers more speed-up relative to CPU performance
-when the number of atoms per GPU is large, e.g. on the order of tens
-or hundreds of 1000s. :ulb,l
-
-As noted above, this package will continue to run a simulation
-entirely on the GPU(s) (except for inter-processor MPI communication),
-for multiple timesteps, until a CPU calculation is required, either by
-a fix or compute that is non-GPU-ized, or until output is performed
-(thermo or dump snapshot or restart file).  The less often this
-occurs, the faster your simulation will run. :l,ule
-
-[Restrictions:]
-
-None.
-
-:line
-
-5.8 KOKKOS package :h4,link(acc_8)
-
-The KOKKOS package was developed primaritly by Christian Trott
-(Sandia) with contributions of various styles by others, including
-Sikandar Mashayak (UIUC).  The underlying Kokkos library was written
-primarily by Carter Edwards, Christian Trott, and Dan Sunderland (all
-Sandia).
-
-The KOKKOS package contains versions of pair, fix, and atom styles
-that use data structures and macros provided by the Kokkos library,
-which is included with LAMMPS in lib/kokkos.
-
-The Kokkos library is part of
-"Trilinos"_http://trilinos.sandia.gov/packages/kokkos and is a
-templated C++ library that provides two key abstractions for an
-application like LAMMPS.  First, it allows a single implementation of
-an application kernel (e.g. a pair style) to run efficiently on
-different kinds of hardware, such as a GPU, Intel Phi, or many-core
-chip.
-
-The Kokkos library also provides data abstractions to adjust (at
-compile time) the memory layout of basic data structures like 2d and
-3d arrays and allow the transparent utilization of special hardware
-load and store operations.  Such data structures are used in LAMMPS to
-store atom coordinates or forces or neighbor lists.  The layout is
-chosen to optimize performance on different platforms.  Again this
-functionality is hidden from the developer, and does not affect how
-the kernel is coded.
-
-These abstractions are set at build time, when LAMMPS is compiled with
-the KOKKOS package installed.  This is done by selecting a "host" and
-"device" to build for, compatible with the compute nodes in your
-machine (one on a desktop machine or 1000s on a supercomputer).
-
-All Kokkos operations occur within the context of an individual MPI
-task running on a single node of the machine.  The total number of MPI
-tasks used by LAMMPS (one or multiple per compute node) is set in the
-usual manner via the mpirun or mpiexec commands, and is independent of
-Kokkos.
-
-Kokkos provides support for two different modes of execution per MPI
-task.  This means that computational tasks (pairwise interactions,
-neighbor list builds, time integration, etc) can be parallelized for
-one or the other of the two modes.  The first mode is called the
-"host" and is one or more threads running on one or more physical CPUs
-(within the node).  Currently, both multi-core CPUs and an Intel Phi
-processor (running in native mode, not offload mode like the
-USER-INTEL package) are supported.  The second mode is called the
-"device" and is an accelerator chip of some kind.  Currently only an
-NVIDIA GPU is supported.  If your compute node does not have a GPU,
-then there is only one mode of execution, i.e. the host and device are
-the same.
-
-Here is a quick overview of how to use the KOKKOS package
-for GPU acceleration:
-
-specify variables and settings in your Makefile.machine that enable GPU, Phi, or OpenMP support
-include the KOKKOS package and build LAMMPS
-enable the KOKKOS package and its hardware options via the "-k on" command-line switch
-use KOKKOS styles in your input script :ul
-
-The latter two steps can be done using the "-k on", "-pk kokkos" and
-"-sf kk" "command-line switches"_Section_start.html#start_7
-respectively.  Or the effect of the "-pk" or "-sf" switches can be
-duplicated by adding the "package kokkos"_package.html or "suffix
-kk"_suffix.html commands respectively to your input script.
-
-[Required hardware/software:]
-
-The KOKKOS package can be used to build and run LAMMPS on the
-following kinds of hardware:
-
-CPU-only: one MPI task per CPU core (MPI-only, but using KOKKOS styles)
-CPU-only: one or a few MPI tasks per node with additional threading via OpenMP
-Phi: on one or more Intel Phi coprocessors (per node)
-GPU: on the GPUs of a node with additional OpenMP threading on the CPUs :ul
-
-Note that Intel Xeon Phi coprocessors are supported in "native" mode,
-not "offload" mode like the USER-INTEL package supports.
-
-Only NVIDIA GPUs are currently supported.
-
-IMPORTANT NOTE: For good performance of the KOKKOS package on GPUs,
-you must have Kepler generation GPUs (or later).  The Kokkos library
-exploits texture cache options not supported by Telsa generation GPUs
-(or older).
-
-To build the KOKKOS package for GPUs, NVIDIA Cuda software must be
-installed on your system.  See the discussion above for the USER-CUDA
-and GPU packages for details of how to check and do this.
-
-[Building LAMMPS with the KOKKOS package:]
-
-Unlike other acceleration packages discussed in this section, the
-Kokkos library in lib/kokkos does not have to be pre-built before
-building LAMMPS itself.  Instead, options for the Kokkos library are
-specified at compile time, when LAMMPS itself is built.  This can be
-done in one of two ways, as discussed below.
-
-Here are examples of how to build LAMMPS for the different compute-node
-configurations listed above.
-
-CPU-only (run all-MPI or with OpenMP threading):
-
-cd lammps/src
-make yes-kokkos
-make g++ OMP=yes :pre
-
-Intel Xeon Phi:
-
-cd lammps/src
-make yes-kokkos
-make g++ OMP=yes MIC=yes :pre
-
-CPUs and GPUs:
-
-cd lammps/src
-make yes-kokkos
-make cuda CUDA=yes :pre
-
-These examples set the KOKKOS-specific OMP, MIC, CUDA variables on the
-make command line which requires a GNU-compatible make command.  Try
-"gmake" if your system's standard make complains.  
-
-IMPORTANT NOTE: If you build using make line variables and re-build
-LAMMPS twice with different KOKKOS options and the *same* target,
-e.g. g++ in the first two examples above, then you *must* perform a
-"make clean-all" or "make clean-machine" before each build.  This is
-to force all the KOKKOS-dependent files to be re-compiled with the new
-options.
-
-You can also hardwire these make variables in the specified machine
-makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above,
-with a line like:
-
-MIC = yes :pre
-
-Note that if you build LAMMPS multiple times in this manner, using
-different KOKKOS options (defined in different machine makefiles), you
-do not have to worry about doing a "clean" in between.  This is
-because the targets will be different.
-
-IMPORTANT NOTE: The 3rd example above for a GPU, uses a different
-machine makefile, in this case src/MAKE/Makefile.cuda, which is
-included in the LAMMPS distribution.  To build the KOKKOS package for
-a GPU, this makefile must use the NVIDA "nvcc" compiler.  And it must
-have a CCFLAGS -arch setting that is appropriate for your NVIDIA
-hardware and installed software.  Typical values for -arch are given
-in "Section 2.3.4"_Section_start.html#start_3_4 of the manual, as well
-as other settings that must be included in the machine makefile, if
-you create your own.
-
-There are other allowed options when building with the KOKKOS package.
-As above, They can be set either as variables on the make command line
-or in the machine makefile in the src/MAKE directory.  See "Section
-2.3.4"_Section_start.html#start_3_4 of the manual for details.
-
-IMPORTANT NOTE: Currently, there are no precision options with the
-KOKKOS package.  All compilation and computation is performed in
-double precision.
-
-[Run with the KOKKOS package from the command line:]
-
-The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-
-When using KOKKOS built with host=OMP, you need to choose how many
-OpenMP threads per MPI task will be used (via the "-k" command-line
-switch discussed below).  Note that the product of MPI tasks * OpenMP
-threads/task should not exceed the physical number of cores (on a
-node), otherwise performance will suffer.
-
-When using the KOKKOS package built with device=CUDA, you must use
-exactly one MPI task per physical GPU.
-
-When using the KOKKOS package built with host=MIC for Intel Xeon Phi
-coprocessor support you need to insure there are one or more MPI tasks
-per coprocessor, and choose the number of coprocessor threads to use
-per MPI task (via the "-k" command-line switch discussed below).  The
-product of MPI tasks * coprocessor threads/task should not exceed the
-maximum number of threads the coproprocessor is designed to run,
-otherwise performance will suffer.  This value is 240 for current
-generation Xeon Phi(TM) chips, which is 60 physical cores * 4
-threads/core.  Note that with the KOKKOS package you do not need to
-specify how many Phi coprocessors there are per node; each
-coprocessors is simply treated as running some number of MPI tasks.
-
-You must use the "-k on" "command-line
-switch"_Section_start.html#start_7 to enable the KOKKOS package.  It
-takes additional arguments for hardware settings appropriate to your
-system.  Those arguments are "documented
-here"_Section_start.html#start_7.  The two most commonly used arguments
-are:
-
--k on t Nt
--k on g Ng :pre
-
-The "t Nt" option applies to host=OMP (even if device=CUDA) and
-host=MIC.  For host=OMP, it specifies how many OpenMP threads per MPI
-task to use with a node.  For host=MIC, it specifies how many Xeon Phi
-threads per MPI task to use within a node.  The default is Nt = 1.
-Note that for host=OMP this is effectively MPI-only mode which may be
-fine.  But for host=MIC you will typically end up using far less than
-all the 240 available threads, which could give very poor performance.
-
-The "g Ng" option applies to device=CUDA.  It specifies how many GPUs
-per compute node to use.  The default is 1, so this only needs to be
-specified is you have 2 or more GPUs per compute node.
-
-The "-k on" switch also issues a default "package kokkos neigh full
-comm host"_package.html command which sets various KOKKOS options to
-default values, as discussed on the "package"_package.html command doc
-page.
-
-Use the "-sf kk" "command-line switch"_Section_start.html#start_7,
-which will automatically append "kk" to styles that support it.  Use
-the "-pk kokkos" "command-line switch"_Section_start.html#start_7 if
-you wish to override any of the default values set by the "package
-kokkos"_package.html command invoked by the "-k on" switch.
-
-host=OMP, dual hex-core nodes (12 threads/node):
-mpirun -np 12 lmp_g++ -in in.lj                           # MPI-only mode with no Kokkos
-mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj              # MPI-only mode with Kokkos
-mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj          # one MPI task, 12 threads
-mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj           # two MPI tasks, 6 threads/task 
-mpirun -np 32 -ppn 2 lmp_g++ -k on t 6 -sf kk -in in.lj   # ditto on 16 nodes :pre
-
-host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading):
-mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj           # 1 MPI task on 1 Phi, 1*240 = 240
-mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj            # 30 MPI tasks on 1 Phi, 30*8 = 240
-mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj           # 12 MPI tasks on 1 Phi, 12*20 = 240
-mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj   # ditto on 8 Phis
-
-
-host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU:
-mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj          # one MPI task, 6 threads on CPU
-mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj   # ditto on 4 nodes :pre
-
-host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs:
-mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj           # two MPI tasks, 8 threads per CPU
-mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # ditto on 16 nodes :pre
-
-[Or run with the KOKKOS package by editing an input script:]
-
-The discussion above for the mpirun/mpiexec command and setting
-appropriate thread and GPU values for host=OMP or host=MIC or
-device=CUDA are the same.
-
-You must still use the "-k on" "command-line
-switch"_Section_start.html#start_7 to enable the KOKKOS package, and
-specify its additional arguments for hardware options appopriate to
-your system, as documented above.
-
-Use the "suffix kk"_suffix.html command, or you can explicitly add a
-"kk" suffix to individual styles in your input script, e.g.
-
-pair_style lj/cut/kk 2.5 :pre
-
-You only need to use the "package kokkos"_package.html command if you
-wish to change any of its option defaults.
-
-[Speed-ups to expect:]
-
-The performance of KOKKOS running in different modes is a function of
-your hardware, which KOKKOS-enable styles are used, and the problem
-size.
-
-Generally speaking, the following rules of thumb apply:
-
-When running on CPUs only, with a single thread per MPI task,
-performance of a KOKKOS style is somewhere between the standard
-(un-accelerated) styles (MPI-only mode), and those provided by the
-USER-OMP package.  However the difference between all 3 is small (less
-than 20%). :ulb,l
-
-When running on CPUs only, with multiple threads per MPI task,
-performance of a KOKKOS style is a bit slower than the USER-OMP
-package. :l
-
-When running on GPUs, KOKKOS is typically faster than the USER-CUDA
-and GPU packages. :l
-
-When running on Intel Xeon Phi, KOKKOS is not as fast as
-the USER-INTEL package, which is optimized for that hardware. :l,ule
-
-See the "Benchmark page"_http://lammps.sandia.gov/bench.html of the
-LAMMPS web site for performance of the KOKKOS package on different
-hardware.
-
-[Guidelines for best performance:]
-
-Here are guidline for using the KOKKOS package on the different
-hardware configurations listed above.
-
-Many of the guidelines use the "package kokkos"_package.html command
-See its doc page for details and default settings.  Experimenting with
-its options can provide a speed-up for specific calculations.
-
-[Running on a multi-core CPU:]
-
-If N is the number of physical cores/node, then the number of MPI
-tasks/node * number of threads/task should not exceed N, and should
-typically equal N.  Note that the default threads/task is 1, as set by
-the "t" keyword of the "-k" "command-line
-switch"_Section_start.html#start_7.  If you do not change this, no
-additional parallelism (beyond MPI) will be invoked on the host
-CPU(s).
-
-You can compare the performance running in different modes:
-  
-run with 1 MPI task/node and N threads/task
-run with N MPI tasks/node and 1 thread/task
-run with settings in between these extremes :ul
-
-Examples of mpirun commands in these modes are shown above.
-
-When using KOKKOS to perform multi-threading, it is important for
-performance to bind both MPI tasks to physical cores, and threads to
-physical cores, so they do not migrate during a simulation.
-
-If you are not certain MPI tasks are being bound (check the defaults
-for your MPI installation), binding can be forced with these flags:
-
-OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
-Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... :pre
-
-For binding threads with the KOKKOS OMP option, use thread affinity
-environment variables to force binding.  With OpenMP 3.1 (gcc 4.7 or
-later, intel 12 or later) setting the environment variable
-OMP_PROC_BIND=true should be sufficient.  For binding threads with the
-KOKKOS pthreads option, compile LAMMPS the KOKKOS HWLOC=yes option, as
-discussed in "Section 2.3.4"_Sections_start.html#start_3_4 of the
-manual.
-
-[Running on GPUs:]
-
-Insure the -arch setting in the machine makefile you are using,
-e.g. src/MAKE/Makefile.cuda, is correct for your GPU hardware/software
-(see "this section"_Section_start.html#start_3_4 of the manual for
-details).
-
-The -np setting of the mpirun command should set the number of MPI
-tasks/node to be equal to the # of physical GPUs on the node. 
-
-Use the "-k" "command-line switch"_Section_commands.html#start_7 to
-specify the number of GPUs per node, and the number of threads per MPI
-task.  As above for multi-core CPUs (and no GPU), if N is the number
-of physical cores/node, then the number of MPI tasks/node * number of
-threads/task should not exceed N.  With one GPU (and one MPI task) it
-may be faster to use less than all the available cores, by setting
-threads/task to a smaller value.  This is because using all the cores
-on a dual-socket node will incur extra cost to copy memory from the
-2nd socket to the GPU.
-
-Examples of mpirun commands that follow these rules are shown above.
-
-IMPORTANT NOTE: When using a GPU, you will achieve the best
-performance if your input script does not use any fix or compute
-styles which are not yet Kokkos-enabled.  This allows data to stay on
-the GPU for multiple timesteps, without being copied back to the host
-CPU.  Invoking a non-Kokkos fix or compute, or performing I/O for
-"thermo"_thermo_style.html or "dump"_dump.html output will cause data
-to be copied back to the CPU.
-
-You cannot yet assign multiple MPI tasks to the same GPU with the
-KOKKOS package.  We plan to support this in the future, similar to the
-GPU package in LAMMPS.
-
-You cannot yet use both the host (multi-threaded) and device (GPU)
-together to compute pairwise interactions with the KOKKOS package.  We
-hope to support this in the future, similar to the GPU package in
-LAMMPS.
-
-[Running on an Intel Phi:]
-
-Kokkos only uses Intel Phi processors in their "native" mode, i.e.
-not hosted by a CPU.
-
-As illustrated above, build LAMMPS with OMP=yes (the default) and
-MIC=yes.  The latter insures code is correctly compiled for the Intel
-Phi.  The OMP setting means OpenMP will be used for parallelization on
-the Phi, which is currently the best option within Kokkos.  In the
-future, other options may be added.
-
-Current-generation Intel Phi chips have either 61 or 57 cores.  One
-core should be excluded for running the OS, leaving 60 or 56 cores.
-Each core is hyperthreaded, so there are effectively N = 240 (4*60) or
-N = 224 (4*56) cores to run on.
-
-The -np setting of the mpirun command sets the number of MPI
-tasks/node.  The "-k on t Nt" command-line switch sets the number of
-threads/task as Nt.  The product of these 2 values should be N, i.e.
-240 or 224.  Also, the number of threads/task should be a multiple of
-4 so that logical threads from more than one MPI task do not run on
-the same physical core.
-
-Examples of mpirun commands that follow these rules are shown above.
-
-[Restrictions:]
-
-As noted above, if using GPUs, the number of MPI tasks per compute
-node should equal to the number of GPUs per compute node.  In the
-future Kokkos will support assigning multiple MPI tasks to a single
-GPU.
-
-Currently Kokkos does not support AMD GPUs due to limits in the
-available backend programming models.  Specifically, Kokkos requires
-extensive C++ support from the Kernel language.  This is expected to
-change in the future.
-
-:line
-
-5.9 USER-INTEL package :h4,link(acc_9)
-
-The USER-INTEL package was developed by Mike Brown at Intel
-Corporation.  It provides a capability to accelerate simulations by
-offloading neighbor list and non-bonded force calculations to Intel(R)
-Xeon Phi(TM) coprocessors (not native mode like the KOKKOS package).
-Additionally, it supports running simulations in single, mixed, or
-double precision with vectorization, even if a coprocessor is not
-present, i.e. on an Intel(R) CPU.  The same C++ code is used for both
-cases.  When offloading to a coprocessor, the routine is run twice,
-once with an offload flag.
-
-The USER-INTEL package can be used in tandem with the USER-OMP
-package.  This is useful when offloading pair style computations to
-coprocessors, so that other styles not supported by the USER-INTEL
-package, e.g. bond, angle, dihedral, improper, and long-range
-electrostatics, can be run simultaneously in threaded mode on CPU
-cores.  Since less MPI tasks than CPU cores will typically be invoked
-when running with coprocessors, this enables the extra cores to be
-utilized for useful computation.
-
-If LAMMPS is built with both the USER-INTEL and USER-OMP packages
-intsalled, this mode of operation is made easier to use, because the
-"-suffix intel" "command-line switch"_Section_start.html#start_7 or
-the "suffix intel"_suffix.html command will both set a second-choice
-suffix to "omp" so that styles from the USER-OMP package will be used
-if available, after first testing if a style from the USER-INTEL
-package is available.
-
-Here is a quick overview of how to use the USER-INTEL package
-for CPU acceleration:
-
-specify these CCFLAGS in your Makefile.machine: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost
-specify -fopenmp with LINKFLAGS in your Makefile.machine
-include the USER-INTEL package and (optionally) USER-OMP package and build LAMMPS
-if using the USER-OMP package, specify how many threads per MPI task to use
-use USER-INTEL styles in your input script :ul
-
-Using the USER-INTEL package to offload work to the Intel(R)
-Xeon Phi(TM) coprocessor is the same except for these additional
-steps:
-
-add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Makefile.machine
-add the flag -offload to LINKFLAGS in your Makefile.machine
-specify how many threads per coprocessor to use :ul
-
-The latter two steps in the first case and the last step in the
-coprocessor case can be done using the "-pk omp" and "-sf intel" and
-"-pk intel" "command-line switches"_Section_start.html#start_7
-respectively.  Or the effect of the "-pk" or "-sf" switches can be
-duplicated by adding the "package omp"_package.html or "suffix
-intel"_suffix.html or "package intel"_package.html commands
-respectively to your input script.
-
-[Required hardware/software:]
-
-To use the offload option, you must have one or more Intel(R) Xeon
-Phi(TM) coprocessors.
-
-Optimizations for vectorization have only been tested with the
-Intel(R) compiler.  Use of other compilers may not result in
-vectorization or give poor performance.
-
-Use of an Intel C++ compiler is reccommended, but not required.  The
-compiler must support the OpenMP interface.
-
-[Building LAMMPS with the USER-INTEL package:]
-
-Include the package(s) and build LAMMPS:  
-
-cd lammps/src
-make yes-user-intel
-make yes-user-omp (if desired)
-make machine :pre
-
-If the USER-OMP package is also installed, you can use styles from
-both packages, as described below.
-
-The lo-level src/MAKE/Makefile.machine needs a flag for OpenMP support
-in both the CCFLAGS and LINKFLAGS variables, which is {-openmp} for
-Intel compilers.  You also need to add -DLAMMPS_MEMALIGN=64 and
--restrict to CCFLAGS.
-
-If you are compiling on the same architecture that will be used for
-the runs, adding the flag {-xHost} to CCFLAGS will enable
-vectorization with the Intel(R) compiler.
-
-In order to build with support for an Intel(R) coprocessor, the flag
-{-offload} should be added to the LINKFLAGS line and the flag
--DLMP_INTEL_OFFLOAD should be added to the CCFLAGS line.
-
-Note that the machine makefiles Makefile.intel and
-Makefile.intel_offload are included in the src/MAKE directory with
-options that perform well with the Intel(R) compiler. The latter file
-has support for offload to coprocessors; the former does not.
-
-If using an Intel compiler, it is recommended that Intel(R) Compiler
-2013 SP1 update 1 be used.  Newer versions have some performance
-issues that are being addressed. If using Intel(R) MPI, version 5 or
-higher is recommended.
-
-[Running with the USER-INTEL package from the command line:]
-
-The mpirun or mpiexec command sets the total number of MPI tasks used
-by LAMMPS (one or multiple per compute node) and the number of MPI
-tasks used per node.  E.g. the mpirun command does this via its -np
-and -ppn switches.
-
-If LAMMPS was also built with the USER-OMP package, you need to choose
-how many OpenMP threads per MPI task will be used by the USER-OMP
-package.  Note that the product of MPI tasks * OpenMP threads/task
-should not exceed the physical number of cores (on a node), otherwise
-performance will suffer.
-
-If LAMMPS was built with coprocessor support for the USER-INTEL
-package, you need to specify the number of coprocessor/node and the
-number of threads to use on the coprocessor per MPI task.  Note that
-coprocessor threads (which run on the coprocessor) are totally
-independent from OpenMP threads (which run on the CPU).  The product
-of MPI tasks * coprocessor threads/task should not exceed the maximum
-number of threads the coproprocessor is designed to run, otherwise
-performance will suffer.  This value is 240 for current generation
-Xeon Phi(TM) chips, which is 60 physical cores * 4 threads/core.  The
-threads/core value can be set to a smaller value if desired by an
-option on the "package intel"_package.html command, in which case the
-maximum number of threads is also reduced.
-
-Use the "-sf intel" "command-line switch"_Section_start.html#start_7,
-which will automatically append "intel" to styles that support it.  If
-a style does not support it, a "omp" suffix is tried next.  Use the
-"-pk omp Nt" "command-line switch"_Section_start.html#start_7, to set
-Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with
-the USER-OMP package.  Use the "-pk intel Nphi" "command-line
-switch"_Section_start.html#start_7 to set Nphi = # of Xeon Phi(TM)
-coprocessors/node, if LAMMPS was built with coprocessor support.
-
-CPU-only without USER-OMP (but using Intel vectorization on CPU):
-lmp_machine -sf intel -in in.script                 # 1 MPI task
-mpirun -np 32 lmp_machine -sf intel -in in.script   # 32 MPI tasks on as many nodes as needed (e.g. 2 16-core nodes) :pre
-
-CPU-only with USER-OMP (and Intel vectorization on CPU):
-lmp_machine -sf intel -pk intel 16 0 -in in.script                # 1 MPI task on a 16-core node
-mpirun -np 4 lmp_machine -sf intel -pk intel 4 0 -in in.script    # 4 MPI tasks each with 4 threads on a single 16-core node
-mpirun -np 32 lmp_machine -sf intel -pk intel 4 0 -in in.script   # ditto on 8 16-core nodes :pre
-
-CPUs + Xeon Phi(TM) coprocessors with USER-OMP:
-lmp_machine -sf intel -pk intel 16 1 -in in.script                                  # 1 MPI task, 240 threads on 1 coprocessor
-mpirun -np 4 lmp_machine -sf intel -pk intel 4 1 tptask 60 -in in.script            # 4 MPI tasks each with 4 OpenMP threads on a single 16-core node, 
-                                                                                    # each MPI task uses 60 threads on 1 coprocessor
-mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.script   # ditto on 8 16-core nodes for MPI tasks and OpenMP threads, 
-                                                                                    # each MPI task uses 120 threads on one of 2 coprocessors :pre
-
-Note that if the "-sf intel" switch is used, it also issues two
-default commands: "package omp 0"_package.html and "package intel
-1"_package.html command.  These set the number of OpenMP threads per
-MPI task via the OMP_NUM_THREADS environment variable, and the number
-of Xeon Phi(TM) coprocessors/node to 1.  The former is ignored if
-LAMMPS was not built with the USER-OMP package.  The latter is ignored
-is LAMMPS was not built with coprocessor support, except for its
-optional precision setting.
-
-Using the "-pk omp" switch explicitly allows for direct setting of the
-number of OpenMP threads per MPI task, and additional options.  Using
-the "-pk intel" switch explicitly allows for direct setting of the
-number of coprocessors/node, and additional options.  The syntax for
-these two switches is the same as the "package omp"_package.html and
-"package intel"_package.html commands.  See the "package"_package.html
-command doc page for details, including the default values used for
-all its options if these switches are not specified, and how to set
-the number of OpenMP threads via the OMP_NUM_THREADS environment
-variable if desired.
-
-[Or run with the USER-INTEL package by editing an input script:]
-
-The discussion above for the mpirun/mpiexec command, MPI tasks/node,
-OpenMP threads per MPI task, and coprocessor threads per MPI task is
-the same.
-
-Use the "suffix intel"_suffix.html command, or you can explicitly add an
-"intel" suffix to individual styles in your input script, e.g.
-
-pair_style lj/cut/intel 2.5 :pre
-
-You must also use the "package omp"_package.html command to enable the
-USER-OMP package (assuming LAMMPS was built with USER-OMP) unless the "-sf
-intel" or "-pk omp" "command-line switches"_Section_start.html#start_7
-were used.  It specifies how many OpenMP threads per MPI task to use,
-as well as other options.  Its doc page explains how to set the number
-of threads via an environment variable if desired.
-
-You must also use the "package intel"_package.html command to enable
-coprocessor support within the USER-INTEL package (assuming LAMMPS was
-built with coprocessor support) unless the "-sf intel" or "-pk intel"
-"command-line switches"_Section_start.html#start_7 were used.  It
-specifies how many coprocessors/node to use, as well as other
-coprocessor options.
-
-[Speed-ups to expect:]
-
-If LAMMPS was not built with coprocessor support when including the
-USER-INTEL package, then acclerated styles will run on the CPU using
-vectorization optimizations and the specified precision.  This may
-give a substantial speed-up for a pair style, particularly if mixed or
-single precision is used.
-
-If LAMMPS was built with coproccesor support, the pair styles will run
-on one or more Intel(R) Xeon Phi(TM) coprocessors (per node).  The
-performance of a Xeon Phi versus a multi-core CPU is a function of
-your hardware, which pair style is used, the number of
-atoms/coprocessor, and the precision used on the coprocessor (double,
-single, mixed).
-
-See the "Benchmark page"_http://lammps.sandia.gov/bench.html of the
-LAMMPS web site for performance of the USER-INTEL package on different
-hardware.
-
-[Guidelines for best performance on an Intel(R) Xeon Phi(TM)
-coprocessor:]
-
-The default for the "package intel"_package.html command is to have
-all the MPI tasks on a given compute node use a single Xeon Phi(TM)
-coprocessor.  In general, running with a large number of MPI tasks on
-each node will perform best with offload.  Each MPI task will
-automatically get affinity to a subset of the hardware threads
-available on the coprocessor.  For example, if your card has 61 cores,
-with 60 cores available for offload and 4 hardware threads per core
-(240 total threads), running with 24 MPI tasks per node will cause
-each MPI task to use a subset of 10 threads on the coprocessor.  Fine
-tuning of the number of threads to use per MPI task or the number of
-threads to use per core can be accomplished with keyword settings of
-the "package intel"_package.html command. :ulb,l
-
-If desired, only a fraction of the pair style computation can be
-offloaded to the coprocessors.  This is accomplished by using the
-{balance} keyword in the "package intel"_package.html command.  A
-balance of 0 runs all calculations on the CPU.  A balance of 1 runs
-all calculations on the coprocessor.  A balance of 0.5 runs half of
-the calculations on the coprocessor.  Setting the balance to -1 (the
-default) will enable dynamic load balancing that continously adjusts
-the fraction of offloaded work throughout the simulation.  This option
-typically produces results within 5 to 10 percent of the optimal fixed
-balance. :l
-
-When using offload with CPU hyperthreading disabled, it may help
-performance to use fewer MPI tasks and OpenMP threads than available
-cores.  This is due to the fact that additional threads are generated
-internally to handle the asynchronous offload tasks. :l
-
-If running short benchmark runs with dynamic load balancing, adding a
-short warm-up run (10-20 steps) will allow the load-balancer to find a
-near-optimal setting that will carry over to additional runs. :l
-
-If pair computations are being offloaded to an Intel(R) Xeon Phi(TM)
-coprocessor, a diagnostic line is printed to the screen (not to the
-log file), during the setup phase of a run, indicating that offload
-mode is being used and indicating the number of coprocessor threads
-per MPI task.  Additionally, an offload timing summary is printed at
-the end of each run.  When offloading, the frequency for "atom
-sorting"_atom_modify.html is changed to 1 so that the per-atom data is
-effectively sorted at every rebuild of the neighbor lists. :l
-
-For simulations with long-range electrostatics or bond, angle,
-dihedral, improper calculations, computation and data transfer to the
-coprocessor will run concurrently with computations and MPI
-communications for these calculations on the host CPU.  The USER-INTEL
-package has two modes for deciding which atoms will be handled by the
-coprocessor.  This choice is controlled with the {ghost} keyword of
-the "package intel"_package.html command.  When set to 0, ghost atoms
-(atoms at the borders between MPI tasks) are not offloaded to the
-card.  This allows for overlap of MPI communication of forces with
-computation on the coprocessor when the "newton"_newton.html setting
-is "on".  The default is dependent on the style being used, however,
-better performance may be achieved by setting this option
-explictly. :l,ule
-
-[Restrictions:]
-
-When offloading to a coprocessor, "hybrid"_pair_hybrid.html styles
-that require skip lists for neighbor builds cannot be offloaded.
-Using "hybrid/overlay"_pair_hybrid.html is allowed.  Only one intel
-accelerated style may be used with hybrid styles.
-"Special_bonds"_special_bonds.html exclusion lists are not currently
-supported with offload, however, the same effect can often be
-accomplished by setting cutoffs for excluded atom types to 0.  None of
-the pair styles in the USER-INTEL package currently support the
-"inner", "middle", "outer" options for rRESPA integration via the
-"run_style respa"_run_style.html command; only the "pair" option is
-supported.
-
-:line
-
-5.10 Comparison of GPU and USER-CUDA and KOKKOS packages :h4,link(acc_10)
+The next section compares and contrasts the various accelerator
+options, since there are multiple ways to perform OpenMP threading,
+run on GPUs, and run on Intel Xeon Phi coprocessors.
 
 All 3 of these packages accelerate a LAMMPS calculation using NVIDIA
 hardware, but they do it in different ways.
 
-NOTE: this section still needs to be re-worked with additional KOKKOS
-information.
-
 As a consequence, for a particular simulation on specific hardware,
 one package may be faster than the other.  We give guidelines below,
 but the best way to determine which package is faster for your input
diff --git a/doc/Section_start.html b/doc/Section_start.html
index 53c16d660d..80a09fdce3 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -1158,11 +1158,12 @@ letter abbreviation can be used:
 </P>
 <UL><LI>-c or -cuda
 <LI>-e or -echo
-<LI>-i or -in
 <LI>-h or -help
+<LI>-i or -in
 <LI>-k or -kokkos
 <LI>-l or -log
 <LI>-nc or -nocite
+<LI>-pk or -package
 <LI>-p or -partition
 <LI>-pl or -plog
 <LI>-ps or -pscreen
@@ -1198,6 +1199,15 @@ can be useful to figure out which line of your script is causing an
 input error.  The default value is <I>log</I>.  The echo style can also be
 set by using the <A HREF = "echo.html">echo</A> command in the input script itself.
 </P>
+<PRE>-help 
+</PRE>
+<P>Print a brief help summary and a list of options compiled into this
+executable for each LAMMPS style (atom_style, fix, compute,
+pair_style, bond_style, etc).  This can tell you if the command you
+want to use was included via the appropriate package at compile time.
+LAMMPS will print the info and immediately exit if this switch is
+used.
+</P>
 <PRE>-in file 
 </PRE>
 <P>Specify a file to use as an input script.  This is an optional switch
@@ -1210,15 +1220,6 @@ Note that this is a required switch when running LAMMPS in
 multi-partition mode, since multiple processors cannot all read from
 stdin.
 </P>
-<PRE>-help 
-</PRE>
-<P>Print a brief help summary and a list of options compiled into this
-executable for each LAMMPS style (atom_style, fix, compute,
-pair_style, bond_style, etc).  This can tell you if the command you
-want to use was included via the appropriate package at compile time.
-LAMMPS will print the info and immediately exit if this switch is
-used.
-</P>
 <PRE>-kokkos on/off keyword/value ... 
 </PRE>
 <P>Explicitly enable or disable KOKKOS support, as provided by the KOKKOS
@@ -1326,6 +1327,20 @@ references for specific cite-able features used during a LAMMPS run.
 See the <A HREF = "http://lammps.sandia.gov/cite.html">citation page</A> for more
 details.
 </P>
+<PRE>-package style args .... 
+</PRE>
+<P>Invoke the <A HREF = "package.html">package</A> command with style and args.  The
+syntax is the same as if the command appeared at the top of the input
+script.  For example "-package gpu 2" or "-pk gpu 2" is the same as
+<A HREF = "package.html">package gpu 2</A> in the input script.  The possible styles
+and args are documented on the <A HREF = "package.html">package</A> doc page.  This
+switch can be used multiple times, e.g. to set options for the
+USER-INTEL and USER-OMP packages which can be used together.
+</P>
+<P>Along with the "-suffix" command-line switch, this is a convenient
+mechanism for invoking accelerator packages and their options without
+having to edit an input script.
+</P>
 <PRE>-partition 8x2 4 5 ... 
 </PRE>
 <P>Invoke LAMMPS in multi-partition mode.  When LAMMPS is run on P
@@ -1484,7 +1499,7 @@ multi-partition mode, if the specified file is "none", then no screen
 output is performed. Option -pscreen will override the name of the
 partition screen files file.N.
 </P>
-<PRE>-suffix style args 
+<PRE>-suffix style 
 </PRE>
 <P>Use variants of various styles if they exist.  The specified style can
 be <I>cuda</I>, <I>gpu</I>, <I>intel</I>, <I>kk</I>, <I>omp</I>, or <I>opt</I>.  These refer to
@@ -1494,51 +1509,61 @@ package, the "gpu" style to the GPU package, the "intel" style to the
 USER-INTEL package, the "kk" style to the KOKKOS package, the "opt"
 style to the OPT package, and the "omp" style to the USER-OMP package.
 </P>
+<P>Along with the "-package" command-line switch, this is a convenient
+mechanism for invoking accelerator packages and their options without
+having to edit an input script.
+</P>
 <P>As an example, all of the packages provide a <A HREF = "pair_lj.html">pair_style
 lj/cut</A> variant, with style names lj/cut/cuda,
-lj/cut/gpu, lj/cut/intel, lj/cut/kk, lj/cut/omp, or lj/cut/opt.  A
-variant styles can be specified explicitly in your input script,
-e.g. pair_style lj/cut/gpu.  If the -suffix switch is used, you do not
-need to modify your input script.  The specified suffix
-(cuda,gpu,intel,kk,omp,opt) is automatically appended whenever your
-input script command creates a new <A HREF = "atom_style.html">atom</A>,
-<A HREF = "pair_style.html">pair</A>, <A HREF = "fix.html">fix</A>, <A HREF = "compute.html">compute</A>, or
-<A HREF = "run_style.html">run</A> style.  If the variant version does not exist,
-the standard version is created.
+lj/cut/gpu, lj/cut/intel, lj/cut/kk, lj/cut/omp, and lj/cut/opt.  A
+variant style can be specified explicitly in your input script,
+e.g. pair_style lj/cut/gpu.  If the -suffix switch is used the
+specified suffix (cuda,gpu,intel,kk,omp,opt) is automatically appended
+whenever your input script command creates a new
+<A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>, <A HREF = "fix.html">fix</A>,
+<A HREF = "compute.html">compute</A>, or <A HREF = "run_style.html">run</A> style.  If the variant
+version does not exist, the standard version is created.
 </P>
 <P>For the GPU package, using this command-line switch also invokes the
-default GPU settings, as if the command "package gpu force/neigh 0 0
-1" were used at the top of your input script.  These settings can be
-changed by using the <A HREF = "package.html">package gpu</A> command in your script
-if desired.
+default GPU settings, as if the command "package gpu 1" were used at
+the top of your input script.  These settings can be changed by using
+the "-package gpu" command-line switch or the <A HREF = "package.html">package
+gpu</A> command in your script.
 </P>
-<P>For the USER-INTEL package, using this command-line switch also invokes the
-default USER-INTEL settings, as if the command "package intel * mixed
-balance -1" were used at the top of your input script.  These settings
-can be changed by using the <A HREF = "package.html">package intel</A> command in
-your script if desired.  If the USER-OMP package is installed, the
-intel suffix will make the omp suffix a second choice, if a requested
-style is not available in the USER-INTEL package.
+<P>For the USER-INTEL package, using this command-line switch also
+invokes the default USER-INTEL settings, as if the command "package
+intel 1" were used at the top of your input script.  These settings
+can be changed by using the "-package intel" command-line switch or
+the <A HREF = "package.html">package intel</A> command in your script.  If the
+USER-OMP package is also installed, the intel suffix will make the omp
+suffix a second choice, if a requested style is not available in the
+USER-INTEL package.  It will also invoke the default USER-OMP
+settings, as if the command "package omp 0" were used at the top of
+your input script.  These settings can be changed by using the
+"-package omp" command-line switch or the <A HREF = "package.html">package omp</A>
+command in your script.
 </P>
 <P>For the KOKKOS package, using this command-line switch also invokes
-the default KOKKOS settings, as if the command "package kokkos neigh
-full comm/exchange host comm/forward host " were used at the top of
-your input script.  These settings can be changed by using the
-<A HREF = "package.html">package kokkos</A> command in your script if desired.
+the default KOKKOS settings, as if the command "package kokkos" were
+used at the top of your input script.  These settings can be changed
+by using the "-package kokkos" command-line switch or the <A HREF = "package.html">package
+kokkos</A> command in your script.
 </P>
 <P>For the OMP package, using this command-line switch also invokes the
-default OMP settings, as if the command "package omp *" were used at
+default OMP settings, as if the command "package omp 0" were used at
 the top of your input script.  These settings can be changed by using
-the <A HREF = "package.html">package omp</A> command in your script if desired.
+the "-package omp" command-line switch or the <A HREF = "package.html">package
+omp</A> command in your script.
 </P>
-<P>The <A HREF = "suffix.html">suffix</A> command can also be used to set a suffix and
-it can also turn off or back on any suffix setting made via the
-command line.
+<P>The <A HREF = "suffix.html">suffix</A> command can also be used within an input
+script to set a suffix, or to turn off or back on any suffix setting
+made via the command line.
 </P>
 <PRE>-var name value1 value2 ... 
 </PRE>
 <P>Specify a variable that will be defined for substitution purposes when
-the input script is read.  "Name" is the variable name which can be a
+the input script is read.  This switch can be used multiple times to
+define multiple variables.  "Name" is the variable name which can be a
 single character (referenced as $x in the input script) or a full
 string (referenced as ${abc}).  An <A HREF = "variable.html">index-style
 variable</A> will be created and populated with the
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index 0408eb60d9..8c643fd243 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -1152,11 +1152,12 @@ letter abbreviation can be used:
 
 -c or -cuda
 -e or -echo
--i or -in
 -h or -help
+-i or -in
 -k or -kokkos
 -l or -log
 -nc or -nocite
+-pk or -package
 -p or -partition
 -pl or -plog
 -ps or -pscreen
@@ -1192,6 +1193,15 @@ can be useful to figure out which line of your script is causing an
 input error.  The default value is {log}.  The echo style can also be
 set by using the "echo"_echo.html command in the input script itself.
 
+-help :pre
+
+Print a brief help summary and a list of options compiled into this
+executable for each LAMMPS style (atom_style, fix, compute,
+pair_style, bond_style, etc).  This can tell you if the command you
+want to use was included via the appropriate package at compile time.
+LAMMPS will print the info and immediately exit if this switch is
+used.
+
 -in file :pre
 
 Specify a file to use as an input script.  This is an optional switch
@@ -1204,15 +1214,6 @@ Note that this is a required switch when running LAMMPS in
 multi-partition mode, since multiple processors cannot all read from
 stdin.
 
--help :pre
-
-Print a brief help summary and a list of options compiled into this
-executable for each LAMMPS style (atom_style, fix, compute,
-pair_style, bond_style, etc).  This can tell you if the command you
-want to use was included via the appropriate package at compile time.
-LAMMPS will print the info and immediately exit if this switch is
-used.
-
 -kokkos on/off keyword/value ... :pre
 
 Explicitly enable or disable KOKKOS support, as provided by the KOKKOS
@@ -1320,6 +1321,20 @@ references for specific cite-able features used during a LAMMPS run.
 See the "citation page"_http://lammps.sandia.gov/cite.html for more
 details.
 
+-package style args .... :pre
+
+Invoke the "package"_package.html command with style and args.  The
+syntax is the same as if the command appeared at the top of the input
+script.  For example "-package gpu 2" or "-pk gpu 2" is the same as
+"package gpu 2"_package.html in the input script.  The possible styles
+and args are documented on the "package"_package.html doc page.  This
+switch can be used multiple times, e.g. to set options for the
+USER-INTEL and USER-OMP packages which can be used together.
+
+Along with the "-suffix" command-line switch, this is a convenient
+mechanism for invoking accelerator packages and their options without
+having to edit an input script.
+
 -partition 8x2 4 5 ... :pre
 
 Invoke LAMMPS in multi-partition mode.  When LAMMPS is run on P
@@ -1478,7 +1493,7 @@ multi-partition mode, if the specified file is "none", then no screen
 output is performed. Option -pscreen will override the name of the
 partition screen files file.N.
 
--suffix style args :pre
+-suffix style :pre
 
 Use variants of various styles if they exist.  The specified style can
 be {cuda}, {gpu}, {intel}, {kk}, {omp}, or {opt}.  These refer to
@@ -1488,51 +1503,61 @@ package, the "gpu" style to the GPU package, the "intel" style to the
 USER-INTEL package, the "kk" style to the KOKKOS package, the "opt"
 style to the OPT package, and the "omp" style to the USER-OMP package.
 
+Along with the "-package" command-line switch, this is a convenient
+mechanism for invoking accelerator packages and their options without
+having to edit an input script.
+
 As an example, all of the packages provide a "pair_style
 lj/cut"_pair_lj.html variant, with style names lj/cut/cuda,
-lj/cut/gpu, lj/cut/intel, lj/cut/kk, lj/cut/omp, or lj/cut/opt.  A
-variant styles can be specified explicitly in your input script,
-e.g. pair_style lj/cut/gpu.  If the -suffix switch is used, you do not
-need to modify your input script.  The specified suffix
-(cuda,gpu,intel,kk,omp,opt) is automatically appended whenever your
-input script command creates a new "atom"_atom_style.html,
-"pair"_pair_style.html, "fix"_fix.html, "compute"_compute.html, or
-"run"_run_style.html style.  If the variant version does not exist,
-the standard version is created.
+lj/cut/gpu, lj/cut/intel, lj/cut/kk, lj/cut/omp, and lj/cut/opt.  A
+variant style can be specified explicitly in your input script,
+e.g. pair_style lj/cut/gpu.  If the -suffix switch is used the
+specified suffix (cuda,gpu,intel,kk,omp,opt) is automatically appended
+whenever your input script command creates a new
+"atom"_atom_style.html, "pair"_pair_style.html, "fix"_fix.html,
+"compute"_compute.html, or "run"_run_style.html style.  If the variant
+version does not exist, the standard version is created.
 
 For the GPU package, using this command-line switch also invokes the
-default GPU settings, as if the command "package gpu force/neigh 0 0
-1" were used at the top of your input script.  These settings can be
-changed by using the "package gpu"_package.html command in your script
-if desired.
+default GPU settings, as if the command "package gpu 1" were used at
+the top of your input script.  These settings can be changed by using
+the "-package gpu" command-line switch or the "package
+gpu"_package.html command in your script.
 
-For the USER-INTEL package, using this command-line switch also invokes the
-default USER-INTEL settings, as if the command "package intel * mixed
-balance -1" were used at the top of your input script.  These settings
-can be changed by using the "package intel"_package.html command in
-your script if desired.  If the USER-OMP package is installed, the
-intel suffix will make the omp suffix a second choice, if a requested
-style is not available in the USER-INTEL package.
+For the USER-INTEL package, using this command-line switch also
+invokes the default USER-INTEL settings, as if the command "package
+intel 1" were used at the top of your input script.  These settings
+can be changed by using the "-package intel" command-line switch or
+the "package intel"_package.html command in your script.  If the
+USER-OMP package is also installed, the intel suffix will make the omp
+suffix a second choice, if a requested style is not available in the
+USER-INTEL package.  It will also invoke the default USER-OMP
+settings, as if the command "package omp 0" were used at the top of
+your input script.  These settings can be changed by using the
+"-package omp" command-line switch or the "package omp"_package.html
+command in your script.
 
 For the KOKKOS package, using this command-line switch also invokes
-the default KOKKOS settings, as if the command "package kokkos neigh
-full comm/exchange host comm/forward host " were used at the top of
-your input script.  These settings can be changed by using the
-"package kokkos"_package.html command in your script if desired.
+the default KOKKOS settings, as if the command "package kokkos" were
+used at the top of your input script.  These settings can be changed
+by using the "-package kokkos" command-line switch or the "package
+kokkos"_package.html command in your script.
 
 For the OMP package, using this command-line switch also invokes the
-default OMP settings, as if the command "package omp *" were used at
+default OMP settings, as if the command "package omp 0" were used at
 the top of your input script.  These settings can be changed by using
-the "package omp"_package.html command in your script if desired.
+the "-package omp" command-line switch or the "package
+omp"_package.html command in your script.
 
-The "suffix"_suffix.html command can also be used to set a suffix and
-it can also turn off or back on any suffix setting made via the
-command line.
+The "suffix"_suffix.html command can also be used within an input
+script to set a suffix, or to turn off or back on any suffix setting
+made via the command line.
 
 -var name value1 value2 ... :pre
 
 Specify a variable that will be defined for substitution purposes when
-the input script is read.  "Name" is the variable name which can be a
+the input script is read.  This switch can be used multiple times to
+define multiple variables.  "Name" is the variable name which can be a
 single character (referenced as $x in the input script) or a full
 string (referenced as $\{abc\}).  An "index-style
 variable"_variable.html will be created and populated with the
diff --git a/doc/accelerate_cuda.html b/doc/accelerate_cuda.html
new file mode 100644
index 0000000000..4d005e3f93
--- /dev/null
+++ b/doc/accelerate_cuda.html
@@ -0,0 +1,218 @@
+<HTML>
+<CENTER><A HREF = "Section_packages.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> -
+<A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
+</CENTER>
+
+
+
+
+
+
+<HR>
+
+<P><A HREF = "Section_accelerate.html">Return to Section accelerate overview</A>
+</P>
+<H4>5.3.1 USER-CUDA package 
+</H4>
+<P>The USER-CUDA package was developed by Christian Trott (Sandia) while
+at U Technology Ilmenau in Germany.  It provides NVIDIA GPU versions
+of many pair styles, many fixes, a few computes, and for long-range
+Coulombics via the PPPM command.  It has the following general
+features:
+</P>
+<UL><LI>The package is designed to allow an entire LAMMPS calculation, for
+many timesteps, to run entirely on the GPU (except for inter-processor
+MPI communication), so that atom-based data (e.g. coordinates, forces)
+do not have to move back-and-forth between the CPU and GPU. 
+
+<LI>The speed-up advantage of this approach is typically better when the
+number of atoms per GPU is large 
+
+<LI>Data will stay on the GPU until a timestep where a non-USER-CUDA fix
+or compute is invoked.  Whenever a non-GPU operation occurs (fix,
+compute, output), data automatically moves back to the CPU as needed.
+This may incur a performance penalty, but should otherwise work
+transparently. 
+
+<LI>Neighbor lists are constructed on the GPU. 
+
+<LI>The package only supports use of a single MPI task, running on a
+single CPU (core), assigned to each GPU. 
+</UL>
+<P>Here is a quick overview of how to use the USER-CUDA package:
+</P>
+<UL><LI>build the library in lib/cuda for your GPU hardware with desired precision
+<LI>include the USER-CUDA package and build LAMMPS
+<LI>use the mpirun command to specify 1 MPI task per GPU (on each node)
+<LI>enable the USER-CUDA package via the "-c on" command-line switch
+<LI>specify the # of GPUs per node
+<LI>use USER-CUDA styles in your input script 
+</UL>
+<P>The latter two steps can be done using the "-pk cuda" and "-sf cuda"
+<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the <A HREF = "package.html">package cuda</A> or <A HREF = "suffix.html">suffix cuda</A> commands
+respectively to your input script.
+</P>
+<P><B>Required hardware/software:</B>
+</P>
+<P>To use this package, you need to have one or more NVIDIA GPUs and
+install the NVIDIA Cuda software on your system:
+</P>
+<P>Your NVIDIA GPU needs to support Compute Capability 1.3. This list may
+help you to find out the Compute Capability of your card:
+</P>
+<P>http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units
+</P>
+<P>Install the Nvidia Cuda Toolkit (version 3.2 or higher) and the
+corresponding GPU drivers.  The Nvidia Cuda SDK is not required, but
+we recommend it also be installed.  You can then make sure its sample
+projects can be compiled without problems.
+</P>
+<P><B>Building LAMMPS with the USER-CUDA package:</B>
+</P>
+<P>This requires two steps (a,b): build the USER-CUDA library, then build
+LAMMPS with the USER-CUDA package.
+</P>
+<P>(a) Build the USER-CUDA library
+</P>
+<P>The USER-CUDA library is in lammps/lib/cuda.  If your <I>CUDA</I> toolkit
+is not installed in the default system directoy <I>/usr/local/cuda</I> edit
+the file <I>lib/cuda/Makefile.common</I> accordingly.
+</P>
+<P>To set options for the library build, type "make OPTIONS", where
+<I>OPTIONS</I> are one or more of the following. The settings will be
+written to the <I>lib/cuda/Makefile.defaults</I> and used when
+the library is built.
+</P>
+<PRE><I>precision=N</I> to set the precision level
+  N = 1 for single precision (default)
+  N = 2 for double precision
+  N = 3 for positions in double precision
+  N = 4 for positions and velocities in double precision
+<I>arch=M</I> to set GPU compute capability
+  M = 35 for Kepler GPUs
+  M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
+  M = 21 for CC2.1 (GF104/114,  e.g. GTX560, GTX460, GTX450)
+  M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
+<I>prec_timer=0/1</I> to use hi-precision timers
+  0 = do not use them (default)
+  1 = use them
+  this is usually only useful for Mac machines 
+<I>dbg=0/1</I> to activate debug mode
+  0 = no debug mode (default)
+  1 = yes debug mode
+  this is only useful for developers
+<I>cufft=1</I> for use of the CUDA FFT library
+  0 = no CUFFT support (default)
+  in the future other CUDA-enabled FFT libraries might be supported 
+</PRE>
+<P>To build the library, simply type:
+</P>
+<PRE>make 
+</PRE>
+<P>If successful, it will produce the files libcuda.a and Makefile.lammps.
+</P>
+<P>Note that if you change any of the options (like precision), you need
+to re-build the entire library.  Do a "make clean" first, followed by
+"make".
+</P>
+<P>(b) Build LAMMPS with the USER-CUDA package
+</P>
+<PRE>cd lammps/src
+make yes-user-cuda
+make machine 
+</PRE>
+<P>No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
+</P>
+<P>Note that if you change the USER-CUDA library precision (discussed
+above) and rebuild the USER-CUDA library, then you also need to
+re-install the USER-CUDA package and re-build LAMMPS, so that all
+affected files are re-compiled and linked to the new USER-CUDA
+library.
+</P>
+<P><B>Run with the USER-CUDA package from the command line:</B>
+</P>
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+</P>
+<P>When using the USER-CUDA package, you must use exactly one MPI task
+per physical GPU.
+</P>
+<P>You must use the "-c on" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to enable the USER-CUDA package.
+The "-c on" switch also issues a default <A HREF = "package.html">package cuda 1</A>
+command which sets various USER-CUDA options to default values, as
+discussed on the <A HREF = "package.html">package</A> command doc page.
+</P>
+<P>Use the "-sf cuda" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "cuda" to styles that support it.  Use
+the "-pk cuda Ng" <A HREF = "Section_start.html#start_7">command-line switch</A> to
+set Ng = # of GPUs per node to a different value than the default set
+by the "-c on" switch (1 GPU) or change other <A HREF = "package.html">package
+cuda</A> options.
+</P>
+<PRE>lmp_machine -c on -sf cuda -pk cuda 1 -in in.script                       # 1 MPI task uses 1 GPU
+mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script          # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node
+mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script  # ditto on 12 16-core nodes 
+</PRE>
+<P>The syntax for the "-pk" switch is the same as same as the "package
+cuda" command.  See the <A HREF = "package.html">package</A> command doc page for
+details, including the default values used for all its options if it
+is not specified.
+</P>
+<P>Note that the default for the <A HREF = "package.html">package cuda</A> command is
+to set the Newton flag to "off" for both pairwise and bonded
+interactions.  This typically gives fastest performance.  If the
+<A HREF = "newton.html">newton</A> command is used in the input script, it can
+override these defaults.
+</P>
+<P><B>Or run with the USER-CUDA package by editing an input script:</B>
+</P>
+<P>The discussion above for the mpirun/mpiexec command and the requirement
+of one MPI task per GPU is the same.
+</P>
+<P>You must still use the "-c on" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to enable the USER-CUDA package.
+</P>
+<P>Use the <A HREF = "suffix.html">suffix cuda</A> command, or you can explicitly add a
+"cuda" suffix to individual styles in your input script, e.g.
+</P>
+<PRE>pair_style lj/cut/cuda 2.5 
+</PRE>
+<P>You only need to use the <A HREF = "package.html">package cuda</A> command if you
+wish to change any of its option defaults, including the number of
+GPUs/node (default = 1), as set by the "-c on" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.
+</P>
+<P><B>Speed-ups to expect:</B>
+</P>
+<P>The performance of a GPU versus a multi-core CPU is a function of your
+hardware, which pair style is used, the number of atoms/GPU, and the
+precision used on the GPU (double, single, mixed).
+</P>
+<P>See the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the
+LAMMPS web site for performance of the USER-CUDA package on different
+hardware.
+</P>
+<P><B>Guidelines for best performance:</B>
+</P>
+<UL><LI>The USER-CUDA package offers more speed-up relative to CPU performance
+when the number of atoms per GPU is large, e.g. on the order of tens
+or hundreds of 1000s. 
+
+<LI>As noted above, this package will continue to run a simulation
+entirely on the GPU(s) (except for inter-processor MPI communication),
+for multiple timesteps, until a CPU calculation is required, either by
+a fix or compute that is non-GPU-ized, or until output is performed
+(thermo or dump snapshot or restart file).  The less often this
+occurs, the faster your simulation will run. 
+</UL>
+<P><B>Restrictions:</B>
+</P>
+<P>None.
+</P>
+</HTML>
diff --git a/doc/accelerate_cuda.txt b/doc/accelerate_cuda.txt
new file mode 100644
index 0000000000..d88094ecbf
--- /dev/null
+++ b/doc/accelerate_cuda.txt
@@ -0,0 +1,213 @@
+"Previous Section"_Section_packages.html - "LAMMPS WWW Site"_lws -
+"LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+"Return to Section accelerate overview"_Section_accelerate.html
+
+5.3.1 USER-CUDA package :h4
+
+The USER-CUDA package was developed by Christian Trott (Sandia) while
+at U Technology Ilmenau in Germany.  It provides NVIDIA GPU versions
+of many pair styles, many fixes, a few computes, and for long-range
+Coulombics via the PPPM command.  It has the following general
+features:
+
+The package is designed to allow an entire LAMMPS calculation, for
+many timesteps, to run entirely on the GPU (except for inter-processor
+MPI communication), so that atom-based data (e.g. coordinates, forces)
+do not have to move back-and-forth between the CPU and GPU. :ulb,l
+
+The speed-up advantage of this approach is typically better when the
+number of atoms per GPU is large :l
+
+Data will stay on the GPU until a timestep where a non-USER-CUDA fix
+or compute is invoked.  Whenever a non-GPU operation occurs (fix,
+compute, output), data automatically moves back to the CPU as needed.
+This may incur a performance penalty, but should otherwise work
+transparently. :l
+
+Neighbor lists are constructed on the GPU. :l
+
+The package only supports use of a single MPI task, running on a
+single CPU (core), assigned to each GPU. :l,ule
+
+Here is a quick overview of how to use the USER-CUDA package:
+
+build the library in lib/cuda for your GPU hardware with desired precision
+include the USER-CUDA package and build LAMMPS
+use the mpirun command to specify 1 MPI task per GPU (on each node)
+enable the USER-CUDA package via the "-c on" command-line switch
+specify the # of GPUs per node
+use USER-CUDA styles in your input script :ul
+
+The latter two steps can be done using the "-pk cuda" and "-sf cuda"
+"command-line switches"_Section_start.html#start_7 respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the "package cuda"_package.html or "suffix cuda"_suffix.html commands
+respectively to your input script.
+
+[Required hardware/software:]
+
+To use this package, you need to have one or more NVIDIA GPUs and
+install the NVIDIA Cuda software on your system:
+
+Your NVIDIA GPU needs to support Compute Capability 1.3. This list may
+help you to find out the Compute Capability of your card:
+
+http://en.wikipedia.org/wiki/Comparison_of_Nvidia_graphics_processing_units
+
+Install the Nvidia Cuda Toolkit (version 3.2 or higher) and the
+corresponding GPU drivers.  The Nvidia Cuda SDK is not required, but
+we recommend it also be installed.  You can then make sure its sample
+projects can be compiled without problems.
+
+[Building LAMMPS with the USER-CUDA package:]
+
+This requires two steps (a,b): build the USER-CUDA library, then build
+LAMMPS with the USER-CUDA package.
+
+(a) Build the USER-CUDA library
+
+The USER-CUDA library is in lammps/lib/cuda.  If your {CUDA} toolkit
+is not installed in the default system directoy {/usr/local/cuda} edit
+the file {lib/cuda/Makefile.common} accordingly.
+
+To set options for the library build, type "make OPTIONS", where
+{OPTIONS} are one or more of the following. The settings will be
+written to the {lib/cuda/Makefile.defaults} and used when
+the library is built.
+
+{precision=N} to set the precision level
+  N = 1 for single precision (default)
+  N = 2 for double precision
+  N = 3 for positions in double precision
+  N = 4 for positions and velocities in double precision
+{arch=M} to set GPU compute capability
+  M = 35 for Kepler GPUs
+  M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
+  M = 21 for CC2.1 (GF104/114,  e.g. GTX560, GTX460, GTX450)
+  M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
+{prec_timer=0/1} to use hi-precision timers
+  0 = do not use them (default)
+  1 = use them
+  this is usually only useful for Mac machines 
+{dbg=0/1} to activate debug mode
+  0 = no debug mode (default)
+  1 = yes debug mode
+  this is only useful for developers
+{cufft=1} for use of the CUDA FFT library
+  0 = no CUFFT support (default)
+  in the future other CUDA-enabled FFT libraries might be supported :pre
+
+To build the library, simply type:
+
+make :pre
+
+If successful, it will produce the files libcuda.a and Makefile.lammps.
+
+Note that if you change any of the options (like precision), you need
+to re-build the entire library.  Do a "make clean" first, followed by
+"make".
+
+(b) Build LAMMPS with the USER-CUDA package
+
+cd lammps/src
+make yes-user-cuda
+make machine :pre
+
+No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
+
+Note that if you change the USER-CUDA library precision (discussed
+above) and rebuild the USER-CUDA library, then you also need to
+re-install the USER-CUDA package and re-build LAMMPS, so that all
+affected files are re-compiled and linked to the new USER-CUDA
+library.
+
+[Run with the USER-CUDA package from the command line:]
+
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+
+When using the USER-CUDA package, you must use exactly one MPI task
+per physical GPU.
+
+You must use the "-c on" "command-line
+switch"_Section_start.html#start_7 to enable the USER-CUDA package.
+The "-c on" switch also issues a default "package cuda 1"_package.html
+command which sets various USER-CUDA options to default values, as
+discussed on the "package"_package.html command doc page.
+
+Use the "-sf cuda" "command-line switch"_Section_start.html#start_7,
+which will automatically append "cuda" to styles that support it.  Use
+the "-pk cuda Ng" "command-line switch"_Section_start.html#start_7 to
+set Ng = # of GPUs per node to a different value than the default set
+by the "-c on" switch (1 GPU) or change other "package
+cuda"_package.html options.
+
+lmp_machine -c on -sf cuda -pk cuda 1 -in in.script                       # 1 MPI task uses 1 GPU
+mpirun -np 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script          # 2 MPI tasks use 2 GPUs on a single 16-core (or whatever) node
+mpirun -np 24 -ppn 2 lmp_machine -c on -sf cuda -pk cuda 2 -in in.script  # ditto on 12 16-core nodes :pre
+
+The syntax for the "-pk" switch is the same as same as the "package
+cuda" command.  See the "package"_package.html command doc page for
+details, including the default values used for all its options if it
+is not specified.
+
+Note that the default for the "package cuda"_package.html command is
+to set the Newton flag to "off" for both pairwise and bonded
+interactions.  This typically gives fastest performance.  If the
+"newton"_newton.html command is used in the input script, it can
+override these defaults.
+
+[Or run with the USER-CUDA package by editing an input script:]
+
+The discussion above for the mpirun/mpiexec command and the requirement
+of one MPI task per GPU is the same.
+
+You must still use the "-c on" "command-line
+switch"_Section_start.html#start_7 to enable the USER-CUDA package.
+
+Use the "suffix cuda"_suffix.html command, or you can explicitly add a
+"cuda" suffix to individual styles in your input script, e.g.
+
+pair_style lj/cut/cuda 2.5 :pre
+
+You only need to use the "package cuda"_package.html command if you
+wish to change any of its option defaults, including the number of
+GPUs/node (default = 1), as set by the "-c on" "command-line
+switch"_Section_start.html#start_7.
+
+[Speed-ups to expect:]
+
+The performance of a GPU versus a multi-core CPU is a function of your
+hardware, which pair style is used, the number of atoms/GPU, and the
+precision used on the GPU (double, single, mixed).
+
+See the "Benchmark page"_http://lammps.sandia.gov/bench.html of the
+LAMMPS web site for performance of the USER-CUDA package on different
+hardware.
+
+[Guidelines for best performance:]
+
+The USER-CUDA package offers more speed-up relative to CPU performance
+when the number of atoms per GPU is large, e.g. on the order of tens
+or hundreds of 1000s. :ulb,l
+
+As noted above, this package will continue to run a simulation
+entirely on the GPU(s) (except for inter-processor MPI communication),
+for multiple timesteps, until a CPU calculation is required, either by
+a fix or compute that is non-GPU-ized, or until output is performed
+(thermo or dump snapshot or restart file).  The less often this
+occurs, the faster your simulation will run. :l,ule
+
+[Restrictions:]
+
+None.
diff --git a/doc/accelerate_gpu.html b/doc/accelerate_gpu.html
new file mode 100644
index 0000000000..d09eb331c8
--- /dev/null
+++ b/doc/accelerate_gpu.html
@@ -0,0 +1,248 @@
+<HTML>
+<CENTER><A HREF = "Section_packages.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> -
+<A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
+</CENTER>
+
+
+
+
+
+
+<HR>
+
+<P><A HREF = "Section_accelerate.html">Return to Section accelerate overview</A>
+</P>
+<H4>5.3.2 GPU package 
+</H4>
+<P>The GPU package was developed by Mike Brown at ORNL and his
+collaborators, particularly Trung Nguyen (ORNL).  It provides GPU
+versions of many pair styles, including the 3-body Stillinger-Weber
+pair style, and for <A HREF = "kspace_style.html">kspace_style pppm</A> for
+long-range Coulombics.  It has the following general features:
+</P>
+<UL><LI>It is designed to exploit common GPU hardware configurations where one
+or more GPUs are coupled to many cores of one or more multi-core CPUs,
+e.g. within a node of a parallel machine. 
+
+<LI>Atom-based data (e.g. coordinates, forces) moves back-and-forth
+between the CPU(s) and GPU every timestep. 
+
+<LI>Neighbor lists can be built on the CPU or on the GPU 
+
+<LI>The charge assignement and force interpolation portions of PPPM can be
+run on the GPU.  The FFT portion, which requires MPI communication
+between processors, runs on the CPU. 
+
+<LI>Asynchronous force computations can be performed simultaneously on the
+CPU(s) and GPU. 
+
+<LI>It allows for GPU computations to be performed in single or double
+precision, or in mixed-mode precision, where pairwise forces are
+computed in single precision, but accumulated into double-precision
+force vectors. 
+
+<LI>LAMMPS-specific code is in the GPU package.  It makes calls to a
+generic GPU library in the lib/gpu directory.  This library provides
+NVIDIA support as well as more general OpenCL support, so that the
+same functionality can eventually be supported on a variety of GPU
+hardware. 
+</UL>
+<P>Here is a quick overview of how to use the GPU package:
+</P>
+<UL><LI>build the library in lib/gpu for your GPU hardware wity desired precision
+<LI>include the GPU package and build LAMMPS
+<LI>use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU
+<LI>specify the # of GPUs per node
+<LI>use GPU styles in your input script 
+</UL>
+<P>The latter two steps can be done using the "-pk gpu" and "-sf gpu"
+<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the <A HREF = "package.html">package gpu</A> or <A HREF = "suffix.html">suffix gpu</A> commands
+respectively to your input script.
+</P>
+<P><B>Required hardware/software:</B>
+</P>
+<P>To use this package, you currently need to have an NVIDIA GPU and
+install the NVIDIA Cuda software on your system:
+</P>
+<UL><LI>Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/gpus/0/information
+<LI>Go to http://www.nvidia.com/object/cuda_get.html
+<LI>Install a driver and toolkit appropriate for your system (SDK is not necessary)
+<LI>Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) to list supported devices and properties 
+</UL>
+<P><B>Building LAMMPS with the GPU package:</B>
+</P>
+<P>This requires two steps (a,b): build the GPU library, then build
+LAMMPS with the GPU package.
+</P>
+<P>(a) Build the GPU library
+</P>
+<P>The GPU library is in lammps/lib/gpu.  Select a Makefile.machine (in
+lib/gpu) appropriate for your system.  You should pay special
+attention to 3 settings in this makefile.
+</P>
+<UL><LI>CUDA_HOME = needs to be where NVIDIA Cuda software is installed on your system
+<LI>CUDA_ARCH = needs to be appropriate to your GPUs
+<LI>CUDA_PREC = precision (double, mixed, single) you desire 
+</UL>
+<P>See lib/gpu/Makefile.linux.double for examples of the ARCH settings
+for different GPU choices, e.g. Fermi vs Kepler.  It also lists the
+possible precision settings:
+</P>
+<PRE>CUDA_PREC = -D_SINGLE_SINGLE  # single precision for all calculations
+CUDA_PREC = -D_DOUBLE_DOUBLE  # double precision for all calculations
+CUDA_PREC = -D_SINGLE_DOUBLE  # accumulation of forces, etc, in double 
+</PRE>
+<P>The last setting is the mixed mode referred to above.  Note that your
+GPU must support double precision to use either the 2nd or 3rd of
+these settings.
+</P>
+<P>To build the library, type:
+</P>
+<PRE>make -f Makefile.machine 
+</PRE>
+<P>If successful, it will produce the files libgpu.a and Makefile.lammps.
+</P>
+<P>The latter file has 3 settings that need to be appropriate for the
+paths and settings for the CUDA system software on your machine.
+Makefile.lammps is a copy of the file specified by the EXTRAMAKE
+setting in Makefile.machine.  You can change EXTRAMAKE or create your
+own Makefile.lammps.machine if needed.
+</P>
+<P>Note that to change the precision of the GPU library, you need to
+re-build the entire library.  Do a "clean" first, e.g. "make -f
+Makefile.linux clean", followed by the make command above.
+</P>
+<P>(b) Build LAMMPS with the GPU package
+</P>
+<PRE>cd lammps/src
+make yes-gpu
+make machine 
+</PRE>
+<P>No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
+</P>
+<P>Note that if you change the GPU library precision (discussed above)
+and rebuild the GPU library, then you also need to re-install the GPU
+package and re-build LAMMPS, so that all affected files are
+re-compiled and linked to the new GPU library.
+</P>
+<P><B>Run with the GPU package from the command line:</B>
+</P>
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+</P>
+<P>When using the GPU package, you cannot assign more than one GPU to a
+single MPI task.  However multiple MPI tasks can share the same GPU,
+and in many cases it will be more efficient to run this way.  Likewise
+it may be more efficient to use less MPI tasks/node than the available
+# of CPU cores.  Assignment of multiple MPI tasks to a GPU will happen
+automatically if you create more MPI tasks/node than there are
+GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
+shared by 4 MPI tasks.
+</P>
+<P>Use the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "gpu" to styles that support it.  Use
+the "-pk gpu Ng" <A HREF = "Section_start.html#start_7">command-line switch</A> to
+set Ng = # of GPUs/node to use.
+</P>
+<PRE>lmp_machine -sf gpu -pk gpu 1 -in in.script                         # 1 MPI task uses 1 GPU
+mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script           # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node
+mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes 
+</PRE>
+<P>Note that if the "-sf gpu" switch is used, it also issues a default
+<A HREF = "package.html">package gpu 1</A> command, which sets the number of
+GPUs/node to 1.
+</P>
+<P>Using the "-pk" switch explicitly allows for setting of the number of
+GPUs/node to use and additional options.  Its syntax is the same as
+same as the "package gpu" command.  See the <A HREF = "package.html">package</A>
+command doc page for details, including the default values used for
+all its options if it is not specified.
+</P>
+<P>Note that the default for the <A HREF = "package.html">package gpu</A> command is to
+set the Newton flag to "off" pairwise interactions.  It does not
+affect the setting for bonded interactions (LAMMPS default is "on").
+The "off" setting for pairwise interaction is currently required for
+GPU package pair styles.
+</P>
+<P><B>Or run with the GPU package by editing an input script:</B>
+</P>
+<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+and use of multiple MPI tasks/GPU is the same.
+</P>
+<P>Use the <A HREF = "suffix.html">suffix gpu</A> command, or you can explicitly add an
+"gpu" suffix to individual styles in your input script, e.g.
+</P>
+<PRE>pair_style lj/cut/gpu 2.5 
+</PRE>
+<P>You must also use the <A HREF = "package.html">package gpu</A> command to enable the
+GPU package, unless the "-sf gpu" or "-pk gpu" <A HREF = "Section_start.html#start_7">command-line
+switches</A> were used.  It specifies the
+number of GPUs/node to use, as well as other options.
+</P>
+<P><B>Speed-ups to expect:</B>
+</P>
+<P>The performance of a GPU versus a multi-core CPU is a function of your
+hardware, which pair style is used, the number of atoms/GPU, and the
+precision used on the GPU (double, single, mixed).
+</P>
+<P>See the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the
+LAMMPS web site for performance of the GPU package on various
+hardware, including the Titan HPC platform at ORNL.
+</P>
+<P>You should also experiment with how many MPI tasks per GPU to use to
+give the best performance for your problem and machine.  This is also
+a function of the problem size and the pair style being using.
+Likewise, you should experiment with the precision setting for the GPU
+library to see if single or mixed precision will give accurate
+results, since they will typically be faster.
+</P>
+<P><B>Guidelines for best performance:</B>
+</P>
+<UL><LI>Using multiple MPI tasks per GPU will often give the best performance,
+as allowed my most multi-core CPU/GPU configurations. 
+
+<LI>If the number of particles per MPI task is small (e.g. 100s of
+particles), it can be more efficient to run with fewer MPI tasks per
+GPU, even if you do not use all the cores on the compute node. 
+
+<LI>The <A HREF = "package.html">package gpu</A> command has several options for tuning
+performance.  Neighbor lists can be built on the GPU or CPU.  Force
+calculations can be dynamically balanced across the CPU cores and
+GPUs.  GPU-specific settings can be made which can be optimized
+for different hardware.  See the <A HREF = "package.html">packakge</A> command
+doc page for details. 
+
+<LI>As described by the <A HREF = "package.html">package gpu</A> command, GPU
+accelerated pair styles can perform computations asynchronously with
+CPU computations. The "Pair" time reported by LAMMPS will be the
+maximum of the time required to complete the CPU pair style
+computations and the time required to complete the GPU pair style
+computations. Any time spent for GPU-enabled pair styles for
+computations that run simultaneously with <A HREF = "bond_style.html">bond</A>,
+<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
+<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A>
+calculations will not be included in the "Pair" time. 
+
+<LI>When the <I>mode</I> setting for the package gpu command is force/neigh,
+the time for neighbor list calculations on the GPU will be added into
+the "Pair" time, not the "Neigh" time.  An additional breakdown of the
+times required for various tasks on the GPU (data copy, neighbor
+calculations, force computations, etc) are output only with the LAMMPS
+screen output (not in the log file) at the end of each run.  These
+timings represent total time spent on the GPU for each routine,
+regardless of asynchronous CPU calculations. 
+
+<LI>The output section "GPU Time Info (average)" reports "Max Mem / Proc".
+This is the maximum memory used at one time on the GPU for data
+storage by a single MPI process. 
+</UL>
+<P><B>Restrictions:</B>
+</P>
+<P>None.
+</P>
+</HTML>
diff --git a/doc/accelerate_gpu.txt b/doc/accelerate_gpu.txt
new file mode 100644
index 0000000000..e221e2342c
--- /dev/null
+++ b/doc/accelerate_gpu.txt
@@ -0,0 +1,243 @@
+"Previous Section"_Section_packages.html - "LAMMPS WWW Site"_lws -
+"LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+"Return to Section accelerate overview"_Section_accelerate.html
+
+5.3.2 GPU package :h4
+
+The GPU package was developed by Mike Brown at ORNL and his
+collaborators, particularly Trung Nguyen (ORNL).  It provides GPU
+versions of many pair styles, including the 3-body Stillinger-Weber
+pair style, and for "kspace_style pppm"_kspace_style.html for
+long-range Coulombics.  It has the following general features:
+
+It is designed to exploit common GPU hardware configurations where one
+or more GPUs are coupled to many cores of one or more multi-core CPUs,
+e.g. within a node of a parallel machine. :ulb,l
+
+Atom-based data (e.g. coordinates, forces) moves back-and-forth
+between the CPU(s) and GPU every timestep. :l
+
+Neighbor lists can be built on the CPU or on the GPU :l
+
+The charge assignement and force interpolation portions of PPPM can be
+run on the GPU.  The FFT portion, which requires MPI communication
+between processors, runs on the CPU. :l
+
+Asynchronous force computations can be performed simultaneously on the
+CPU(s) and GPU. :l
+
+It allows for GPU computations to be performed in single or double
+precision, or in mixed-mode precision, where pairwise forces are
+computed in single precision, but accumulated into double-precision
+force vectors. :l
+
+LAMMPS-specific code is in the GPU package.  It makes calls to a
+generic GPU library in the lib/gpu directory.  This library provides
+NVIDIA support as well as more general OpenCL support, so that the
+same functionality can eventually be supported on a variety of GPU
+hardware. :l,ule
+
+Here is a quick overview of how to use the GPU package:
+
+build the library in lib/gpu for your GPU hardware wity desired precision
+include the GPU package and build LAMMPS
+use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU
+specify the # of GPUs per node
+use GPU styles in your input script :ul
+
+The latter two steps can be done using the "-pk gpu" and "-sf gpu"
+"command-line switches"_Section_start.html#start_7 respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the "package gpu"_package.html or "suffix gpu"_suffix.html commands
+respectively to your input script.
+
+[Required hardware/software:]
+
+To use this package, you currently need to have an NVIDIA GPU and
+install the NVIDIA Cuda software on your system:
+
+Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/gpus/0/information
+Go to http://www.nvidia.com/object/cuda_get.html
+Install a driver and toolkit appropriate for your system (SDK is not necessary)
+Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) to list supported devices and properties :ul
+
+[Building LAMMPS with the GPU package:]
+
+This requires two steps (a,b): build the GPU library, then build
+LAMMPS with the GPU package.
+
+(a) Build the GPU library
+
+The GPU library is in lammps/lib/gpu.  Select a Makefile.machine (in
+lib/gpu) appropriate for your system.  You should pay special
+attention to 3 settings in this makefile.
+
+CUDA_HOME = needs to be where NVIDIA Cuda software is installed on your system
+CUDA_ARCH = needs to be appropriate to your GPUs
+CUDA_PREC = precision (double, mixed, single) you desire :ul
+
+See lib/gpu/Makefile.linux.double for examples of the ARCH settings
+for different GPU choices, e.g. Fermi vs Kepler.  It also lists the
+possible precision settings:
+
+CUDA_PREC = -D_SINGLE_SINGLE  # single precision for all calculations
+CUDA_PREC = -D_DOUBLE_DOUBLE  # double precision for all calculations
+CUDA_PREC = -D_SINGLE_DOUBLE  # accumulation of forces, etc, in double :pre
+
+The last setting is the mixed mode referred to above.  Note that your
+GPU must support double precision to use either the 2nd or 3rd of
+these settings.
+
+To build the library, type:
+
+make -f Makefile.machine :pre
+
+If successful, it will produce the files libgpu.a and Makefile.lammps.
+
+The latter file has 3 settings that need to be appropriate for the
+paths and settings for the CUDA system software on your machine.
+Makefile.lammps is a copy of the file specified by the EXTRAMAKE
+setting in Makefile.machine.  You can change EXTRAMAKE or create your
+own Makefile.lammps.machine if needed.
+
+Note that to change the precision of the GPU library, you need to
+re-build the entire library.  Do a "clean" first, e.g. "make -f
+Makefile.linux clean", followed by the make command above.
+
+(b) Build LAMMPS with the GPU package
+
+cd lammps/src
+make yes-gpu
+make machine :pre
+
+No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
+
+Note that if you change the GPU library precision (discussed above)
+and rebuild the GPU library, then you also need to re-install the GPU
+package and re-build LAMMPS, so that all affected files are
+re-compiled and linked to the new GPU library.
+
+[Run with the GPU package from the command line:]
+
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+
+When using the GPU package, you cannot assign more than one GPU to a
+single MPI task.  However multiple MPI tasks can share the same GPU,
+and in many cases it will be more efficient to run this way.  Likewise
+it may be more efficient to use less MPI tasks/node than the available
+# of CPU cores.  Assignment of multiple MPI tasks to a GPU will happen
+automatically if you create more MPI tasks/node than there are
+GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
+shared by 4 MPI tasks.
+
+Use the "-sf gpu" "command-line switch"_Section_start.html#start_7,
+which will automatically append "gpu" to styles that support it.  Use
+the "-pk gpu Ng" "command-line switch"_Section_start.html#start_7 to
+set Ng = # of GPUs/node to use.
+
+lmp_machine -sf gpu -pk gpu 1 -in in.script                         # 1 MPI task uses 1 GPU
+mpirun -np 12 lmp_machine -sf gpu -pk gpu 2 -in in.script           # 12 MPI tasks share 2 GPUs on a single 16-core (or whatever) node
+mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes :pre
+
+Note that if the "-sf gpu" switch is used, it also issues a default
+"package gpu 1"_package.html command, which sets the number of
+GPUs/node to 1.
+
+Using the "-pk" switch explicitly allows for setting of the number of
+GPUs/node to use and additional options.  Its syntax is the same as
+same as the "package gpu" command.  See the "package"_package.html
+command doc page for details, including the default values used for
+all its options if it is not specified.
+
+Note that the default for the "package gpu"_package.html command is to
+set the Newton flag to "off" pairwise interactions.  It does not
+affect the setting for bonded interactions (LAMMPS default is "on").
+The "off" setting for pairwise interaction is currently required for
+GPU package pair styles.
+
+[Or run with the GPU package by editing an input script:]
+
+The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+and use of multiple MPI tasks/GPU is the same.
+
+Use the "suffix gpu"_suffix.html command, or you can explicitly add an
+"gpu" suffix to individual styles in your input script, e.g.
+
+pair_style lj/cut/gpu 2.5 :pre
+
+You must also use the "package gpu"_package.html command to enable the
+GPU package, unless the "-sf gpu" or "-pk gpu" "command-line
+switches"_Section_start.html#start_7 were used.  It specifies the
+number of GPUs/node to use, as well as other options.
+
+[Speed-ups to expect:]
+
+The performance of a GPU versus a multi-core CPU is a function of your
+hardware, which pair style is used, the number of atoms/GPU, and the
+precision used on the GPU (double, single, mixed).
+
+See the "Benchmark page"_http://lammps.sandia.gov/bench.html of the
+LAMMPS web site for performance of the GPU package on various
+hardware, including the Titan HPC platform at ORNL.
+
+You should also experiment with how many MPI tasks per GPU to use to
+give the best performance for your problem and machine.  This is also
+a function of the problem size and the pair style being using.
+Likewise, you should experiment with the precision setting for the GPU
+library to see if single or mixed precision will give accurate
+results, since they will typically be faster.
+
+[Guidelines for best performance:]
+
+Using multiple MPI tasks per GPU will often give the best performance,
+as allowed my most multi-core CPU/GPU configurations. :ulb,l
+
+If the number of particles per MPI task is small (e.g. 100s of
+particles), it can be more efficient to run with fewer MPI tasks per
+GPU, even if you do not use all the cores on the compute node. :l
+
+The "package gpu"_package.html command has several options for tuning
+performance.  Neighbor lists can be built on the GPU or CPU.  Force
+calculations can be dynamically balanced across the CPU cores and
+GPUs.  GPU-specific settings can be made which can be optimized
+for different hardware.  See the "packakge"_package.html command
+doc page for details. :l
+
+As described by the "package gpu"_package.html command, GPU
+accelerated pair styles can perform computations asynchronously with
+CPU computations. The "Pair" time reported by LAMMPS will be the
+maximum of the time required to complete the CPU pair style
+computations and the time required to complete the GPU pair style
+computations. Any time spent for GPU-enabled pair styles for
+computations that run simultaneously with "bond"_bond_style.html,
+"angle"_angle_style.html, "dihedral"_dihedral_style.html,
+"improper"_improper_style.html, and "long-range"_kspace_style.html
+calculations will not be included in the "Pair" time. :l
+
+When the {mode} setting for the package gpu command is force/neigh,
+the time for neighbor list calculations on the GPU will be added into
+the "Pair" time, not the "Neigh" time.  An additional breakdown of the
+times required for various tasks on the GPU (data copy, neighbor
+calculations, force computations, etc) are output only with the LAMMPS
+screen output (not in the log file) at the end of each run.  These
+timings represent total time spent on the GPU for each routine,
+regardless of asynchronous CPU calculations. :l
+
+The output section "GPU Time Info (average)" reports "Max Mem / Proc".
+This is the maximum memory used at one time on the GPU for data
+storage by a single MPI process. :l,ule
+
+[Restrictions:]
+
+None.
diff --git a/doc/accelerate_intel.html b/doc/accelerate_intel.html
new file mode 100644
index 0000000000..a96a13b1d3
--- /dev/null
+++ b/doc/accelerate_intel.html
@@ -0,0 +1,304 @@
+<HTML>
+<CENTER><A HREF = "Section_packages.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> -
+<A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
+</CENTER>
+
+
+
+
+
+
+<HR>
+
+<P><A HREF = "Section_accelerate.html">Return to Section accelerate overview</A>
+</P>
+<H4>5.3.3 USER-INTEL package 
+</H4>
+<P>The USER-INTEL package was developed by Mike Brown at Intel
+Corporation.  It provides a capability to accelerate simulations by
+offloading neighbor list and non-bonded force calculations to Intel(R)
+Xeon Phi(TM) coprocessors (not native mode like the KOKKOS package).
+Additionally, it supports running simulations in single, mixed, or
+double precision with vectorization, even if a coprocessor is not
+present, i.e. on an Intel(R) CPU.  The same C++ code is used for both
+cases.  When offloading to a coprocessor, the routine is run twice,
+once with an offload flag.
+</P>
+<P>The USER-INTEL package can be used in tandem with the USER-OMP
+package.  This is useful when offloading pair style computations to
+coprocessors, so that other styles not supported by the USER-INTEL
+package, e.g. bond, angle, dihedral, improper, and long-range
+electrostatics, can be run simultaneously in threaded mode on CPU
+cores.  Since less MPI tasks than CPU cores will typically be invoked
+when running with coprocessors, this enables the extra cores to be
+utilized for useful computation.
+</P>
+<P>If LAMMPS is built with both the USER-INTEL and USER-OMP packages
+intsalled, this mode of operation is made easier to use, because the
+"-suffix intel" <A HREF = "Section_start.html#start_7">command-line switch</A> or
+the <A HREF = "suffix.html">suffix intel</A> command will both set a second-choice
+suffix to "omp" so that styles from the USER-OMP package will be used
+if available, after first testing if a style from the USER-INTEL
+package is available.
+</P>
+<P>Here is a quick overview of how to use the USER-INTEL package
+for CPU acceleration:
+</P>
+<UL><LI>specify these CCFLAGS in your Makefile.machine: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost
+<LI>specify -fopenmp with LINKFLAGS in your Makefile.machine
+<LI>include the USER-INTEL package and (optionally) USER-OMP package and build LAMMPS
+<LI>if using the USER-OMP package, specify how many threads per MPI task to use
+<LI>use USER-INTEL styles in your input script 
+</UL>
+<P>Using the USER-INTEL package to offload work to the Intel(R)
+Xeon Phi(TM) coprocessor is the same except for these additional
+steps:
+</P>
+<UL><LI>add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Makefile.machine
+<LI>add the flag -offload to LINKFLAGS in your Makefile.machine
+<LI>specify how many threads per coprocessor to use 
+</UL>
+<P>The latter two steps in the first case and the last step in the
+coprocessor case can be done using the "-pk omp" and "-sf intel" and
+"-pk intel" <A HREF = "Section_start.html#start_7">command-line switches</A>
+respectively.  Or the effect of the "-pk" or "-sf" switches can be
+duplicated by adding the <A HREF = "package.html">package omp</A> or <A HREF = "suffix.html">suffix
+intel</A> or <A HREF = "package.html">package intel</A> commands
+respectively to your input script.
+</P>
+<P><B>Required hardware/software:</B>
+</P>
+<P>To use the offload option, you must have one or more Intel(R) Xeon
+Phi(TM) coprocessors.
+</P>
+<P>Optimizations for vectorization have only been tested with the
+Intel(R) compiler.  Use of other compilers may not result in
+vectorization or give poor performance.
+</P>
+<P>Use of an Intel C++ compiler is reccommended, but not required.  The
+compiler must support the OpenMP interface.
+</P>
+<P><B>Building LAMMPS with the USER-INTEL package:</B>
+</P>
+<P>Include the package(s) and build LAMMPS:  
+</P>
+<PRE>cd lammps/src
+make yes-user-intel
+make yes-user-omp (if desired)
+make machine 
+</PRE>
+<P>If the USER-OMP package is also installed, you can use styles from
+both packages, as described below.
+</P>
+<P>The lo-level src/MAKE/Makefile.machine needs a flag for OpenMP support
+in both the CCFLAGS and LINKFLAGS variables, which is <I>-openmp</I> for
+Intel compilers.  You also need to add -DLAMMPS_MEMALIGN=64 and
+-restrict to CCFLAGS.
+</P>
+<P>If you are compiling on the same architecture that will be used for
+the runs, adding the flag <I>-xHost</I> to CCFLAGS will enable
+vectorization with the Intel(R) compiler.
+</P>
+<P>In order to build with support for an Intel(R) coprocessor, the flag
+<I>-offload</I> should be added to the LINKFLAGS line and the flag
+-DLMP_INTEL_OFFLOAD should be added to the CCFLAGS line.
+</P>
+<P>Note that the machine makefiles Makefile.intel and
+Makefile.intel_offload are included in the src/MAKE directory with
+options that perform well with the Intel(R) compiler. The latter file
+has support for offload to coprocessors; the former does not.
+</P>
+<P>If using an Intel compiler, it is recommended that Intel(R) Compiler
+2013 SP1 update 1 be used.  Newer versions have some performance
+issues that are being addressed. If using Intel(R) MPI, version 5 or
+higher is recommended.
+</P>
+<P><B>Running with the USER-INTEL package from the command line:</B>
+</P>
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+</P>
+<P>If LAMMPS was also built with the USER-OMP package, you need to choose
+how many OpenMP threads per MPI task will be used by the USER-OMP
+package.  Note that the product of MPI tasks * OpenMP threads/task
+should not exceed the physical number of cores (on a node), otherwise
+performance will suffer.
+</P>
+<P>If LAMMPS was built with coprocessor support for the USER-INTEL
+package, you need to specify the number of coprocessor/node and the
+number of threads to use on the coprocessor per MPI task.  Note that
+coprocessor threads (which run on the coprocessor) are totally
+independent from OpenMP threads (which run on the CPU).  The product
+of MPI tasks * coprocessor threads/task should not exceed the maximum
+number of threads the coproprocessor is designed to run, otherwise
+performance will suffer.  This value is 240 for current generation
+Xeon Phi(TM) chips, which is 60 physical cores * 4 threads/core.  The
+threads/core value can be set to a smaller value if desired by an
+option on the <A HREF = "package.html">package intel</A> command, in which case the
+maximum number of threads is also reduced.
+</P>
+<P>Use the "-sf intel" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "intel" to styles that support it.  If
+a style does not support it, a "omp" suffix is tried next.  Use the
+"-pk omp Nt" <A HREF = "Section_start.html#start_7">command-line switch</A>, to set
+Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with
+the USER-OMP package.  Use the "-pk intel Nphi" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to set Nphi = # of Xeon Phi(TM)
+coprocessors/node, if LAMMPS was built with coprocessor support.
+</P>
+<PRE>CPU-only without USER-OMP (but using Intel vectorization on CPU):
+lmp_machine -sf intel -in in.script                 # 1 MPI task
+mpirun -np 32 lmp_machine -sf intel -in in.script   # 32 MPI tasks on as many nodes as needed (e.g. 2 16-core nodes) 
+</PRE>
+<PRE>CPU-only with USER-OMP (and Intel vectorization on CPU):
+lmp_machine -sf intel -pk intel 16 0 -in in.script                # 1 MPI task on a 16-core node
+mpirun -np 4 lmp_machine -sf intel -pk intel 4 0 -in in.script    # 4 MPI tasks each with 4 threads on a single 16-core node
+mpirun -np 32 lmp_machine -sf intel -pk intel 4 0 -in in.script   # ditto on 8 16-core nodes 
+</PRE>
+<PRE>CPUs + Xeon Phi(TM) coprocessors with USER-OMP:
+lmp_machine -sf intel -pk intel 16 1 -in in.script                                  # 1 MPI task, 240 threads on 1 coprocessor
+mpirun -np 4 lmp_machine -sf intel -pk intel 4 1 tptask 60 -in in.script            # 4 MPI tasks each with 4 OpenMP threads on a single 16-core node, 
+                                                                                    # each MPI task uses 60 threads on 1 coprocessor
+mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.script   # ditto on 8 16-core nodes for MPI tasks and OpenMP threads, 
+                                                                                    # each MPI task uses 120 threads on one of 2 coprocessors 
+</PRE>
+<P>Note that if the "-sf intel" switch is used, it also issues two
+default commands: <A HREF = "package.html">package omp 0</A> and <A HREF = "package.html">package intel
+1</A> command.  These set the number of OpenMP threads per
+MPI task via the OMP_NUM_THREADS environment variable, and the number
+of Xeon Phi(TM) coprocessors/node to 1.  The former is ignored if
+LAMMPS was not built with the USER-OMP package.  The latter is ignored
+is LAMMPS was not built with coprocessor support, except for its
+optional precision setting.
+</P>
+<P>Using the "-pk omp" switch explicitly allows for direct setting of the
+number of OpenMP threads per MPI task, and additional options.  Using
+the "-pk intel" switch explicitly allows for direct setting of the
+number of coprocessors/node, and additional options.  The syntax for
+these two switches is the same as the <A HREF = "package.html">package omp</A> and
+<A HREF = "package.html">package intel</A> commands.  See the <A HREF = "package.html">package</A>
+command doc page for details, including the default values used for
+all its options if these switches are not specified, and how to set
+the number of OpenMP threads via the OMP_NUM_THREADS environment
+variable if desired.
+</P>
+<P><B>Or run with the USER-INTEL package by editing an input script:</B>
+</P>
+<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+OpenMP threads per MPI task, and coprocessor threads per MPI task is
+the same.
+</P>
+<P>Use the <A HREF = "suffix.html">suffix intel</A> command, or you can explicitly add an
+"intel" suffix to individual styles in your input script, e.g.
+</P>
+<PRE>pair_style lj/cut/intel 2.5 
+</PRE>
+<P>You must also use the <A HREF = "package.html">package omp</A> command to enable the
+USER-OMP package (assuming LAMMPS was built with USER-OMP) unless the "-sf
+intel" or "-pk omp" <A HREF = "Section_start.html#start_7">command-line switches</A>
+were used.  It specifies how many OpenMP threads per MPI task to use,
+as well as other options.  Its doc page explains how to set the number
+of threads via an environment variable if desired.
+</P>
+<P>You must also use the <A HREF = "package.html">package intel</A> command to enable
+coprocessor support within the USER-INTEL package (assuming LAMMPS was
+built with coprocessor support) unless the "-sf intel" or "-pk intel"
+<A HREF = "Section_start.html#start_7">command-line switches</A> were used.  It
+specifies how many coprocessors/node to use, as well as other
+coprocessor options.
+</P>
+<P><B>Speed-ups to expect:</B>
+</P>
+<P>If LAMMPS was not built with coprocessor support when including the
+USER-INTEL package, then acclerated styles will run on the CPU using
+vectorization optimizations and the specified precision.  This may
+give a substantial speed-up for a pair style, particularly if mixed or
+single precision is used.
+</P>
+<P>If LAMMPS was built with coproccesor support, the pair styles will run
+on one or more Intel(R) Xeon Phi(TM) coprocessors (per node).  The
+performance of a Xeon Phi versus a multi-core CPU is a function of
+your hardware, which pair style is used, the number of
+atoms/coprocessor, and the precision used on the coprocessor (double,
+single, mixed).
+</P>
+<P>See the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the
+LAMMPS web site for performance of the USER-INTEL package on different
+hardware.
+</P>
+<P><B>Guidelines for best performance on an Intel(R) Xeon Phi(TM)
+coprocessor:</B>
+</P>
+<UL><LI>The default for the <A HREF = "package.html">package intel</A> command is to have
+all the MPI tasks on a given compute node use a single Xeon Phi(TM)
+coprocessor.  In general, running with a large number of MPI tasks on
+each node will perform best with offload.  Each MPI task will
+automatically get affinity to a subset of the hardware threads
+available on the coprocessor.  For example, if your card has 61 cores,
+with 60 cores available for offload and 4 hardware threads per core
+(240 total threads), running with 24 MPI tasks per node will cause
+each MPI task to use a subset of 10 threads on the coprocessor.  Fine
+tuning of the number of threads to use per MPI task or the number of
+threads to use per core can be accomplished with keyword settings of
+the <A HREF = "package.html">package intel</A> command. 
+
+<LI>If desired, only a fraction of the pair style computation can be
+offloaded to the coprocessors.  This is accomplished by using the
+<I>balance</I> keyword in the <A HREF = "package.html">package intel</A> command.  A
+balance of 0 runs all calculations on the CPU.  A balance of 1 runs
+all calculations on the coprocessor.  A balance of 0.5 runs half of
+the calculations on the coprocessor.  Setting the balance to -1 (the
+default) will enable dynamic load balancing that continously adjusts
+the fraction of offloaded work throughout the simulation.  This option
+typically produces results within 5 to 10 percent of the optimal fixed
+balance. 
+
+<LI>When using offload with CPU hyperthreading disabled, it may help
+performance to use fewer MPI tasks and OpenMP threads than available
+cores.  This is due to the fact that additional threads are generated
+internally to handle the asynchronous offload tasks. 
+
+<LI>If running short benchmark runs with dynamic load balancing, adding a
+short warm-up run (10-20 steps) will allow the load-balancer to find a
+near-optimal setting that will carry over to additional runs. 
+
+<LI>If pair computations are being offloaded to an Intel(R) Xeon Phi(TM)
+coprocessor, a diagnostic line is printed to the screen (not to the
+log file), during the setup phase of a run, indicating that offload
+mode is being used and indicating the number of coprocessor threads
+per MPI task.  Additionally, an offload timing summary is printed at
+the end of each run.  When offloading, the frequency for <A HREF = "atom_modify.html">atom
+sorting</A> is changed to 1 so that the per-atom data is
+effectively sorted at every rebuild of the neighbor lists. 
+
+<LI>For simulations with long-range electrostatics or bond, angle,
+dihedral, improper calculations, computation and data transfer to the
+coprocessor will run concurrently with computations and MPI
+communications for these calculations on the host CPU.  The USER-INTEL
+package has two modes for deciding which atoms will be handled by the
+coprocessor.  This choice is controlled with the <I>ghost</I> keyword of
+the <A HREF = "package.html">package intel</A> command.  When set to 0, ghost atoms
+(atoms at the borders between MPI tasks) are not offloaded to the
+card.  This allows for overlap of MPI communication of forces with
+computation on the coprocessor when the <A HREF = "newton.html">newton</A> setting
+is "on".  The default is dependent on the style being used, however,
+better performance may be achieved by setting this option
+explictly. 
+</UL>
+<P><B>Restrictions:</B>
+</P>
+<P>When offloading to a coprocessor, <A HREF = "pair_hybrid.html">hybrid</A> styles
+that require skip lists for neighbor builds cannot be offloaded.
+Using <A HREF = "pair_hybrid.html">hybrid/overlay</A> is allowed.  Only one intel
+accelerated style may be used with hybrid styles.
+<A HREF = "special_bonds.html">Special_bonds</A> exclusion lists are not currently
+supported with offload, however, the same effect can often be
+accomplished by setting cutoffs for excluded atom types to 0.  None of
+the pair styles in the USER-INTEL package currently support the
+"inner", "middle", "outer" options for rRESPA integration via the
+<A HREF = "run_style.html">run_style respa</A> command; only the "pair" option is
+supported.
+</P>
+</HTML>
diff --git a/doc/accelerate_intel.txt b/doc/accelerate_intel.txt
new file mode 100644
index 0000000000..64e14d2528
--- /dev/null
+++ b/doc/accelerate_intel.txt
@@ -0,0 +1,299 @@
+"Previous Section"_Section_packages.html - "LAMMPS WWW Site"_lws -
+"LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+"Return to Section accelerate overview"_Section_accelerate.html
+
+5.3.3 USER-INTEL package :h4
+
+The USER-INTEL package was developed by Mike Brown at Intel
+Corporation.  It provides a capability to accelerate simulations by
+offloading neighbor list and non-bonded force calculations to Intel(R)
+Xeon Phi(TM) coprocessors (not native mode like the KOKKOS package).
+Additionally, it supports running simulations in single, mixed, or
+double precision with vectorization, even if a coprocessor is not
+present, i.e. on an Intel(R) CPU.  The same C++ code is used for both
+cases.  When offloading to a coprocessor, the routine is run twice,
+once with an offload flag.
+
+The USER-INTEL package can be used in tandem with the USER-OMP
+package.  This is useful when offloading pair style computations to
+coprocessors, so that other styles not supported by the USER-INTEL
+package, e.g. bond, angle, dihedral, improper, and long-range
+electrostatics, can be run simultaneously in threaded mode on CPU
+cores.  Since less MPI tasks than CPU cores will typically be invoked
+when running with coprocessors, this enables the extra cores to be
+utilized for useful computation.
+
+If LAMMPS is built with both the USER-INTEL and USER-OMP packages
+intsalled, this mode of operation is made easier to use, because the
+"-suffix intel" "command-line switch"_Section_start.html#start_7 or
+the "suffix intel"_suffix.html command will both set a second-choice
+suffix to "omp" so that styles from the USER-OMP package will be used
+if available, after first testing if a style from the USER-INTEL
+package is available.
+
+Here is a quick overview of how to use the USER-INTEL package
+for CPU acceleration:
+
+specify these CCFLAGS in your Makefile.machine: -fopenmp, -DLAMMPS_MEMALIGN=64, and -restrict, -xHost
+specify -fopenmp with LINKFLAGS in your Makefile.machine
+include the USER-INTEL package and (optionally) USER-OMP package and build LAMMPS
+if using the USER-OMP package, specify how many threads per MPI task to use
+use USER-INTEL styles in your input script :ul
+
+Using the USER-INTEL package to offload work to the Intel(R)
+Xeon Phi(TM) coprocessor is the same except for these additional
+steps:
+
+add the flag -DLMP_INTEL_OFFLOAD to CCFLAGS in your Makefile.machine
+add the flag -offload to LINKFLAGS in your Makefile.machine
+specify how many threads per coprocessor to use :ul
+
+The latter two steps in the first case and the last step in the
+coprocessor case can be done using the "-pk omp" and "-sf intel" and
+"-pk intel" "command-line switches"_Section_start.html#start_7
+respectively.  Or the effect of the "-pk" or "-sf" switches can be
+duplicated by adding the "package omp"_package.html or "suffix
+intel"_suffix.html or "package intel"_package.html commands
+respectively to your input script.
+
+[Required hardware/software:]
+
+To use the offload option, you must have one or more Intel(R) Xeon
+Phi(TM) coprocessors.
+
+Optimizations for vectorization have only been tested with the
+Intel(R) compiler.  Use of other compilers may not result in
+vectorization or give poor performance.
+
+Use of an Intel C++ compiler is reccommended, but not required.  The
+compiler must support the OpenMP interface.
+
+[Building LAMMPS with the USER-INTEL package:]
+
+Include the package(s) and build LAMMPS:  
+
+cd lammps/src
+make yes-user-intel
+make yes-user-omp (if desired)
+make machine :pre
+
+If the USER-OMP package is also installed, you can use styles from
+both packages, as described below.
+
+The lo-level src/MAKE/Makefile.machine needs a flag for OpenMP support
+in both the CCFLAGS and LINKFLAGS variables, which is {-openmp} for
+Intel compilers.  You also need to add -DLAMMPS_MEMALIGN=64 and
+-restrict to CCFLAGS.
+
+If you are compiling on the same architecture that will be used for
+the runs, adding the flag {-xHost} to CCFLAGS will enable
+vectorization with the Intel(R) compiler.
+
+In order to build with support for an Intel(R) coprocessor, the flag
+{-offload} should be added to the LINKFLAGS line and the flag
+-DLMP_INTEL_OFFLOAD should be added to the CCFLAGS line.
+
+Note that the machine makefiles Makefile.intel and
+Makefile.intel_offload are included in the src/MAKE directory with
+options that perform well with the Intel(R) compiler. The latter file
+has support for offload to coprocessors; the former does not.
+
+If using an Intel compiler, it is recommended that Intel(R) Compiler
+2013 SP1 update 1 be used.  Newer versions have some performance
+issues that are being addressed. If using Intel(R) MPI, version 5 or
+higher is recommended.
+
+[Running with the USER-INTEL package from the command line:]
+
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+
+If LAMMPS was also built with the USER-OMP package, you need to choose
+how many OpenMP threads per MPI task will be used by the USER-OMP
+package.  Note that the product of MPI tasks * OpenMP threads/task
+should not exceed the physical number of cores (on a node), otherwise
+performance will suffer.
+
+If LAMMPS was built with coprocessor support for the USER-INTEL
+package, you need to specify the number of coprocessor/node and the
+number of threads to use on the coprocessor per MPI task.  Note that
+coprocessor threads (which run on the coprocessor) are totally
+independent from OpenMP threads (which run on the CPU).  The product
+of MPI tasks * coprocessor threads/task should not exceed the maximum
+number of threads the coproprocessor is designed to run, otherwise
+performance will suffer.  This value is 240 for current generation
+Xeon Phi(TM) chips, which is 60 physical cores * 4 threads/core.  The
+threads/core value can be set to a smaller value if desired by an
+option on the "package intel"_package.html command, in which case the
+maximum number of threads is also reduced.
+
+Use the "-sf intel" "command-line switch"_Section_start.html#start_7,
+which will automatically append "intel" to styles that support it.  If
+a style does not support it, a "omp" suffix is tried next.  Use the
+"-pk omp Nt" "command-line switch"_Section_start.html#start_7, to set
+Nt = # of OpenMP threads per MPI task to use, if LAMMPS was built with
+the USER-OMP package.  Use the "-pk intel Nphi" "command-line
+switch"_Section_start.html#start_7 to set Nphi = # of Xeon Phi(TM)
+coprocessors/node, if LAMMPS was built with coprocessor support.
+
+CPU-only without USER-OMP (but using Intel vectorization on CPU):
+lmp_machine -sf intel -in in.script                 # 1 MPI task
+mpirun -np 32 lmp_machine -sf intel -in in.script   # 32 MPI tasks on as many nodes as needed (e.g. 2 16-core nodes) :pre
+
+CPU-only with USER-OMP (and Intel vectorization on CPU):
+lmp_machine -sf intel -pk intel 16 0 -in in.script                # 1 MPI task on a 16-core node
+mpirun -np 4 lmp_machine -sf intel -pk intel 4 0 -in in.script    # 4 MPI tasks each with 4 threads on a single 16-core node
+mpirun -np 32 lmp_machine -sf intel -pk intel 4 0 -in in.script   # ditto on 8 16-core nodes :pre
+
+CPUs + Xeon Phi(TM) coprocessors with USER-OMP:
+lmp_machine -sf intel -pk intel 16 1 -in in.script                                  # 1 MPI task, 240 threads on 1 coprocessor
+mpirun -np 4 lmp_machine -sf intel -pk intel 4 1 tptask 60 -in in.script            # 4 MPI tasks each with 4 OpenMP threads on a single 16-core node, 
+                                                                                    # each MPI task uses 60 threads on 1 coprocessor
+mpirun -np 32 -ppn 4 lmp_machine -sf intel -pk intel 4 2 tptask 120 -in in.script   # ditto on 8 16-core nodes for MPI tasks and OpenMP threads, 
+                                                                                    # each MPI task uses 120 threads on one of 2 coprocessors :pre
+
+Note that if the "-sf intel" switch is used, it also issues two
+default commands: "package omp 0"_package.html and "package intel
+1"_package.html command.  These set the number of OpenMP threads per
+MPI task via the OMP_NUM_THREADS environment variable, and the number
+of Xeon Phi(TM) coprocessors/node to 1.  The former is ignored if
+LAMMPS was not built with the USER-OMP package.  The latter is ignored
+is LAMMPS was not built with coprocessor support, except for its
+optional precision setting.
+
+Using the "-pk omp" switch explicitly allows for direct setting of the
+number of OpenMP threads per MPI task, and additional options.  Using
+the "-pk intel" switch explicitly allows for direct setting of the
+number of coprocessors/node, and additional options.  The syntax for
+these two switches is the same as the "package omp"_package.html and
+"package intel"_package.html commands.  See the "package"_package.html
+command doc page for details, including the default values used for
+all its options if these switches are not specified, and how to set
+the number of OpenMP threads via the OMP_NUM_THREADS environment
+variable if desired.
+
+[Or run with the USER-INTEL package by editing an input script:]
+
+The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+OpenMP threads per MPI task, and coprocessor threads per MPI task is
+the same.
+
+Use the "suffix intel"_suffix.html command, or you can explicitly add an
+"intel" suffix to individual styles in your input script, e.g.
+
+pair_style lj/cut/intel 2.5 :pre
+
+You must also use the "package omp"_package.html command to enable the
+USER-OMP package (assuming LAMMPS was built with USER-OMP) unless the "-sf
+intel" or "-pk omp" "command-line switches"_Section_start.html#start_7
+were used.  It specifies how many OpenMP threads per MPI task to use,
+as well as other options.  Its doc page explains how to set the number
+of threads via an environment variable if desired.
+
+You must also use the "package intel"_package.html command to enable
+coprocessor support within the USER-INTEL package (assuming LAMMPS was
+built with coprocessor support) unless the "-sf intel" or "-pk intel"
+"command-line switches"_Section_start.html#start_7 were used.  It
+specifies how many coprocessors/node to use, as well as other
+coprocessor options.
+
+[Speed-ups to expect:]
+
+If LAMMPS was not built with coprocessor support when including the
+USER-INTEL package, then acclerated styles will run on the CPU using
+vectorization optimizations and the specified precision.  This may
+give a substantial speed-up for a pair style, particularly if mixed or
+single precision is used.
+
+If LAMMPS was built with coproccesor support, the pair styles will run
+on one or more Intel(R) Xeon Phi(TM) coprocessors (per node).  The
+performance of a Xeon Phi versus a multi-core CPU is a function of
+your hardware, which pair style is used, the number of
+atoms/coprocessor, and the precision used on the coprocessor (double,
+single, mixed).
+
+See the "Benchmark page"_http://lammps.sandia.gov/bench.html of the
+LAMMPS web site for performance of the USER-INTEL package on different
+hardware.
+
+[Guidelines for best performance on an Intel(R) Xeon Phi(TM)
+coprocessor:]
+
+The default for the "package intel"_package.html command is to have
+all the MPI tasks on a given compute node use a single Xeon Phi(TM)
+coprocessor.  In general, running with a large number of MPI tasks on
+each node will perform best with offload.  Each MPI task will
+automatically get affinity to a subset of the hardware threads
+available on the coprocessor.  For example, if your card has 61 cores,
+with 60 cores available for offload and 4 hardware threads per core
+(240 total threads), running with 24 MPI tasks per node will cause
+each MPI task to use a subset of 10 threads on the coprocessor.  Fine
+tuning of the number of threads to use per MPI task or the number of
+threads to use per core can be accomplished with keyword settings of
+the "package intel"_package.html command. :ulb,l
+
+If desired, only a fraction of the pair style computation can be
+offloaded to the coprocessors.  This is accomplished by using the
+{balance} keyword in the "package intel"_package.html command.  A
+balance of 0 runs all calculations on the CPU.  A balance of 1 runs
+all calculations on the coprocessor.  A balance of 0.5 runs half of
+the calculations on the coprocessor.  Setting the balance to -1 (the
+default) will enable dynamic load balancing that continously adjusts
+the fraction of offloaded work throughout the simulation.  This option
+typically produces results within 5 to 10 percent of the optimal fixed
+balance. :l
+
+When using offload with CPU hyperthreading disabled, it may help
+performance to use fewer MPI tasks and OpenMP threads than available
+cores.  This is due to the fact that additional threads are generated
+internally to handle the asynchronous offload tasks. :l
+
+If running short benchmark runs with dynamic load balancing, adding a
+short warm-up run (10-20 steps) will allow the load-balancer to find a
+near-optimal setting that will carry over to additional runs. :l
+
+If pair computations are being offloaded to an Intel(R) Xeon Phi(TM)
+coprocessor, a diagnostic line is printed to the screen (not to the
+log file), during the setup phase of a run, indicating that offload
+mode is being used and indicating the number of coprocessor threads
+per MPI task.  Additionally, an offload timing summary is printed at
+the end of each run.  When offloading, the frequency for "atom
+sorting"_atom_modify.html is changed to 1 so that the per-atom data is
+effectively sorted at every rebuild of the neighbor lists. :l
+
+For simulations with long-range electrostatics or bond, angle,
+dihedral, improper calculations, computation and data transfer to the
+coprocessor will run concurrently with computations and MPI
+communications for these calculations on the host CPU.  The USER-INTEL
+package has two modes for deciding which atoms will be handled by the
+coprocessor.  This choice is controlled with the {ghost} keyword of
+the "package intel"_package.html command.  When set to 0, ghost atoms
+(atoms at the borders between MPI tasks) are not offloaded to the
+card.  This allows for overlap of MPI communication of forces with
+computation on the coprocessor when the "newton"_newton.html setting
+is "on".  The default is dependent on the style being used, however,
+better performance may be achieved by setting this option
+explictly. :l,ule
+
+[Restrictions:]
+
+When offloading to a coprocessor, "hybrid"_pair_hybrid.html styles
+that require skip lists for neighbor builds cannot be offloaded.
+Using "hybrid/overlay"_pair_hybrid.html is allowed.  Only one intel
+accelerated style may be used with hybrid styles.
+"Special_bonds"_special_bonds.html exclusion lists are not currently
+supported with offload, however, the same effect can often be
+accomplished by setting cutoffs for excluded atom types to 0.  None of
+the pair styles in the USER-INTEL package currently support the
+"inner", "middle", "outer" options for rRESPA integration via the
+"run_style respa"_run_style.html command; only the "pair" option is
+supported.
diff --git a/doc/accelerate_kokkos.html b/doc/accelerate_kokkos.html
new file mode 100644
index 0000000000..4192df77c6
--- /dev/null
+++ b/doc/accelerate_kokkos.html
@@ -0,0 +1,438 @@
+<HTML>
+<CENTER><A HREF = "Section_packages.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> -
+<A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
+</CENTER>
+
+
+
+
+
+
+<HR>
+
+<P><A HREF = "Section_accelerate.html">Return to Section accelerate overview</A>
+</P>
+<H4>5.3.4 KOKKOS package 
+</H4>
+<P>The KOKKOS package was developed primaritly by Christian Trott
+(Sandia) with contributions of various styles by others, including
+Sikandar Mashayak (UIUC).  The underlying Kokkos library was written
+primarily by Carter Edwards, Christian Trott, and Dan Sunderland (all
+Sandia).
+</P>
+<P>The KOKKOS package contains versions of pair, fix, and atom styles
+that use data structures and macros provided by the Kokkos library,
+which is included with LAMMPS in lib/kokkos.
+</P>
+<P>The Kokkos library is part of
+<A HREF = "http://trilinos.sandia.gov/packages/kokkos">Trilinos</A> and is a
+templated C++ library that provides two key abstractions for an
+application like LAMMPS.  First, it allows a single implementation of
+an application kernel (e.g. a pair style) to run efficiently on
+different kinds of hardware, such as a GPU, Intel Phi, or many-core
+chip.
+</P>
+<P>The Kokkos library also provides data abstractions to adjust (at
+compile time) the memory layout of basic data structures like 2d and
+3d arrays and allow the transparent utilization of special hardware
+load and store operations.  Such data structures are used in LAMMPS to
+store atom coordinates or forces or neighbor lists.  The layout is
+chosen to optimize performance on different platforms.  Again this
+functionality is hidden from the developer, and does not affect how
+the kernel is coded.
+</P>
+<P>These abstractions are set at build time, when LAMMPS is compiled with
+the KOKKOS package installed.  This is done by selecting a "host" and
+"device" to build for, compatible with the compute nodes in your
+machine (one on a desktop machine or 1000s on a supercomputer).
+</P>
+<P>All Kokkos operations occur within the context of an individual MPI
+task running on a single node of the machine.  The total number of MPI
+tasks used by LAMMPS (one or multiple per compute node) is set in the
+usual manner via the mpirun or mpiexec commands, and is independent of
+Kokkos.
+</P>
+<P>Kokkos provides support for two different modes of execution per MPI
+task.  This means that computational tasks (pairwise interactions,
+neighbor list builds, time integration, etc) can be parallelized for
+one or the other of the two modes.  The first mode is called the
+"host" and is one or more threads running on one or more physical CPUs
+(within the node).  Currently, both multi-core CPUs and an Intel Phi
+processor (running in native mode, not offload mode like the
+USER-INTEL package) are supported.  The second mode is called the
+"device" and is an accelerator chip of some kind.  Currently only an
+NVIDIA GPU is supported.  If your compute node does not have a GPU,
+then there is only one mode of execution, i.e. the host and device are
+the same.
+</P>
+<P>Here is a quick overview of how to use the KOKKOS package
+for GPU acceleration:
+</P>
+<UL><LI>specify variables and settings in your Makefile.machine that enable GPU, Phi, or OpenMP support
+<LI>include the KOKKOS package and build LAMMPS
+<LI>enable the KOKKOS package and its hardware options via the "-k on" command-line switch
+<LI>use KOKKOS styles in your input script 
+</UL>
+<P>The latter two steps can be done using the "-k on", "-pk kokkos" and
+"-sf kk" <A HREF = "Section_start.html#start_7">command-line switches</A>
+respectively.  Or the effect of the "-pk" or "-sf" switches can be
+duplicated by adding the <A HREF = "package.html">package kokkos</A> or <A HREF = "suffix.html">suffix
+kk</A> commands respectively to your input script.
+</P>
+<P><B>Required hardware/software:</B>
+</P>
+<P>The KOKKOS package can be used to build and run LAMMPS on the
+following kinds of hardware:
+</P>
+<UL><LI>CPU-only: one MPI task per CPU core (MPI-only, but using KOKKOS styles)
+<LI>CPU-only: one or a few MPI tasks per node with additional threading via OpenMP
+<LI>Phi: on one or more Intel Phi coprocessors (per node)
+<LI>GPU: on the GPUs of a node with additional OpenMP threading on the CPUs 
+</UL>
+<P>Note that Intel Xeon Phi coprocessors are supported in "native" mode,
+not "offload" mode like the USER-INTEL package supports.
+</P>
+<P>Only NVIDIA GPUs are currently supported.
+</P>
+<P>IMPORTANT NOTE: For good performance of the KOKKOS package on GPUs,
+you must have Kepler generation GPUs (or later).  The Kokkos library
+exploits texture cache options not supported by Telsa generation GPUs
+(or older).
+</P>
+<P>To build the KOKKOS package for GPUs, NVIDIA Cuda software must be
+installed on your system.  See the discussion above for the USER-CUDA
+and GPU packages for details of how to check and do this.
+</P>
+<P><B>Building LAMMPS with the KOKKOS package:</B>
+</P>
+<P>Unlike other acceleration packages discussed in this section, the
+Kokkos library in lib/kokkos does not have to be pre-built before
+building LAMMPS itself.  Instead, options for the Kokkos library are
+specified at compile time, when LAMMPS itself is built.  This can be
+done in one of two ways, as discussed below.
+</P>
+<P>Here are examples of how to build LAMMPS for the different compute-node
+configurations listed above.
+</P>
+<P>CPU-only (run all-MPI or with OpenMP threading):
+</P>
+<PRE>cd lammps/src
+make yes-kokkos
+make g++ OMP=yes 
+</PRE>
+<P>Intel Xeon Phi:
+</P>
+<PRE>cd lammps/src
+make yes-kokkos
+make g++ OMP=yes MIC=yes 
+</PRE>
+<P>CPUs and GPUs:
+</P>
+<PRE>cd lammps/src
+make yes-kokkos
+make cuda CUDA=yes 
+</PRE>
+<P>These examples set the KOKKOS-specific OMP, MIC, CUDA variables on the
+make command line which requires a GNU-compatible make command.  Try
+"gmake" if your system's standard make complains.  
+</P>
+<P>IMPORTANT NOTE: If you build using make line variables and re-build
+LAMMPS twice with different KOKKOS options and the *same* target,
+e.g. g++ in the first two examples above, then you *must* perform a
+"make clean-all" or "make clean-machine" before each build.  This is
+to force all the KOKKOS-dependent files to be re-compiled with the new
+options.
+</P>
+<P>You can also hardwire these make variables in the specified machine
+makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above,
+with a line like:
+</P>
+<PRE>MIC = yes 
+</PRE>
+<P>Note that if you build LAMMPS multiple times in this manner, using
+different KOKKOS options (defined in different machine makefiles), you
+do not have to worry about doing a "clean" in between.  This is
+because the targets will be different.
+</P>
+<P>IMPORTANT NOTE: The 3rd example above for a GPU, uses a different
+machine makefile, in this case src/MAKE/Makefile.cuda, which is
+included in the LAMMPS distribution.  To build the KOKKOS package for
+a GPU, this makefile must use the NVIDA "nvcc" compiler.  And it must
+have a CCFLAGS -arch setting that is appropriate for your NVIDIA
+hardware and installed software.  Typical values for -arch are given
+in <A HREF = "Section_start.html#start_3_4">Section 2.3.4</A> of the manual, as well
+as other settings that must be included in the machine makefile, if
+you create your own.
+</P>
+<P>There are other allowed options when building with the KOKKOS package.
+As above, They can be set either as variables on the make command line
+or in the machine makefile in the src/MAKE directory.  See <A HREF = "Section_start.html#start_3_4">Section
+2.3.4</A> of the manual for details.
+</P>
+<P>IMPORTANT NOTE: Currently, there are no precision options with the
+KOKKOS package.  All compilation and computation is performed in
+double precision.
+</P>
+<P><B>Run with the KOKKOS package from the command line:</B>
+</P>
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+</P>
+<P>When using KOKKOS built with host=OMP, you need to choose how many
+OpenMP threads per MPI task will be used (via the "-k" command-line
+switch discussed below).  Note that the product of MPI tasks * OpenMP
+threads/task should not exceed the physical number of cores (on a
+node), otherwise performance will suffer.
+</P>
+<P>When using the KOKKOS package built with device=CUDA, you must use
+exactly one MPI task per physical GPU.
+</P>
+<P>When using the KOKKOS package built with host=MIC for Intel Xeon Phi
+coprocessor support you need to insure there are one or more MPI tasks
+per coprocessor, and choose the number of coprocessor threads to use
+per MPI task (via the "-k" command-line switch discussed below).  The
+product of MPI tasks * coprocessor threads/task should not exceed the
+maximum number of threads the coproprocessor is designed to run,
+otherwise performance will suffer.  This value is 240 for current
+generation Xeon Phi(TM) chips, which is 60 physical cores * 4
+threads/core.  Note that with the KOKKOS package you do not need to
+specify how many Phi coprocessors there are per node; each
+coprocessors is simply treated as running some number of MPI tasks.
+</P>
+<P>You must use the "-k on" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to enable the KOKKOS package.  It
+takes additional arguments for hardware settings appropriate to your
+system.  Those arguments are <A HREF = "Section_start.html#start_7">documented
+here</A>.  The two most commonly used
+options are:
+</P>
+<PRE>-k on t Nt g Ng 
+</PRE>
+<P>The "t Nt" option applies to host=OMP (even if device=CUDA) and
+host=MIC.  For host=OMP, it specifies how many OpenMP threads per MPI
+task to use with a node.  For host=MIC, it specifies how many Xeon Phi
+threads per MPI task to use within a node.  The default is Nt = 1.
+Note that for host=OMP this is effectively MPI-only mode which may be
+fine.  But for host=MIC you will typically end up using far less than
+all the 240 available threads, which could give very poor performance.
+</P>
+<P>The "g Ng" option applies to device=CUDA.  It specifies how many GPUs
+per compute node to use.  The default is 1, so this only needs to be
+specified is you have 2 or more GPUs per compute node.
+</P>
+<P>The "-k on" switch also issues a "package kokkos" command (with no
+additional arguments) which sets various KOKKOS options to default
+values, as discussed on the <A HREF = "package.html">package</A> command doc page.
+</P>
+<P>Use the "-sf kk" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "kk" to styles that support it.  Use
+the "-pk kokkos" <A HREF = "Section_start.html#start_7">command-line switch</A> if
+you wish to change any of the default <A HREF = "package.html">package kokkos</A>
+optionns set by the "-k on" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.
+</P>
+<PRE>host=OMP, dual hex-core nodes (12 threads/node):
+mpirun -np 12 lmp_g++ -in in.lj                           # MPI-only mode with no Kokkos
+mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj              # MPI-only mode with Kokkos
+mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj          # one MPI task, 12 threads
+mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj           # two MPI tasks, 6 threads/task 
+mpirun -np 32 -ppn 2 lmp_g++ -k on t 6 -sf kk -in in.lj   # ditto on 16 nodes 
+</PRE>
+<P>host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading):
+mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj           # 1 MPI task on 1 Phi, 1*240 = 240
+mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj            # 30 MPI tasks on 1 Phi, 30*8 = 240
+mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj           # 12 MPI tasks on 1 Phi, 12*20 = 240
+mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj   # ditto on 8 Phis
+</P>
+<PRE>host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU:
+mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj          # one MPI task, 6 threads on CPU
+mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj   # ditto on 4 nodes 
+</PRE>
+<PRE>host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs:
+mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj           # two MPI tasks, 8 threads per CPU
+mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # ditto on 16 nodes 
+</PRE>
+<P>Note that the default for the <A HREF = "package.html">package kokkos</A> command is
+to use "full" neighbor lists and set the Newton flag to "off" for both
+pairwise and bonded interactions.  This typically gives fastest
+performance.  If the <A HREF = "newton.html">newton</A> command is used in the input
+script, it can override the Newton flag defaults.
+</P>
+<P>However, when running in MPI-only mode with 1 thread per MPI task, it
+will typically be faster to use "half" neighbor lists and set the
+Newton flag to "on", just as is the case for non-accelerated pair
+styles.  You can do this with the "-pk" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.
+</P>
+<P><B>Or run with the KOKKOS package by editing an input script:</B>
+</P>
+<P>The discussion above for the mpirun/mpiexec command and setting
+appropriate thread and GPU values for host=OMP or host=MIC or
+device=CUDA are the same.
+</P>
+<P>You must still use the "-k on" <A HREF = "Section_start.html#start_7">command-line
+switch</A> to enable the KOKKOS package, and
+specify its additional arguments for hardware options appopriate to
+your system, as documented above.
+</P>
+<P>Use the <A HREF = "suffix.html">suffix kk</A> command, or you can explicitly add a
+"kk" suffix to individual styles in your input script, e.g.
+</P>
+<PRE>pair_style lj/cut/kk 2.5 
+</PRE>
+<P>You only need to use the <A HREF = "package.html">package kokkos</A> command if you
+wish to change any of its option defaults, as set by the "-k on"
+<A HREF = "Section_start.html#start_7">command-line switch</A>.
+</P>
+<P><B>Speed-ups to expect:</B>
+</P>
+<P>The performance of KOKKOS running in different modes is a function of
+your hardware, which KOKKOS-enable styles are used, and the problem
+size.
+</P>
+<P>Generally speaking, the following rules of thumb apply:
+</P>
+<UL><LI>When running on CPUs only, with a single thread per MPI task,
+performance of a KOKKOS style is somewhere between the standard
+(un-accelerated) styles (MPI-only mode), and those provided by the
+USER-OMP package.  However the difference between all 3 is small (less
+than 20%). 
+
+<LI>When running on CPUs only, with multiple threads per MPI task,
+performance of a KOKKOS style is a bit slower than the USER-OMP
+package. 
+
+<LI>When running on GPUs, KOKKOS is typically faster than the USER-CUDA
+and GPU packages. 
+
+<LI>When running on Intel Xeon Phi, KOKKOS is not as fast as
+the USER-INTEL package, which is optimized for that hardware. 
+</UL>
+<P>See the <A HREF = "http://lammps.sandia.gov/bench.html">Benchmark page</A> of the
+LAMMPS web site for performance of the KOKKOS package on different
+hardware.
+</P>
+<P><B>Guidelines for best performance:</B>
+</P>
+<P>Here are guidline for using the KOKKOS package on the different
+hardware configurations listed above.
+</P>
+<P>Many of the guidelines use the <A HREF = "package.html">package kokkos</A> command
+See its doc page for details and default settings.  Experimenting with
+its options can provide a speed-up for specific calculations.
+</P>
+<P><B>Running on a multi-core CPU:</B>
+</P>
+<P>If N is the number of physical cores/node, then the number of MPI
+tasks/node * number of threads/task should not exceed N, and should
+typically equal N.  Note that the default threads/task is 1, as set by
+the "t" keyword of the "-k" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  If you do not change this, no
+additional parallelism (beyond MPI) will be invoked on the host
+CPU(s).
+</P>
+<P>You can compare the performance running in different modes:
+</P>
+<UL><LI>run with 1 MPI task/node and N threads/task
+<LI>run with N MPI tasks/node and 1 thread/task
+<LI>run with settings in between these extremes 
+</UL>
+<P>Examples of mpirun commands in these modes are shown above.
+</P>
+<P>When using KOKKOS to perform multi-threading, it is important for
+performance to bind both MPI tasks to physical cores, and threads to
+physical cores, so they do not migrate during a simulation.
+</P>
+<P>If you are not certain MPI tasks are being bound (check the defaults
+for your MPI installation), binding can be forced with these flags:
+</P>
+<PRE>OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
+Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... 
+</PRE>
+<P>For binding threads with the KOKKOS OMP option, use thread affinity
+environment variables to force binding.  With OpenMP 3.1 (gcc 4.7 or
+later, intel 12 or later) setting the environment variable
+OMP_PROC_BIND=true should be sufficient.  For binding threads with the
+KOKKOS pthreads option, compile LAMMPS the KOKKOS HWLOC=yes option, as
+discussed in <A HREF = "Sections_start.html#start_3_4">Section 2.3.4</A> of the
+manual.
+</P>
+<P><B>Running on GPUs:</B>
+</P>
+<P>Insure the -arch setting in the machine makefile you are using,
+e.g. src/MAKE/Makefile.cuda, is correct for your GPU hardware/software
+(see <A HREF = "Section_start.html#start_3_4">this section</A> of the manual for
+details).
+</P>
+<P>The -np setting of the mpirun command should set the number of MPI
+tasks/node to be equal to the # of physical GPUs on the node. 
+</P>
+<P>Use the "-k" <A HREF = "Section_commands.html#start_7">command-line switch</A> to
+specify the number of GPUs per node, and the number of threads per MPI
+task.  As above for multi-core CPUs (and no GPU), if N is the number
+of physical cores/node, then the number of MPI tasks/node * number of
+threads/task should not exceed N.  With one GPU (and one MPI task) it
+may be faster to use less than all the available cores, by setting
+threads/task to a smaller value.  This is because using all the cores
+on a dual-socket node will incur extra cost to copy memory from the
+2nd socket to the GPU.
+</P>
+<P>Examples of mpirun commands that follow these rules are shown above.
+</P>
+<P>IMPORTANT NOTE: When using a GPU, you will achieve the best
+performance if your input script does not use any fix or compute
+styles which are not yet Kokkos-enabled.  This allows data to stay on
+the GPU for multiple timesteps, without being copied back to the host
+CPU.  Invoking a non-Kokkos fix or compute, or performing I/O for
+<A HREF = "thermo_style.html">thermo</A> or <A HREF = "dump.html">dump</A> output will cause data
+to be copied back to the CPU.
+</P>
+<P>You cannot yet assign multiple MPI tasks to the same GPU with the
+KOKKOS package.  We plan to support this in the future, similar to the
+GPU package in LAMMPS.
+</P>
+<P>You cannot yet use both the host (multi-threaded) and device (GPU)
+together to compute pairwise interactions with the KOKKOS package.  We
+hope to support this in the future, similar to the GPU package in
+LAMMPS.
+</P>
+<P><B>Running on an Intel Phi:</B>
+</P>
+<P>Kokkos only uses Intel Phi processors in their "native" mode, i.e.
+not hosted by a CPU.
+</P>
+<P>As illustrated above, build LAMMPS with OMP=yes (the default) and
+MIC=yes.  The latter insures code is correctly compiled for the Intel
+Phi.  The OMP setting means OpenMP will be used for parallelization on
+the Phi, which is currently the best option within Kokkos.  In the
+future, other options may be added.
+</P>
+<P>Current-generation Intel Phi chips have either 61 or 57 cores.  One
+core should be excluded for running the OS, leaving 60 or 56 cores.
+Each core is hyperthreaded, so there are effectively N = 240 (4*60) or
+N = 224 (4*56) cores to run on.
+</P>
+<P>The -np setting of the mpirun command sets the number of MPI
+tasks/node.  The "-k on t Nt" command-line switch sets the number of
+threads/task as Nt.  The product of these 2 values should be N, i.e.
+240 or 224.  Also, the number of threads/task should be a multiple of
+4 so that logical threads from more than one MPI task do not run on
+the same physical core.
+</P>
+<P>Examples of mpirun commands that follow these rules are shown above.
+</P>
+<P><B>Restrictions:</B>
+</P>
+<P>As noted above, if using GPUs, the number of MPI tasks per compute
+node should equal to the number of GPUs per compute node.  In the
+future Kokkos will support assigning multiple MPI tasks to a single
+GPU.
+</P>
+<P>Currently Kokkos does not support AMD GPUs due to limits in the
+available backend programming models.  Specifically, Kokkos requires
+extensive C++ support from the Kernel language.  This is expected to
+change in the future.
+</P>
+</HTML>
diff --git a/doc/accelerate_kokkos.txt b/doc/accelerate_kokkos.txt
new file mode 100644
index 0000000000..b8dbcd0e0d
--- /dev/null
+++ b/doc/accelerate_kokkos.txt
@@ -0,0 +1,433 @@
+"Previous Section"_Section_packages.html - "LAMMPS WWW Site"_lws -
+"LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+"Return to Section accelerate overview"_Section_accelerate.html
+
+5.3.4 KOKKOS package :h4
+
+The KOKKOS package was developed primaritly by Christian Trott
+(Sandia) with contributions of various styles by others, including
+Sikandar Mashayak (UIUC).  The underlying Kokkos library was written
+primarily by Carter Edwards, Christian Trott, and Dan Sunderland (all
+Sandia).
+
+The KOKKOS package contains versions of pair, fix, and atom styles
+that use data structures and macros provided by the Kokkos library,
+which is included with LAMMPS in lib/kokkos.
+
+The Kokkos library is part of
+"Trilinos"_http://trilinos.sandia.gov/packages/kokkos and is a
+templated C++ library that provides two key abstractions for an
+application like LAMMPS.  First, it allows a single implementation of
+an application kernel (e.g. a pair style) to run efficiently on
+different kinds of hardware, such as a GPU, Intel Phi, or many-core
+chip.
+
+The Kokkos library also provides data abstractions to adjust (at
+compile time) the memory layout of basic data structures like 2d and
+3d arrays and allow the transparent utilization of special hardware
+load and store operations.  Such data structures are used in LAMMPS to
+store atom coordinates or forces or neighbor lists.  The layout is
+chosen to optimize performance on different platforms.  Again this
+functionality is hidden from the developer, and does not affect how
+the kernel is coded.
+
+These abstractions are set at build time, when LAMMPS is compiled with
+the KOKKOS package installed.  This is done by selecting a "host" and
+"device" to build for, compatible with the compute nodes in your
+machine (one on a desktop machine or 1000s on a supercomputer).
+
+All Kokkos operations occur within the context of an individual MPI
+task running on a single node of the machine.  The total number of MPI
+tasks used by LAMMPS (one or multiple per compute node) is set in the
+usual manner via the mpirun or mpiexec commands, and is independent of
+Kokkos.
+
+Kokkos provides support for two different modes of execution per MPI
+task.  This means that computational tasks (pairwise interactions,
+neighbor list builds, time integration, etc) can be parallelized for
+one or the other of the two modes.  The first mode is called the
+"host" and is one or more threads running on one or more physical CPUs
+(within the node).  Currently, both multi-core CPUs and an Intel Phi
+processor (running in native mode, not offload mode like the
+USER-INTEL package) are supported.  The second mode is called the
+"device" and is an accelerator chip of some kind.  Currently only an
+NVIDIA GPU is supported.  If your compute node does not have a GPU,
+then there is only one mode of execution, i.e. the host and device are
+the same.
+
+Here is a quick overview of how to use the KOKKOS package
+for GPU acceleration:
+
+specify variables and settings in your Makefile.machine that enable GPU, Phi, or OpenMP support
+include the KOKKOS package and build LAMMPS
+enable the KOKKOS package and its hardware options via the "-k on" command-line switch
+use KOKKOS styles in your input script :ul
+
+The latter two steps can be done using the "-k on", "-pk kokkos" and
+"-sf kk" "command-line switches"_Section_start.html#start_7
+respectively.  Or the effect of the "-pk" or "-sf" switches can be
+duplicated by adding the "package kokkos"_package.html or "suffix
+kk"_suffix.html commands respectively to your input script.
+
+[Required hardware/software:]
+
+The KOKKOS package can be used to build and run LAMMPS on the
+following kinds of hardware:
+
+CPU-only: one MPI task per CPU core (MPI-only, but using KOKKOS styles)
+CPU-only: one or a few MPI tasks per node with additional threading via OpenMP
+Phi: on one or more Intel Phi coprocessors (per node)
+GPU: on the GPUs of a node with additional OpenMP threading on the CPUs :ul
+
+Note that Intel Xeon Phi coprocessors are supported in "native" mode,
+not "offload" mode like the USER-INTEL package supports.
+
+Only NVIDIA GPUs are currently supported.
+
+IMPORTANT NOTE: For good performance of the KOKKOS package on GPUs,
+you must have Kepler generation GPUs (or later).  The Kokkos library
+exploits texture cache options not supported by Telsa generation GPUs
+(or older).
+
+To build the KOKKOS package for GPUs, NVIDIA Cuda software must be
+installed on your system.  See the discussion above for the USER-CUDA
+and GPU packages for details of how to check and do this.
+
+[Building LAMMPS with the KOKKOS package:]
+
+Unlike other acceleration packages discussed in this section, the
+Kokkos library in lib/kokkos does not have to be pre-built before
+building LAMMPS itself.  Instead, options for the Kokkos library are
+specified at compile time, when LAMMPS itself is built.  This can be
+done in one of two ways, as discussed below.
+
+Here are examples of how to build LAMMPS for the different compute-node
+configurations listed above.
+
+CPU-only (run all-MPI or with OpenMP threading):
+
+cd lammps/src
+make yes-kokkos
+make g++ OMP=yes :pre
+
+Intel Xeon Phi:
+
+cd lammps/src
+make yes-kokkos
+make g++ OMP=yes MIC=yes :pre
+
+CPUs and GPUs:
+
+cd lammps/src
+make yes-kokkos
+make cuda CUDA=yes :pre
+
+These examples set the KOKKOS-specific OMP, MIC, CUDA variables on the
+make command line which requires a GNU-compatible make command.  Try
+"gmake" if your system's standard make complains.  
+
+IMPORTANT NOTE: If you build using make line variables and re-build
+LAMMPS twice with different KOKKOS options and the *same* target,
+e.g. g++ in the first two examples above, then you *must* perform a
+"make clean-all" or "make clean-machine" before each build.  This is
+to force all the KOKKOS-dependent files to be re-compiled with the new
+options.
+
+You can also hardwire these make variables in the specified machine
+makefile, e.g. src/MAKE/Makefile.g++ in the first two examples above,
+with a line like:
+
+MIC = yes :pre
+
+Note that if you build LAMMPS multiple times in this manner, using
+different KOKKOS options (defined in different machine makefiles), you
+do not have to worry about doing a "clean" in between.  This is
+because the targets will be different.
+
+IMPORTANT NOTE: The 3rd example above for a GPU, uses a different
+machine makefile, in this case src/MAKE/Makefile.cuda, which is
+included in the LAMMPS distribution.  To build the KOKKOS package for
+a GPU, this makefile must use the NVIDA "nvcc" compiler.  And it must
+have a CCFLAGS -arch setting that is appropriate for your NVIDIA
+hardware and installed software.  Typical values for -arch are given
+in "Section 2.3.4"_Section_start.html#start_3_4 of the manual, as well
+as other settings that must be included in the machine makefile, if
+you create your own.
+
+There are other allowed options when building with the KOKKOS package.
+As above, They can be set either as variables on the make command line
+or in the machine makefile in the src/MAKE directory.  See "Section
+2.3.4"_Section_start.html#start_3_4 of the manual for details.
+
+IMPORTANT NOTE: Currently, there are no precision options with the
+KOKKOS package.  All compilation and computation is performed in
+double precision.
+
+[Run with the KOKKOS package from the command line:]
+
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+
+When using KOKKOS built with host=OMP, you need to choose how many
+OpenMP threads per MPI task will be used (via the "-k" command-line
+switch discussed below).  Note that the product of MPI tasks * OpenMP
+threads/task should not exceed the physical number of cores (on a
+node), otherwise performance will suffer.
+
+When using the KOKKOS package built with device=CUDA, you must use
+exactly one MPI task per physical GPU.
+
+When using the KOKKOS package built with host=MIC for Intel Xeon Phi
+coprocessor support you need to insure there are one or more MPI tasks
+per coprocessor, and choose the number of coprocessor threads to use
+per MPI task (via the "-k" command-line switch discussed below).  The
+product of MPI tasks * coprocessor threads/task should not exceed the
+maximum number of threads the coproprocessor is designed to run,
+otherwise performance will suffer.  This value is 240 for current
+generation Xeon Phi(TM) chips, which is 60 physical cores * 4
+threads/core.  Note that with the KOKKOS package you do not need to
+specify how many Phi coprocessors there are per node; each
+coprocessors is simply treated as running some number of MPI tasks.
+
+You must use the "-k on" "command-line
+switch"_Section_start.html#start_7 to enable the KOKKOS package.  It
+takes additional arguments for hardware settings appropriate to your
+system.  Those arguments are "documented
+here"_Section_start.html#start_7.  The two most commonly used
+options are:
+
+-k on t Nt g Ng :pre
+
+The "t Nt" option applies to host=OMP (even if device=CUDA) and
+host=MIC.  For host=OMP, it specifies how many OpenMP threads per MPI
+task to use with a node.  For host=MIC, it specifies how many Xeon Phi
+threads per MPI task to use within a node.  The default is Nt = 1.
+Note that for host=OMP this is effectively MPI-only mode which may be
+fine.  But for host=MIC you will typically end up using far less than
+all the 240 available threads, which could give very poor performance.
+
+The "g Ng" option applies to device=CUDA.  It specifies how many GPUs
+per compute node to use.  The default is 1, so this only needs to be
+specified is you have 2 or more GPUs per compute node.
+
+The "-k on" switch also issues a "package kokkos" command (with no
+additional arguments) which sets various KOKKOS options to default
+values, as discussed on the "package"_package.html command doc page.
+
+Use the "-sf kk" "command-line switch"_Section_start.html#start_7,
+which will automatically append "kk" to styles that support it.  Use
+the "-pk kokkos" "command-line switch"_Section_start.html#start_7 if
+you wish to change any of the default "package kokkos"_package.html
+optionns set by the "-k on" "command-line
+switch"_Section_start.html#start_7.
+
+host=OMP, dual hex-core nodes (12 threads/node):
+mpirun -np 12 lmp_g++ -in in.lj                           # MPI-only mode with no Kokkos
+mpirun -np 12 lmp_g++ -k on -sf kk -in in.lj              # MPI-only mode with Kokkos
+mpirun -np 1 lmp_g++ -k on t 12 -sf kk -in in.lj          # one MPI task, 12 threads
+mpirun -np 2 lmp_g++ -k on t 6 -sf kk -in in.lj           # two MPI tasks, 6 threads/task 
+mpirun -np 32 -ppn 2 lmp_g++ -k on t 6 -sf kk -in in.lj   # ditto on 16 nodes :pre
+
+host=MIC, Intel Phi with 61 cores (240 threads/phi via 4x hardware threading):
+mpirun -np 1 lmp_g++ -k on t 240 -sf kk -in in.lj           # 1 MPI task on 1 Phi, 1*240 = 240
+mpirun -np 30 lmp_g++ -k on t 8 -sf kk -in in.lj            # 30 MPI tasks on 1 Phi, 30*8 = 240
+mpirun -np 12 lmp_g++ -k on t 20 -sf kk -in in.lj           # 12 MPI tasks on 1 Phi, 12*20 = 240
+mpirun -np 96 -ppn 12 lmp_g++ -k on t 20 -sf kk -in in.lj   # ditto on 8 Phis
+
+host=OMP, device=CUDA, node = dual hex-core CPUs and a single GPU:
+mpirun -np 1 lmp_cuda -k on t 6 -sf kk -in in.lj          # one MPI task, 6 threads on CPU
+mpirun -np 4 -ppn 1 lmp_cuda -k on t 6 -sf kk -in in.lj   # ditto on 4 nodes :pre
+
+host=OMP, device=CUDA, node = dual 8-core CPUs and 2 GPUs:
+mpirun -np 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj           # two MPI tasks, 8 threads per CPU
+mpirun -np 32 -ppn 2 lmp_cuda -k on t 8 g 2 -sf kk -in in.lj   # ditto on 16 nodes :pre
+
+Note that the default for the "package kokkos"_package.html command is
+to use "full" neighbor lists and set the Newton flag to "off" for both
+pairwise and bonded interactions.  This typically gives fastest
+performance.  If the "newton"_newton.html command is used in the input
+script, it can override the Newton flag defaults.
+
+However, when running in MPI-only mode with 1 thread per MPI task, it
+will typically be faster to use "half" neighbor lists and set the
+Newton flag to "on", just as is the case for non-accelerated pair
+styles.  You can do this with the "-pk" "command-line
+switch"_Section_start.html#start_7.
+
+[Or run with the KOKKOS package by editing an input script:]
+
+The discussion above for the mpirun/mpiexec command and setting
+appropriate thread and GPU values for host=OMP or host=MIC or
+device=CUDA are the same.
+
+You must still use the "-k on" "command-line
+switch"_Section_start.html#start_7 to enable the KOKKOS package, and
+specify its additional arguments for hardware options appopriate to
+your system, as documented above.
+
+Use the "suffix kk"_suffix.html command, or you can explicitly add a
+"kk" suffix to individual styles in your input script, e.g.
+
+pair_style lj/cut/kk 2.5 :pre
+
+You only need to use the "package kokkos"_package.html command if you
+wish to change any of its option defaults, as set by the "-k on"
+"command-line switch"_Section_start.html#start_7.
+
+[Speed-ups to expect:]
+
+The performance of KOKKOS running in different modes is a function of
+your hardware, which KOKKOS-enable styles are used, and the problem
+size.
+
+Generally speaking, the following rules of thumb apply:
+
+When running on CPUs only, with a single thread per MPI task,
+performance of a KOKKOS style is somewhere between the standard
+(un-accelerated) styles (MPI-only mode), and those provided by the
+USER-OMP package.  However the difference between all 3 is small (less
+than 20%). :ulb,l
+
+When running on CPUs only, with multiple threads per MPI task,
+performance of a KOKKOS style is a bit slower than the USER-OMP
+package. :l
+
+When running on GPUs, KOKKOS is typically faster than the USER-CUDA
+and GPU packages. :l
+
+When running on Intel Xeon Phi, KOKKOS is not as fast as
+the USER-INTEL package, which is optimized for that hardware. :l,ule
+
+See the "Benchmark page"_http://lammps.sandia.gov/bench.html of the
+LAMMPS web site for performance of the KOKKOS package on different
+hardware.
+
+[Guidelines for best performance:]
+
+Here are guidline for using the KOKKOS package on the different
+hardware configurations listed above.
+
+Many of the guidelines use the "package kokkos"_package.html command
+See its doc page for details and default settings.  Experimenting with
+its options can provide a speed-up for specific calculations.
+
+[Running on a multi-core CPU:]
+
+If N is the number of physical cores/node, then the number of MPI
+tasks/node * number of threads/task should not exceed N, and should
+typically equal N.  Note that the default threads/task is 1, as set by
+the "t" keyword of the "-k" "command-line
+switch"_Section_start.html#start_7.  If you do not change this, no
+additional parallelism (beyond MPI) will be invoked on the host
+CPU(s).
+
+You can compare the performance running in different modes:
+  
+run with 1 MPI task/node and N threads/task
+run with N MPI tasks/node and 1 thread/task
+run with settings in between these extremes :ul
+
+Examples of mpirun commands in these modes are shown above.
+
+When using KOKKOS to perform multi-threading, it is important for
+performance to bind both MPI tasks to physical cores, and threads to
+physical cores, so they do not migrate during a simulation.
+
+If you are not certain MPI tasks are being bound (check the defaults
+for your MPI installation), binding can be forced with these flags:
+
+OpenMPI 1.8: mpirun -np 2 -bind-to socket -map-by socket ./lmp_openmpi ...
+Mvapich2 2.0: mpiexec -np 2 -bind-to socket -map-by socket ./lmp_mvapich ... :pre
+
+For binding threads with the KOKKOS OMP option, use thread affinity
+environment variables to force binding.  With OpenMP 3.1 (gcc 4.7 or
+later, intel 12 or later) setting the environment variable
+OMP_PROC_BIND=true should be sufficient.  For binding threads with the
+KOKKOS pthreads option, compile LAMMPS the KOKKOS HWLOC=yes option, as
+discussed in "Section 2.3.4"_Sections_start.html#start_3_4 of the
+manual.
+
+[Running on GPUs:]
+
+Insure the -arch setting in the machine makefile you are using,
+e.g. src/MAKE/Makefile.cuda, is correct for your GPU hardware/software
+(see "this section"_Section_start.html#start_3_4 of the manual for
+details).
+
+The -np setting of the mpirun command should set the number of MPI
+tasks/node to be equal to the # of physical GPUs on the node. 
+
+Use the "-k" "command-line switch"_Section_commands.html#start_7 to
+specify the number of GPUs per node, and the number of threads per MPI
+task.  As above for multi-core CPUs (and no GPU), if N is the number
+of physical cores/node, then the number of MPI tasks/node * number of
+threads/task should not exceed N.  With one GPU (and one MPI task) it
+may be faster to use less than all the available cores, by setting
+threads/task to a smaller value.  This is because using all the cores
+on a dual-socket node will incur extra cost to copy memory from the
+2nd socket to the GPU.
+
+Examples of mpirun commands that follow these rules are shown above.
+
+IMPORTANT NOTE: When using a GPU, you will achieve the best
+performance if your input script does not use any fix or compute
+styles which are not yet Kokkos-enabled.  This allows data to stay on
+the GPU for multiple timesteps, without being copied back to the host
+CPU.  Invoking a non-Kokkos fix or compute, or performing I/O for
+"thermo"_thermo_style.html or "dump"_dump.html output will cause data
+to be copied back to the CPU.
+
+You cannot yet assign multiple MPI tasks to the same GPU with the
+KOKKOS package.  We plan to support this in the future, similar to the
+GPU package in LAMMPS.
+
+You cannot yet use both the host (multi-threaded) and device (GPU)
+together to compute pairwise interactions with the KOKKOS package.  We
+hope to support this in the future, similar to the GPU package in
+LAMMPS.
+
+[Running on an Intel Phi:]
+
+Kokkos only uses Intel Phi processors in their "native" mode, i.e.
+not hosted by a CPU.
+
+As illustrated above, build LAMMPS with OMP=yes (the default) and
+MIC=yes.  The latter insures code is correctly compiled for the Intel
+Phi.  The OMP setting means OpenMP will be used for parallelization on
+the Phi, which is currently the best option within Kokkos.  In the
+future, other options may be added.
+
+Current-generation Intel Phi chips have either 61 or 57 cores.  One
+core should be excluded for running the OS, leaving 60 or 56 cores.
+Each core is hyperthreaded, so there are effectively N = 240 (4*60) or
+N = 224 (4*56) cores to run on.
+
+The -np setting of the mpirun command sets the number of MPI
+tasks/node.  The "-k on t Nt" command-line switch sets the number of
+threads/task as Nt.  The product of these 2 values should be N, i.e.
+240 or 224.  Also, the number of threads/task should be a multiple of
+4 so that logical threads from more than one MPI task do not run on
+the same physical core.
+
+Examples of mpirun commands that follow these rules are shown above.
+
+[Restrictions:]
+
+As noted above, if using GPUs, the number of MPI tasks per compute
+node should equal to the number of GPUs per compute node.  In the
+future Kokkos will support assigning multiple MPI tasks to a single
+GPU.
+
+Currently Kokkos does not support AMD GPUs due to limits in the
+available backend programming models.  Specifically, Kokkos requires
+extensive C++ support from the Kernel language.  This is expected to
+change in the future.
diff --git a/doc/accelerate_omp.html b/doc/accelerate_omp.html
new file mode 100644
index 0000000000..bc0b24c37c
--- /dev/null
+++ b/doc/accelerate_omp.html
@@ -0,0 +1,197 @@
+<HTML>
+<CENTER><A HREF = "Section_packages.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> -
+<A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
+</CENTER>
+
+
+
+
+
+
+<HR>
+
+<P><A HREF = "Section_accelerate.html">Return to Section accelerate overview</A>
+</P>
+<H4>5.3.5 USER-OMP package 
+</H4>
+<P>The USER-OMP package was developed by Axel Kohlmeyer at Temple
+University.  It provides multi-threaded versions of most pair styles,
+nearly all bonded styles (bond, angle, dihedral, improper), several
+Kspace styles, and a few fix styles.  The package currently
+uses the OpenMP interface for multi-threading.
+</P>
+<P>Here is a quick overview of how to use the USER-OMP package:
+</P>
+<UL><LI>use the -fopenmp flag for compiling and linking in your Makefile.machine
+<LI>include the USER-OMP package and build LAMMPS
+<LI>use the mpirun command to set the number of MPI tasks/node
+<LI>specify how many threads per MPI task to use
+<LI>use USER-OMP styles in your input script 
+</UL>
+<P>The latter two steps can be done using the "-pk omp" and "-sf omp"
+<A HREF = "Section_start.html#start_7">command-line switches</A> respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the <A HREF = "package.html">package omp</A> or <A HREF = "suffix.html">suffix omp</A> commands
+respectively to your input script.
+</P>
+<P><B>Required hardware/software:</B>
+</P>
+<P>Your compiler must support the OpenMP interface.  You should have one
+or more multi-core CPUs so that multiple threads can be launched by an
+MPI task running on a CPU.
+</P>
+<P><B>Building LAMMPS with the USER-OMP package:</B>
+</P>
+<P>Include the package and build LAMMPS:
+</P>
+<PRE>cd lammps/src
+make yes-user-omp
+make machine 
+</PRE>
+<P>Your src/MAKE/Makefile.machine needs a flag for OpenMP support in both
+the CCFLAGS and LINKFLAGS variables.  For GNU and Intel compilers,
+this flag is "-fopenmp".  Without this flag the USER-OMP styles will
+still be compiled and work, but will not support multi-threading.
+</P>
+<P><B>Run with the USER-OMP package from the command line:</B>
+</P>
+<P>The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+</P>
+<P>You need to choose how many threads per MPI task will be used by the
+USER-OMP package.  Note that the product of MPI tasks * threads/task
+should not exceed the physical number of cores (on a node), otherwise
+performance will suffer.
+</P>
+<P>Use the "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "omp" to styles that support it.  Use
+the "-pk omp Nt" <A HREF = "Section_start.html#start_7">command-line switch</A>, to
+set Nt = # of OpenMP threads per MPI task to use.
+</P>
+<PRE>lmp_machine -sf omp -pk omp 16 -in in.script                       # 1 MPI task on a 16-core node
+mpirun -np 4 lmp_machine -sf omp -pk omp 4 -in in.script           # 4 MPI tasks each with 4 threads on a single 16-core node
+mpirun -np 32 -ppn 4 lmp_machine -sf omp -pk omp 4 -in in.script   # ditto on 8 16-core nodes 
+</PRE>
+<P>Note that if the "-sf omp" switch is used, it also issues a default
+<A HREF = "package.html">package omp 0</A> command, which sets the number of threads
+per MPI task via the OMP_NUM_THREADS environment variable.
+</P>
+<P>Using the "-pk" switch explicitly allows for direct setting of the
+number of threads and additional options.  Its syntax is the same as
+the "package omp" command.  See the <A HREF = "package.html">package</A> command doc
+page for details, including the default values used for all its
+options if it is not specified, and how to set the number of threads
+via the OMP_NUM_THREADS environment variable if desired.
+</P>
+<P><B>Or run with the USER-OMP package by editing an input script:</B>
+</P>
+<P>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+and threads/MPI task is the same.
+</P>
+<P>Use the <A HREF = "suffix.html">suffix omp</A> command, or you can explicitly add an
+"omp" suffix to individual styles in your input script, e.g.
+</P>
+<PRE>pair_style lj/cut/omp 2.5 
+</PRE>
+<P>You must also use the <A HREF = "package.html">package omp</A> command to enable the
+USER-OMP package, unless the "-sf omp" or "-pk omp" <A HREF = "Section_start.html#start_7">command-line
+switches</A> were used.  It specifies how many
+threads per MPI task to use, as well as other options.  Its doc page
+explains how to set the number of threads via an environment variable
+if desired.
+</P>
+<P><B>Speed-ups to expect:</B>
+</P>
+<P>Depending on which styles are accelerated, you should look for a
+reduction in the "Pair time", "Bond time", "KSpace time", and "Loop
+time" values printed at the end of a run.  
+</P>
+<P>You may see a small performance advantage (5 to 20%) when running a
+USER-OMP style (in serial or parallel) with a single thread per MPI
+task, versus running standard LAMMPS with its standard
+(un-accelerated) styles (in serial or all-MPI parallelization with 1
+task/core).  This is because many of the USER-OMP styles contain
+similar optimizations to those used in the OPT package, as described
+above.
+</P>
+<P>With multiple threads/task, the optimal choice of MPI tasks/node and
+OpenMP threads/task can vary a lot and should always be tested via
+benchmark runs for a specific simulation running on a specific
+machine, paying attention to guidelines discussed in the next
+sub-section.
+</P>
+<P>A description of the multi-threading strategy used in the USER-OMP
+package and some performance examples are <A HREF = "http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1">presented
+here</A>
+</P>
+<P><B>Guidelines for best performance:</B>
+</P>
+<P>For many problems on current generation CPUs, running the USER-OMP
+package with a single thread/task is faster than running with multiple
+threads/task.  This is because the MPI parallelization in LAMMPS is
+often more efficient than multi-threading as implemented in the
+USER-OMP package.  The parallel efficiency (in a threaded sense) also
+varies for different USER-OMP styles.
+</P>
+<P>Using multiple threads/task can be more effective under the following
+circumstances:
+</P>
+<UL><LI>Individual compute nodes have a significant number of CPU cores but
+the CPU itself has limited memory bandwidth, e.g. for Intel Xeon 53xx
+(Clovertown) and 54xx (Harpertown) quad core processors. Running one
+MPI task per CPU core will result in significant performance
+degradation, so that running with 4 or even only 2 MPI tasks per node
+is faster.  Running in hybrid MPI+OpenMP mode will reduce the
+inter-node communication bandwidth contention in the same way, but
+offers an additional speedup by utilizing the otherwise idle CPU
+cores. 
+
+<LI>The interconnect used for MPI communication does not provide
+sufficient bandwidth for a large number of MPI tasks per node.  For
+example, this applies to running over gigabit ethernet or on Cray XT4
+or XT5 series supercomputers.  As in the aforementioned case, this
+effect worsens when using an increasing number of nodes. 
+
+<LI>The system has a spatially inhomogeneous particle density which does
+not map well to the <A HREF = "processors.html">domain decomposition scheme</A> or
+<A HREF = "balance.html">load-balancing</A> options that LAMMPS provides.  This is
+because multi-threading achives parallelism over the number of
+particles, not via their distribution in space. 
+
+<LI>A machine is being used in "capability mode", i.e. near the point
+where MPI parallelism is maxed out.  For example, this can happen when
+using the <A HREF = "kspace_style.html">PPPM solver</A> for long-range
+electrostatics on large numbers of nodes.  The scaling of the KSpace
+calculation (see the <A HREF = "kspace_style.html">kspace_style</A> command) becomes
+the performance-limiting factor.  Using multi-threading allows less
+MPI tasks to be invoked and can speed-up the long-range solver, while
+increasing overall performance by parallelizing the pairwise and
+bonded calculations via OpenMP.  Likewise additional speedup can be
+sometimes be achived by increasing the length of the Coulombic cutoff
+and thus reducing the work done by the long-range solver.  Using the
+<A HREF = "run_style.html">run_style verlet/split</A> command, which is compatible
+with the USER-OMP package, is an alternative way to reduce the number
+of MPI tasks assigned to the KSpace calculation. 
+</UL>
+<P>Additional performance tips are as follows:
+</P>
+<UL><LI>The best parallel efficiency from <I>omp</I> styles is typically achieved
+when there is at least one MPI task per physical processor,
+i.e. socket or die. 
+
+<LI>It is usually most efficient to restrict threading to a single
+socket, i.e. use one or more MPI task per socket. 
+
+<LI>Several current MPI implementation by default use a processor affinity
+setting that restricts each MPI task to a single CPU core.  Using
+multi-threading in this mode will force the threads to share that core
+and thus is likely to be counterproductive.  Instead, binding MPI
+tasks to a (multi-core) socket, should solve this issue. 
+</UL>
+<P><B>Restrictions:</B>
+</P>
+<P>None.
+</P>
+</HTML>
diff --git a/doc/accelerate_omp.txt b/doc/accelerate_omp.txt
new file mode 100644
index 0000000000..e2ededf5e7
--- /dev/null
+++ b/doc/accelerate_omp.txt
@@ -0,0 +1,192 @@
+"Previous Section"_Section_packages.html - "LAMMPS WWW Site"_lws -
+"LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+"Return to Section accelerate overview"_Section_accelerate.html
+
+5.3.5 USER-OMP package :h4
+
+The USER-OMP package was developed by Axel Kohlmeyer at Temple
+University.  It provides multi-threaded versions of most pair styles,
+nearly all bonded styles (bond, angle, dihedral, improper), several
+Kspace styles, and a few fix styles.  The package currently
+uses the OpenMP interface for multi-threading.
+
+Here is a quick overview of how to use the USER-OMP package:
+
+use the -fopenmp flag for compiling and linking in your Makefile.machine
+include the USER-OMP package and build LAMMPS
+use the mpirun command to set the number of MPI tasks/node
+specify how many threads per MPI task to use
+use USER-OMP styles in your input script :ul
+
+The latter two steps can be done using the "-pk omp" and "-sf omp"
+"command-line switches"_Section_start.html#start_7 respectively.  Or
+the effect of the "-pk" or "-sf" switches can be duplicated by adding
+the "package omp"_package.html or "suffix omp"_suffix.html commands
+respectively to your input script.
+
+[Required hardware/software:]
+
+Your compiler must support the OpenMP interface.  You should have one
+or more multi-core CPUs so that multiple threads can be launched by an
+MPI task running on a CPU.
+
+[Building LAMMPS with the USER-OMP package:]
+
+Include the package and build LAMMPS:
+
+cd lammps/src
+make yes-user-omp
+make machine :pre
+
+Your src/MAKE/Makefile.machine needs a flag for OpenMP support in both
+the CCFLAGS and LINKFLAGS variables.  For GNU and Intel compilers,
+this flag is "-fopenmp".  Without this flag the USER-OMP styles will
+still be compiled and work, but will not support multi-threading.
+
+[Run with the USER-OMP package from the command line:]
+
+The mpirun or mpiexec command sets the total number of MPI tasks used
+by LAMMPS (one or multiple per compute node) and the number of MPI
+tasks used per node.  E.g. the mpirun command does this via its -np
+and -ppn switches.
+
+You need to choose how many threads per MPI task will be used by the
+USER-OMP package.  Note that the product of MPI tasks * threads/task
+should not exceed the physical number of cores (on a node), otherwise
+performance will suffer.
+
+Use the "-sf omp" "command-line switch"_Section_start.html#start_7,
+which will automatically append "omp" to styles that support it.  Use
+the "-pk omp Nt" "command-line switch"_Section_start.html#start_7, to
+set Nt = # of OpenMP threads per MPI task to use.
+
+lmp_machine -sf omp -pk omp 16 -in in.script                       # 1 MPI task on a 16-core node
+mpirun -np 4 lmp_machine -sf omp -pk omp 4 -in in.script           # 4 MPI tasks each with 4 threads on a single 16-core node
+mpirun -np 32 -ppn 4 lmp_machine -sf omp -pk omp 4 -in in.script   # ditto on 8 16-core nodes :pre
+
+Note that if the "-sf omp" switch is used, it also issues a default
+"package omp 0"_package.html command, which sets the number of threads
+per MPI task via the OMP_NUM_THREADS environment variable.
+
+Using the "-pk" switch explicitly allows for direct setting of the
+number of threads and additional options.  Its syntax is the same as
+the "package omp" command.  See the "package"_package.html command doc
+page for details, including the default values used for all its
+options if it is not specified, and how to set the number of threads
+via the OMP_NUM_THREADS environment variable if desired.
+
+[Or run with the USER-OMP package by editing an input script:]
+
+The discussion above for the mpirun/mpiexec command, MPI tasks/node,
+and threads/MPI task is the same.
+
+Use the "suffix omp"_suffix.html command, or you can explicitly add an
+"omp" suffix to individual styles in your input script, e.g.
+
+pair_style lj/cut/omp 2.5 :pre
+
+You must also use the "package omp"_package.html command to enable the
+USER-OMP package, unless the "-sf omp" or "-pk omp" "command-line
+switches"_Section_start.html#start_7 were used.  It specifies how many
+threads per MPI task to use, as well as other options.  Its doc page
+explains how to set the number of threads via an environment variable
+if desired.
+
+[Speed-ups to expect:]
+
+Depending on which styles are accelerated, you should look for a
+reduction in the "Pair time", "Bond time", "KSpace time", and "Loop
+time" values printed at the end of a run.  
+
+You may see a small performance advantage (5 to 20%) when running a
+USER-OMP style (in serial or parallel) with a single thread per MPI
+task, versus running standard LAMMPS with its standard
+(un-accelerated) styles (in serial or all-MPI parallelization with 1
+task/core).  This is because many of the USER-OMP styles contain
+similar optimizations to those used in the OPT package, as described
+above.
+
+With multiple threads/task, the optimal choice of MPI tasks/node and
+OpenMP threads/task can vary a lot and should always be tested via
+benchmark runs for a specific simulation running on a specific
+machine, paying attention to guidelines discussed in the next
+sub-section.
+
+A description of the multi-threading strategy used in the USER-OMP
+package and some performance examples are "presented
+here"_http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1
+
+[Guidelines for best performance:]
+
+For many problems on current generation CPUs, running the USER-OMP
+package with a single thread/task is faster than running with multiple
+threads/task.  This is because the MPI parallelization in LAMMPS is
+often more efficient than multi-threading as implemented in the
+USER-OMP package.  The parallel efficiency (in a threaded sense) also
+varies for different USER-OMP styles.
+
+Using multiple threads/task can be more effective under the following
+circumstances:
+
+Individual compute nodes have a significant number of CPU cores but
+the CPU itself has limited memory bandwidth, e.g. for Intel Xeon 53xx
+(Clovertown) and 54xx (Harpertown) quad core processors. Running one
+MPI task per CPU core will result in significant performance
+degradation, so that running with 4 or even only 2 MPI tasks per node
+is faster.  Running in hybrid MPI+OpenMP mode will reduce the
+inter-node communication bandwidth contention in the same way, but
+offers an additional speedup by utilizing the otherwise idle CPU
+cores. :ulb,l
+
+The interconnect used for MPI communication does not provide
+sufficient bandwidth for a large number of MPI tasks per node.  For
+example, this applies to running over gigabit ethernet or on Cray XT4
+or XT5 series supercomputers.  As in the aforementioned case, this
+effect worsens when using an increasing number of nodes. :l
+
+The system has a spatially inhomogeneous particle density which does
+not map well to the "domain decomposition scheme"_processors.html or
+"load-balancing"_balance.html options that LAMMPS provides.  This is
+because multi-threading achives parallelism over the number of
+particles, not via their distribution in space. :l
+
+A machine is being used in "capability mode", i.e. near the point
+where MPI parallelism is maxed out.  For example, this can happen when
+using the "PPPM solver"_kspace_style.html for long-range
+electrostatics on large numbers of nodes.  The scaling of the KSpace
+calculation (see the "kspace_style"_kspace_style.html command) becomes
+the performance-limiting factor.  Using multi-threading allows less
+MPI tasks to be invoked and can speed-up the long-range solver, while
+increasing overall performance by parallelizing the pairwise and
+bonded calculations via OpenMP.  Likewise additional speedup can be
+sometimes be achived by increasing the length of the Coulombic cutoff
+and thus reducing the work done by the long-range solver.  Using the
+"run_style verlet/split"_run_style.html command, which is compatible
+with the USER-OMP package, is an alternative way to reduce the number
+of MPI tasks assigned to the KSpace calculation. :l,ule
+
+Additional performance tips are as follows:
+
+The best parallel efficiency from {omp} styles is typically achieved
+when there is at least one MPI task per physical processor,
+i.e. socket or die. :ulb,l
+
+It is usually most efficient to restrict threading to a single
+socket, i.e. use one or more MPI task per socket. :l
+
+Several current MPI implementation by default use a processor affinity
+setting that restricts each MPI task to a single CPU core.  Using
+multi-threading in this mode will force the threads to share that core
+and thus is likely to be counterproductive.  Instead, binding MPI
+tasks to a (multi-core) socket, should solve this issue. :l,ule
+
+[Restrictions:]
+
+None.
diff --git a/doc/accelerate_opt.html b/doc/accelerate_opt.html
new file mode 100644
index 0000000000..1293c2f637
--- /dev/null
+++ b/doc/accelerate_opt.html
@@ -0,0 +1,77 @@
+<HTML>
+<CENTER><A HREF = "Section_packages.html">Previous Section</A> - <A HREF = "http://lammps.sandia.gov">LAMMPS WWW Site</A> -
+<A HREF = "Manual.html">LAMMPS Documentation</A> - <A HREF = "Section_commands.html#comm">LAMMPS Commands</A> 
+</CENTER>
+
+
+
+
+
+
+<HR>
+
+<P><A HREF = "Section_accelerate.html">Return to Section accelerate overview</A>
+</P>
+<H4>5.3.6 OPT package 
+</H4>
+<P>The OPT package was developed by James Fischer (High Performance
+Technologies), David Richie, and Vincent Natoli (Stone Ridge
+Technologies).  It contains a handful of pair styles whose compute()
+methods were rewritten in C++ templated form to reduce the overhead
+due to if tests and other conditional code.
+</P>
+<P>Here is a quick overview of how to use the OPT package:
+</P>
+<UL><LI>include the OPT package and build LAMMPS
+<LI>use OPT pair styles in your input script 
+</UL>
+<P>The last step can be done using the "-sf opt" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  Or the effect of the "-sf" switch
+can be duplicated by adding a <A HREF = "suffix.html">suffix opt</A> command to your
+input script.
+</P>
+<P><B>Required hardware/software:</B>
+</P>
+<P>None.
+</P>
+<P><B>Building LAMMPS with the OPT package:</B>
+</P>
+<P>Include the package and build LAMMPS:
+</P>
+<PRE>cd lammps/src
+make yes-opt
+make machine 
+</PRE>
+<P>No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
+</P>
+<P><B>Run with the OPT package from the command line:</B>
+</P>
+<P>Use the "-sf opt" <A HREF = "Section_start.html#start_7">command-line switch</A>,
+which will automatically append "opt" to styles that support it.
+</P>
+<PRE>lmp_machine -sf opt -in in.script
+mpirun -np 4 lmp_machine -sf opt -in in.script 
+</PRE>
+<P><B>Or run with the OPT package by editing an input script:</B>
+</P>
+<P>Use the <A HREF = "suffix.html">suffix opt</A> command, or you can explicitly add an
+"opt" suffix to individual styles in your input script, e.g.
+</P>
+<PRE>pair_style lj/cut/opt 2.5 
+</PRE>
+<P><B>Speed-ups to expect:</B>
+</P>
+<P>You should see a reduction in the "Pair time" value printed at the end
+of a run.  On most machines for reasonable problem sizes, it will be a
+5 to 20% savings.
+</P>
+<P><B>Guidelines for best performance:</B>
+</P>
+<P>None.  Just try out an OPT pair style to see how it performs.
+</P>
+<P><B>Restrictions:</B>
+</P>
+<P>None.
+</P>
+</HTML>
diff --git a/doc/accelerate_opt.txt b/doc/accelerate_opt.txt
new file mode 100644
index 0000000000..d7e9225720
--- /dev/null
+++ b/doc/accelerate_opt.txt
@@ -0,0 +1,72 @@
+"Previous Section"_Section_packages.html - "LAMMPS WWW Site"_lws -
+"LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+"Return to Section accelerate overview"_Section_accelerate.html
+
+5.3.6 OPT package :h4
+
+The OPT package was developed by James Fischer (High Performance
+Technologies), David Richie, and Vincent Natoli (Stone Ridge
+Technologies).  It contains a handful of pair styles whose compute()
+methods were rewritten in C++ templated form to reduce the overhead
+due to if tests and other conditional code.
+
+Here is a quick overview of how to use the OPT package:
+
+include the OPT package and build LAMMPS
+use OPT pair styles in your input script :ul
+
+The last step can be done using the "-sf opt" "command-line
+switch"_Section_start.html#start_7.  Or the effect of the "-sf" switch
+can be duplicated by adding a "suffix opt"_suffix.html command to your
+input script.
+
+[Required hardware/software:]
+
+None.
+
+[Building LAMMPS with the OPT package:]
+
+Include the package and build LAMMPS:
+
+cd lammps/src
+make yes-opt
+make machine :pre
+
+No additional compile/link flags are needed in your Makefile.machine
+in src/MAKE.
+
+[Run with the OPT package from the command line:]
+
+Use the "-sf opt" "command-line switch"_Section_start.html#start_7,
+which will automatically append "opt" to styles that support it.
+
+lmp_machine -sf opt -in in.script
+mpirun -np 4 lmp_machine -sf opt -in in.script :pre
+
+[Or run with the OPT package by editing an input script:]
+
+Use the "suffix opt"_suffix.html command, or you can explicitly add an
+"opt" suffix to individual styles in your input script, e.g.
+
+pair_style lj/cut/opt 2.5 :pre
+
+[Speed-ups to expect:]
+
+You should see a reduction in the "Pair time" value printed at the end
+of a run.  On most machines for reasonable problem sizes, it will be a
+5 to 20% savings.
+
+[Guidelines for best performance:]
+
+None.  Just try out an OPT pair style to see how it performs.
+
+[Restrictions:]
+
+None.
diff --git a/doc/package.html b/doc/package.html
index 42fb14396b..7097c66588 100644
--- a/doc/package.html
+++ b/doc/package.html
@@ -22,7 +22,10 @@
 <PRE>  <I>cuda</I> args = Ngpu keyword value ...
     Ngpu = # of GPUs per node
     zero or more keyword/value pairs may be appended
-    keywords = <I>gpuID</I> or <I>timing</I> or <I>test</I> or <I>thread</I>
+    keywords = <I>newton</I> or <I>gpuID</I> or <I>timing</I> or <I>test</I> or <I>thread</I>
+      <I>newton</I> = <I>off</I> or <I>on</I>
+        off = set Newton pairwise and bonded flags off (default)
+        on = set Newton pairwise and bonded flags on
       <I>gpuID</I> values = gpu1 .. gpuN
         gpu1 .. gpuN = IDs of the Ngpu GPUs to use
       <I>timing</I> values = none
@@ -39,6 +42,9 @@
       <I>neigh</I> value = <I>yes</I> or <I>no</I>
         yes = neighbor list build on GPU (default)
         no = neighbor list build on CPU
+      <I>newton</I> = <I>off</I> or <I>on</I>
+        off = set Newton pairwise flag off (default and required)
+        on = set Newton pairwise flag on (currently not allowed)
       <I>split</I> = fraction
         fraction = fraction of atoms assigned to GPU (default = 1.0)
       <I>gpuID</I> values = first last
@@ -68,11 +74,24 @@
      <I>tptask</I> value = Ntptask
        Ntptask = max number of threads to use on coprocessor for each MPI task
   <I>kokkos</I> args = keyword value ...
-    one or more keyword/value pairs may be appended
-    keywords = <I>neigh</I> or <I>comm/exchange</I> or <I>comm/forward</I>
+    zero or more keyword/value pairs may be appended
+    keywords = <I>neigh</I> or <I>comm</I> or <I>comm/exchange</I> or <I>comm/forward</I>
       <I>neigh</I> value = <I>full</I> or <I>half/thread</I> or <I>half</I> or <I>n2</I> or <I>full/cluster</I>
+        full = full neighbor list
+        half/thread = half neighbor list built in thread-safe manner
+        half = half neighbor list, not thread-safe, only use when 1 thread/MPI task
+        n2 = non-binning neighbor list build, O(N^2) algorithm
+        full/cluster = full neighbor list with clustered groups of atoms
+      <I>newton</I> = <I>off</I> or <I>on</I>
+        off = set Newton pairwise and bonded flags off (default)
+        on = set Newton pairwise and bonded flags on
+      <I>comm</I> value = <I>no</I> or <I>host</I> or <I>device</I>
+        use value for both comm/exchange and comm/forward
       <I>comm/exchange</I> value = <I>no</I> or <I>host</I> or <I>device</I>
       <I>comm/forward</I> value = <I>no</I> or <I>host</I> or <I>device</I>
+        no = perform communication pack/unpack in non-KOKKOS mode
+        host = perform pack/unpack on host (e.g. with OpenMP threading)
+        device = perform pack/unpack on device (e.g. on GPU)
   <I>omp</I> args = Nthreads keyword value ...
     Nthread = # of OpenMP threads to associate with each MPI process
     zero or more keyword/value pairs may be appended 
@@ -88,39 +107,59 @@
 <PRE>package gpu 1
 package gpu 1 split 0.75
 package gpu 2 split -1.0
-package cuda gpu/node/special 2 0 2
-package cuda test 3948
-package kokkos neigh half/thread comm/forward device
-package omp 0 neigh yes
+package cuda 2 gpuID 0 2
+package cuda 1 test 3948
+package kokkos neigh half/thread comm device
+package omp 0 neigh no
 package omp 4
 package intel * mixed balance -1 
 </PRE>
 <P><B>Description:</B>
 </P>
-<P>This command invokes package-specific settings.  Currently the
-following packages use it: USER-CUDA, GPU, USER-INTEL, KOKKOS, and
-USER-OMP.
+<P>This command invokes package-specific settings for the various
+accelerator packages available in LAMMPS.  Currently the following
+packages use settings from this command: USER-CUDA, GPU, USER-INTEL,
+KOKKOS, and USER-OMP.
 </P>
-<P>Talk about command line switches
+<P>If this command is specified in an input script, it must be near the
+top of the script, before the simulation box has been defined.  This
+is because it specifies settings that the accelerator packages use in
+their intialization, before a simultion is defined.
 </P>
-<P>When does it have to be invoked
+<P>This command can also be specified from the command-line when
+launching LAMMPS, using the "-pk" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  The syntax is exactly the same as
+when used in an input script.
 </P>
-<P>To use the accelerated GPU and USER-OMP styles, the use of the package
-command is required.  However, as described in the "Defaults" section
-below, if you use the "-sf gpu" or "-sf omp" <A HREF = "Section_start.html#start_7">command-line
-options</A> to enable use of these styles,
-then default package settings are enabled.  In that case you only need
-to use the package command if you want to change the defaults.
+<P>Note that all of the accelerator packages require the package command
+to be specified (except the OPT package), if the package is to be used
+in a simulation (LAMMPS can be built with an accelerator package
+without using it in a particular simulation).  However, in all cases,
+a default version of the command is typically invoked by other
+accelerator settings.
 </P>
-<P>To use the accelerated USER-CUDA and KOKKOS styles, the package
-command is not required as defaults are assigned internally.  You only
-need to use the package command if you want to change the defaults.
+<P>The USER-CUDA and KOKKOS packages require a "-c on" or "-k on"
+<A HREF = "Section_start.html#start_7">command-line switch</A> respectively, which
+invokes a "package cuda" or "package kokkos" command with default
+settings.
 </P>
-<P>See <A HREF = "Section_accelerate.html">Section_accelerate</A> of the manual for
-more details about using these various packages for accelerating
-LAMMPS calculations.
+<P>For the GPU, USER-INTEL, and USER-OMP packages, if a "-sf gpu" or "-sf
+intel" or "-sf omp" <A HREF = "Section_start.html#start_7">command-line switch</A>
+is used to auto-append accelerator suffixes to various styles in the
+input script, then those switches also invoke a "package gpu",
+"package intel", or "package omp" command with default settings.
 </P>
-<P>Package GPU always sets newton pair off.  Not so for USER-CUDA>
+<P>IMPORTANT NOTE: A package command for a particular style can be
+invoked multiple times when a simulation is setup, e.g. by the "-c
+on", "-k on", "-sf", and "-pk" <A HREF = "Section_start.html#start_7">command-line
+switches</A>, and by using this command in an
+input script.  Each time it is used all of the style options are set,
+either to default values or to specified settings.  I.e. settings from
+previous invocations do not persist across multiple invocations.
+</P>
+<P>See the <A HREF = "Section_accelerate.html">Section Accelerate</A> section of the
+manual for more details about using the various accelerator packages
+for speeding up LAMMPS simulations.
 </P>
 <HR>
 
@@ -133,6 +172,12 @@ exactly one MPI task per GPU, as set by the mpirun or mpiexec command.
 <P>Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 </P>
+<P>The <I>newton</I> keyword sets the Newton flags for pairwise and bonded
+interactions to <I>off</I> or <I>on</I>, the same as the <A HREF = "newton.html">newton</A>
+command allows.  The default is <I>off</I> because this will almost always
+give better performance for the USER-CUDA package.  This means
+more computation is done, but less communication.
+</P>
 <P>The <I>gpuID</I> keyword allows selection of which GPUs on each node will
 be used for a simulation.  GPU IDs range from 0 to N-1 where N is the
 physical number of GPUs/node.  An ID is specified for each of the
@@ -197,6 +242,16 @@ enabled command requires a neighbor list, it will also be built on the
 CPU.  In these cases, it will typically be more efficient to only use
 CPU neighbor list builds.
 </P>
+<P>The <I>newton</I> keyword sets the Newton flags for pairwise (not bonded)
+interactions to <I>off</I> or <I>on</I>, the same as the <A HREF = "newton.html">newton</A>
+command allows.  Currently, only an <I>off</I> value is allowed, since all
+the GPU package pair styles require this setting.  This means more
+computation is done, but less communication.  In the future a value of
+<I>on</I> may be allowed, so the <I>newton</I> keyword is included as an option
+for compatibility with the package command for other accelerator
+styles.  Note that the newton setting for bonded interactions is not
+affected by this keyword.
+</P>
 <P>The <I>split</I> keyword can be used for load balancing force calculations
 between CPU and GPU cores in GPU-enabled pair styles. If 0 < <I>split</I> <
 1.0, a fixed fraction of particles is offloaded to the GPU while force
@@ -327,32 +382,55 @@ generation Xeon Phi chip.
 <P>The <I>kokkos</I> style invokes settings associated with the use of the
 KOKKOS package.
 </P>
-<P>The <I>neigh</I> keyword determines what kinds of neighbor lists are built.
-A value of <I>half</I> uses half-neighbor lists, the same as used by most
-pair styles in LAMMPS.  A value of <I>half/thread</I> uses a threadsafe
-variant of the half-neighbor list.  It should be used instead of
-<I>half</I> when running with threads on a CPU.  A value of <I>full</I> uses a
-full-neighborlist, i.e. f_ij and f_ji are both calculated.  This
-performs twice as much computation as the <I>half</I> option, however that
-can be a win because it is threadsafe and doesn't require atomic
-operations.  A value of <I>full/cluster</I> is an experimental neighbor
-style, where particles interact with all particles within a small
-cluster, if at least one of the clusters particles is within the
-neighbor cutoff range.  This potentially allows for better
-vectorization on architectures such as the Intel Phi.  If also reduces
-the size of the neighbor list by roughly a factor of the cluster size,
-thus reducing the total memory footprint considerably.
+<P>All of the settings are optional keyword/value pairs.  Each has a
+default value as listed below.
 </P>
-<P>The <I>comm/exchange</I> and <I>comm/forward</I> keywords determine whether the
-host or device performs the packing and unpacking of data when
-communicating information between processors.  "Exchange"
+<P>The <I>neigh</I> keyword determines how neighbor lists are built.  A value
+of <I>half</I> uses half-neighbor lists, the same as used by most pair
+styles in LAMMPS.  A value of <I>half/thread</I> uses a thread-safe variant
+of the half-neighbor list.  It should be used instead of <I>half</I> when
+running with more than 1 threads per MPI task on a CPU.  A value of
+<I>n2</I> uses an O(N^2) algorithm to build the neighbor list without
+binning, where N = # of atoms on a processor.  It is typically slower
+than the other methods, which use binning.
+</P>
+<P>A value of <I>full</I> uses a full neighbor lists and is the default.  This
+performs twice as much computation as the <I>half</I> option, however that
+is often a win because it is thread-safe and doesn't require atomic
+operations in the calculation of pair forces.  For that reason, <I>full</I>
+is the default setting.  However, when running in MPI-only mode with 1
+thread per MPI task, <I>half</I> neighbor lists will typically be faster,
+just as it is for non-accelerated pair styles.
+</P>
+<P>A value of <I>full/cluster</I> is an experimental neighbor style, where
+particles interact with all particles within a small cluster, if at
+least one of the clusters particles is within the neighbor cutoff
+range.  This potentially allows for better vectorization on
+architectures such as the Intel Phi.  If also reduces the size of the
+neighbor list by roughly a factor of the cluster size, thus reducing
+the total memory footprint considerably.
+</P>
+<P>The <I>newton</I> keyword sets the Newton flags for pairwise and bonded
+interactions to <I>off</I> or <I>on</I>, the same as the <A HREF = "newton.html">newton</A>
+command allows.  The default is <I>off</I> because this will almost always
+give better performance for the KOKKOS package.  This means more
+computation is done, but less communication.  However, when running in
+MPI-only mode with 1 thread per MPI task, a value of <I>on</I> will
+typically be faster, just as it is for non-accelerated pair styles.
+</P>
+<P>The <I>comm</I> and <I>comm/exchange</I> and <I>comm/forward</I> keywords determine
+whether the host or device performs the packing and unpacking of data
+when communicating per-atom data between processors.  "Exchange"
 communication happens only on timesteps that neighbor lists are
 rebuilt.  The data is only for atoms that migrate to new processors.
 "Forward" communication happens every timestep.  The data is for atom
 coordinates and any other atom properties that needs to be updated for
 ghost atoms owned by each processor.
 </P>
-<P>The value options for these keywords are <I>no</I> or <I>host</I> or <I>device</I>.
+<P>The <I>comm</I> keyword is simply a short-cut to set the same value
+for both the <I>comm/exchange</I> and <I>comm/forward</I> keywords.
+</P>
+<P>The value options for all 3 keywords are <I>no</I> or <I>host</I> or <I>device</I>.
 A value of <I>no</I> means to use the standard non-KOKKOS method of
 packing/unpacking data for the communication.  A value of <I>host</I> means
 to use the host, typically a multi-core CPU, and perform the
@@ -361,10 +439,12 @@ to use the device, typically a GPU, to perform the packing/unpacking
 operation.
 </P>
 <P>The optimal choice for these keywords depends on the input script and
-the hardware used.  The <I>no</I> value is useful for verifying that Kokkos
-code is working correctly.  It may also be the fastest choice when
-using Kokkos styles in MPI-only mode (i.e. with a thread count of 1).
-When running on CPUs or Xeon Phi, the <I>host</I> and <I>device</I> values work
+the hardware used.  The <I>no</I> value is useful for verifying that the
+Kokkos-based <I>host</I> and <I>device</I> values are working correctly.  It may
+also be the fastest choice when using Kokkos styles in MPI-only mode
+(i.e. with a thread count of 1).
+</P>
+<P>When running on CPUs or Xeon Phi, the <I>host</I> and <I>device</I> values work
 identically.  When using GPUs, the <I>device</I> value will typically be
 optimal if all of your styles used in your input script are supported
 by the KOKKOS package.  In this case data can stay on the GPU for many
@@ -468,39 +548,39 @@ setting</A>
 </P>
 <P><B>Default:</B>
 </P>
-<P>To use the USER-CUDA package, the package cuda command must be invoked
-explicitly in your input script or via the "-pk cuda" <A HREF = "Section_start.html#start_7">command-line
-switch</A>.  This will set the # of GPUs/node.
-The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled,
-test = not enabled, and thread = auto.
+<P>For the USER-CUDA package, the default is Ngpu = 1 and the option
+defaults are newton = off, gpuID = 0 to Ngpu-1, timing = not enabled,
+test = not enabled, and thread = auto.  These settings are made
+automatically by the required "-c on" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.  You can change them bu using the
+package cuda command in your input script or via the "-pk cuda"
+<A HREF = "Section_start.html#start_7">command-line switch</A>.
 </P>
 <P>For the GPU package, the default is Ngpu = 1 and the option defaults
-are neigh = yes, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1, binsize =
-pair cutoff + neighbor skin, device = not used.  These settings are
-made automatically if the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line
+are neigh = yes, newton = off, split = 1.0, gpuID = 0 to Ngpu-1, tpa =
+1, binsize = pair cutoff + neighbor skin, device = not used.  These
+settings are made automatically if the "-sf gpu" <A HREF = "Section_start.html#start_7">command-line
 switch</A> is used.  If it is not used, you
 must invoke the package gpu command in your input script or via the
 "-pk gpu" <A HREF = "Section_start.html#start_7">command-line switch</A>.
 </P>
 <P>For the USER-INTEL package, the default is Nphi = 1 and the option
-defaults are prec = mixed, balance = -1, tpc = 4, tptask = 240.  The
-default ghost option is determined by the pair style being used.  This
-value used is output to the screen in the offload report at the end of
-each run.  These settings are made automatically if the "-sf intel"
-<A HREF = "Section_start.html#start_7">command-line switch</A> is used.  If it is
-not used, you must invoke the package intel command in your input
-script or or via the "-pk intel" <A HREF = "Section_start.html#start_7">command-line
-switch</A>.
+defaults are prec = mixed, balance = -1, tpc = 4, tptask = 240.  Note
+that all of these settings, except "prec", are ignored if LAMMPS was
+not built with Xeon Phi coprocessor support.  The default ghost option
+is determined by the pair style being used.  This value is output to
+the screen in the offload report at the end of each run.  These
+settings are made automatically if the "-sf intel" <A HREF = "Section_start.html#start_7">command-line
+switch</A> is used.  If it is not used, you
+must invoke the package intel command in your input script or or via
+the "-pk intel" <A HREF = "Section_start.html#start_7">command-line switch</A>.
 </P>
-<P>The default settings for the KOKKOS package are "package kokkos neigh
-full comm/exchange host comm/forward host".  This is the case whether
-the "-sf kk" <A HREF = "Section_start.html#start_7">command-line switch</A> is used
-or not.
-To use the KOKKOS package, the package kokkos command must be invoked
-explicitly in your input script or via the "-pk kokkos" <A HREF = "Section_start.html#start_7">command-line
-switch</A>.  This will set the # of GPUs/node.
-The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled,
-test = not enabled, and thread = auto.
+<P>For the KOKKOS package, the option defaults neigh = full, newton =
+off, and comm = host.  These settings are made automatically by the
+required "-k on" <A HREF = "Section_start.html#start_7">command-line switch</A>.
+You can change them bu using the package kokkos command in your input
+script or via the "-pk kokkos" <A HREF = "Section_start.html#start_7">command-line
+switch</A>.
 </P>
 <P>For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
diff --git a/doc/package.txt b/doc/package.txt
index 9c565f35b2..5a3ad7b506 100644
--- a/doc/package.txt
+++ b/doc/package.txt
@@ -17,7 +17,10 @@ args = arguments specific to the style :l
   {cuda} args = Ngpu keyword value ...
     Ngpu = # of GPUs per node
     zero or more keyword/value pairs may be appended
-    keywords = {gpuID} or {timing} or {test} or {thread}
+    keywords = {newton} or {gpuID} or {timing} or {test} or {thread}
+      {newton} = {off} or {on}
+        off = set Newton pairwise and bonded flags off (default)
+        on = set Newton pairwise and bonded flags on
       {gpuID} values = gpu1 .. gpuN
         gpu1 .. gpuN = IDs of the Ngpu GPUs to use
       {timing} values = none
@@ -34,6 +37,9 @@ args = arguments specific to the style :l
       {neigh} value = {yes} or {no}
         yes = neighbor list build on GPU (default)
         no = neighbor list build on CPU
+      {newton} = {off} or {on}
+        off = set Newton pairwise flag off (default and required)
+        on = set Newton pairwise flag on (currently not allowed)
       {split} = fraction
         fraction = fraction of atoms assigned to GPU (default = 1.0)
       {gpuID} values = first last
@@ -63,11 +69,24 @@ args = arguments specific to the style :l
      {tptask} value = Ntptask
        Ntptask = max number of threads to use on coprocessor for each MPI task
   {kokkos} args = keyword value ...
-    one or more keyword/value pairs may be appended
-    keywords = {neigh} or {comm/exchange} or {comm/forward}
+    zero or more keyword/value pairs may be appended
+    keywords = {neigh} or {comm} or {comm/exchange} or {comm/forward}
       {neigh} value = {full} or {half/thread} or {half} or {n2} or {full/cluster}
+        full = full neighbor list
+        half/thread = half neighbor list built in thread-safe manner
+        half = half neighbor list, not thread-safe, only use when 1 thread/MPI task
+        n2 = non-binning neighbor list build, O(N^2) algorithm
+        full/cluster = full neighbor list with clustered groups of atoms
+      {newton} = {off} or {on}
+        off = set Newton pairwise and bonded flags off (default)
+        on = set Newton pairwise and bonded flags on
+      {comm} value = {no} or {host} or {device}
+        use value for both comm/exchange and comm/forward
       {comm/exchange} value = {no} or {host} or {device}
       {comm/forward} value = {no} or {host} or {device}
+        no = perform communication pack/unpack in non-KOKKOS mode
+        host = perform pack/unpack on host (e.g. with OpenMP threading)
+        device = perform pack/unpack on device (e.g. on GPU)
   {omp} args = Nthreads keyword value ...
     Nthread = # of OpenMP threads to associate with each MPI process
     zero or more keyword/value pairs may be appended 
@@ -82,39 +101,59 @@ args = arguments specific to the style :l
 package gpu 1
 package gpu 1 split 0.75
 package gpu 2 split -1.0
-package cuda gpu/node/special 2 0 2
-package cuda test 3948
-package kokkos neigh half/thread comm/forward device
-package omp 0 neigh yes
+package cuda 2 gpuID 0 2
+package cuda 1 test 3948
+package kokkos neigh half/thread comm device
+package omp 0 neigh no
 package omp 4
 package intel * mixed balance -1 :pre
 
 [Description:]
 
-This command invokes package-specific settings.  Currently the
-following packages use it: USER-CUDA, GPU, USER-INTEL, KOKKOS, and
-USER-OMP.
+This command invokes package-specific settings for the various
+accelerator packages available in LAMMPS.  Currently the following
+packages use settings from this command: USER-CUDA, GPU, USER-INTEL,
+KOKKOS, and USER-OMP.
 
-Talk about command line switches
+If this command is specified in an input script, it must be near the
+top of the script, before the simulation box has been defined.  This
+is because it specifies settings that the accelerator packages use in
+their intialization, before a simultion is defined.
 
-When does it have to be invoked
+This command can also be specified from the command-line when
+launching LAMMPS, using the "-pk" "command-line
+switch"_Section_start.html#start_7.  The syntax is exactly the same as
+when used in an input script.
 
-To use the accelerated GPU and USER-OMP styles, the use of the package
-command is required.  However, as described in the "Defaults" section
-below, if you use the "-sf gpu" or "-sf omp" "command-line
-options"_Section_start.html#start_7 to enable use of these styles,
-then default package settings are enabled.  In that case you only need
-to use the package command if you want to change the defaults.
+Note that all of the accelerator packages require the package command
+to be specified (except the OPT package), if the package is to be used
+in a simulation (LAMMPS can be built with an accelerator package
+without using it in a particular simulation).  However, in all cases,
+a default version of the command is typically invoked by other
+accelerator settings.
 
-To use the accelerated USER-CUDA and KOKKOS styles, the package
-command is not required as defaults are assigned internally.  You only
-need to use the package command if you want to change the defaults.
+The USER-CUDA and KOKKOS packages require a "-c on" or "-k on"
+"command-line switch"_Section_start.html#start_7 respectively, which
+invokes a "package cuda" or "package kokkos" command with default
+settings.
 
-See "Section_accelerate"_Section_accelerate.html of the manual for
-more details about using these various packages for accelerating
-LAMMPS calculations.
+For the GPU, USER-INTEL, and USER-OMP packages, if a "-sf gpu" or "-sf
+intel" or "-sf omp" "command-line switch"_Section_start.html#start_7
+is used to auto-append accelerator suffixes to various styles in the
+input script, then those switches also invoke a "package gpu",
+"package intel", or "package omp" command with default settings.
 
-Package GPU always sets newton pair off.  Not so for USER-CUDA>
+IMPORTANT NOTE: A package command for a particular style can be
+invoked multiple times when a simulation is setup, e.g. by the "-c
+on", "-k on", "-sf", and "-pk" "command-line
+switches"_Section_start.html#start_7, and by using this command in an
+input script.  Each time it is used all of the style options are set,
+either to default values or to specified settings.  I.e. settings from
+previous invocations do not persist across multiple invocations.
+
+See the "Section Accelerate"_Section_accelerate.html section of the
+manual for more details about using the various accelerator packages
+for speeding up LAMMPS simulations.
 
 :line
 
@@ -127,6 +166,12 @@ exactly one MPI task per GPU, as set by the mpirun or mpiexec command.
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
+The {newton} keyword sets the Newton flags for pairwise and bonded
+interactions to {off} or {on}, the same as the "newton"_newton.html
+command allows.  The default is {off} because this will almost always
+give better performance for the USER-CUDA package.  This means
+more computation is done, but less communication.
+
 The {gpuID} keyword allows selection of which GPUs on each node will
 be used for a simulation.  GPU IDs range from 0 to N-1 where N is the
 physical number of GPUs/node.  An ID is specified for each of the
@@ -191,6 +236,16 @@ enabled command requires a neighbor list, it will also be built on the
 CPU.  In these cases, it will typically be more efficient to only use
 CPU neighbor list builds.
 
+The {newton} keyword sets the Newton flags for pairwise (not bonded)
+interactions to {off} or {on}, the same as the "newton"_newton.html
+command allows.  Currently, only an {off} value is allowed, since all
+the GPU package pair styles require this setting.  This means more
+computation is done, but less communication.  In the future a value of
+{on} may be allowed, so the {newton} keyword is included as an option
+for compatibility with the package command for other accelerator
+styles.  Note that the newton setting for bonded interactions is not
+affected by this keyword.
+
 The {split} keyword can be used for load balancing force calculations
 between CPU and GPU cores in GPU-enabled pair styles. If 0 < {split} <
 1.0, a fixed fraction of particles is offloaded to the GPU while force
@@ -321,32 +376,55 @@ generation Xeon Phi chip.
 The {kokkos} style invokes settings associated with the use of the
 KOKKOS package.
 
-The {neigh} keyword determines what kinds of neighbor lists are built.
-A value of {half} uses half-neighbor lists, the same as used by most
-pair styles in LAMMPS.  A value of {half/thread} uses a threadsafe
-variant of the half-neighbor list.  It should be used instead of
-{half} when running with threads on a CPU.  A value of {full} uses a
-full-neighborlist, i.e. f_ij and f_ji are both calculated.  This
-performs twice as much computation as the {half} option, however that
-can be a win because it is threadsafe and doesn't require atomic
-operations.  A value of {full/cluster} is an experimental neighbor
-style, where particles interact with all particles within a small
-cluster, if at least one of the clusters particles is within the
-neighbor cutoff range.  This potentially allows for better
-vectorization on architectures such as the Intel Phi.  If also reduces
-the size of the neighbor list by roughly a factor of the cluster size,
-thus reducing the total memory footprint considerably.
+All of the settings are optional keyword/value pairs.  Each has a
+default value as listed below.
 
-The {comm/exchange} and {comm/forward} keywords determine whether the
-host or device performs the packing and unpacking of data when
-communicating information between processors.  "Exchange"
+The {neigh} keyword determines how neighbor lists are built.  A value
+of {half} uses half-neighbor lists, the same as used by most pair
+styles in LAMMPS.  A value of {half/thread} uses a thread-safe variant
+of the half-neighbor list.  It should be used instead of {half} when
+running with more than 1 threads per MPI task on a CPU.  A value of
+{n2} uses an O(N^2) algorithm to build the neighbor list without
+binning, where N = # of atoms on a processor.  It is typically slower
+than the other methods, which use binning.
+
+A value of {full} uses a full neighbor lists and is the default.  This
+performs twice as much computation as the {half} option, however that
+is often a win because it is thread-safe and doesn't require atomic
+operations in the calculation of pair forces.  For that reason, {full}
+is the default setting.  However, when running in MPI-only mode with 1
+thread per MPI task, {half} neighbor lists will typically be faster,
+just as it is for non-accelerated pair styles.
+
+A value of {full/cluster} is an experimental neighbor style, where
+particles interact with all particles within a small cluster, if at
+least one of the clusters particles is within the neighbor cutoff
+range.  This potentially allows for better vectorization on
+architectures such as the Intel Phi.  If also reduces the size of the
+neighbor list by roughly a factor of the cluster size, thus reducing
+the total memory footprint considerably.
+
+The {newton} keyword sets the Newton flags for pairwise and bonded
+interactions to {off} or {on}, the same as the "newton"_newton.html
+command allows.  The default is {off} because this will almost always
+give better performance for the KOKKOS package.  This means more
+computation is done, but less communication.  However, when running in
+MPI-only mode with 1 thread per MPI task, a value of {on} will
+typically be faster, just as it is for non-accelerated pair styles.
+
+The {comm} and {comm/exchange} and {comm/forward} keywords determine
+whether the host or device performs the packing and unpacking of data
+when communicating per-atom data between processors.  "Exchange"
 communication happens only on timesteps that neighbor lists are
 rebuilt.  The data is only for atoms that migrate to new processors.
 "Forward" communication happens every timestep.  The data is for atom
 coordinates and any other atom properties that needs to be updated for
 ghost atoms owned by each processor.
 
-The value options for these keywords are {no} or {host} or {device}.
+The {comm} keyword is simply a short-cut to set the same value
+for both the {comm/exchange} and {comm/forward} keywords.
+
+The value options for all 3 keywords are {no} or {host} or {device}.
 A value of {no} means to use the standard non-KOKKOS method of
 packing/unpacking data for the communication.  A value of {host} means
 to use the host, typically a multi-core CPU, and perform the
@@ -355,9 +433,11 @@ to use the device, typically a GPU, to perform the packing/unpacking
 operation.
 
 The optimal choice for these keywords depends on the input script and
-the hardware used.  The {no} value is useful for verifying that Kokkos
-code is working correctly.  It may also be the fastest choice when
-using Kokkos styles in MPI-only mode (i.e. with a thread count of 1).
+the hardware used.  The {no} value is useful for verifying that the
+Kokkos-based {host} and {device} values are working correctly.  It may
+also be the fastest choice when using Kokkos styles in MPI-only mode
+(i.e. with a thread count of 1).
+
 When running on CPUs or Xeon Phi, the {host} and {device} values work
 identically.  When using GPUs, the {device} value will typically be
 optimal if all of your styles used in your input script are supported
@@ -462,39 +542,39 @@ setting"_Section_start.html#start_7
 
 [Default:]
 
-To use the USER-CUDA package, the package cuda command must be invoked
-explicitly in your input script or via the "-pk cuda" "command-line
-switch"_Section_start.html#start_7.  This will set the # of GPUs/node.
-The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled,
-test = not enabled, and thread = auto.
+For the USER-CUDA package, the default is Ngpu = 1 and the option
+defaults are newton = off, gpuID = 0 to Ngpu-1, timing = not enabled,
+test = not enabled, and thread = auto.  These settings are made
+automatically by the required "-c on" "command-line
+switch"_Section_start.html#start_7.  You can change them bu using the
+package cuda command in your input script or via the "-pk cuda"
+"command-line switch"_Section_start.html#start_7.
 
 For the GPU package, the default is Ngpu = 1 and the option defaults
-are neigh = yes, split = 1.0, gpuID = 0 to Ngpu-1, tpa = 1, binsize =
-pair cutoff + neighbor skin, device = not used.  These settings are
-made automatically if the "-sf gpu" "command-line
+are neigh = yes, newton = off, split = 1.0, gpuID = 0 to Ngpu-1, tpa =
+1, binsize = pair cutoff + neighbor skin, device = not used.  These
+settings are made automatically if the "-sf gpu" "command-line
 switch"_Section_start.html#start_7 is used.  If it is not used, you
 must invoke the package gpu command in your input script or via the
 "-pk gpu" "command-line switch"_Section_start.html#start_7.
 
 For the USER-INTEL package, the default is Nphi = 1 and the option
-defaults are prec = mixed, balance = -1, tpc = 4, tptask = 240.  The
-default ghost option is determined by the pair style being used.  This
-value used is output to the screen in the offload report at the end of
-each run.  These settings are made automatically if the "-sf intel"
-"command-line switch"_Section_start.html#start_7 is used.  If it is
-not used, you must invoke the package intel command in your input
-script or or via the "-pk intel" "command-line
-switch"_Section_start.html#start_7.
+defaults are prec = mixed, balance = -1, tpc = 4, tptask = 240.  Note
+that all of these settings, except "prec", are ignored if LAMMPS was
+not built with Xeon Phi coprocessor support.  The default ghost option
+is determined by the pair style being used.  This value is output to
+the screen in the offload report at the end of each run.  These
+settings are made automatically if the "-sf intel" "command-line
+switch"_Section_start.html#start_7 is used.  If it is not used, you
+must invoke the package intel command in your input script or or via
+the "-pk intel" "command-line switch"_Section_start.html#start_7.
 
-The default settings for the KOKKOS package are "package kokkos neigh
-full comm/exchange host comm/forward host".  This is the case whether
-the "-sf kk" "command-line switch"_Section_start.html#start_7 is used
-or not.
-To use the KOKKOS package, the package kokkos command must be invoked
-explicitly in your input script or via the "-pk kokkos" "command-line
-switch"_Section_start.html#start_7.  This will set the # of GPUs/node.
-The options defaults are gpuID = 0 to Ngpu-1, timing = not enabled,
-test = not enabled, and thread = auto.
+For the KOKKOS package, the option defaults neigh = full, newton =
+off, and comm = host.  These settings are made automatically by the
+required "-k on" "command-line switch"_Section_start.html#start_7.
+You can change them bu using the package kokkos command in your input
+script or via the "-pk kokkos" "command-line
+switch"_Section_start.html#start_7.
 
 For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
@@ -502,4 +582,3 @@ the "-sf omp" "command-line switch"_Section_start.html#start_7 is
 used.  If it is not used, you must invoke the package omp command in
 your input script or via the "-pk omp" "command-line
 switch"_Section_start.html#start_7.
-
diff --git a/examples/gpu/in.gpu.melt.2.5 b/examples/gpu/in.gpu.melt.2.5
index fb58e26f24..be59cc1099 100644
--- a/examples/gpu/in.gpu.melt.2.5
+++ b/examples/gpu/in.gpu.melt.2.5
@@ -1,7 +1,6 @@
 # 3d Lennard-Jones melt
 
-newton          off
-package 	gpu force/neigh 0 1 1
+package 	gpu 1
 
 variable	x index 2
 variable	y index 2
diff --git a/examples/gpu/in.gpu.melt.5.0 b/examples/gpu/in.gpu.melt.5.0
index cafef97fa6..00a65a8374 100644
--- a/examples/gpu/in.gpu.melt.5.0
+++ b/examples/gpu/in.gpu.melt.5.0
@@ -1,7 +1,6 @@
 # 3d Lennard-Jones melt
 
-newton          off
-package 	gpu force/neigh 0 1 1 threads_per_atom 8
+package 	gpu 1 tpa 8
 
 variable	x index 2
 variable	y index 2
diff --git a/examples/gpu/in.gpu.phosphate b/examples/gpu/in.gpu.phosphate
index 49108f8b3e..a9b9679ec0 100644
--- a/examples/gpu/in.gpu.phosphate
+++ b/examples/gpu/in.gpu.phosphate
@@ -1,8 +1,7 @@
 # GI-System
 
 units metal
-newton off
-package		gpu force/neigh 0 1 1
+package		gpu 1
 
 atom_style      charge 
 read_data 	data.phosphate
diff --git a/examples/gpu/in.gpu.rhodo b/examples/gpu/in.gpu.rhodo
index 3c14baf0e2..d8eaf46b39 100644
--- a/examples/gpu/in.gpu.rhodo
+++ b/examples/gpu/in.gpu.rhodo
@@ -1,7 +1,6 @@
 # Rhodopsin model
 
-newton off
-package 	gpu force/neigh 0 1 1
+package 	gpu 1
 
 variable	x index 2
 variable	y index 2
diff --git a/examples/intel/in.intel.lc b/examples/intel/in.intel.lc
index 7f0d131dfa..56bb9406b7 100644
--- a/examples/intel/in.intel.lc
+++ b/examples/intel/in.intel.lc
@@ -4,8 +4,8 @@
 # cutoff 4.0 with skin 0.8
 # NPT, T=2.4, P=8.0
 
-package intel * mixed balance $b
-package omp *
+package intel 1 prec mixed balance $b
+package omp 0
 suffix $s
 processors * * * grid numa
 
diff --git a/examples/intel/in.intel.rhodo b/examples/intel/in.intel.rhodo
index 5e65ebca81..0bf753f72d 100644
--- a/examples/intel/in.intel.rhodo
+++ b/examples/intel/in.intel.rhodo
@@ -1,7 +1,7 @@
 # Rhodopsin model
 
-package intel * mixed balance $b
-package omp *
+package intel 1 prec mixed balance $b
+package omp 0
 suffix $s
 
 variable	x index 4
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 6414b58a0e..d170a5343c 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -94,6 +94,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   _gpu_mode = GPU_NEIGH;
   _particle_split = 1.0;
   int nthreads = 1;
+  int newtonflag = 0;
   int threads_per_atom = -1;
   double binsize = -1;
   char *opencl_flags = NULL;
@@ -107,6 +108,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg]+1,"hybrid") == 0) _gpu_mode = GPU_HYB_NEIGH;
       else error->all(FLERR,"Illegal package gpu command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"newton") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      if (strcmp(arg[iarg+1],"off") == 0) newtonflag = 0;
+      else if (strcmp(arg[iarg+1],"on") == 0) newtonflag = 1;
+      else error->all(FLERR,"Illegal package gpu command");
     } else if (strcmp(arg[iarg],"split") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       _particle_split = force->numeric(FLERR,arg[iarg+1]);
@@ -150,9 +156,12 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
     error->all(FLERR,"No OpenMP support compiled in");
   #endif
 
-  // set newton_pair = 0 since required by all GPU pair styles
+  // set newton pair flag
+  // require newtonflag = 0 since currently required by all GPU pair styles
 
-  force->newton_pair = 0;
+  if (newtonflag == 1) error->all(FLERR,"Illegal package gpu command");
+
+  force->newton_pair = newtonflag;
   if (force->newton_pair || force->newton_bond) force->newton = 1;
   else force->newton = 0;
 
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 5ddd1bac60..ee1d231894 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -17,6 +17,7 @@
 #include "ctype.h"
 #include "kokkos.h"
 #include "lammps.h"
+#include "force.h"
 #include "neighbor_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "error.h"
@@ -124,19 +125,48 @@ KokkosLMP::~KokkosLMP()
 
 void KokkosLMP::accelerator(int narg, char **arg)
 {
+  // defaults
+
+  neighflag = FULL;
+  int newtonflag = 0;
+  double binsize = 0.0;
+  exchange_comm_classic = forward_comm_classic = 0;
+  exchange_comm_on_host = forward_comm_on_host = 1;
+
   int iarg = 0;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"neigh") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package command");
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"full") == 0) neighflag = FULL;
       else if (strcmp(arg[iarg+1],"half/thread") == 0) neighflag = HALFTHREAD;
       else if (strcmp(arg[iarg+1],"half") == 0) neighflag = HALF;
       else if (strcmp(arg[iarg+1],"n2") == 0) neighflag = N2;
       else if (strcmp(arg[iarg+1],"full/cluster") == 0) neighflag = FULLCLUSTER;
-      else error->all(FLERR,"Illegal package command");
+      else error->all(FLERR,"Illegal package kokkos command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"binsize") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      binsize = force->numeric(FLERR,arg[iarg+1]);
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"newton") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"off") == 0) newtonflag = 0;
+      else if (strcmp(arg[iarg+1],"on") == 0) newtonflag = 1;
+      else error->all(FLERR,"Illegal package kokkos command");
+    } else if (strcmp(arg[iarg],"comm") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"no") == 0) {
+        exchange_comm_classic = forward_comm_classic = 1;
+      } else if (strcmp(arg[iarg+1],"host") == 0) {
+        exchange_comm_classic = forward_comm_classic = 0;
+        exchange_comm_on_host = forward_comm_on_host = 1;
+      } else if (strcmp(arg[iarg+1],"device") == 0) {
+        exchange_comm_classic = forward_comm_classic = 0;
+        exchange_comm_on_host = forward_comm_on_host = 0;
+      } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"comm/exchange") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package command");
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) exchange_comm_classic = 1;
       else if (strcmp(arg[iarg+1],"host") == 0) {
         exchange_comm_classic = 0;
@@ -144,10 +174,10 @@ void KokkosLMP::accelerator(int narg, char **arg)
       } else if (strcmp(arg[iarg+1],"device") == 0) {
         exchange_comm_classic = 0;
         exchange_comm_on_host = 0;
-      } else error->all(FLERR,"Illegal package command");
+      } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"comm/forward") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package command");
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) forward_comm_classic = 1;
       else if (strcmp(arg[iarg+1],"host") == 0) {
         forward_comm_classic = 0;
@@ -155,10 +185,19 @@ void KokkosLMP::accelerator(int narg, char **arg)
       } else if (strcmp(arg[iarg+1],"device") == 0) {
         forward_comm_classic = 0;
         forward_comm_on_host = 0;
-      } else error->all(FLERR,"Illegal package command");
+      } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
-    } else error->all(FLERR,"Illegal package command");
+    } else error->all(FLERR,"Illegal package kokkos command");
   }
+
+  // set newton flags
+  // set neighbor binsize, same as neigh_modify command
+
+  force->newton = force->newton_pair = force->newton_bond = newtonflag;
+
+  neighbor->binsize_user = binsize;
+  if (binsize <= 0.0) neighbor->binsizeflag = 0;
+  else neighbor->binsizeflag = 1;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp
index 271c377397..4afb212853 100644
--- a/src/USER-CUDA/cuda.cpp
+++ b/src/USER-CUDA/cuda.cpp
@@ -47,6 +47,8 @@
 
 using namespace LAMMPS_NS;
 
+/* ---------------------------------------------------------------------- */
+
 Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp)
 {
   cuda_exists = true;
@@ -55,7 +57,9 @@ Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp)
   if (universe->me == 0) printf("# Using LAMMPS_CUDA \n");
 
   shared_data.me = universe->me;
+
   device_set = false;
+  devicelist = NULL;
 
   Cuda_Cuda_GetCompileSettings(&shared_data);
 
@@ -148,12 +152,16 @@ Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp)
   //cCudaData<double, float, yx >
 }
 
+/* ---------------------------------------------------------------------- */
+
 Cuda::~Cuda()
 {
   print_timings();
 
   if (universe->me == 0) printf("# CUDA: Free memory...\n");
 
+  delete [] devicelist;
+
   delete cu_q;
   delete cu_x;
   delete cu_v;
@@ -197,21 +205,37 @@ Cuda::~Cuda()
   }
 }
 
-void Cuda::accelerator(int narg, char** arg)
-{
-  if (device_set) return;
-  if (universe->me == 0) printf("# CUDA: Activate GPU \n");
+/* ----------------------------------------------------------------------
+   package cuda command
+   can be invoked multiple times: -c on, -pk, package command
+   can only init GPUs once in activate(), so just store params here
+------------------------------------------------------------------------- */
 
-  int pppn = force->inumeric(FLERR,arg[0]);
+void Cuda::accelerator(int narg, char **arg)
+{
+  // this error should not happen 
+
+  if (device_set) error->all(FLERR,"USER-CUDA device is already activated");
+
+  // pppn = # of GPUs/node
+
+  pppn = force->inumeric(FLERR,arg[0]);
   if (pppn <= 0) error->all(FLERR,"Illegal package cuda command");
 
   // optional args
 
-  int* devicelist = NULL;
+  delete [] devicelist;
+  devicelist = NULL;
+  int newtonflag = 0;
 
   int iarg = 1;
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"gpuID") == 0) {
+    if (strcmp(arg[iarg],"newton") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
+      if (strcmp(arg[iarg+1],"off") == 0) newtonflag = 0;
+      else if (strcmp(arg[iarg+1],"on") == 0) newtonflag = 1;
+      else error->all(FLERR,"Illegal package cuda command");
+    } else if (strcmp(arg[iarg],"gpuID") == 0) {
       if (iarg+pppn+1 > narg) error->all(FLERR,"Illegal package cuda command");
       devicelist = new int[pppn];
       for (int k = 0; k < pppn; k++)
@@ -255,6 +279,23 @@ void Cuda::accelerator(int narg, char** arg)
     } else error->all(FLERR,"Illegal package cuda command");
   }
 
+  // set newton flags
+
+  force->newton = force->newton_pair = force->newton_bond = newtonflag;
+}
+
+/* ----------------------------------------------------------------------
+   activate the GPUs
+   only done once with whatever settings used by the last package command
+------------------------------------------------------------------------- */
+
+void Cuda::activate()
+{
+  if (device_set) return;
+  device_set = true;
+
+  if (universe->me == 0) printf("# CUDA: Activate GPU \n");
+
   CudaWrapper_Init(0, (char**)0, universe->me, pppn, devicelist);
   //if(shared_data.overlap_comm)
   CudaWrapper_AddStreams(3);
@@ -288,11 +329,11 @@ void Cuda::accelerator(int narg, char** arg)
 
   cu_binned_id  = 0;
   cu_binned_idnew = 0;
-  device_set = true;
   allocate();
-  delete devicelist;
 }
 
+/* ---------------------------------------------------------------------- */
+
 void Cuda::setSharedDataZero()
 {
   MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");)
@@ -338,7 +379,6 @@ void Cuda::setSharedDataZero()
 
 void Cuda::allocate()
 {
-  accelerator(0, NULL);
   MYDBG(printf("# CUDA: Cuda::allocate ...\n");)
 
   if(not cu_virial) {
@@ -413,7 +453,6 @@ void Cuda::setDomainParams()
 void Cuda::checkResize()
 {
   MYDBG(printf("# CUDA: Cuda::checkResize ...\n");)
-  accelerator(0, NULL);
   cuda_shared_atom* cu_atom = & shared_data.atom;
   cuda_shared_pair* cu_pair = & shared_data.pair;
   cu_atom->q_flag      = atom->q_flag;
diff --git a/src/USER-CUDA/cuda.h b/src/USER-CUDA/cuda.h
index 80a0aab4ca..1299bc5a35 100644
--- a/src/USER-CUDA/cuda.h
+++ b/src/USER-CUDA/cuda.h
@@ -47,6 +47,7 @@ class Cuda : protected Pointers
     void allocate();
 
     void accelerator(int, char**);
+    void activate();
 
     void setSharedDataZero();
     void setSystemParams();
@@ -148,6 +149,9 @@ class Cuda : protected Pointers
     int copy_buffersize;
 
   private:
+    int pppn;                  // number of GPUs/node
+    int *devicelist;           // IDs of GPUs
+
     std::map<class NeighList*, class CudaNeighList*> neigh_lists;
 };
 }
diff --git a/src/USER-CUDA/domain_cuda.cpp b/src/USER-CUDA/domain_cuda.cpp
index 11dc7f0878..d250b5e9d3 100644
--- a/src/USER-CUDA/domain_cuda.cpp
+++ b/src/USER-CUDA/domain_cuda.cpp
@@ -61,7 +61,6 @@ DomainCuda::DomainCuda(LAMMPS* lmp) : Domain(lmp)
 
 void DomainCuda::init()
 {
-  cuda->accelerator(0, NULL);
   Domain::init();
 
   if(not cuda->finished_run) {
@@ -77,6 +76,12 @@ void DomainCuda::init()
 
 void DomainCuda::set_global_box()
 {
+  // one-time activation of CUDA
+  // do it here, b/c is now too late for further package commands
+  // activation must occur before any USER-CUDA class communicates with GPUs
+
+  cuda->activate();
+
   Domain::set_global_box();
 
   if(not cuda->finished_run) {
diff --git a/src/USER-CUDA/fix_shake_cuda.cpp b/src/USER-CUDA/fix_shake_cuda.cpp
index 28b7192e0d..a8747d7b13 100644
--- a/src/USER-CUDA/fix_shake_cuda.cpp
+++ b/src/USER-CUDA/fix_shake_cuda.cpp
@@ -57,7 +57,6 @@ FixShakeCuda::FixShakeCuda(LAMMPS* lmp, int narg, char** arg) :
   if(atom->map_style != 1)
     error->all(FLERR, "Fix shake/cuda needs atom map style array. In particular it does not currently work with hash-tables.");
 
-  cuda->accelerator(0, NULL);
   MPI_Comm_rank(world, &me);
   MPI_Comm_size(world, &nprocs);
   neighbor_step = true;
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
index 83dd79d7d2..ccc7a5897a 100644
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
@@ -69,7 +69,6 @@ void PairLJCharmmCoulCharmmCuda::allocate()
         if(! allocated) PairLJCharmmCoulCharmm::allocate();
         if(! allocated2)
         {
-                cuda->accelerator(0,NULL);
                 allocated2 = true;
                 cuda->shared_data.pair.coeff1  = lj1;
                 cuda->shared_data.pair.coeff2  = lj2;
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
index 340116959c..9c4e1634eb 100644
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
@@ -69,7 +69,6 @@ void PairLJCharmmCoulCharmmImplicitCuda::allocate()
         if(! allocated) PairLJCharmmCoulCharmmImplicit::allocate();
         if(! allocated2)
         {
-                cuda->accelerator(0,NULL);
                 allocated2 = true;
                 cuda->shared_data.pair.coeff1  = lj1;
                 cuda->shared_data.pair.coeff2  = lj2;
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
index 315f7ff80e..8da8fdb677 100644
--- a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
@@ -76,7 +76,6 @@ void PairLJCharmmCoulLongCuda::allocate()
         if(! allocated) PairLJCharmmCoulLong::allocate();
         if(! allocated2)
         {
-                cuda->accelerator(0,NULL);
                 allocated2 = true;
                 //cuda->shared_data.pair.cut     = cut_lj;
                 cuda->shared_data.pair.coeff1  = lj1;
diff --git a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp
index 44ead2c2fc..13b53fb0b4 100644
--- a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp
@@ -69,7 +69,6 @@ void PairLJGromacsCoulGromacsCuda::allocate()
         if(! allocated) PairLJGromacsCoulGromacs::allocate();
         if(! allocated2)
         {
-                cuda->accelerator(0,NULL);
                 allocated2 = true;
                 cuda->shared_data.pair.coeff1  = lj1;
                 cuda->shared_data.pair.coeff2  = lj2;
diff --git a/src/USER-CUDA/pair_lj_gromacs_cuda.cpp b/src/USER-CUDA/pair_lj_gromacs_cuda.cpp
index c8a09aaf7f..f50fccf290 100644
--- a/src/USER-CUDA/pair_lj_gromacs_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_gromacs_cuda.cpp
@@ -69,7 +69,6 @@ void PairLJGromacsCuda::allocate()
         if(! allocated) PairLJGromacs::allocate();
         if(! allocated2)
         {
-                cuda->accelerator(0,NULL);
                 allocated2 = true;
                 cuda->shared_data.pair.cut = cut;
                 cuda->shared_data.pair.cut_inner = cut_inner;
diff --git a/src/USER-CUDA/pair_lj_smooth_cuda.cpp b/src/USER-CUDA/pair_lj_smooth_cuda.cpp
index 62a7a7378d..ef4750b389 100644
--- a/src/USER-CUDA/pair_lj_smooth_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_smooth_cuda.cpp
@@ -69,7 +69,6 @@ void PairLJSmoothCuda::allocate()
         if(! allocated) PairLJSmooth::allocate();
         if(! allocated2)
         {
-                cuda->accelerator(0,NULL);
                 allocated2 = true;
                 cuda->shared_data.pair.cut = cut;
                 cuda->shared_data.pair.cut_inner = cut_inner;
diff --git a/src/input.cpp b/src/input.cpp
index 9ca5383c36..af1bca7f86 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -644,7 +644,7 @@ void Input::clear()
   if (narg > 0) error->all(FLERR,"Illegal clear command");
   lmp->destroy();
   lmp->create();
-  lmp->post_create();
+  lmp->post_create(0,NULL,NULL,NULL);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/lammps.cpp b/src/lammps.cpp
index c1fc855e53..6de1a5644c 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -13,6 +13,7 @@
 
 #include "mpi.h"
 #include "string.h"
+#include "ctype.h"
 #include "lammps.h"
 #include "style_angle.h"
 #include "style_atom.h"
@@ -98,58 +99,9 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
   int wdfirst,wdlast;
   int kkfirst,kklast;
 
-#if 0
-  // read .lammpsrc in home and current directory for overriding defaults
-
-  const char *rcpath;
-  char *homepath = NULL;
-#ifdef _WIN32
-  const char rcname[] = "lammps.rc";
-#else
-  const char rcname[] = ".lammpsrc";
-  const char *homedir = getenv("HOME");
-  if (homedir) {
-    int len = strlen(homedir) + strlen(rcname);
-    homepath = new char[len+2];
-    strcpy(homepath,homedir);
-    strcat(homepath,"/");
-    strcat(homepath,rcname);
-  }
-#endif
-  FILE *fd;
-  do {
-    if (homepath) rcpath = homepath;
-    else rcpath = rcname;
-    fd = fopen(rcpath,"r");
-    if (fd) {
-       char linebuf[1024];
-       char *key, *value;
-
-       // loop through file
-       while(1) {
-         fgets(linebuf,1024,fd);
-         if (feof(fd) || ferror(fd)) break;
-
-         // truncate line at comment character, if present
-         if ((key = strstr(linebuf,"#"))) *key = '\0';
-
-         key = strtok(linebuf," \t\n\r\f");
-         value = strtok(NULL," \t\n\r\f");
-
-         // skip empty lines
-         if (key == NULL) continue;
-
-       }
-       fclose(fd);
-    }
-    if (homepath) {
-      delete[] homepath;
-      homepath = NULL;
-    }
-  } while(rcpath != rcname);
-#endif
-
-  // parsing command line flags
+ int npack = 0;
+  int *pfirst = NULL;
+  int *plast = NULL;
 
   int iarg = 1;
   while (iarg < narg) {
@@ -224,6 +176,22 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
       kkfirst = iarg;
       while (iarg < narg && arg[iarg][0] != '-') iarg++;
       kklast = iarg;
+    } else if (strcmp(arg[iarg],"-package") == 0 ||
+               strcmp(arg[iarg],"-pk") == 0) {
+      if (iarg+2 > narg)
+        error->universe_all(FLERR,"Invalid command-line argument");
+      memory->grow(pfirst,npack+1,"lammps:pfirst");
+      memory->grow(plast,npack+1,"lammps:plast");
+      // delimit args for package command invocation
+      // any package arg with leading "-" will be followed by numeric digit
+      iarg++;
+      pfirst[npack] = iarg;
+      while (iarg < narg) {
+        if (arg[iarg][0] != '-') iarg++;
+        else if (isdigit(arg[iarg][1])) iarg++;
+        else break;
+      }
+      plast[npack++] = iarg;
     } else if (strcmp(arg[iarg],"-suffix") == 0 ||
                strcmp(arg[iarg],"-sf") == 0) {
       if (iarg+2 > narg)
@@ -539,7 +507,9 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
   // allocate top-level classes
 
   create();
-  post_create();
+  post_create(npack,pfirst,plast,arg);
+  memory->destroy(pfirst);
+  memory->destroy(plast);
 
   // if helpflag set, print help and quit
 
@@ -662,15 +632,18 @@ void LAMMPS::create()
      so that package-specific core classes have been instantiated
 ------------------------------------------------------------------------- */
 
-void LAMMPS::post_create()
+void LAMMPS::post_create(int npack, int *pfirst, int *plast, char **arg)
 {
-  if (!suffix_enable) return;
+  // default package commands triggered by "-c on" and "-k on"
+
+  if (cuda && cuda->cuda_exists) input->one("package cuda 1");
+  if (kokkos && kokkos->kokkos_exists) input->one("package kokkos");
 
   // suffix will always be set if suffix_enable = 1
-  // USER-CUDA and KOKKOS have package classes instantiated if enabled
-  //   via "-c on" and "-k on"
-  // GPU, INTEL, USER-OMP provide their own fixes which will have
-  //   been compiled with LAMMPS if those packages were installed
+  // check that USER-CUDA and KOKKOS package classes were instantiated
+  // check that GPU, INTEL, USER-OMP fixes were compiled with LAMMPS
+
+  if (!suffix_enable) return;
 
   if (strcmp(suffix,"cuda") == 0 && (cuda == NULL || cuda->cuda_exists == 0))
     error->all(FLERR,"Using suffix cuda without USER-CUDA package enabled");
@@ -700,6 +673,22 @@ void LAMMPS::post_create()
   if (suffix2) {
     if (strcmp(suffix,"omp") == 0) input->one("package omp 0");
   }
+
+  // invoke any command-line package commands
+
+  if (npack) {
+    char str[128];
+    for (int i = 0; i < npack; i++) {
+      strcpy(str,"package");
+      for (int j = pfirst[i]; j < plast[i]; j++) {
+        if (strlen(str) + strlen(arg[j]) + 2 > 128)
+          error->all(FLERR,"Too many -pk arguments in command line");
+        strcat(str," ");
+        strcat(str,arg[j]);
+      }
+      input->one(str);
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -709,9 +698,6 @@ void LAMMPS::post_create()
 
 void LAMMPS::init()
 {
-  if (cuda) cuda->accelerator(0,NULL);
-  if (kokkos) kokkos->accelerator(0,NULL);
-
   update->init();
   force->init();         // pair must come after update due to minimizer
   domain->init();
@@ -795,6 +781,7 @@ void help_message(FILE *fp)
         " -kokkos on/off ...           : turn KOKKOS mode on or off     (-k)\n"
         " -log none/<filename>         : where to send log output       (-l)\n"
         " -nocite                      : disable writing log.cite file  (-nc)\n"
+        " -package style ...           : invoke package command (-pk)\n"
         " -partition <partition size>  : assign partition sizes         (-p)\n"
         " -plog <basename>             : basename for partition logs    (-pl)\n"
         " -pscreen <basename>          : basename for partition screens (-ps)\n"
diff --git a/src/lammps.h b/src/lammps.h
index 778cdea871..0af9721708 100644
--- a/src/lammps.h
+++ b/src/lammps.h
@@ -54,7 +54,7 @@ class LAMMPS {
   LAMMPS(int, char **, MPI_Comm);
   ~LAMMPS();
   void create();
-  void post_create();
+  void post_create(int, int *, int *, char **);
   void init();
   void destroy();
 
diff --git a/src/modify.cpp b/src/modify.cpp
index 47bf41cd40..3abb15dd44 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -661,7 +661,7 @@ void Modify::add_fix(int narg, char **arg, int trysuffix)
   // MUST change NEXCEPT above when add new fix to this list
 
   const char *exceptions[NEXCEPT] = 
-    {"GPU","OMP","Intel","property/atom","cmap"};
+    {"GPU","OMP","INTEL","property/atom","cmap"};
 
   if (domain->box_exist == 0) {
     int m;
diff --git a/src/neighbor.h b/src/neighbor.h
index 05a8622d04..a0271a5e01 100644
--- a/src/neighbor.h
+++ b/src/neighbor.h
@@ -38,6 +38,9 @@ class Neighbor : protected Pointers {
   double cutneighmax;              // max neighbor cutoff for all type pairs
   double *cuttype;                 // for each type, max neigh cut w/ others
 
+  int binsizeflag;                 // user-chosen bin size
+  double binsize_user;             // set externally by some accelerator pkgs
+
   bigint ncalls;                   // # of times build has been called
   bigint ndanger;                  // # of dangerous builds
   bigint lastcall;                 // timestep of last neighbor::build() call
@@ -121,9 +124,6 @@ class Neighbor : protected Pointers {
   int mbinx,mbiny,mbinz;
   int mbinxlo,mbinylo,mbinzlo;
 
-  int binsizeflag;                 // user-chosen bin size
-  double binsize_user;
-
   double binsizex,binsizey,binsizez;  // actual bin sizes and inverse sizes
   double bininvx,bininvy,bininvz;
 

USER-CUDA	for NVIDIA GPUs
GPU	for NVIDIA GPUs as well as OpenCL support
USER-INTEL	for Intel CPUs and Intel Xeon Phi
KOKKOS	for GPUs, Intel Xeon Phi, and OpenMP threading
USER-OMP	for OpenMP threading
OPT	generic CPU optimizations +
USER-CUDA	for NVIDIA GPUs
GPU	for NVIDIA GPUs as well as OpenCL support
USER-INTEL	for Intel CPUs and Intel Xeon Phi
KOKKOS	for GPUs, Intel Xeon Phi, and OpenMP threading
USER-OMP	for OpenMP threading
OPT	generic CPU optimizations
build the accelerator library	only for USER-CUDA and GPU packages
install the accelerator package	make yes-opt, make yes-user-intel, etc
add compile/link flags to Makefile.machine	in src/MAKE, only for USER-INTEL, KOKKOS, USER-OMP packages
re-build LAMMPS	make machine
run a LAMMPS simulation	lmp_machine < in.script
enable the accelerator package	via "-c on" and "-k on" command-line switches, only for USER-CUDA and KOKKOS packages
set any needed options for the package	via "-pk" command-line switch or package command, only if defaults need to be changed
use accelerated styles in your input script	via "-sf" command-line switch or suffix command +