From 143aa917f201565ffb597083966dd5517f46c486 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 2 Dec 2011 15:47:30 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7273
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 doc/Section_errors.html |  17 ++++++
 doc/Section_errors.txt  |  17 ++++++
 doc/Section_start.html  |  22 +++++---
 doc/Section_start.txt   |  22 +++++---
 doc/package.html        | 113 ++++++++++++++++++++++++++++++++--------
 doc/package.txt         | 113 ++++++++++++++++++++++++++++++++--------
 doc/pair_coul.html      |   9 ++--
 doc/pair_coul.txt       |   3 +-
 8 files changed, 255 insertions(+), 61 deletions(-)
diff --git a/doc/Section_errors.html b/doc/Section_errors.html
index 4330576f5c..58ea10e3ff 100644
--- a/doc/Section_errors.html
+++ b/doc/Section_errors.html
@@ -181,6 +181,12 @@ the bond topologies you have defined.
 
 <DD>GPU acceleration requires fix gpu in the input script. 
 
+<DT><I>Accelerator sharing is not currently supported on system.</I> 
+
+<DD>You cannot use more MPI processes than accelerators on the
+system as currently configured. For NVIDIA GPUs, the compute
+mode must be changed using nvidia-smi to support sharing. 
+
 <DT><I>All angle coeffs are not set</I> 
 
 <DD>All angle coefficients must be set in the data file or by the
@@ -1205,6 +1211,11 @@ in LAMMPS.
 
 <DD>No atoms in system have a non-zero charge. 
 
+<DT><I>Cannot use neigh_modify exclude with GPU neighbor builds</I> 
+
+<DD>This is a current limitation of the GPU implementation
+in LAMMPS. 
+
 <DT><I>Cannot use neighbor bins - box size << cutoff</I> 
 
 <DD>Too many neighbor bins will be created.  This typically happens when
@@ -5799,6 +5810,12 @@ length in that dimension.  E.g. the xy tilt must be between -half and
 
 <DD>Self-explanatory. 
 
+<DT><I>Unable to initialize accelerator for use</I> 
+
+<DD>One or more specified accelerator(s) cannot currently be used by LAMMPS.
+This can happen if the accelerator is already in use by another 
+process. 
+
 <DT><I>Unbalanced quotes in input line</I> 
 
 <DD>No matching end double quote was found following a leading double
diff --git a/doc/Section_errors.txt b/doc/Section_errors.txt
index dab3e05cf6..2bf511d925 100644
--- a/doc/Section_errors.txt
+++ b/doc/Section_errors.txt
@@ -178,6 +178,12 @@ the bond topologies you have defined. :dd
 
 GPU acceleration requires fix gpu in the input script. :dd
 
+{Accelerator sharing is not currently supported on system.} :dt
+
+You cannot use more MPI processes than accelerators on the
+system as currently configured. For NVIDIA GPUs, the compute
+mode must be changed using nvidia-smi to support sharing. :dd
+
 {All angle coeffs are not set} :dt
 
 All angle coefficients must be set in the data file or by the
@@ -1202,6 +1208,11 @@ in LAMMPS. :dd
 
 No atoms in system have a non-zero charge. :dd
 
+{Cannot use neigh_modify exclude with GPU neighbor builds} :dt
+
+This is a current limitation of the GPU implementation
+in LAMMPS. :dd
+
 {Cannot use neighbor bins - box size << cutoff} :dt
 
 Too many neighbor bins will be created.  This typically happens when
@@ -5796,6 +5807,12 @@ Self-explanatory. :dd
 
 Self-explanatory. :dd
 
+{Unable to initialize accelerator for use} :dt
+
+One or more specified accelerator(s) cannot currently be used by LAMMPS.
+This can happen if the accelerator is already in use by another 
+process. :dd
+
 {Unbalanced quotes in input line} :dt
 
 No matching end double quote was found following a leading double
diff --git a/doc/Section_start.html b/doc/Section_start.html
index bfcfc00a79..0d871eb977 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -998,12 +998,22 @@ lj/cut</A> variant, with style names lj/cut/opt, lj/cut/omp,
 lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
 explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
 -suffix switch is used, you do not need to modify your input script.
-The specified suffix (opt,omp,gpu,cuda) is automatically appended whenever
-your input script command creates a new <A HREF = "atom_style.html">atom</A>,
-<A HREF = "pair_style.html">pair</A>, <A HREF = "fix.html">fix</A>, <A HREF = "compute.html">compute</A>, or
-<A HREF = "run_style.html">run</A> style.  atom, pair, fix, compute, or integrate
-style.  If the variant version does not exist, the standard version is
-created.
+The specified suffix (opt,omp,gpu,cuda) is automatically appended
+whenever your input script command creates a new
+<A HREF = "atom_style.html">atom</A>, <A HREF = "pair_style.html">pair</A>, <A HREF = "fix.html">fix</A>,
+<A HREF = "compute.html">compute</A>, or <A HREF = "run_style.html">run</A> style.  If the variant
+version does not exist, the standard version is created.
+</P>
+<P>For the GPU package, using this command-line switch also invokes the
+default GPU settings, as if the command "package gpu force/neigh 0 0
+1" were used at the top of your input script.  These settings can be
+changed by using the <A HREF = "pacakge.html">package gpu</A> command in your script
+if desired.
+</P>
+<P>For the OMP package, using this command-line switch also invokes the
+default OMP settings, as if the command "package omp *" were used at
+the top of your input script.  These settings can be changed by using
+the <A HREF = "pacakge.html">package omp</A> command in your script if desired.
 </P>
 <P>The <A HREF = "suffix.html">suffix</A> command can also set a suffix and it can also
 turn off/on any suffix setting made via the command line.
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index 12492f09f3..f2cb42e1b4 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -990,12 +990,22 @@ lj/cut"_pair_lj.html variant, with style names lj/cut/opt, lj/cut/omp,
 lj/cut/gpu, or lj/cut/cuda.  A variant styles can be specified
 explicitly in your input script, e.g. pair_style lj/cut/gpu.  If the
 -suffix switch is used, you do not need to modify your input script.
-The specified suffix (opt,omp,gpu,cuda) is automatically appended whenever
-your input script command creates a new "atom"_atom_style.html,
-"pair"_pair_style.html, "fix"_fix.html, "compute"_compute.html, or
-"run"_run_style.html style.  atom, pair, fix, compute, or integrate
-style.  If the variant version does not exist, the standard version is
-created.
+The specified suffix (opt,omp,gpu,cuda) is automatically appended
+whenever your input script command creates a new
+"atom"_atom_style.html, "pair"_pair_style.html, "fix"_fix.html,
+"compute"_compute.html, or "run"_run_style.html style.  If the variant
+version does not exist, the standard version is created.
+
+For the GPU package, using this command-line switch also invokes the
+default GPU settings, as if the command "package gpu force/neigh 0 0
+1" were used at the top of your input script.  These settings can be
+changed by using the "package gpu"_pacakge.html command in your script
+if desired.
+
+For the OMP package, using this command-line switch also invokes the
+default OMP settings, as if the command "package omp *" were used at
+the top of your input script.  These settings can be changed by using
+the "package omp"_pacakge.html command in your script if desired.
 
 The "suffix"_suffix.html command can also set a suffix and it can also
 turn off/on any suffix setting made via the command line.
diff --git a/doc/package.html b/doc/package.html
index b900dd4b7a..9b750c49af 100644
--- a/doc/package.html
+++ b/doc/package.html
@@ -24,7 +24,18 @@
     first = ID of first GPU to be used on each node
     last = ID of last GPU to be used on each node
     split = fraction of particles assigned to the GPU
-  <I>cuda</I> args = to be determined
+  <I>cuda</I> args = one or more keyword/value pairs may be appended
+    keywords = <I>gpu/node</I> or <I>gpu/node/special</I> or <I>timing</I> or <I>test</I> or <I>override/bpa</I>
+    gpu/node values = N
+      N = number of GPUs to be used per node
+    gpu/node/special values = N gpu1 .. gpuN
+      N = number of GPUs to be used per node
+      gpu1 .. gpuN = N IDs of the GPUs to use
+    timing values = none
+    test values = id
+      id = atom-ID of a test particle
+    override/bpa values = flag
+      flag = 0 for TpA algorithm, 1 for BpA algorithm 
   <I>omp</I> args = Nthreads mode
     Nthreads = # of OpenMP threads to associate with each MPI process
     mode = force or force/neigh (optional) 
@@ -45,17 +56,25 @@ package omp 4 force
 <P>This command invokes package-specific settings.  Currently the
 following packages use it: GPU, USER-CUDA, and USER-OMP.
 </P>
+<P>To use the accelerated GPU and USER-OMP styles, the use of the package
+command is required.  However, as described in the "Defaults" section
+below, if you use the "-sf gpu" or "-sf omp" <A HREF = "Section_start.html#start_6">command-line
+options</A> to enable use of these styles,
+then default package settings are enabled.  In that case you only need
+to use the package command if you want to change the defaults.
+</P>
+<P>To use the accelerate USER-CUDA styles, the package command is not
+required as defaults are assigned internally.  You only need to use
+the package command if you want to change the defaults.
+</P>
 <P>See <A HREF = "Section_accelerate.html">this section</A> of the manual for more
-details about using these various packages for accelerating
-a LAMMPS calculation.
+details about using these various packages for accelerating LAMMPS
+calculations.
 </P>
 <HR>
 
 <P>The <I>gpu</I> style invokes options associated with the use of the GPU
-package.  It allows you to select and initialize GPUs to be used for
-acceleration via this package and configure how the GPU acceleration
-is performed.  These settings are required in order to use any style
-with GPU acceleration.
+package. 
 </P>
 <P>The <I>mode</I> setting specifies where neighbor list calculations will be
 performed.  If <I>mode</I> is force, neighbor list calculation is performed
@@ -102,7 +121,54 @@ the other particles.
 <HR>
 
 <P>The <I>cuda</I> style invokes options associated with the use of the
-USER-CUDA package.  These still need to be documented.
+USER-CUDA package.  
+</P>
+<P>The <I>gpu/node</I> setting specifies the number <I>N</I> of GPUs to be used on
+each node.  An MPI process with rank <I>K</I> will use the GPU (K mod N).
+This implies that processes should be assigned with successive ranks
+on each node, which is the default with most (or even all) MPI
+implementations. The default value for <I>N</I> is 2.
+</P>
+<P>The <I>gpu/node/special</I> setting also specifies the number (N) of GPUs
+to be used on each node, but allows more control over their
+specification.  An MPI process with rank <I>K</I> will use the GPU <I>gpuI</I>
+with l = (K mod N) + 1. This implies that processes should be assigned
+with successive ranks on each node, which is the default with most (or
+even all) MPI implementations.  For example if you have three GPUs on
+a machine, one of which is used for the X-Server (the GPU with the ID
+1) while the others (with IDs 0 and 2) are used for computations you
+would specify:
+</P>
+<PRE>package cuda gpu/node/special 2 0 2 
+</PRE>
+<P>A main purpose of the <I>gpu/node/special</I> option is to allow two (or
+more) simulations to be run on one workstation.  In that case one
+would set the first simulation to use GPU 0 and the second to use GPU
+1. This is not necessary though, if the GPUs are in what is called
+<I>compute exclusive</I> mode.  Using that setting, every process will get
+its own GPU automatically.  This <I>compute exclusive</I> mode can be set
+as root using the <I>nvidia-smi</I> tool which is part of the CUDA
+installation.
+</P>
+<P>Note that if the <I>gpu/node/special</I> keyword is not used, the USER-CUDA
+package sorts existing GPUs on each node according to their number of
+multiprocessors.  This way, compute GPUs will be priorized over
+X-Server GPUs.
+</P>
+<P>Use of the <I>timing</I> keyword will output detailed timing information
+for various subroutines.
+</P>
+<P>The <I>test</I> keyword will output info for the the specified atom at
+several points during each time step.  This is mainly usefull for
+debugging purposes.  Note that the simulation will be severly slowed
+down if this option is used.
+</P>
+<P>The <I>override/bpa</I> keyword can be used to specify which mode is used
+for pair-force evaluation.  TpA = one thread per atom; BpA = one block
+per atom.  If this keyword is not used, a short test at the begin of
+each run will determine which method is more effective (the result of
+this test is part of the LAMMPS output).  Therefore it is usually not
+necessary to use this setting.
 </P>
 <HR>
 
@@ -153,22 +219,15 @@ its own pages).
 </P>
 <P>The cuda style of this command can only be invoked if LAMMPS was built
 with the USER-CUDA package.  See the <A HREF = "Section_start.html#start_3">Making
-LAMMPS</A> section for more info.  When using
-styles in the USER-CUDA package, use of the "package cuda" command in
-your input script is not required.
+LAMMPS</A> section for more info.
 </P>
 <P>The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the <A HREF = "Section_start.html#start_3">Making
-LAMMPS</A> section for more info.  When using
-styles in the GPU package, use of the "package gpu" command in your
-input script is currently required.
+LAMMPS</A> section for more info.
 </P>
 <P>The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the <A HREF = "Section_start.html#start_3">Making
-LAMMPS</A> section for more info.  When using
-styles in the USER-OMP package, use of the "package omp" command in
-your input script is not required.  See the information on default
-settings below.
+LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
 </P>
@@ -176,10 +235,20 @@ settings below.
 </P>
 <P><B>Default:</B>
 </P>
-<P>If the "-sf omp" <A HREF = "Section_start.html#start_6">command-line switch</A> is
-used then "package omp *" is also auto-invoked to specify default OMP
-settings.
+<P>If the "-sf gpu" <A HREF = "Section_start.html#start_6">command-line switch</A> is
+used then it is as if the command "package gpu force/neigh 0 0 1" were
+invoked, to specify default settings for the GPU package.  If the
+command-line switch is not used, then no defaults are set, and you
+must specify the appropriate package command in your input script.
 </P>
-<P>The other styles have no defaults.
+<P>The default settings for the USER CUDA package are "package cuda gpu
+2".  This is the case whether the "-sf cuda" <A HREF = "Section_start.html#start_6">command-line
+switch</A> is used or not.
+</P>
+<P>If the "-sf omp" <A HREF = "Section_start.html#start_6">command-line switch</A> is
+used then it is as if the command "package omp *" were invoked, to
+specify default settings for the USER-OMP package.  If the
+command-line switch is not used, then no defaults are set, and you
+must specify the appropriate package command in your input script.
 </P>
 </HTML>
diff --git a/doc/package.txt b/doc/package.txt
index 63e3f2bf08..61bc477f2f 100644
--- a/doc/package.txt
+++ b/doc/package.txt
@@ -19,7 +19,18 @@ args = arguments specific to the style :l
     first = ID of first GPU to be used on each node
     last = ID of last GPU to be used on each node
     split = fraction of particles assigned to the GPU
-  {cuda} args = to be determined
+  {cuda} args = one or more keyword/value pairs may be appended
+    keywords = {gpu/node} or {gpu/node/special} or {timing} or {test} or {override/bpa}
+    gpu/node values = N
+      N = number of GPUs to be used per node
+    gpu/node/special values = N gpu1 .. gpuN
+      N = number of GPUs to be used per node
+      gpu1 .. gpuN = N IDs of the GPUs to use
+    timing values = none
+    test values = id
+      id = atom-ID of a test particle
+    override/bpa values = flag
+      flag = 0 for TpA algorithm, 1 for BpA algorithm 
   {omp} args = Nthreads mode
     Nthreads = # of OpenMP threads to associate with each MPI process
     mode = force or force/neigh (optional) :pre
@@ -39,17 +50,25 @@ package omp 4 force :pre
 This command invokes package-specific settings.  Currently the
 following packages use it: GPU, USER-CUDA, and USER-OMP.
 
+To use the accelerated GPU and USER-OMP styles, the use of the package
+command is required.  However, as described in the "Defaults" section
+below, if you use the "-sf gpu" or "-sf omp" "command-line
+options"_Section_start.html#start_6 to enable use of these styles,
+then default package settings are enabled.  In that case you only need
+to use the package command if you want to change the defaults.
+
+To use the accelerate USER-CUDA styles, the package command is not
+required as defaults are assigned internally.  You only need to use
+the package command if you want to change the defaults.
+
 See "this section"_Section_accelerate.html of the manual for more
-details about using these various packages for accelerating
-a LAMMPS calculation.
+details about using these various packages for accelerating LAMMPS
+calculations.
 
 :line
 
 The {gpu} style invokes options associated with the use of the GPU
-package.  It allows you to select and initialize GPUs to be used for
-acceleration via this package and configure how the GPU acceleration
-is performed.  These settings are required in order to use any style
-with GPU acceleration.
+package. 
 
 The {mode} setting specifies where neighbor list calculations will be
 performed.  If {mode} is force, neighbor list calculation is performed
@@ -96,7 +115,54 @@ the other particles.
 :line
 
 The {cuda} style invokes options associated with the use of the
-USER-CUDA package.  These still need to be documented.
+USER-CUDA package.  
+
+The {gpu/node} setting specifies the number {N} of GPUs to be used on
+each node.  An MPI process with rank {K} will use the GPU (K mod N).
+This implies that processes should be assigned with successive ranks
+on each node, which is the default with most (or even all) MPI
+implementations. The default value for {N} is 2.
+
+The {gpu/node/special} setting also specifies the number (N) of GPUs
+to be used on each node, but allows more control over their
+specification.  An MPI process with rank {K} will use the GPU {gpuI}
+with l = (K mod N) + 1. This implies that processes should be assigned
+with successive ranks on each node, which is the default with most (or
+even all) MPI implementations.  For example if you have three GPUs on
+a machine, one of which is used for the X-Server (the GPU with the ID
+1) while the others (with IDs 0 and 2) are used for computations you
+would specify:
+
+package cuda gpu/node/special 2 0 2 :pre
+
+A main purpose of the {gpu/node/special} option is to allow two (or
+more) simulations to be run on one workstation.  In that case one
+would set the first simulation to use GPU 0 and the second to use GPU
+1. This is not necessary though, if the GPUs are in what is called
+{compute exclusive} mode.  Using that setting, every process will get
+its own GPU automatically.  This {compute exclusive} mode can be set
+as root using the {nvidia-smi} tool which is part of the CUDA
+installation.
+
+Note that if the {gpu/node/special} keyword is not used, the USER-CUDA
+package sorts existing GPUs on each node according to their number of
+multiprocessors.  This way, compute GPUs will be priorized over
+X-Server GPUs.
+ 
+Use of the {timing} keyword will output detailed timing information
+for various subroutines.
+
+The {test} keyword will output info for the the specified atom at
+several points during each time step.  This is mainly usefull for
+debugging purposes.  Note that the simulation will be severly slowed
+down if this option is used.
+
+The {override/bpa} keyword can be used to specify which mode is used
+for pair-force evaluation.  TpA = one thread per atom; BpA = one block
+per atom.  If this keyword is not used, a short test at the begin of
+each run will determine which method is more effective (the result of
+this test is part of the LAMMPS output).  Therefore it is usually not
+necessary to use this setting.
 
 :line
 
@@ -147,22 +213,15 @@ This command cannot be used after the simulation box is defined by a
 
 The cuda style of this command can only be invoked if LAMMPS was built
 with the USER-CUDA package.  See the "Making
-LAMMPS"_Section_start.html#start_3 section for more info.  When using
-styles in the USER-CUDA package, use of the "package cuda" command in
-your input script is not required.
+LAMMPS"_Section_start.html#start_3 section for more info.
 
 The gpu style of this command can only be invoked if LAMMPS was built
 with the GPU package.  See the "Making
-LAMMPS"_Section_start.html#start_3 section for more info.  When using
-styles in the GPU package, use of the "package gpu" command in your
-input script is currently required.
+LAMMPS"_Section_start.html#start_3 section for more info.
 
 The omp style of this command can only be invoked if LAMMPS was built
 with the USER-OMP package.  See the "Making
-LAMMPS"_Section_start.html#start_3 section for more info.  When using
-styles in the USER-OMP package, use of the "package omp" command in
-your input script is not required.  See the information on default
-settings below.
+LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
@@ -170,9 +229,19 @@ settings below.
 
 [Default:]
 
+If the "-sf gpu" "command-line switch"_Section_start.html#start_6 is
+used then it is as if the command "package gpu force/neigh 0 0 1" were
+invoked, to specify default settings for the GPU package.  If the
+command-line switch is not used, then no defaults are set, and you
+must specify the appropriate package command in your input script.
+
+The default settings for the USER CUDA package are "package cuda gpu
+2".  This is the case whether the "-sf cuda" "command-line
+switch"_Section_start.html#start_6 is used or not.
+
 If the "-sf omp" "command-line switch"_Section_start.html#start_6 is
-used then "package omp *" is also auto-invoked to specify default OMP
-settings.
-
-The other styles have no defaults.
+used then it is as if the command "package omp *" were invoked, to
+specify default settings for the USER-OMP package.  If the
+command-line switch is not used, then no defaults are set, and you
+must specify the appropriate package command in your input script.
 
diff --git a/doc/pair_coul.html b/doc/pair_coul.html
index 6b6af23458..9e09518801 100644
--- a/doc/pair_coul.html
+++ b/doc/pair_coul.html
@@ -19,16 +19,17 @@
 </H3>
 <H3>pair_style coul/long command 
 </H3>
-<H3>pair_style coul/long/gpu command 
-</H3>
 <H3>pair_style coul/long/omp command 
 </H3>
+<H3>pair_style coul/long/gpu command 
+</H3>
 <P><B>Syntax:</B>
 </P>
-<P>pair_style coul/cut cutoff
+<PRE>pair_style coul/cut cutoff
 pair_style coul/debye kappa cutoff
 pair_style coul/long cutoff
-</P>
+pair_style coul/long/gpu cutoff 
+</PRE>
 <UL><LI>cutoff = global cutoff for Coulombic interactions
 <LI>kappa = Debye length (inverse distance units) 
 </UL>
diff --git a/doc/pair_coul.txt b/doc/pair_coul.txt
index 6e5e7a54ab..44168cd1c5 100644
--- a/doc/pair_coul.txt
+++ b/doc/pair_coul.txt
@@ -11,14 +11,15 @@ pair_style coul/cut/omp command :h3
 pair_style coul/debye command :h3
 pair_style coul/debye/omp command :h3
 pair_style coul/long command :h3
-pair_style coul/long/gpu command :h3
 pair_style coul/long/omp command :h3
+pair_style coul/long/gpu command :h3
 
 [Syntax:]
 
 pair_style coul/cut cutoff
 pair_style coul/debye kappa cutoff
 pair_style coul/long cutoff
+pair_style coul/long/gpu cutoff :pre
 
 cutoff = global cutoff for Coulombic interactions
 kappa = Debye length (inverse distance units) :ul