diff --git a/bench/FERMI/README b/bench/FERMI/README
index 5244aa8916..247cbf9f43 100644
--- a/bench/FERMI/README
+++ b/bench/FERMI/README
@@ -16,38 +16,45 @@ lmp_linux_double
 The precision (single, mixed, double) refers to the GPU and USER-CUDA
 pacakge precision.  See the README files in the lib/gpu and lib/cuda
 directories for instructions on how to build the packages with
-different precisions.  The doc/Section_accelerate.html file also has a
-summary description.
+different precisions.  The GPU and USER-CUDA sub-sections of the
+doc/Section_accelerate.html file also describes this process.
 
 ------------------------------------------------------------------------
 
-If the script has "cpu" in its name, it is meant to be run in CPU-only
-mode (without using the GPU or USER-CUDA styles).  For example:
+To run on just CPUs (without using the GPU or USER-CUDA styles),
+do something like the following:
 
-mpirun -np 1 ../lmp_linux_double -v x 8 -v y 8 -v z 8 -v t 100 < in.lj.cpu
-mpirun -np 12 ../lmp_linux_double -v x 16 -v y 16 -v z 16 -v t 100 < in.lj.cpu
+mpirun -np 1 lmp_linux_double -v x 8 -v y 8 -v z 8 -v t 100 < in.lj
+mpirun -np 12 lmp_linux_double -v x 16 -v y 16 -v z 16 -v t 100 < in.lj
 
 The "xyz" settings determine the problem size.  The "t" setting
 determines the number of timesteps.
 
+These mpirun commands run on a single node.  To run on multiple
+nodes, scale up the "-np" setting.
+
 ------------------------------------------------------------------------
 
-If the script has "gpu" in its name, it is meant to be run using
-the GPU package.  For example:
+To run with the GPU package, do something like the following:
 
-mpirun -np 12 ../lmp_linux_single -sf gpu -v g 1 -v x 32 -v y 32 -v z 64 -v t 100 < in.lj.gpu
-
-mpirun -np 8 ../lmp_linux_mixed -sf gpu -v g 2 -v x 32 -v y 32 -v z 64 -v t 100 < in.lj.gpu
+mpirun -np 12 lmp_linux_single -sf gpu -pk gpu 1 -v x 32 -v y 32 -v z 64 -v t 100 < in.lj
+mpirun -np 8 lmp_linux_mixed -sf gpu -pk gpu 2 -v x 32 -v y 32 -v z 64 -v t 100 < in.lj
 
 The "xyz" settings determine the problem size.  The "t" setting
 determines the number of timesteps.  The "np" setting determines how
-many MPI tasks per compute node the problem will run on, and the "g"
-setting determines how many GPUs per compute node the problem will run
-on, i.e. 1 or 2 in this case.  Note that you can use more MPI tasks
-than GPUs (both per compute node) with the GPU package.
+many MPI tasks (per node) the problem will run on, The numeric
+argument to the "-pk" setting is the number of GPUs (per node).  Note
+that you can use more MPI tasks than GPUs (per node) with the GPU
+package.
+
+These mpirun commands run on a single node.  To run on multiple
+nodes, scale up the "-np" setting, and control the number of
+MPI tasks per node via a "-ppn" setting.
 
 ------------------------------------------------------------------------
 
+To run with the USER-CUDA package, do something like the following:
+
 If the script has "cuda" in its name, it is meant to be run using
 the USER-CUDA package.  For example:
 
@@ -62,7 +69,10 @@ setting determines how many GPUs per compute node the problem will run
 on, i.e. 1 or 2 in this case.  For the USER-CUDA package, the number
 of MPI tasks and GPUs (both per compute node) must be equal.
 
+These mpirun commands run on a single node.  To run on multiple
+nodes, scale up the "-np" setting.
+
 ------------------------------------------------------------------------
 
-If the script has "titan" in its name, it was run on the Titan supercomputer
-at ORNL.
+If the script has "titan" in its name, it was run on the Titan
+supercomputer at ORNL.
diff --git a/bench/FERMI/in.lj.cpu b/bench/FERMI/in.lj.cpu
deleted file mode 100644
index ab6988e286..0000000000
--- a/bench/FERMI/in.lj.cpu
+++ /dev/null
@@ -1,22 +0,0 @@
-# 3d Lennard-Jones melt
-
-units		lj
-atom_style	atomic
-
-lattice		fcc 0.8442
-region		box block 0 $x 0 $y 0 $z
-create_box	1 box
-create_atoms	1 box
-mass		1 1.0
-
-velocity	all create 1.44 87287 loop geom
-
-pair_style	lj/cut 2.5
-pair_coeff	1 1 1.0 1.0 2.5
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 20 check no
-
-fix		1 all nve
-
-run		$t
diff --git a/bench/FERMI/in.lj.cuda b/bench/FERMI/in.lj.cuda
deleted file mode 100644
index d6aa533ee0..0000000000
--- a/bench/FERMI/in.lj.cuda
+++ /dev/null
@@ -1,27 +0,0 @@
-# 3d Lennard-Jones melt
-
-# set variable g = 1/2 for 1/2 GPUs
-
-if "$g == 1" then "package cuda gpu/node 1"
-if "$g == 2" then "package cuda gpu/node 2"
-
-units		lj
-atom_style	atomic
-
-lattice		fcc 0.8442
-region		box block 0 $x 0 $y 0 $z
-create_box	1 box
-create_atoms	1 box
-mass		1 1.0
-
-velocity	all create 1.44 87287 loop geom
-
-pair_style	lj/cut 2.5
-pair_coeff	1 1 1.0 1.0 2.5
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 20 check no
-
-fix		1 all nve
-
-run		$t
diff --git a/bench/FERMI/in.lj.gpu b/bench/FERMI/in.lj.gpu
deleted file mode 100644
index 813ea36f75..0000000000
--- a/bench/FERMI/in.lj.gpu
+++ /dev/null
@@ -1,29 +0,0 @@
-# 3d Lennard-Jones melt
-
-# newton off is required for GPU package
-# set variable g = 1/2 for 1/2 GPUs
-
-newton          off
-if "$g == 1" then "package gpu force/neigh 0 0 1"
-if "$g == 2" then "package gpu force/neigh 0 1 1"
-
-units		lj
-atom_style	atomic
-
-lattice		fcc 0.8442
-region		box block 0 $x 0 $y 0 $z
-create_box	1 box
-create_atoms	1 box
-mass		1 1.0
-
-velocity	all create 1.44 87287 loop geom
-
-pair_style	lj/cut 2.5
-pair_coeff	1 1 1.0 1.0 2.5
-
-neighbor	0.3 bin
-neigh_modify	delay 0 every 20 check no
-
-fix		1 all nve
-
-run		$t