diff --git a/doc/src/package.rst b/doc/src/package.rst
index aea4ba657f..842fc8bc1c 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -32,10 +32,12 @@ Syntax
              size = bin size for neighbor list construction (distance units)
            *split* = fraction
              fraction = fraction of atoms assigned to GPU (default = 1.0)
-           *tpa* value = Nthreads
-             Nthreads = # of GPU vector lanes used per atom
+           *tpa* value = Nlanes
+             Nlanes = # of GPU vector lanes (CUDA threads) used per atom
            *blocksize* value = size
              size = thread block size for pair force computation
+           *omp* value = Nthreads
+             Nthreads = number of OpenMP threads to use on CPU (default = 0)
            *platform* value = id
              id = For OpenCL, platform ID for the GPU or accelerator
            *gpuID* values = id
@@ -101,7 +103,7 @@ Syntax
              off = use device acceleration (e.g. GPU) for all available styles in the KOKKOS package (default)
              on  = use device acceleration only for pair styles (and host acceleration for others)
        *omp* args = Nthreads keyword value ...
-         Nthread = # of OpenMP threads to associate with each MPI process
+         Nthreads = # of OpenMP threads to associate with each MPI process
          zero or more keyword/value pairs may be appended
          keywords = *neigh*
            *neigh* value = *yes* or *no*
@@ -116,7 +118,7 @@ Examples
    package gpu 0
    package gpu 1 split 0.75
    package gpu 2 split -1.0
-   package gpu 0 device_type intelgpu
+   package gpu 0 omp 2 device_type intelgpu
    package kokkos neigh half comm device
    package omp 0 neigh no
    package omp 4
@@ -266,10 +268,10 @@ with MPI.
 
 The *tpa* keyword sets the number of GPU vector lanes per atom used to
 perform force calculations.  With a default value of 1, the number of
-threads will be chosen based on the pair style, however, the value can
+lanes will be chosen based on the pair style, however, the value can
 be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
-the value can improve performance. The number of threads per atom must
+the value can improve performance. The number of lanes per atom must
 be a power of 2 and currently cannot be greater than the SIMD width
 for the GPU / accelerator. In the case it exceeds the SIMD width, it
 will automatically be decreased to meet the restriction.
@@ -282,6 +284,14 @@ individual GPU cores, but reduces the total number of thread blocks,
 thus may lead to load imbalance. On modern hardware, the sensitivity
 to the blocksize is typically low.
 
+The *Nthreads* value for the *omp* keyword sets the number of OpenMP
+threads allocated for each MPI task. This setting controls OpenMP
+parallelism only for routines run on the CPUs. For more details on
+setting the number of OpenMP threads, see the discussion of the
+*Nthreads* setting on this doc page for the "package omp" command.
+The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL,
+and GPU packages.
+
 The *platform* keyword is only used with OpenCL to specify the ID for
 an OpenCL platform. See the output from ocl_get_devices in the lib/gpu
 directory. In LAMMPS only one platform can be active at a time and by
@@ -336,44 +346,13 @@ built with co-processor support.
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
-The *omp* keyword determines the number of OpenMP threads allocated
-for each MPI task when any portion of the interactions computed by a
-USER-INTEL pair style are run on the CPU.  This can be the case even
-if LAMMPS was built with co-processor support; see the *balance*
-keyword discussion below.  If you are running with less MPI tasks/node
-than there are CPUs, it can be advantageous to use OpenMP threading on
-the CPUs.
-
-.. note::
-
-   The *omp* keyword has nothing to do with co-processor threads on
-   the Xeon Phi; see the *tpc* and *tptask* keywords below for a
-   discussion of co-processor threads.
-
-The *Nthread* value for the *omp* keyword sets the number of OpenMP
-threads allocated for each MPI task.  Setting *Nthread* = 0 (the
-default) instructs LAMMPS to use whatever value is the default for the
-given OpenMP environment. This is usually determined via the
-*OMP_NUM_THREADS* environment variable or the compiler runtime, which
-is usually a value of 1.
-
-For more details, including examples of how to set the OMP_NUM_THREADS
-environment variable, see the discussion of the *Nthreads* setting on
-this doc page for the "package omp" command.  Nthreads is a required
-argument for the USER-OMP package.  Its meaning is exactly the same
-for the USER-INTEL package.
-
-.. note::
-
-   If you build LAMMPS with both the USER-INTEL and USER-OMP
-   packages, be aware that both packages allow setting of the *Nthreads*
-   value via their package commands, but there is only a single global
-   *Nthreads* value used by OpenMP.  Thus if both package commands are
-   invoked, you should insure the two values are consistent.  If they are
-   not, the last one invoked will take precedence, for both packages.
-   Also note that if the :doc:`-sf hybrid intel omp command-line switch <Run_options>` is used, it invokes a "package intel"
-   command, followed by a "package omp" command, both with a setting of
-   *Nthreads* = 0.
+The *Nthreads* value for the *omp* keyword sets the number of OpenMP
+threads allocated for each MPI task. This setting controls OpenMP
+parallelism only for routines run on the CPUs. For more details on
+setting the number of OpenMP threads, see the discussion of the
+*Nthreads* setting on this doc page for the "package omp" command.
+The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL,
+and GPU packages.
 
 The *mode* keyword determines the precision mode to use for
 computing pair style forces, either on the CPU or on the co-processor,
@@ -579,7 +558,7 @@ result in better performance for certain configurations and system sizes.
 The *omp* style invokes settings associated with the use of the
 USER-OMP package.
 
-The *Nthread* argument sets the number of OpenMP threads allocated for
+The *Nthreads* argument sets the number of OpenMP threads allocated for
 each MPI task.  For example, if your system has nodes with dual
 quad-core processors, it has a total of 8 cores per node.  You could
 use two MPI tasks per node (e.g. using the -ppn option of the mpirun
@@ -588,7 +567,7 @@ This would use all 8 cores on each node.  Note that the product of MPI
 tasks \* threads/task should not exceed the physical number of cores
 (on a node), otherwise performance will suffer.
 
-Setting *Nthread* = 0 instructs LAMMPS to use whatever value is the
+Setting *Nthreads* = 0 instructs LAMMPS to use whatever value is the
 default for the given OpenMP environment. This is usually determined
 via the *OMP_NUM_THREADS* environment variable or the compiler
 runtime.  Note that in most cases the default for OpenMP capable
@@ -619,6 +598,18 @@ input.  Not all features of LAMMPS support OpenMP threading via the
 USER-OMP package and the parallel efficiency can be very different,
 too.
 
+.. note::
+
+   If you build LAMMPS with the GPU, USER-INTEL, and / or USER-OMP
+   packages, be aware these packages all allow setting of the *Nthreads*
+   value via their package commands, but there is only a single global
+   *Nthreads* value used by OpenMP.  Thus if multiple package commands are
+   invoked, you should insure the values are consistent.  If they are
+   not, the last one invoked will take precedence, for all packages.
+   Also note that if the :doc:`-sf hybrid intel omp command-line switch <Run_options>` is used, it invokes a "package intel" command, followed by a
+   "package omp" command, both with a setting of *Nthreads* = 0. Likewise
+   for a hybrid suffix for gpu and omp.
+
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
@@ -665,7 +656,7 @@ Default
 
 For the GPU package, the default is Ngpu = 0 and the option defaults
 are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
-to Ngpu-1, tpa = 1, and platform=-1.  These settings are made
+to Ngpu-1, tpa = 1, omp = 0, and platform=-1.  These settings are made
 automatically if the "-sf gpu" :doc:`command-line switch <Run_options>`
 is used.  If it is not used, you must invoke the package gpu command
 in your input script or via the "-pk gpu" :doc:`command-line switch <Run_options>`.
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index e2478a64e5..4a68466d05 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -331,11 +331,11 @@ void AnswerT::get_answers(double **f, double **tor) {
       }
       if (_rot) {
         vec3d *torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
-        forcep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
+        vec4d_t *torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
         for (int i=ifrom; i<ito; i++) {
-          torp[i].x+=forcep[i].x;
-          torp[i].y+=forcep[i].y;
-          torp[i].z+=forcep[i].z;
+          torp[i].x+=torquep[i].x;
+          torp[i].y+=torquep[i].y;
+          torp[i].z+=torquep[i].z;
         }
       }
     }
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 87bfe14751..98411a8033 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -250,7 +250,7 @@ void BaseEllipsoidT::output_times() {
       if (times[6]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
+      fprintf(screen,"Lanes / atom:    %d.\n",_threads_per_atom);
       fprintf(screen,"Vector width:    %d.\n", device->simd_size());
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       if (nbor->gpu_nbor()==2)
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 5ba9185e6f..a65c3d8810 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -53,14 +53,10 @@ DeviceT::~Device() {
 template <class numtyp, class acctyp>
 int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
                          const int first_gpu_id, const int gpu_mode,
-                         const double p_split, const int nthreads,
-                         const int t_per_atom, const double user_cell_size,
-                         char *ocl_args, const int ocl_platform,
-                         char *device_type_flags, const int block_pair) {
-  _nthreads=nthreads;
-  #if (LAL_USE_OMP == 1)
-  omp_set_num_threads(nthreads);
-  #endif
+                         const double p_split, const int t_per_atom,
+                         const double user_cell_size, char *ocl_args,
+                         const int ocl_platform, char *device_type_flags,
+                         const int block_pair) {
   _threads_per_atom=t_per_atom;
   _threads_per_charge=t_per_atom;
   _threads_per_three=t_per_atom;
@@ -583,7 +579,7 @@ void DeviceT::init_message(FILE *screen, const char *name,
     fprintf(screen,"- Using acceleration for %s:\n",name);
     fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
     #if (LAL_USE_OMP == 1)
-    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
+    fprintf(screen,"-  with %d thread(s) per proc.\n", omp_get_max_threads());
     #endif
     #ifdef USE_OPENCL
     fprintf(screen,"-  with OpenCL Parameters for: %s (%d)\n",
@@ -803,7 +799,7 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
       if (times[5]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
+      fprintf(screen,"Lanes / atom:    %d.\n",threads_per_atom);
       fprintf(screen,"Vector width:    %d.\n", simd_size());
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
       if (nbor.gpu_nbor()==2)
@@ -1031,13 +1027,13 @@ Device<PRECISION,ACC_PRECISION> global_device;
 using namespace LAMMPS_AL;
 int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
                     const int first_gpu_id, const int gpu_mode,
-                    const double particle_split, const int nthreads,
-                    const int t_per_atom, const double user_cell_size,
-                    char *opencl_config, const int ocl_platform,
-                    char *device_type_flags, const int block_pair) {
+                    const double particle_split, const int t_per_atom,
+                    const double user_cell_size, char *opencl_config,
+                    const int ocl_platform, char *device_type_flags,
+                    const int block_pair) {
   return global_device.init_device(world,replica,ngpu,first_gpu_id,gpu_mode,
-                                   particle_split,nthreads,t_per_atom,
-                                   user_cell_size,opencl_config,ocl_platform,
+                                   particle_split,t_per_atom,user_cell_size,
+                                   opencl_config,ocl_platform,
                                    device_type_flags,block_pair);
 }
 
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index bd5b81558c..1db6ae3127 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -49,10 +49,10 @@ class Device {
     * - -11 if config_string has the wrong number of parameters **/
   int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
                   const int first_gpu_id, const int gpu_mode,
-                  const double particle_split, const int nthreads,
-                  const int t_per_atom, const double user_cell_size,
-                  char *config_string, const int ocl_platform,
-                  char *device_type_flags, const int block_pair);
+                  const double particle_split, const int t_per_atom,
+                  const double user_cell_size, char *config_string,
+                  const int ocl_platform, char *device_type_flags,
+                  const int block_pair);
 
   /// Initialize the device for Atom storage
   /** \param charge True if charges need to be stored
@@ -201,8 +201,6 @@ class Device {
 
   /// Return the number of procs sharing a device (size of device communicator)
   inline int procs_per_gpu() const { return _procs_per_gpu; }
-  /// Return the number of threads per proc
-  inline int num_threads() const { return _nthreads; }
   /// My rank within all processes
   inline int world_me() const { return _world_me; }
   /// Total number of processes
@@ -331,7 +329,7 @@ class Device {
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
       _replica_size;
-  int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
+  int _gpu_mode, _first_device, _last_device, _platform_id;
   double _particle_split;
   double _cpu_full;
   double _ptx_arch;
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index efbaa6e1f8..8297c338a5 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -32,16 +32,18 @@
 #include "citeme.h"
 #include "error.h"
 
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
 
-extern int lmp_init_device(MPI_Comm world, MPI_Comm replica,
-                           const int ngpu, const int first_gpu_id,
-                           const int gpu_mode, const double particle_split,
-                           const int nthreads, const int t_per_atom,
+extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                           const int first_gpu_id, const int gpu_mode,
+                           const double particle_split, const int t_per_atom,
                            const double cell_size, char *opencl_args,
                            const int ocl_platform, char *device_type_flags,
                            const int block_pair);
@@ -123,7 +125,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
 
   _gpu_mode = GPU_NEIGH;
   _particle_split = 1.0;
-  int nthreads = 1;
+  int nthreads = 0;
   int newtonflag = 0;
   int threads_per_atom = -1;
   double binsize = 0.0;
@@ -167,10 +169,10 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       threads_per_atom = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"nthreads") == 0) {
+    } else if (strcmp(arg[iarg],"omp") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       nthreads = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command");
+      if (nthreads < 0) error->all(FLERR,"Illegal fix GPU command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"platform") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
@@ -200,6 +202,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   #if (LAL_USE_OMP == 0)
   if (nthreads > 1)
     error->all(FLERR,"No OpenMP support compiled in");
+  #else
+  if (nthreads > 0) {
+    omp_set_num_threads(nthreads);
+    comm->nthreads = nthreads;
+  }
   #endif
 
   // set newton pair flag
@@ -227,9 +234,9 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   if (binsize == 0.0) binsize = -1.0;
   _binsize = binsize;
   int gpu_flag = lmp_init_device(universe->uworld, world, ngpu, first_gpu_id,
-                                 _gpu_mode, _particle_split, nthreads,
-                                 threads_per_atom, binsize, opencl_args,
-                                 ocl_platform, device_type_flags, block_pair);
+                                 _gpu_mode, _particle_split, threads_per_atom,
+                                 binsize, opencl_args, ocl_platform,
+                                 device_type_flags, block_pair);
   GPU_EXTRA::check_flag(gpu_flag,error,world);
 }