diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index e59dae1a6f..3d47df8a92 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -70,6 +70,12 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
   if (success!=0)
     return success;
 
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
+  if (success!=0)
+    return success;
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index c6341f7d57..94e7502c55 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -71,6 +71,12 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
   if (success!=0)
     return success;
 
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
+  if (success!=0)
+    return success;
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 478f0092c7..b2a41f10cf 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -72,6 +72,12 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
   if (success!=0)
     return success;
 
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
+  if (success!=0)
+    return success;
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
index 941f463b14..3f71c820c7 100644
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@@ -71,6 +71,12 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
   if (success!=0)
     return success;
 
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
+  if (success!=0)
+    return success;
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index aa77a48c66..ba28d697cc 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -84,6 +84,12 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   if (success!=0)
     return success;
 
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
+  if (success!=0)
+    return success;
+
   ucl_device=device->gpu;
   atom=&device->atom;
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 9397f3c6c5..411e19a78a 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -301,16 +301,6 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
   if (!ans.init(ef_nlocal,charge,rot,*gpu))
     return -3;
 
-  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
-                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
-                  _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device, compile_string()))
-    return -3;
-  if (_cell_size<0.0)
-    nbor->cell_size(cell_size,cell_size);
-  else
-    nbor->cell_size(_cell_size,cell_size);
-
   _init_count++;
   return 0;
 }
@@ -338,6 +328,39 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
   return 0;
 }
 
+template <class numtyp, class acctyp>
+int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
+                  const int host_nlocal, const int nall,
+                  const int maxspecial, const int gpu_host,
+                  const int max_nbors, const double cell_size,
+                  const bool pre_cut, const int threads_per_atom) {
+  int ef_nlocal=nlocal;
+  if (_particle_split<1.0 && _particle_split>0.0)
+    ef_nlocal=static_cast<int>(_particle_split*nlocal);
+ 
+  int gpu_nbor=0;
+  if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+  #ifndef USE_CUDPP
+  if (gpu_nbor==1)
+    gpu_nbor=2;
+  #endif
+
+  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
+                  *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
+                  _block_cell_id, _block_nbor_build, threads_per_atom,
+                  _warp_size, _time_device, compile_string()))
+    return -3;
+  if (_cell_size<0.0)
+    nbor->cell_size(cell_size,cell_size);
+  else
+    nbor->cell_size(_cell_size,cell_size);
+
+  return 0;
+}
+
 template <class numtyp, class acctyp>
 void DeviceT::set_single_precompute
                      (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 695b0a62f9..68d88a3182 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -91,6 +91,13 @@ class Device {
     * - -5 Double precision is not supported on card **/
   int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
 
+  /// Initialize neighbor list build -- callback function from pair
+  int init_nbor(Neighbor *nbor, const int nlocal,
+                  const int host_nlocal, const int nall,
+                  const int maxspecial, const int gpu_host,
+                  const int max_nbors, const double cell_size,
+                  const bool pre_cut, const int threads_per_atom);
+
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
                     const int first_gpu, const int last_gpu);
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 0d5b4334c9..7be87939fe 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -30,7 +30,6 @@
 #include "neighbor.h"
 #include "citeme.h"
 #include "error.h"
-#include "utils.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
@@ -219,17 +218,6 @@ void FixGPU::init()
     error->all(FLERR,"GPU package does not (yet) work with "
                "atom_style template");
 
-  // hybrid cannot be used with force/neigh option
-
-  if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds");
-
-  if (_particle_split < 0)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"GPU split param must be positive "
-                 "for hybrid pair styles");
-
   // neighbor list builds on the GPU with triclinic box is not yet supported
 
   if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) &&
@@ -243,16 +231,21 @@ void FixGPU::init()
 
   // make sure fdotr virial is not accumulated multiple times
 
-  if (force->pair_match("^hybrid",0) != NULL) {
+  if (force->pair_match("hybrid",1) != NULL) {
     PairHybrid *hybrid = (PairHybrid *) force->pair;
     for (int i = 0; i < hybrid->nstyles; i++)
-      if (!utils::strmatch(hybrid->keywords[i],"/gpu$"))
+      if (strstr(hybrid->keywords[i],"/gpu")==NULL)
+        force->pair->no_virial_fdotr_compute = 1;
+  } else if (force->pair_match("hybrid/overlay",1) != NULL) {
+    PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
+    for (int i = 0; i < hybrid->nstyles; i++)
+      if (strstr(hybrid->keywords[i],"/gpu")==NULL)
         force->pair->no_virial_fdotr_compute = 1;
   }
 
   // rRESPA support
 
-  if (utils::strmatch(update->integrate_style,"^respa"))
+  if (strstr(update->integrate_style,"respa"))
     _nlevels_respa = ((Respa *) update->integrate)->nlevels;
 }
 
@@ -283,7 +276,7 @@ void FixGPU::min_setup(int vflag)
 
 /* ---------------------------------------------------------------------- */
 
-void FixGPU::post_force(int /* vflag */)
+void FixGPU::post_force(int vflag)
 {
   if (!force->pair) return;
 
@@ -315,7 +308,7 @@ void FixGPU::min_post_force(int vflag)
 
 /* ---------------------------------------------------------------------- */
 
-void FixGPU::post_force_respa(int vflag, int /* ilevel */, int /* iloop */)
+void FixGPU::post_force_respa(int vflag, int ilevel, int iloop)
 {
   post_force(vflag);
 }