diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h
index f3a7a06ceb..9c2aba780b 100644
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@@ -77,6 +77,14 @@ class UCL_Device {
   /// Returns the stream indexed by i
   inline command_queue & cq(const int i) { return _cq[i]; }
 
+  /// Set the default command queue (by default this is the null stream)
+  /** \param i index of the command queue (as added by push_command_queue()) 
+      If i is 0, the default command queue is set to the null stream **/
+  inline void set_command_queue(const int i) {
+    if (i==0) _cq[0]=0;
+    else _cq[0]=_cq[i];
+  }
+  
   /// Block until all commands in the default stream have completed
   inline void sync() { sync(0); }
   
@@ -127,7 +135,8 @@ class UCL_Device {
   /// Get the number of cores
   inline unsigned cores(const int i) 
     { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
-      else return _properties[i].multiProcessorCount*32; }
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
+      else return _properties[i].multiProcessorCount*192; }
   
   /// Get the gigabytes of global memory in the current device
   inline double gigabytes() { return gigabytes(_device); }
@@ -205,6 +214,7 @@ inline int UCL_Device::set(int num) {
   if (_device==num)
     return UCL_SUCCESS;
   for (int i=1; i<num_queues(); i++) pop_command_queue();
+  _cq[0]=0;
   cudaThreadExit();
   cudaError err=cudaSetDevice(_device_ids[num]);
   if (err!=cudaSuccess) {
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 8040cd70c7..c17c1943c3 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -151,7 +151,8 @@ class UCL_Device {
   /// Get the number of cores
   inline unsigned cores(const int i) 
     { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
-      else return _properties[i].multiProcessorCount*32; }
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
+      else return _properties[i].multiProcessorCount*192; }
   
   /// Get the gigabytes of global memory in the current device
   inline double gigabytes() { return gigabytes(_device); }
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index ad3cd39cd0..24fdccd51c 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -124,7 +124,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
   if (my_gpu>=gpu->num_devices())
     return -2;
     
-  #ifndef CUDA_PRX
+  #ifndef CUDA_PROXY
   if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
     return -7;
   #endif
@@ -203,7 +203,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
   if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
                   _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _time_device))
+                  _warp_size, _time_device))
     return -3;
   nbor->cell_size(cell_size);
 
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 4f022baf71..6a086745c5 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -37,13 +37,16 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
                        const int gpu_nbor, const int gpu_host, 
                        const bool pre_cut, const int block_cell_2d,
                        const int block_cell_id, const int block_nbor_build,
-                       const int threads_per_atom, const bool time_device) {
+                       const int threads_per_atom, const int warp_size,
+                       const bool time_device) {
   clear();
 
   _threads_per_atom=threads_per_atom;
   _block_cell_2d=block_cell_2d;
   _block_cell_id=block_cell_id;
+  _max_block_nbor_build=block_nbor_build;
   _block_nbor_build=block_nbor_build;
+  _warp_size=warp_size;
   _shared=shared;
   dev=&devi;
   _gpu_nbor=gpu_nbor;
@@ -418,6 +421,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     for (int i=0; i<_ncells; i++)
       mn=std::max(mn,host_cell_counts[i]);
     mn*=8;
+    set_nbor_block_size(mn/2);
+
     resize_max_neighbors<numtyp,acctyp>(mn,success);
     if (!success)
       return;
@@ -497,6 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     mn=host_acc[0];
     for (int i=1; i<nt; i++)
       mn=std::max(mn,host_acc[i]);
+    set_nbor_block_size(mn);
 
     if (mn>_max_nbors) {  
       resize_max_neighbors<numtyp,acctyp>(mn,success);
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index f9abb638c8..a2d2eda560 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -67,7 +67,7 @@ class Neighbor {
             const int gpu_nbor, const int gpu_host, const bool pre_cut,
             const int block_cell_2d, const int block_cell_id, 
             const int block_nbor_build, const int threads_per_atom,
-            const bool time_device);
+            const int warp_size, const bool time_device);
 
   /// Set the size of the cutoff+skin
   inline void cell_size(const double size) { _cell_size=size; }
@@ -237,12 +237,20 @@ class Neighbor {
   double _gpu_bytes, _c_bytes, _cell_bytes;
   void alloc(bool &success);
   
-  int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
-  int _threads_per_atom;
-  int _total_atoms;
+  int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
+  int _ncells, _threads_per_atom, _total_atoms;
 
   template <class numtyp, class acctyp>
   inline void resize_max_neighbors(const int maxn, bool &success);
+  
+  int _warp_size;
+  inline void set_nbor_block_size(const int mn) {
+    int desired=mn/(2*_warp_size);
+    desired*=_warp_size;
+    if (desired<_warp_size) desired=_warp_size;
+    else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
+    _block_nbor_build=desired;
+  }
 };
 
 }
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index 0d7c7212e9..99990ece67 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -119,6 +119,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
   int ix = BLOCK_ID_X;
   int iy = BLOCK_ID_Y % ncelly;
   int iz = BLOCK_ID_Y / ncelly;
+  int bsx = BLOCK_SIZE_X;
 	  
   int icell = ix + iy*ncellx + iz*ncellx*ncelly;
 
@@ -134,9 +135,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
 
   numtyp4 diff;
   numtyp r2;
-  int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/BLOCK_SIZE_X);
+  int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/bsx);
   for (int ii = 0; ii < cap; ii++) {
-    int i = icell_begin + tid + ii*BLOCK_SIZE_X;
+    int i = icell_begin + tid + ii*bsx;
     int pid_i = nall, pid_j, stride;
     numtyp4 atom_i, atom_j;
     int cnt = 0;    
@@ -173,14 +174,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
           int num_atom_cell = jcell_end - jcell_begin;
 	  
           // load jcell to shared memory
-          int num_iter = ucl_ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
+          int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx);
 
           for (int k = 0; k < num_iter; k++) {
-            int end_idx = min(BLOCK_NBOR_BUILD, 
-                              num_atom_cell-k*BLOCK_NBOR_BUILD);
+            int end_idx = min(bsx, num_atom_cell-k*bsx);
 	    
             if (tid < end_idx) {
-              pid_j =  cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
+              pid_j =  cell_particle_id[tid+k*bsx+jcell_begin];
               cell_list_sh[tid] = pid_j;
               atom_j = fetch_pos(pid_j,x_); //[pid_j];
               pos_sh[tid].x = atom_j.x;
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index a5f2f6eedf..184664cdd0 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -100,6 +100,8 @@
 
 #else
 
+#if (ARCH < 300)
+
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
 #define BLOCK_NBOR_BUILD 128
@@ -107,6 +109,17 @@
 #define BLOCK_BIO_PAIR 128
 #define MAX_SHARED_TYPES 11
 
+#else
+
+#define THREADS_PER_ATOM 4
+#define THREADS_PER_CHARGE 8
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_PAIR 512
+#define BLOCK_BIO_PAIR 512
+#define MAX_SHARED_TYPES 11
+
+#endif
+
 #endif
 
 #define WARP_SIZE 32