diff --git a/lib/gpu/Makefile.nvidia b/lib/gpu/Makefile.nvidia
index 7efb430e5d..a1defe2254 100644
--- a/lib/gpu/Makefile.nvidia
+++ b/lib/gpu/Makefile.nvidia
@@ -16,7 +16,7 @@
 BIN_DIR = .
 OBJ_DIR = .
 AR = ar
-CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -DDEBUG -Xptxas -v --use_fast_math
+CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
 CUDA_ARCH = -maxrregcount 128 #-arch=sm_13
 CUDA_PREC = -D_SINGLE_SINGLE
 CUDA_LINK = -L/usr/local/cuda/lib64 -lcudart $(CUDA_LIB)
diff --git a/lib/gpu/README b/lib/gpu/README
index ed95c3eeab..62af16e4cb 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -69,6 +69,9 @@ the CUDA_PREC variable:
 NOTE: Double precision is only supported on certain GPUS (with
       compute capability>=1.3).
       
+NOTE: For Tesla and other graphics cards with compute capability>=1.3,
+      make sure that -arch=sm_13 is set on the CUDA_ARCH line.
+
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
       package has been installed before installing the GPU package in LAMMPS.
 
diff --git a/lib/gpu/gb_gpu.cu b/lib/gpu/gb_gpu.cu
index 260edd9e6f..9bfb53a497 100644
--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@@ -203,17 +203,19 @@ string gb_gpu_name(const int id, const int max_nbors) {
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int * gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
-                  const double upsilon, const double mu, double **shape,
-                  double **well, double **cutsq, double **sigma, 
-                  double **epsilon, double *host_lshape, int **form,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int max_nbors, const int thread, const int gpu_id) {
+bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
+                 const double upsilon, const double mu, double **shape,
+                 double **well, double **cutsq, double **sigma, 
+                 double **epsilon, double *host_lshape, int **form,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int max_nbors, const int thread, const int gpu_id) {
   assert(thread<MAX_GPU_THREADS);
   
+  GBMF[thread].gpu.init();
+  
   if (GBMF[thread].gpu.num_devices()==0)
-    return 0;                   
+    return false;                   
 
   ij_size=IJ_SIZE;
   return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
@@ -337,17 +339,20 @@ int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum,
 // forces, and torques for those interactions
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
-void _gb_gpu_nbors(gbmtyp &gbm, const int num_ij, const bool eflag) {
+void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij, 
+                   const bool eflag) {
   gbm.nbor.time_nbor.add_to_total();
   // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
   
+  memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
   gbm.nbor.time_nbor.start();
   gbm.nbor.add(num_ij,gbm.pair_stream);
   gbm.nbor.time_nbor.stop();
 }
 
-void gb_gpu_nbors(const int num_ij, const bool eflag, const int thread) {
-  _gb_gpu_nbors(GBMF[thread],num_ij,eflag);
+void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag, 
+                  const int thread) {
+  _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
 }
 
 // ---------------------------------------------------------------------------
@@ -475,7 +480,7 @@ double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
     gbm.time_gayberne2.add_to_total();
     gbm.time_pair.add_to_total();
   }      
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
+  CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
   
   evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
   gbm.atom.add_forces(ilist,f);
diff --git a/lib/gpu/gb_gpu_memory.cu b/lib/gpu/gb_gpu_memory.cu
index aab5ea55a5..de24284751 100644
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@@ -37,7 +37,7 @@ GB_GPU_MemoryT::~GB_GPU_Memory() {
 }
  
 template <class numtyp, class acctyp>
-int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes, 
+bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes, 
                           const double gamma, const double upsilon, 
                           const double mu, double **host_shape,
                           double **host_well, double **host_cutsq, 
@@ -50,9 +50,11 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
   if (this->allocated)
     clear();
     
-  LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
-                       host_lj1, host_lj2, host_lj3, host_lj4, host_offset,
-                       host_special_lj, max_nbors, me);
+  bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
+                              host_lj1, host_lj2, host_lj3, host_lj4, 
+                              host_offset, host_special_lj, max_nbors, me);
+  if (!p)
+    return false;                              
     
   host_form=h_form;
     
@@ -100,7 +102,7 @@ int* GB_GPU_MemoryT::init(const int ij_size, const int ntypes,
   // Memory for ilist ordered by particle type
   host_olist.safe_alloc_rw(this->max_atoms);
 
-  return this->nbor.host_ij.begin();
+  return true;
 }
   
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
index 4d607ed14e..eb9fb92a1c 100644
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@@ -35,7 +35,7 @@ class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
   GB_GPU_Memory();
   ~GB_GPU_Memory(); 
  
-  int* init(const int ij_size, const int ntypes, const double gamma,
+  bool init(const int ij_size, const int ntypes, const double gamma,
             const double upsilon, const double mu, double **host_shape,
             double **host_well, double **host_cutsq, double **host_sigma, 
             double **host_epsilon, double *host_lshape, int **h_form,
diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu
index 94d826e149..ffc67eb068 100644
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@@ -63,12 +63,13 @@ string lj_gpu_name(const int id, const int max_nbors) {
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int * lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
-                  double **epsilon, double **host_lj1, double **host_lj2, 
-                  double **host_lj3, double **host_lj4, double **offset, 
-                  double *special_lj, const int max_nbors, const int gpu_id) {
+bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
+                 double **epsilon, double **host_lj1, double **host_lj2, 
+                 double **host_lj3, double **host_lj4, double **offset, 
+                 double *special_lj, const int max_nbors, const int gpu_id) {
+  LJMF.gpu.init();                  
   if (LJMF.gpu.num_devices()==0)
-    return 0;                   
+    return false;                   
 
   ij_size=IJ_SIZE;
   return LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
@@ -142,17 +143,19 @@ bool lj_gpu_reset_nbors(const int nall, const int inum, int *ilist,
 // forces, and torques for those interactions
 // ---------------------------------------------------------------------------
 template <class LJMTyp>
-void _lj_gpu_nbors(LJMTyp &ljm, const int num_ij) {
+void _lj_gpu_nbors(LJMTyp &ljm, const int *ij, const int num_ij) {
   ljm.nbor.time_nbor.add_to_total();
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
   
+  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // Not if timed
+
+  memcpy(ljm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
   ljm.nbor.time_nbor.start();
   ljm.nbor.add(num_ij,ljm.pair_stream);
   ljm.nbor.time_nbor.stop();
 }
 
-void lj_gpu_nbors(const int num_ij) {
-  _lj_gpu_nbors(LJMF,num_ij);
+void lj_gpu_nbors(const int *ij, const int num_ij) {
+  _lj_gpu_nbors(LJMF,ij,num_ij);
 }
 
 // ---------------------------------------------------------------------------
@@ -201,7 +204,7 @@ double _lj_gpu_forces(LJMT &ljm, double **f, const int *ilist,
   ljm.atom.time_atom.add_to_total();
   ljm.nbor.time_nbor.add_to_total();
   ljm.time_pair.add_to_total();
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream)); // not if timed
+  CUDA_SAFE_CALL(cudaStreamSynchronize(ljm.pair_stream));
 
   evdw=ljm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial);
   ljm.atom.add_forces(ilist,f);
diff --git a/lib/gpu/lj_gpu_memory.cu b/lib/gpu/lj_gpu_memory.cu
index 5e8bf4cdc6..a5000a7eaf 100644
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@@ -39,7 +39,7 @@ int LJ_GPU_MemoryT::get_max_atoms(const size_t gpu_bytes, const int max_nbors) {
 }
   
 template <class numtyp, class acctyp>
-int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
+bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
                           double **host_cutsq, double **host_sigma, 
                           double **host_epsilon, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
@@ -50,10 +50,10 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
     clear();
     
   if (me>=gpu.num_devices())
-    return 0;
+    return false;
   gpu.set(me);
   if (gpu.revision()<1.0)
-    return 0;  
+    return false;  
     
   // Initialize timers for the selected GPU
   time_pair.init();
@@ -114,8 +114,7 @@ int* LJ_GPU_MemoryT::init(const int ij_size, const int ntypes,
   dev_error.zero();
     
   allocated=true;
-
-  return nbor.host_ij.begin();
+  return true;
 }
   
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h
index 82a87ea6a8..9052ad4956 100644
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@@ -40,7 +40,7 @@ class LJ_GPU_Memory {
   ~LJ_GPU_Memory() { clear(); }
  
   /// Allocate memory on host and device
-  int* init(const int ij_size, const int ntypes, double **host_cutsq, 
+  bool init(const int ij_size, const int ntypes, double **host_cutsq, 
             double **host_sigma, double **host_epsilon, 
             double **host_lj1, double **host_lj2, double **host_lj3, 
             double **host_lj4, double **host_offset, double *host_special_lj,
diff --git a/lib/gpu/nvc_device.cu b/lib/gpu/nvc_device.cu
index 01a2db1bd7..2f25d61af2 100644
--- a/lib/gpu/nvc_device.cu
+++ b/lib/gpu/nvc_device.cu
@@ -28,7 +28,9 @@
 #include "nvc_device.h"
 
 // Grabs the properties for all devices
-NVCDevice::NVCDevice() {
+void NVCDevice::init() {
+  _properties.clear();
+  
   CUDA_SAFE_CALL(cudaGetDeviceCount(&_num_devices));
   for (int dev=0; dev<_num_devices; ++dev) {
     cudaDeviceProp deviceProp;
diff --git a/lib/gpu/nvc_device.h b/lib/gpu/nvc_device.h
index 1ba5f2bc4c..4286c6022f 100644
--- a/lib/gpu/nvc_device.h
+++ b/lib/gpu/nvc_device.h
@@ -33,11 +33,16 @@ using namespace std;
 /// Class for looking at device properties
 /** \note Calls to change the device outside of the class results in incorrect
   *       behavior 
-  * \note There is no error checking for indexing past the number of devices **/
+  * \note There is no error checking for indexing past the number of devices
+  * \note init() at least once before using any of the routines **/
 class NVCDevice {
  public:
   /// Grabs the properties for all devices
-  NVCDevice();
+  /** \note init() must be called following construction before any routines **/
+  NVCDevice() {}
+
+  /// Collect properties for every GPU on the node and set active GPU to ID 0
+  void init();
 
   /// Return the number of devices that support CUDA
   inline int num_devices() { return _properties.size(); }
diff --git a/lib/gpu/nvc_get_devices.cu b/lib/gpu/nvc_get_devices.cu
index a1d35232e6..b85626f3f7 100644
--- a/lib/gpu/nvc_get_devices.cu
+++ b/lib/gpu/nvc_get_devices.cu
@@ -25,6 +25,7 @@
 
 int main(int argc, char** argv) {
   NVCDevice gpu;
+  gpu.init();
   gpu.print_all(cout);
   return 0;
 }
diff --git a/lib/gpu/nvc_macros.h b/lib/gpu/nvc_macros.h
index 56bb5c64ce..4aaa167af5 100644
--- a/lib/gpu/nvc_macros.h
+++ b/lib/gpu/nvc_macros.h
@@ -20,7 +20,7 @@ static __inline__ __device__ numbr cuda_zero() { return 0.0; }
 template <>
 static __inline__ __device__ float cuda_zero<float>() { return 0.0f; }
 
-#ifdef DEBUG
+#ifndef NO_DEBUG
 
 #  define CU_SAFE_CALL_NO_SYNC( call ) do {                                  \
     CUresult err = call;                                                     \
diff --git a/lib/gpu/nvc_memory.h b/lib/gpu/nvc_memory.h
index 0ff5229c05..5aaeffd2f0 100644
--- a/lib/gpu/nvc_memory.h
+++ b/lib/gpu/nvc_memory.h
@@ -156,25 +156,26 @@ class NVC_Host {
   /// Asynchronous copy from device (numel is not bytes)
   inline void copy_from_device(const numtyp *device_p, size_t numel,
                                cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
-                                   cudaMemcpyDeviceToHost,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,device_p,numel*sizeof(numtyp),
+                                           cudaMemcpyDeviceToHost,stream));
   }
   
   /// Asynchronous copy to device (numel is not bytes)
   inline void copy_to_device(numtyp *device_p, size_t numel,
                              cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
-                                   cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(device_p,_array,numel*sizeof(numtyp),
+                                           cudaMemcpyHostToDevice,stream));
   }
   
   /// Asynchronous copy to 2D matrix on device (numel is not bytes)
   inline void copy_to_2Ddevice(numtyp *device_p, const size_t dev_row_size,
                                const size_t rows, const size_t cols,
                                cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DAsync(device_p,dev_row_size*sizeof(numtyp),
-                                     _array,cols*sizeof(numtyp),
-                                     cols*sizeof(numtyp),rows,
-                                     cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(device_p,
+                                             dev_row_size*sizeof(numtyp),
+                                             _array,cols*sizeof(numtyp),
+                                             cols*sizeof(numtyp),rows,
+                                             cudaMemcpyHostToDevice,stream));
   }
   
  private:
@@ -226,8 +227,8 @@ class NVC_Vec {
 
   /// Asynchronous copy from host
   inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream)
-    { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,row_bytes(), 
-                     cudaMemcpyHostToDevice, stream)); }
+    { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,row_bytes(), 
+                             cudaMemcpyHostToDevice, stream)); }
 
   /// Copy to host
   inline void copy_to_host(numtyp *host_p)
@@ -328,17 +329,17 @@ class NVC_Mat {
   /// Asynchronous copy from host (elements not bytes)
   inline void copy_from_host(const numtyp *host_p, const size_t numel,
                              cudaStream_t &stream)
-    { CUDA_SAFE_CALL(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), 
-                     cudaMemcpyHostToDevice, stream)); }
+    { CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyAsync(_array,host_p,numel*sizeof(numtyp), 
+                             cudaMemcpyHostToDevice, stream)); }
 
   /// Asynchronous Copy from Host
   /** \note Used when the number of columns/rows allocated on host smaller than
     *       on device **/
   inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
                                const size_t cols, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DAsync(_array, _pitch, host_p,cols*sizeof(numtyp), 
-                                     cols*sizeof(numtyp), rows,
-                                     cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DAsync(_array, _pitch, host_p,
+                           cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
+                           cudaMemcpyHostToDevice,stream));
   }
 
  private:
@@ -416,9 +417,10 @@ class NVC_ConstMat {
 
   /// Asynchronous Copy from Host
   inline void copy_from_host(const numtyp *host_p, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, 
-                                          numel()*sizeof(numtyp),
-                                          cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpyToArrayAsync(_array, 0, 0, host_p, 
+                                                  numel()*sizeof(numtyp),
+                                                  cudaMemcpyHostToDevice,
+                                                  stream));
   }
       
   /// Asynchronous Copy from Host
@@ -426,9 +428,9 @@ class NVC_ConstMat {
     *       on device **/
   inline void copy_2Dfrom_host(const numtyp *host_p, const size_t rows,
                                const size_t cols, cudaStream_t &stream) {
-    CUDA_SAFE_CALL(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, 
-                         cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
-                         cudaMemcpyHostToDevice,stream));
+    CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy2DToArrayAsync(_array, 0, 0, host_p, 
+                           cols*sizeof(numtyp), cols*sizeof(numtyp), rows,
+                           cudaMemcpyHostToDevice,stream));
   }                               
 
   /// Cast buffer to numtyp in host_write and copy to array