From d30ba2d7eb6469558a6fc80bac91f8b2ecb8f35a Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 2 Mar 2012 15:57:05 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7885
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/geryon/VERSION.txt  |   3 +-
 lib/gpu/geryon/nvd_device.h |   9 ++
 lib/gpu/geryon/nvd_kernel.h |  11 ++-
 lib/gpu/geryon/nvd_memory.h |  84 ++++++++++---------
 lib/gpu/lal_device.cpp      |  13 ++-
 lib/gpu/lal_eam.cpp         |   6 +-
 lib/gpu/lal_neighbor.cpp    | 160 +++++++++++++++++++++---------------
 lib/gpu/lal_neighbor.h      |  27 ++++--
 lib/gpu/lal_neighbor_gpu.cu |   2 +-
 9 files changed, 193 insertions(+), 122 deletions(-)

diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt
index d260cab24e..313907d611 100644
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@@ -1,2 +1 @@
-Geryon Version 11.094
- 
+Geryon Version 12.034
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 9e98de408c..8040cd70c7 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -117,6 +117,14 @@ class UCL_Device {
     _cq.pop_back();
   }
   
+  /// Set the default command queue (by default this is the null stream)
+  /** \param i index of the command queue (as added by push_command_queue()) 
+      If i is 0, the default command queue is set to the null stream **/
+  inline void set_command_queue(const int i) {
+    if (i==0) _cq[0]=0;
+    else _cq[0]=_cq[i];
+  }
+  
   /// Get the current CUDA device name
   inline std::string name() { return name(_device); }
   /// Get the CUDA device name
@@ -280,6 +288,7 @@ inline int UCL_Device::set(int num) {
   if (_device>-1) {
     CU_SAFE_CALL_NS(cuCtxDestroy(_context));
     for (int i=1; i<num_queues(); i++) pop_command_queue();
+    _cq[0]=0;
   }
   _device=_properties[num].device_id;
   CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index 48cddb7188..b72c89c51f 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -34,11 +34,11 @@ class UCL_Texture;
 /// Class storing 1 or more kernel functions from a single string or file
 class UCL_Program {
  public:
-  inline UCL_Program(UCL_Device &device) {}
+  inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
   inline ~UCL_Program() {}
 
   /// Initialize the program with a device
-  inline void init(UCL_Device &device) { }
+  inline void init(UCL_Device &device) { _cq=device.cq(); }
 
   /// Clear any data associated with program
   /** \note Must call init() after each clear **/
@@ -130,6 +130,7 @@ class UCL_Program {
   friend class UCL_Kernel;
  private:
   CUmodule _module;
+  CUstream _cq;
   friend class UCL_Texture;
 };
 
@@ -141,7 +142,7 @@ class UCL_Kernel {
   
   UCL_Kernel(UCL_Program &program, const char *function) : 
     _dimensions(1), _num_args(0), _param_size(0) 
-    { _num_blocks[0]=0; set_function(program,function); }
+    { _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
   
   ~UCL_Kernel() {}
 
@@ -160,6 +161,7 @@ class UCL_Kernel {
       #endif
       return UCL_FUNCTION_NOT_FOUND;
     }
+    _cq=program._cq;
     return UCL_SUCCESS;
   }
 
@@ -229,7 +231,7 @@ class UCL_Kernel {
   /// Run the kernel in the default command queue
   inline void run() {
     CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
-    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0));
+    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
   }
   
   /// Run the kernel in the specified command queue
@@ -245,6 +247,7 @@ class UCL_Kernel {
 
  private:
   CUfunction _kernel;
+  CUstream _cq;
   unsigned _dimensions;
   unsigned _num_blocks[2];
   unsigned _num_args;
diff --git a/lib/gpu/geryon/nvd_memory.h b/lib/gpu/geryon/nvd_memory.h
index 2023e8586f..dc70aa1b3c 100644
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@@ -57,6 +57,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
     *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
   if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
     return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
   return UCL_SUCCESS;
 }
 
@@ -72,6 +73,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
     *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
   if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
     return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
   return UCL_SUCCESS;
 }
 
@@ -92,6 +94,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
   CUresult err=cuMemAlloc(&mat.cbegin(),n);
   if (err!=CUDA_SUCCESS)
     return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
   return UCL_SUCCESS;
 }
 
@@ -101,6 +104,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
   CUresult err=cuMemAlloc(&mat.cbegin(),n);
   if (err!=CUDA_SUCCESS)
     return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
   return UCL_SUCCESS;
 }
 
@@ -115,6 +119,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
   pitch=static_cast<size_t>(upitch);                               
   if (err!=CUDA_SUCCESS)
     return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
   return UCL_SUCCESS;
 }    
 
@@ -129,6 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
   pitch=static_cast<size_t>(upitch);                               
   if (err!=CUDA_SUCCESS)
     return UCL_MEMORY_ERROR;
+  mat.cq()=d.cq();
   return UCL_SUCCESS;
 }    
 
@@ -243,8 +249,8 @@ template<> struct _ucl_memcpy<2,2> {
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstArray=dst.cbegin();
     ins.srcArray=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -255,8 +261,8 @@ template<> struct _ucl_memcpy<2,2> {
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstArray=dst.cbegin();
     ins.srcArray=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@@ -280,8 +286,8 @@ template<> struct _ucl_memcpy<2,0> {
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstArray=dst.cbegin();
     ins.srcDevice=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -292,8 +298,8 @@ template<> struct _ucl_memcpy<2,0> {
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstArray=dst.cbegin();
     ins.srcDevice=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@@ -317,8 +323,8 @@ template<> struct _ucl_memcpy<2,1> {
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstArray=dst.cbegin();
     ins.srcHost=src.begin();
     CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -329,8 +335,8 @@ template<> struct _ucl_memcpy<2,1> {
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstArray=dst.cbegin();
     ins.srcHost=src.begin();
     CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@@ -354,8 +360,8 @@ template<> struct _ucl_memcpy<0,2> {
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstDevice=dst.cbegin();
     ins.srcArray=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -366,8 +372,8 @@ template<> struct _ucl_memcpy<0,2> {
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstDevice=dst.cbegin();
     ins.srcArray=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@@ -391,8 +397,8 @@ template<> struct _ucl_memcpy<1,2> {
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstHost=dst.begin();
     ins.srcArray=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -403,8 +409,8 @@ template<> struct _ucl_memcpy<1,2> {
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstHost=dst.begin();
     ins.srcArray=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@@ -428,8 +434,8 @@ template <> struct _ucl_memcpy<1,0> {
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstHost=dst.begin();
     ins.srcDevice=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -440,8 +446,8 @@ template <> struct _ucl_memcpy<1,0> {
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstHost=dst.begin();
     ins.srcDevice=src.cbegin();
     CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@@ -465,8 +471,8 @@ template <> struct _ucl_memcpy<0,1> {
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstDevice=dst.cbegin();
     ins.srcHost=src.begin();
     CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -477,8 +483,8 @@ template <> struct _ucl_memcpy<0,1> {
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstDevice=dst.cbegin();
     ins.srcHost=src.begin();
     CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@@ -500,8 +506,8 @@ template <> struct _ucl_memcpy<1,1> {
                             const size_t rows) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstHost=dst.begin();
     ins.srcHost=src.begin();
     CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -512,8 +518,8 @@ template <> struct _ucl_memcpy<1,1> {
                             const size_t rows, CUstream &cq) {
     CUDA_MEMCPY2D ins;
     _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
     ins.dstHost=dst.begin();
     ins.srcHost=src.begin();
     CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@@ -529,7 +535,7 @@ template <int mem1, int mem2> struct _ucl_memcpy {
   template <class p1, class p2>
   static inline void mc(p1 &dst, const p2 &src, const size_t n,
                         CUstream &cq) {
-    CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
+    CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
   }
   template <class p1, class p2>
       static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
@@ -546,8 +552,8 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     } else {                                       
       CUDA_MEMCPY2D ins;
       _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
       ins.dstDevice=dst.cbegin();
       ins.srcDevice=src.cbegin();
       CU_SAFE_CALL(cuMemcpy2D(&ins));
@@ -560,16 +566,16 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     if (p1::PADDED==0 || p2::PADDED==0) {
       size_t src_offset=0, dst_offset=0;
       for (size_t i=0; i<rows; i++) {                       
-        CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
-                                  src.cbegin()+src_offset,cols));
+        CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
+                                       src.cbegin()+src_offset,cols,cq));
         src_offset+=spitch;
         dst_offset+=dpitch;
       }
     } else {
       CUDA_MEMCPY2D ins;
       _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
       ins.dstDevice=dst.cbegin();
       ins.srcDevice=src.cbegin();
       CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 1d62b09bac..ad3cd39cd0 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -124,16 +124,25 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
   if (my_gpu>=gpu->num_devices())
     return -2;
     
+  #ifndef CUDA_PRX
   if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
     return -7;
+  #endif
   
   if (gpu->set(my_gpu)!=UCL_SUCCESS)
     return -6;
 
+  gpu->push_command_queue();
+  gpu->set_command_queue(1);
+
   _long_range_precompute=0;
 
-  int flag=compile_kernels();
-
+  int flag=0;
+  for (int i=0; i<_procs_per_gpu; i++) {
+    if (_gpu_rank==i)
+      flag=compile_kernels();
+    gpu_barrier();
+  }
   return flag;
 }
 
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index cb070184e2..5642e5bbfe 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -348,8 +348,9 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
   // copy fp from device to host for comm
   _nlocal=nlocal;
   time_fp1.start();
-  ucl_copy(host_fp,dev_fp,nlocal,false);
+  ucl_copy(host_fp,dev_fp,nlocal,true);
   time_fp1.stop();
+  time_fp1.sync_stop();
 }
 
 // ---------------------------------------------------------------------------
@@ -427,8 +428,9 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
   // copy fp from device to host for comm
   _nlocal=inum_full;
   time_fp1.start();
-  ucl_copy(host_fp,dev_fp,inum_full,false);
+  ucl_copy(host_fp,dev_fp,inum_full,true);
   time_fp1.stop();
+  time_fp1.sync_stop();
   
   return this->nbor->host_jlist.begin()-host_start;
 }
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 24b5906019..4f022baf71 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -69,10 +69,12 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
   time_kernel.init(*dev);
   time_hybrid1.init(*dev);
   time_hybrid2.init(*dev);
+  time_transpose.init(*dev);
   time_nbor.zero();
   time_kernel.zero();
   time_hybrid1.zero();
   time_hybrid2.zero();
+  time_transpose.zero();
 
   _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
   if (_max_atoms==0)
@@ -203,6 +205,7 @@ void Neighbor::clear() {
     time_nbor.clear();
     time_hybrid1.clear();
     time_hybrid2.clear();
+    time_transpose.clear();
   }
 }
 
@@ -285,6 +288,41 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
   }
 }
 
+template <class numtyp, class acctyp>
+void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
+  if (maxn>_max_nbors) {  
+    int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
+    dev_nbor.clear();
+    success=success && 
+            (dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
+    _gpu_bytes=dev_nbor.row_bytes();
+    if (_max_host>0) {
+      host_nbor.clear();
+      dev_host_nbor.clear();
+      success=success && (host_nbor.alloc(mn*_max_host,*dev,
+                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
+      success=success && (dev_host_nbor.alloc(mn*_max_host,
+                                        *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
+      int *ptr=host_nbor.begin();
+      for (int i=0; i<_max_host; i++) {
+        host_jlist[i]=ptr;
+        ptr+=mn;
+      }                                                 
+      _gpu_bytes+=dev_host_nbor.row_bytes();
+    } else {
+      dev_host_nbor.view(dev_nbor);
+      dev_host_numj.view(dev_nbor);
+    }
+    if (_alloc_packed) {
+      dev_packed.clear();
+      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
+                                           UCL_READ_ONLY)==UCL_SUCCESS);
+      _gpu_bytes+=dev_packed.row_bytes();
+    }
+    _max_nbors=mn;
+  }
+}
+
 template <class numtyp, class acctyp>
 void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
                                const int nall, Atom<numtyp,acctyp> &atom, 
@@ -320,6 +358,29 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 
   const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
 
+  if (_maxspecial>0) {
+    time_nbor.start();
+    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
+    view_nspecial.view(nspecial[0],nt*3,*dev);
+    view_special.view(special[0],nt*_maxspecial,*dev);
+    view_tag.view(tag,nall,*dev);
+    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
+    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
+    ucl_copy(atom.dev_tag,view_tag,nall,false);
+    time_nbor.stop();
+    if (_time_device)
+      time_nbor.add_to_total();
+    time_transpose.start();
+    const int b2x=_block_cell_2d;
+    const int b2y=_block_cell_2d;
+    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
+    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
+    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
+                             &_maxspecial,&nt);        
+    time_transpose.stop();
+  }
+  
   // If binning on CPU, do this now
   if (_gpu_nbor==2) {
     double stime = MPI_Wtime();
@@ -352,6 +413,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       cell_id[i]=id;
       host_cell_counts[id+1]++;
     }
+    
+    mn=0;
+    for (int i=0; i<_ncells; i++)
+      mn=std::max(mn,host_cell_counts[i]);
+    mn*=8;
+    resize_max_neighbors<numtyp,acctyp>(mn,success);
+    if (!success)
+      return;
+    _total_atoms=nt;
+
     cell_iter[0]=0;
     for (int i=1; i<_ncells; i++) {
       host_cell_counts[i]+=host_cell_counts[i-1];
@@ -372,28 +443,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     _bin_time+=MPI_Wtime()-stime;
   }        
 
-  if (_maxspecial>0) {
-    time_nbor.start();
-    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
-    view_nspecial.view(nspecial[0],nt*3,*dev);
-    view_special.view(special[0],nt*_maxspecial,*dev);
-    view_tag.view(tag,nall,*dev);
-    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
-    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
-    ucl_copy(atom.dev_tag,view_tag,nall,false);
-    time_nbor.stop();
-    if (_time_device)
-      time_nbor.add_to_total();
-    time_kernel.start();
-    const int b2x=_block_cell_2d;
-    const int b2y=_block_cell_2d;
-    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
-    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
-    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
-    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
-                             &_maxspecial,&nt);        
-  } else
-    time_kernel.start();
+  time_kernel.start();
 
   _nbor_pitch=inum;
   _shared->neigh_tex.bind_float(atom.dev_x,4);
@@ -435,54 +485,30 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   /* Get the maximum number of nbors and realloc if necessary */
   UCL_D_Vec<int> numj;
   numj.view_offset(inum,dev_nbor,inum);
-  ucl_copy(host_acc,numj,inum,false);
+  ucl_copy(host_acc,numj,inum,true);
   if (nt>inum) {
     UCL_H_Vec<int> host_offset;
     host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
+    ucl_copy(host_offset,dev_host_numj,nt-inum,true);
   }
-  mn=host_acc[0];
-  for (int i=1; i<nt; i++)
-    mn=std::max(mn,host_acc[i]);
+  
+  if (_gpu_nbor!=2) {
+    host_acc.sync();
+    mn=host_acc[0];
+    for (int i=1; i<nt; i++)
+      mn=std::max(mn,host_acc[i]);
 
-  if (mn>_max_nbors) {  
-    mn=static_cast<int>(static_cast<double>(mn)*1.10);
-    dev_nbor.clear();
-    success=success && 
-            (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS);
-    _gpu_bytes=dev_nbor.row_bytes();
-    if (_max_host>0) {
-      host_nbor.clear();
-      dev_host_nbor.clear();
-      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
-                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc(mn*_max_host,
-                                        dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
-      int *ptr=host_nbor.begin();
-      for (int i=0; i<_max_host; i++) {
-        host_jlist[i]=ptr;
-        ptr+=mn;
-      }                                                 
-      _gpu_bytes+=dev_host_nbor.row_bytes();
-    } else {
-      dev_host_nbor.view(dev_nbor);
-      dev_host_numj.view(dev_nbor);
-    }
-    if (_alloc_packed) {
-      dev_packed.clear();
-      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
-                                           UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_packed.row_bytes();
-    }
-    if (!success)
+    if (mn>_max_nbors) {  
+      resize_max_neighbors<numtyp,acctyp>(mn,success);
+      if (!success)
+        return;
+      time_kernel.stop();
+      if (_time_device)
+        time_kernel.add_to_total();
+      build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
+                      nspecial, special, success, mn);
       return;
-    _max_nbors=mn;
-    time_kernel.stop();
-    if (_time_device)
-      time_kernel.add_to_total();
-    build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
-                    special, success, mn);
-    return;
+    }
   }
   
   if (_maxspecial>0) {
@@ -497,8 +523,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   time_kernel.stop();
 
   time_nbor.start();
-  if (inum<nt)
-    ucl_copy(host_nbor,dev_host_nbor,false);
+  if (inum<nt) {
+    ucl_copy(host_nbor,dev_host_nbor,true);
+    host_nbor.sync();
+  }
   time_nbor.stop();
 }
 
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 6ecbe97aa4..f9abb638c8 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -106,13 +106,24 @@ class Neighbor {
 
   inline void acc_timers() {
     if (_nbor_time_avail) {
-      time_nbor.add_to_total();
-      time_kernel.add_to_total();
       if (_gpu_nbor==2) {
-        time_hybrid1.add_to_total();
-        time_hybrid2.add_to_total();
+        int mn=0;
+        for (int i=0; i<_total_atoms; i++)
+          mn=std::max(mn,host_acc[i]);
+        if (mn>_max_nbors)
+          assert(0==1);
+      }
+      if (_time_device) {
+        time_nbor.add_to_total();
+        time_kernel.add_to_total();
+        if (_gpu_nbor==2) {
+          time_hybrid1.add_to_total();
+          time_hybrid2.add_to_total();
+        }
+        if (_maxspecial>0)
+          time_transpose.add_to_total();
+        _nbor_time_avail=false;
       }
-      _nbor_time_avail=false;
     }
   }
 
@@ -213,7 +224,7 @@ class Neighbor {
   UCL_D_Vec<int> dev_cell_counts;
 
   /// Device timers
-  UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2;
+  UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
   
  private:
   NeighborShared *_shared;
@@ -228,6 +239,10 @@ class Neighbor {
   
   int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
   int _threads_per_atom;
+  int _total_atoms;
+
+  template <class numtyp, class acctyp>
+  inline void resize_max_neighbors(const int maxn, bool &success);
 };
 
 }
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index 36cd8c42ff..0d7c7212e9 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -85,7 +85,7 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
 __kernel void transpose(__global int *out, __global int *in, int columns_in, 
                         int rows_in)
 {
-	__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
+	__local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 	
 	unsigned ti=THREAD_ID_X;
 	unsigned tj=THREAD_ID_Y;