From a955c52830f320ad65815fc88fa6e8051cd55e02 Mon Sep 17 00:00:00 2001
From: "W. Michael Brown" <brownw@ornl.gov>
Date: Mon, 4 Apr 2011 13:41:31 -0400
Subject: [PATCH] Consolidated block size defines into device header.

---
 lib/gpu/atomic_gpu_memory.cpp    |  4 +---
 lib/gpu/atomic_gpu_memory.h      |  2 --
 lib/gpu/charge_gpu_memory.cpp    |  4 +---
 lib/gpu/charge_gpu_memory.h      |  2 --
 lib/gpu/gb_gpu_memory.cpp        |  4 +---
 lib/gpu/gb_gpu_memory.h          |  2 --
 lib/gpu/pair_gpu_build_kernel.cu | 24 ++++++++++++------------
 lib/gpu/pair_gpu_device.cpp      | 18 ++++++++++--------
 lib/gpu/pair_gpu_device.h        | 19 ++++++++++++++-----
 lib/gpu/pair_gpu_nbor.cpp        |  9 +++++----
 lib/gpu/pppm_gpu_memory.cpp      |  9 +++++++--
 lib/gpu/pppm_gpu_memory.h        |  1 +
 12 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp
index 2e5d91b993..3205a8289d 100644
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@@ -65,9 +65,7 @@ int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
   ucl_device=device->gpu;
   atom=&device->atom;
 
-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
   compile_kernels(*ucl_device,pair_program);
 
   // Initialize host-device load balancer
diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h
index c7117c0288..d82f4371d3 100644
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@@ -18,8 +18,6 @@
 #ifndef ATOMIC_GPU_MEMORY_H
 #define ATOMIC_GPU_MEMORY_H
 
-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp
index e78282064c..77bc897363 100644
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@@ -65,9 +65,7 @@ int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
   ucl_device=device->gpu;
   atom=&device->atom;
 
-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
   compile_kernels(*ucl_device,pair_program);
 
   // Initialize host-device load balancer
diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h
index 4dc33fd2dc..dd775a74f1 100644
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@@ -18,8 +18,6 @@
 #ifndef CHARGE_GPU_MEMORY_H
 #define CHARGE_GPU_MEMORY_H
 
-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp
index a22e582725..0e68607f5a 100644
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@@ -81,9 +81,7 @@ int GB_GPU_MemoryT::init(const int ntypes, const double gamma,
   ucl_device=device->gpu;
   atom=&device->atom;
 
-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
   compile_kernels(*ucl_device);
 
   // Initialize host-device load balancer
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
index f47f9ff758..8a6653170f 100644
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@@ -18,8 +18,6 @@
 #ifndef GB_GPU_MEMORY_H
 #define GB_GPU_MEMORY_H
 
-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu
index 6552edf674..1a483ad0cd 100644
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@@ -54,27 +54,27 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define numtyp4 float4
 #endif
 
-#define CELL_BLOCK_SIZE 64
-#define BLOCK_2D 8
+#define BLOCK_CELL_2D 8
+#define BLOCK_NBOR_BUILD 64
 
 __kernel void transpose(int *out, int *in, int columns_in, int rows_in)
 {
-	__local float block[BLOCK_2D][BLOCK_2D+1];
+	__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 	
 	unsigned ti=THREAD_ID_X;
 	unsigned tj=THREAD_ID_Y;
 	unsigned bi=BLOCK_ID_X;
 	unsigned bj=BLOCK_ID_Y;
 	
-	unsigned i=bi*BLOCK_2D+ti;
-	unsigned j=bj*BLOCK_2D+tj;
+	unsigned i=bi*BLOCK_CELL_2D+ti;
+	unsigned j=bj*BLOCK_CELL_2D+tj;
 	if ((i<columns_in) && (j<rows_in))
 		block[tj][ti]=in[j*columns_in+i];
 
 	__syncthreads();
 
-	i=bj*BLOCK_2D+ti;
-	j=bi*BLOCK_2D+tj;
+	i=bj*BLOCK_CELL_2D+ti;
+	j=bi*BLOCK_CELL_2D+tj;
 	if ((i<rows_in) && (j<columns_in))
 		out[j*rows_in+i] = block[ti][tj];
 }
@@ -153,8 +153,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 	  
   int icell = ix + iy*ncellx + iz*ncellx*ncelly;
 
-  __shared__ int cell_list_sh[CELL_BLOCK_SIZE];
-  __shared__ numtyp4 pos_sh[CELL_BLOCK_SIZE];
+  __shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
+  __shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];
 
   int icell_begin = cell_counts[icell];
   int icell_end = cell_counts[icell+1];
@@ -202,13 +202,13 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
           int num_atom_cell = jcell_end - jcell_begin;
 	  
           // load jcell to shared memory
-          int num_iter = (int)ceil((numtyp)num_atom_cell/CELL_BLOCK_SIZE);
+          int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
 
           for (int k = 0; k < num_iter; k++) {
-            int end_idx = min(CELL_BLOCK_SIZE, num_atom_cell-k*CELL_BLOCK_SIZE);
+            int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
 	    
             if (tid < end_idx) {
-              pid_j =  cell_particle_id[tid+k*CELL_BLOCK_SIZE+jcell_begin];
+              pid_j =  cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
               cell_list_sh[tid] = pid_j;
               atom_j = fetch_pos(pid_j,pos); //[pid_j];
               pos_sh[tid].x = atom_j.x;
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index 669708a3d8..7137222578 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -29,7 +29,6 @@
 #include "pair_gpu_dev_ptx.h"
 #endif
 
-#define BLOCK_1D 64
 #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
 
 template <class numtyp, class acctyp>
@@ -121,9 +120,9 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
   
   gpu->set(my_gpu);
 
-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>gpu->group_size())
-    _block_size=gpu->group_size();
+  _block_pair=BLOCK_PAIR;
+  if (static_cast<size_t>(_block_pair)>gpu->group_size())
+    _block_pair=gpu->group_size();
 
   _long_range_precompute=0;
   
@@ -446,8 +445,9 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
                                          UCL_Timer &time_interp,
                                          PairGPUAns<numtyp,acctyp> &ans, 
                                          const double max_bytes, 
-                                         const double cpu_time, FILE *screen) {
-  double single[7], times[7];
+                                         const double cpu_time, 
+                                         const double idle_time, FILE *screen) {
+  double single[8], times[8];
 
   single[0]=time_out.total_seconds();
   single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
@@ -456,8 +456,9 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
   single[4]=time_interp.total_seconds();
   single[5]=ans.transfer_time()+ans.cast_time();
   single[6]=cpu_time;
+  single[7]=idle_time;
 
-  MPI_Reduce(single,times,7,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
@@ -486,8 +487,9 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in,
         fprintf(screen,"Total:           %.4f s.\n",
                 (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
                 _replica_size);
-        fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
       }
+      fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
       fprintf(screen,"-------------------------------------");
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index ea13f9921c..b11bfbc356 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -34,7 +34,14 @@
 // - Must be >=PPPM_MAX_SPLINE^2
 // - Must be a multiple of 32
 #define PPPM_BLOCK_1D 64
-
+// Default block size for pair styles
+#define BLOCK_PAIR 64
+// Default block size in each dimension for cell list builds and matrix trans
+#define BLOCK_CELL_2D 8
+// Default block size for binning atoms in cell list builds
+#define BLOCK_CELL_ID 128
+// Default block size for neighbor list builds
+#define BLOCK_NBOR_BUILD 64
 
 template <class numtyp, class acctyp, 
           class grdtyp, class grdtyp4> class PPPMGPUMemory;
@@ -123,7 +130,7 @@ class PairGPUDevice {
                            UCL_Timer &time_interp, 
                            PairGPUAns<numtyp,acctyp> &ans, 
                            const double max_bytes, const double cpu_time,
-                           FILE *screen);
+                           const double cpu_idle_time, FILE *screen);
 
   /// Clear all memory on host and device associated with atom and nbor data
   void clear();
@@ -203,13 +210,15 @@ class PairGPUDevice {
   inline int init_count() const { return _init_count; }
   /// Return the number of threads accessing memory simulatenously
   inline int num_mem_threads() const { return _num_mem_threads; }
+  /// Return the min of the pair block size or the device max block size
+  inline int pair_block_size() const { return _block_pair; }
 
   // -------------------- SHARED DEVICE ROUTINES -------------------- 
   // Perform asynchronous zero of integer array 
   void zero(UCL_D_Vec<int> &mem, const int numel) {
     int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
-                                    _block_size));
-    k_zero.set_size(num_blocks,_block_size);
+                                    _block_pair));
+    k_zero.set_size(num_blocks,_block_pair);
     k_zero.run(&mem.begin(),&numel);
   }
 
@@ -259,7 +268,7 @@ class PairGPUDevice {
   double _particle_split;
   double _cpu_full;
 
-  int _block_size, _num_mem_threads;
+  int _block_pair, _num_mem_threads;
   UCL_Program *dev_program;
   UCL_Kernel k_zero, k_info;
   bool _compiled;
diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp
index aed1da0a12..64583cb0b6 100644
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@@ -18,6 +18,7 @@
 
 #include "pair_gpu_precision.h"
 #include "pair_gpu_nbor.h"
+#include "pair_gpu_device.h"
 #include "math.h"
 
 int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
@@ -272,8 +273,8 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
     time_nbor.stop();
     time_nbor.add_to_total();
     time_kernel.start();
-    const int b2x=8;
-    const int b2y=8;
+    const int b2x=BLOCK_CELL_2D;
+    const int b2y=BLOCK_CELL_2D;
     const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
     const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
     _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
@@ -298,7 +299,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
   _cell_bytes=cell_counts.row_bytes();
 
   /* build cell list on GPU */
-  const int neigh_block=128;
+  const int neigh_block=BLOCK_CELL_ID;
   const int GX=(int)ceil((float)nall/neigh_block);
   const numtyp sublo0=static_cast<numtyp>(sublo[0]);
   const numtyp sublo1=static_cast<numtyp>(sublo[1]);
@@ -321,7 +322,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
                              &nall, &ncell_3d);
 
   /* build the neighbor list */
-  const int cell_block=64;
+  const int cell_block=BLOCK_NBOR_BUILD;
   _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
   _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
                             &cell_counts.begin(), &dev_nbor.begin(),
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 389fc59269..fa767d2e7b 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -164,6 +164,8 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
   
   d_error_flag.zero();
   _max_bytes+=1;
+  
+  _cpu_idle_time=0.0;
 
   return h_brick.begin();
 }
@@ -185,7 +187,8 @@ void PPPMGPUMemoryT::clear(const double cpu_time) {
   
   acc_timers();
   device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
-                              *ans,_max_bytes+_max_an_bytes,cpu_time,screen);
+                              *ans,_max_bytes+_max_an_bytes,cpu_time,
+                              _cpu_idle_time,screen);
 
   if (_compiled) {
     k_particle_map.clear();
@@ -273,7 +276,7 @@ void PPPMGPUMemoryT::_precompute(const int ago, const int nlocal, const int nall
 
   time_rho.start();
   BX=block_size();
-std::cout << "Block pencils: " << _block_pencils << std::endl;
+
   GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
                       _block_pencils));
   k_make_rho.set_size(GX,BX);
@@ -308,7 +311,9 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
   if (!success || nlocal==0)
     return 0;
     
+  double t=MPI_Wtime();
   time_out.sync_stop();
+  _cpu_idle_time+=MPI_Wtime()-t;
 
   _precompute_done=false;
 
diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h
index 128eb9bca9..dabcfc39f0 100644
--- a/lib/gpu/pppm_gpu_memory.h
+++ b/lib/gpu/pppm_gpu_memory.h
@@ -176,6 +176,7 @@ class PPPMGPUMemory {
   bool _allocated, _compiled, _precompute_done;
   int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms;
   double  _max_bytes, _max_an_bytes;
+  double _cpu_idle_time;
   
   grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv;