From a955c52830f320ad65815fc88fa6e8051cd55e02 Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Mon, 4 Apr 2011 13:41:31 -0400 Subject: [PATCH] Consolidated block size defines into device header. --- lib/gpu/atomic_gpu_memory.cpp | 4 +--- lib/gpu/atomic_gpu_memory.h | 2 -- lib/gpu/charge_gpu_memory.cpp | 4 +--- lib/gpu/charge_gpu_memory.h | 2 -- lib/gpu/gb_gpu_memory.cpp | 4 +--- lib/gpu/gb_gpu_memory.h | 2 -- lib/gpu/pair_gpu_build_kernel.cu | 24 ++++++++++++------------ lib/gpu/pair_gpu_device.cpp | 18 ++++++++++-------- lib/gpu/pair_gpu_device.h | 19 ++++++++++++++----- lib/gpu/pair_gpu_nbor.cpp | 9 +++++---- lib/gpu/pppm_gpu_memory.cpp | 9 +++++++-- lib/gpu/pppm_gpu_memory.h | 1 + 12 files changed, 52 insertions(+), 46 deletions(-) diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp index 2e5d91b993..3205a8289d 100644 --- a/lib/gpu/atomic_gpu_memory.cpp +++ b/lib/gpu/atomic_gpu_memory.cpp @@ -65,9 +65,7 @@ int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall, ucl_device=device->gpu; atom=&device->atom; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); compile_kernels(*ucl_device,pair_program); // Initialize host-device load balancer diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h index c7117c0288..d82f4371d3 100644 --- a/lib/gpu/atomic_gpu_memory.h +++ b/lib/gpu/atomic_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef ATOMIC_GPU_MEMORY_H #define ATOMIC_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp index e78282064c..77bc897363 100644 --- a/lib/gpu/charge_gpu_memory.cpp +++ b/lib/gpu/charge_gpu_memory.cpp @@ -65,9 +65,7 @@ int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall, ucl_device=device->gpu; atom=&device->atom; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); compile_kernels(*ucl_device,pair_program); // Initialize host-device load balancer diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h index 4dc33fd2dc..dd775a74f1 100644 --- a/lib/gpu/charge_gpu_memory.h +++ b/lib/gpu/charge_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef CHARGE_GPU_MEMORY_H #define CHARGE_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp index a22e582725..0e68607f5a 100644 --- a/lib/gpu/gb_gpu_memory.cpp +++ b/lib/gpu/gb_gpu_memory.cpp @@ -81,9 +81,7 @@ int GB_GPU_MemoryT::init(const int ntypes, const double gamma, ucl_device=device->gpu; atom=&device->atom; - _block_size=BLOCK_1D; - if (static_cast(_block_size)>ucl_device->group_size()) - _block_size=ucl_device->group_size(); + _block_size=device->pair_block_size(); compile_kernels(*ucl_device); // Initialize host-device load balancer diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h index f47f9ff758..8a6653170f 100644 --- a/lib/gpu/gb_gpu_memory.h +++ b/lib/gpu/gb_gpu_memory.h @@ -18,8 +18,6 @@ #ifndef GB_GPU_MEMORY_H #define GB_GPU_MEMORY_H -#define BLOCK_1D 64 - #include "pair_gpu_device.h" #include "pair_gpu_balance.h" #include "mpi.h" diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu index 6552edf674..1a483ad0cd 100644 --- a/lib/gpu/pair_gpu_build_kernel.cu +++ b/lib/gpu/pair_gpu_build_kernel.cu @@ -54,27 +54,27 @@ __inline float4 fetch_pos(const int& i, const float4 *pos) #define numtyp4 float4 #endif -#define CELL_BLOCK_SIZE 64 -#define BLOCK_2D 8 +#define BLOCK_CELL_2D 8 +#define BLOCK_NBOR_BUILD 64 __kernel void transpose(int *out, int *in, int columns_in, int rows_in) { - __local float block[BLOCK_2D][BLOCK_2D+1]; + __local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; unsigned ti=THREAD_ID_X; unsigned tj=THREAD_ID_Y; unsigned bi=BLOCK_ID_X; unsigned bj=BLOCK_ID_Y; - unsigned i=bi*BLOCK_2D+ti; - unsigned j=bj*BLOCK_2D+tj; + unsigned i=bi*BLOCK_CELL_2D+ti; + unsigned j=bj*BLOCK_CELL_2D+tj; if ((i template @@ -121,9 +120,9 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, gpu->set(my_gpu); - _block_size=BLOCK_1D; - if (static_cast(_block_size)>gpu->group_size()) - _block_size=gpu->group_size(); + _block_pair=BLOCK_PAIR; + if (static_cast(_block_pair)>gpu->group_size()) + _block_pair=gpu->group_size(); _long_range_precompute=0; @@ -446,8 +445,9 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_interp, PairGPUAns &ans, const double max_bytes, - const double cpu_time, FILE *screen) { - double single[7], times[7]; + const double cpu_time, + const double idle_time, FILE *screen) { + double single[8], times[8]; single[0]=time_out.total_seconds(); single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time(); @@ -456,8 +456,9 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, single[4]=time_interp.total_seconds(); single[5]=ans.transfer_time()+ans.cast_time(); single[6]=cpu_time; + single[7]=idle_time; - MPI_Reduce(single,times,7,MPI_DOUBLE,MPI_SUM,0,_comm_replica); + MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica); double my_max_bytes=max_bytes+atom.max_gpu_bytes(); double mpi_max_bytes; @@ -486,8 +487,9 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, fprintf(screen,"Total: %.4f s.\n", (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/ _replica_size); - fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); } + fprintf(screen,"CPU Poisson: %.4f s.\n",times[6]/_replica_size); + fprintf(screen,"CPU Idle Time: %.4f s.\n",times[7]/_replica_size); fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); fprintf(screen,"-------------------------------------"); diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index ea13f9921c..b11bfbc356 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -34,7 +34,14 @@ // - Must be >=PPPM_MAX_SPLINE^2 // - Must be a multiple of 32 #define PPPM_BLOCK_1D 64 - +// Default block size for pair styles +#define BLOCK_PAIR 64 +// Default block size in each dimension for cell list builds and matrix trans +#define BLOCK_CELL_2D 8 +// Default block size for binning atoms in cell list builds +#define BLOCK_CELL_ID 128 +// Default block size for neighbor list builds +#define BLOCK_NBOR_BUILD 64 template class PPPMGPUMemory; @@ -123,7 +130,7 @@ class PairGPUDevice { UCL_Timer &time_interp, PairGPUAns &ans, const double max_bytes, const double cpu_time, - FILE *screen); + const double cpu_idle_time, FILE *screen); /// Clear all memory on host and device associated with atom and nbor data void clear(); @@ -203,13 +210,15 @@ class PairGPUDevice { inline int init_count() const { return _init_count; } /// Return the number of threads accessing memory simulatenously inline int num_mem_threads() const { return _num_mem_threads; } + /// Return the min of the pair block size or the device max block size + inline int pair_block_size() const { return _block_pair; } // -------------------- SHARED DEVICE ROUTINES -------------------- // Perform asynchronous zero of integer array void zero(UCL_D_Vec &mem, const int numel) { int num_blocks=static_cast(ceil(static_cast(numel)/ - _block_size)); - k_zero.set_size(num_blocks,_block_size); + _block_pair)); + k_zero.set_size(num_blocks,_block_pair); k_zero.run(&mem.begin(),&numel); } @@ -259,7 +268,7 @@ class PairGPUDevice { double _particle_split; double _cpu_full; - int _block_size, _num_mem_threads; + int _block_pair, _num_mem_threads; UCL_Program *dev_program; UCL_Kernel k_zero, k_info; bool _compiled; diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp index aed1da0a12..64583cb0b6 100644 --- a/lib/gpu/pair_gpu_nbor.cpp +++ b/lib/gpu/pair_gpu_nbor.cpp @@ -18,6 +18,7 @@ #include "pair_gpu_precision.h" #include "pair_gpu_nbor.h" +#include "pair_gpu_device.h" #include "math.h" int PairGPUNbor::bytes_per_atom(const int max_nbors) const { @@ -272,8 +273,8 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, time_nbor.stop(); time_nbor.add_to_total(); time_kernel.start(); - const int b2x=8; - const int b2y=8; + const int b2x=BLOCK_CELL_2D; + const int b2y=BLOCK_CELL_2D; const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); const int g2y=static_cast(ceil(static_cast(nt)/b2y)); _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); @@ -298,7 +299,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, _cell_bytes=cell_counts.row_bytes(); /* build cell list on GPU */ - const int neigh_block=128; + const int neigh_block=BLOCK_CELL_ID; const int GX=(int)ceil((float)nall/neigh_block); const numtyp sublo0=static_cast(sublo[0]); const numtyp sublo1=static_cast(sublo[1]); @@ -321,7 +322,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum, &nall, &ncell_3d); /* build the neighbor list */ - const int cell_block=64; + const int cell_block=BLOCK_NBOR_BUILD; _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1); _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(), &cell_counts.begin(), &dev_nbor.begin(), diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 389fc59269..fa767d2e7b 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -164,6 +164,8 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, d_error_flag.zero(); _max_bytes+=1; + + _cpu_idle_time=0.0; return h_brick.begin(); } @@ -185,7 +187,8 @@ void PPPMGPUMemoryT::clear(const double cpu_time) { acc_timers(); device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp, - *ans,_max_bytes+_max_an_bytes,cpu_time,screen); + *ans,_max_bytes+_max_an_bytes,cpu_time, + _cpu_idle_time,screen); if (_compiled) { k_particle_map.clear(); @@ -273,7 +276,7 @@ void PPPMGPUMemoryT::_precompute(const int ago, const int nlocal, const int nall time_rho.start(); BX=block_size(); -std::cout << "Block pencils: " << _block_pencils << std::endl; + GX=static_cast(ceil(static_cast(_npts_y*_npts_z)/ _block_pencils)); k_make_rho.set_size(GX,BX); @@ -308,7 +311,9 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall, if (!success || nlocal==0) return 0; + double t=MPI_Wtime(); time_out.sync_stop(); + _cpu_idle_time+=MPI_Wtime()-t; _precompute_done=false; diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h index 128eb9bca9..dabcfc39f0 100644 --- a/lib/gpu/pppm_gpu_memory.h +++ b/lib/gpu/pppm_gpu_memory.h @@ -176,6 +176,7 @@ class PPPMGPUMemory { bool _allocated, _compiled, _precompute_done; int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms; double _max_bytes, _max_an_bytes; + double _cpu_idle_time; grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv;