diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h index f3a7a06ceb..9c2aba780b 100644 --- a/lib/gpu/geryon/nvc_device.h +++ b/lib/gpu/geryon/nvc_device.h @@ -77,6 +77,14 @@ class UCL_Device { /// Returns the stream indexed by i inline command_queue & cq(const int i) { return _cq[i]; } + /// Set the default command queue (by default this is the null stream) + /** \param i index of the command queue (as added by push_command_queue()) + If i is 0, the default command queue is set to the null stream **/ + inline void set_command_queue(const int i) { + if (i==0) _cq[0]=0; + else _cq[0]=_cq[i]; + } + /// Block until all commands in the default stream have completed inline void sync() { sync(0); } @@ -127,7 +135,8 @@ class UCL_Device { /// Get the number of cores inline unsigned cores(const int i) { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; - else return _properties[i].multiProcessorCount*32; } + else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32; + else return _properties[i].multiProcessorCount*192; } /// Get the gigabytes of global memory in the current device inline double gigabytes() { return gigabytes(_device); } @@ -205,6 +214,7 @@ inline int UCL_Device::set(int num) { if (_device==num) return UCL_SUCCESS; for (int i=1; i=gpu->num_devices()) return -2; - #ifndef CUDA_PRX + #ifndef CUDA_PROXY if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false) return -7; #endif @@ -203,7 +203,7 @@ int DeviceT::init(Answer &ans, const bool charge, if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, _block_cell_id, _block_nbor_build, threads_per_atom, - _time_device)) + _warp_size, _time_device)) return -3; nbor->cell_size(cell_size); diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 4f022baf71..6a086745c5 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -37,13 +37,16 @@ bool Neighbor::init(NeighborShared *shared, const int inum, const int gpu_nbor, const int gpu_host, const bool pre_cut, const int block_cell_2d, const int block_cell_id, const int block_nbor_build, - const int threads_per_atom, const bool time_device) { + const int threads_per_atom, const int warp_size, + const bool time_device) { clear(); _threads_per_atom=threads_per_atom; _block_cell_2d=block_cell_2d; _block_cell_id=block_cell_id; + _max_block_nbor_build=block_nbor_build; _block_nbor_build=block_nbor_build; + _warp_size=warp_size; _shared=shared; dev=&devi; _gpu_nbor=gpu_nbor; @@ -418,6 +421,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, for (int i=0; i<_ncells; i++) mn=std::max(mn,host_cell_counts[i]); mn*=8; + set_nbor_block_size(mn/2); + resize_max_neighbors(mn,success); if (!success) return; @@ -497,6 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, mn=host_acc[0]; for (int i=1; i_max_nbors) { resize_max_neighbors(mn,success); diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h index f9abb638c8..a2d2eda560 100644 --- a/lib/gpu/lal_neighbor.h +++ b/lib/gpu/lal_neighbor.h @@ -67,7 +67,7 @@ class Neighbor { const int gpu_nbor, const int gpu_host, const bool pre_cut, const int block_cell_2d, const int block_cell_id, const int block_nbor_build, const int threads_per_atom, - const bool time_device); + const int warp_size, const bool time_device); /// Set the size of the cutoff+skin inline void cell_size(const double size) { _cell_size=size; } @@ -237,12 +237,20 @@ class Neighbor { double _gpu_bytes, _c_bytes, _cell_bytes; void alloc(bool &success); - int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells; - int _threads_per_atom; - int _total_atoms; + int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build; + int _ncells, _threads_per_atom, _total_atoms; template inline void resize_max_neighbors(const int maxn, bool &success); + + int _warp_size; + inline void set_nbor_block_size(const int mn) { + int desired=mn/(2*_warp_size); + desired*=_warp_size; + if (desired<_warp_size) desired=_warp_size; + else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build; + _block_nbor_build=desired; + } }; } diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index 0d7c7212e9..99990ece67 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -119,6 +119,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_, int ix = BLOCK_ID_X; int iy = BLOCK_ID_Y % ncelly; int iz = BLOCK_ID_Y / ncelly; + int bsx = BLOCK_SIZE_X; int icell = ix + iy*ncellx + iz*ncellx*ncelly; @@ -134,9 +135,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_, numtyp4 diff; numtyp r2; - int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/BLOCK_SIZE_X); + int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/bsx); for (int ii = 0; ii < cap; ii++) { - int i = icell_begin + tid + ii*BLOCK_SIZE_X; + int i = icell_begin + tid + ii*bsx; int pid_i = nall, pid_j, stride; numtyp4 atom_i, atom_j; int cnt = 0; @@ -173,14 +174,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_, int num_atom_cell = jcell_end - jcell_begin; // load jcell to shared memory - int num_iter = ucl_ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD); + int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx); for (int k = 0; k < num_iter; k++) { - int end_idx = min(BLOCK_NBOR_BUILD, - num_atom_cell-k*BLOCK_NBOR_BUILD); + int end_idx = min(bsx, num_atom_cell-k*bsx); if (tid < end_idx) { - pid_j = cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin]; + pid_j = cell_particle_id[tid+k*bsx+jcell_begin]; cell_list_sh[tid] = pid_j; atom_j = fetch_pos(pid_j,x_); //[pid_j]; pos_sh[tid].x = atom_j.x; diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h index a5f2f6eedf..184664cdd0 100644 --- a/lib/gpu/lal_preprocessor.h +++ b/lib/gpu/lal_preprocessor.h @@ -100,6 +100,8 @@ #else +#if (ARCH < 300) + #define THREADS_PER_ATOM 4 #define THREADS_PER_CHARGE 8 #define BLOCK_NBOR_BUILD 128 @@ -107,6 +109,17 @@ #define BLOCK_BIO_PAIR 128 #define MAX_SHARED_TYPES 11 +#else + +#define THREADS_PER_ATOM 4 +#define THREADS_PER_CHARGE 8 +#define BLOCK_NBOR_BUILD 128 +#define BLOCK_PAIR 512 +#define BLOCK_BIO_PAIR 512 +#define MAX_SHARED_TYPES 11 + +#endif + #endif #define WARP_SIZE 32