From d30ba2d7eb6469558a6fc80bac91f8b2ecb8f35a Mon Sep 17 00:00:00 2001 From: sjplimp Date: Fri, 2 Mar 2012 15:57:05 +0000 Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7885 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- lib/gpu/geryon/VERSION.txt | 3 +- lib/gpu/geryon/nvd_device.h | 9 ++ lib/gpu/geryon/nvd_kernel.h | 11 ++- lib/gpu/geryon/nvd_memory.h | 84 ++++++++++--------- lib/gpu/lal_device.cpp | 13 ++- lib/gpu/lal_eam.cpp | 6 +- lib/gpu/lal_neighbor.cpp | 160 +++++++++++++++++++++--------------- lib/gpu/lal_neighbor.h | 27 ++++-- lib/gpu/lal_neighbor_gpu.cu | 2 +- 9 files changed, 193 insertions(+), 122 deletions(-) diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt index d260cab24e..313907d611 100644 --- a/lib/gpu/geryon/VERSION.txt +++ b/lib/gpu/geryon/VERSION.txt @@ -1,2 +1 @@ -Geryon Version 11.094 - +Geryon Version 12.034 diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h index 9e98de408c..8040cd70c7 100644 --- a/lib/gpu/geryon/nvd_device.h +++ b/lib/gpu/geryon/nvd_device.h @@ -117,6 +117,14 @@ class UCL_Device { _cq.pop_back(); } + /// Set the default command queue (by default this is the null stream) + /** \param i index of the command queue (as added by push_command_queue()) + If i is 0, the default command queue is set to the null stream **/ + inline void set_command_queue(const int i) { + if (i==0) _cq[0]=0; + else _cq[0]=_cq[i]; + } + /// Get the current CUDA device name inline std::string name() { return name(_device); } /// Get the CUDA device name @@ -280,6 +288,7 @@ inline int UCL_Device::set(int num) { if (_device>-1) { CU_SAFE_CALL_NS(cuCtxDestroy(_context)); for (int i=1; i(upitch); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; + mat.cq()=cm.cq(); return UCL_SUCCESS; } @@ -129,6 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows, pitch=static_cast(upitch); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; + mat.cq()=d.cq(); return UCL_SUCCESS; } @@ -243,8 +249,8 @@ template<> struct _ucl_memcpy<2,2> { const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -255,8 +261,8 @@ template<> struct _ucl_memcpy<2,2> { const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); @@ -280,8 +286,8 @@ template<> struct _ucl_memcpy<2,0> { const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -292,8 +298,8 @@ template<> struct _ucl_memcpy<2,0> { const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); @@ -317,8 +323,8 @@ template<> struct _ucl_memcpy<2,1> { const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -329,8 +335,8 @@ template<> struct _ucl_memcpy<2,1> { const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); @@ -354,8 +360,8 @@ template<> struct _ucl_memcpy<0,2> { const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -366,8 +372,8 @@ template<> struct _ucl_memcpy<0,2> { const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); @@ -391,8 +397,8 @@ template<> struct _ucl_memcpy<1,2> { const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -403,8 +409,8 @@ template<> struct _ucl_memcpy<1,2> { const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); @@ -428,8 +434,8 @@ template <> struct _ucl_memcpy<1,0> { const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -440,8 +446,8 @@ template <> struct _ucl_memcpy<1,0> { const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); @@ -465,8 +471,8 @@ template <> struct _ucl_memcpy<0,1> { const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -477,8 +483,8 @@ template <> struct _ucl_memcpy<0,1> { const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); @@ -500,8 +506,8 @@ template <> struct _ucl_memcpy<1,1> { const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -512,8 +518,8 @@ template <> struct _ucl_memcpy<1,1> { const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); @@ -529,7 +535,7 @@ template struct _ucl_memcpy { template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { - CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n)); + CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, @@ -546,8 +552,8 @@ template struct _ucl_memcpy { } else { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); - ins.dstMemoryType=_nvd_set_2D_mem::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); @@ -560,16 +566,16 @@ template struct _ucl_memcpy { if (p1::PADDED==0 || p2::PADDED==0) { size_t src_offset=0, dst_offset=0; for (size_t i=0; i::a(); - ins.srcMemoryType=_nvd_set_2D_mem::a(); + ins.dstMemoryType=_nvd_set_2D_mem::a(); + ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 1d62b09bac..ad3cd39cd0 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -124,16 +124,25 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, if (my_gpu>=gpu->num_devices()) return -2; + #ifndef CUDA_PRX if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false) return -7; + #endif if (gpu->set(my_gpu)!=UCL_SUCCESS) return -6; + gpu->push_command_queue(); + gpu->set_command_queue(1); + _long_range_precompute=0; - int flag=compile_kernels(); - + int flag=0; + for (int i=0; i<_procs_per_gpu; i++) { + if (_gpu_rank==i) + flag=compile_kernels(); + gpu_barrier(); + } return flag; } diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp index cb070184e2..5642e5bbfe 100644 --- a/lib/gpu/lal_eam.cpp +++ b/lib/gpu/lal_eam.cpp @@ -348,8 +348,9 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal, // copy fp from device to host for comm _nlocal=nlocal; time_fp1.start(); - ucl_copy(host_fp,dev_fp,nlocal,false); + ucl_copy(host_fp,dev_fp,nlocal,true); time_fp1.stop(); + time_fp1.sync_stop(); } // --------------------------------------------------------------------------- @@ -427,8 +428,9 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall, // copy fp from device to host for comm _nlocal=inum_full; time_fp1.start(); - ucl_copy(host_fp,dev_fp,inum_full,false); + ucl_copy(host_fp,dev_fp,inum_full,true); time_fp1.stop(); + time_fp1.sync_stop(); return this->nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp index 24b5906019..4f022baf71 100644 --- a/lib/gpu/lal_neighbor.cpp +++ b/lib/gpu/lal_neighbor.cpp @@ -69,10 +69,12 @@ bool Neighbor::init(NeighborShared *shared, const int inum, time_kernel.init(*dev); time_hybrid1.init(*dev); time_hybrid2.init(*dev); + time_transpose.init(*dev); time_nbor.zero(); time_kernel.zero(); time_hybrid1.zero(); time_hybrid2.zero(); + time_transpose.zero(); _max_atoms=static_cast(static_cast(inum)*1.10); if (_max_atoms==0) @@ -203,6 +205,7 @@ void Neighbor::clear() { time_nbor.clear(); time_hybrid1.clear(); time_hybrid2.clear(); + time_transpose.clear(); } } @@ -285,6 +288,41 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj, } } +template +void Neighbor::resize_max_neighbors(const int maxn, bool &success) { + if (maxn>_max_nbors) { + int mn=static_cast(static_cast(maxn)*1.10); + dev_nbor.clear(); + success=success && + (dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS); + _gpu_bytes=dev_nbor.row_bytes(); + if (_max_host>0) { + host_nbor.clear(); + dev_host_nbor.clear(); + success=success && (host_nbor.alloc(mn*_max_host,*dev, + UCL_RW_OPTIMIZED)==UCL_SUCCESS); + success=success && (dev_host_nbor.alloc(mn*_max_host, + *dev,UCL_WRITE_ONLY)==UCL_SUCCESS); + int *ptr=host_nbor.begin(); + for (int i=0; i<_max_host; i++) { + host_jlist[i]=ptr; + ptr+=mn; + } + _gpu_bytes+=dev_host_nbor.row_bytes(); + } else { + dev_host_nbor.view(dev_nbor); + dev_host_numj.view(dev_nbor); + } + if (_alloc_packed) { + dev_packed.clear(); + success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, + UCL_READ_ONLY)==UCL_SUCCESS); + _gpu_bytes+=dev_packed.row_bytes(); + } + _max_nbors=mn; + } +} + template void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const int nall, Atom &atom, @@ -320,6 +358,29 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, const numtyp cell_size_cast=static_cast(_cell_size); + if (_maxspecial>0) { + time_nbor.start(); + UCL_H_Vec view_nspecial, view_special, view_tag; + view_nspecial.view(nspecial[0],nt*3,*dev); + view_special.view(special[0],nt*_maxspecial,*dev); + view_tag.view(tag,nall,*dev); + ucl_copy(dev_nspecial,view_nspecial,nt*3,false); + ucl_copy(dev_special_t,view_special,nt*_maxspecial,false); + ucl_copy(atom.dev_tag,view_tag,nall,false); + time_nbor.stop(); + if (_time_device) + time_nbor.add_to_total(); + time_transpose.start(); + const int b2x=_block_cell_2d; + const int b2y=_block_cell_2d; + const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); + const int g2y=static_cast(ceil(static_cast(nt)/b2y)); + _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); + _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), + &_maxspecial,&nt); + time_transpose.stop(); + } + // If binning on CPU, do this now if (_gpu_nbor==2) { double stime = MPI_Wtime(); @@ -352,6 +413,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, cell_id[i]=id; host_cell_counts[id+1]++; } + + mn=0; + for (int i=0; i<_ncells; i++) + mn=std::max(mn,host_cell_counts[i]); + mn*=8; + resize_max_neighbors(mn,success); + if (!success) + return; + _total_atoms=nt; + cell_iter[0]=0; for (int i=1; i<_ncells; i++) { host_cell_counts[i]+=host_cell_counts[i-1]; @@ -372,28 +443,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, _bin_time+=MPI_Wtime()-stime; } - if (_maxspecial>0) { - time_nbor.start(); - UCL_H_Vec view_nspecial, view_special, view_tag; - view_nspecial.view(nspecial[0],nt*3,*dev); - view_special.view(special[0],nt*_maxspecial,*dev); - view_tag.view(tag,nall,*dev); - ucl_copy(dev_nspecial,view_nspecial,nt*3,false); - ucl_copy(dev_special_t,view_special,nt*_maxspecial,false); - ucl_copy(atom.dev_tag,view_tag,nall,false); - time_nbor.stop(); - if (_time_device) - time_nbor.add_to_total(); - time_kernel.start(); - const int b2x=_block_cell_2d; - const int b2y=_block_cell_2d; - const int g2x=static_cast(ceil(static_cast(_maxspecial)/b2x)); - const int g2y=static_cast(ceil(static_cast(nt)/b2y)); - _shared->k_transpose.set_size(g2x,g2y,b2x,b2y); - _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(), - &_maxspecial,&nt); - } else - time_kernel.start(); + time_kernel.start(); _nbor_pitch=inum; _shared->neigh_tex.bind_float(atom.dev_x,4); @@ -435,54 +485,30 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, /* Get the maximum number of nbors and realloc if necessary */ UCL_D_Vec numj; numj.view_offset(inum,dev_nbor,inum); - ucl_copy(host_acc,numj,inum,false); + ucl_copy(host_acc,numj,inum,true); if (nt>inum) { UCL_H_Vec host_offset; host_offset.view_offset(inum,host_acc,nt-inum); - ucl_copy(host_offset,dev_host_numj,nt-inum,false); + ucl_copy(host_offset,dev_host_numj,nt-inum,true); } - mn=host_acc[0]; - for (int i=1; i_max_nbors) { - mn=static_cast(static_cast(mn)*1.10); - dev_nbor.clear(); - success=success && - (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS); - _gpu_bytes=dev_nbor.row_bytes(); - if (_max_host>0) { - host_nbor.clear(); - dev_host_nbor.clear(); - success=success && (host_nbor.alloc(mn*_max_host,dev_nbor, - UCL_RW_OPTIMIZED)==UCL_SUCCESS); - success=success && (dev_host_nbor.alloc(mn*_max_host, - dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS); - int *ptr=host_nbor.begin(); - for (int i=0; i<_max_host; i++) { - host_jlist[i]=ptr; - ptr+=mn; - } - _gpu_bytes+=dev_host_nbor.row_bytes(); - } else { - dev_host_nbor.view(dev_nbor); - dev_host_numj.view(dev_nbor); - } - if (_alloc_packed) { - dev_packed.clear(); - success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev, - UCL_READ_ONLY)==UCL_SUCCESS); - _gpu_bytes+=dev_packed.row_bytes(); - } - if (!success) + if (mn>_max_nbors) { + resize_max_neighbors(mn,success); + if (!success) + return; + time_kernel.stop(); + if (_time_device) + time_kernel.add_to_total(); + build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, + nspecial, special, success, mn); return; - _max_nbors=mn; - time_kernel.stop(); - if (_time_device) - time_kernel.add_to_total(); - build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial, - special, success, mn); - return; + } } if (_maxspecial>0) { @@ -497,8 +523,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum, time_kernel.stop(); time_nbor.start(); - if (inum_max_nbors) + assert(0==1); + } + if (_time_device) { + time_nbor.add_to_total(); + time_kernel.add_to_total(); + if (_gpu_nbor==2) { + time_hybrid1.add_to_total(); + time_hybrid2.add_to_total(); + } + if (_maxspecial>0) + time_transpose.add_to_total(); + _nbor_time_avail=false; } - _nbor_time_avail=false; } } @@ -213,7 +224,7 @@ class Neighbor { UCL_D_Vec dev_cell_counts; /// Device timers - UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2; + UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose; private: NeighborShared *_shared; @@ -228,6 +239,10 @@ class Neighbor { int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells; int _threads_per_atom; + int _total_atoms; + + template + inline void resize_max_neighbors(const int maxn, bool &success); }; } diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index 36cd8c42ff..0d7c7212e9 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -85,7 +85,7 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id, __kernel void transpose(__global int *out, __global int *in, int columns_in, int rows_in) { - __local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; + __local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; unsigned ti=THREAD_ID_X; unsigned tj=THREAD_ID_Y;