From 7f5a82dc54e699648d2372a2c52d0cab851d13a0 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 11 Sep 2021 00:34:43 -0500 Subject: [PATCH] Switched to the short neighbor list implementation in the pre-10Feb21 version (the recent version enforces tpa = 1 for short nbor) --- lib/gpu/lal_amoeba.cpp | 39 ++++++++++++----- lib/gpu/lal_amoeba.cu | 86 ++++++++++++++++++++++--------------- lib/gpu/lal_base_amoeba.cpp | 23 +++++++--- lib/gpu/lal_base_amoeba.h | 7 ++- src/GPU/pair_amoeba_gpu.cpp | 2 +- 5 files changed, 103 insertions(+), 54 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 08b3f1c9a5..3a83f57594 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -141,14 +141,31 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { int nbor_pitch=this->nbor->nbor_pitch(); this->time_pair.start(); + // Build the short neighbor list if needed + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } + this->k_polar.set_size(GX,BX); this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, &eflag, &vflag, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale); this->time_pair.stop(); + + // Signal that short nbor list is not avail for the next time step + // do it here because polar_real() is the last kernel in a time step at this point + + this->short_nbor_avail = false; + return GX; } @@ -163,20 +180,22 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int GX; + int GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); + + // Build the short neighbor list if needed + if (!this->short_nbor_avail) { + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &_off2, &ainum, + &nbor_pitch, &this->_threads_per_atom); + this->short_nbor_avail = true; + } - GX=static_cast(ceil(static_cast(ainum)/BX)); - this->k_short_nbor.set_size(GX,BX); - // NOTE: this->nbor->dev_packed is not allocated!! -/* - this->k_short_nbor.run(&this->atom->x, &_off2, - &this->nbor->dev_nbor, &this->nbor->dev_packed, - &ainum, &nbor_pitch, &this->_threads_per_atom); -*/ - GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &_aewald, &_off2, &_polar_dscale, &_polar_uscale); diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 9df1dbe485..bcb3aef309 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -196,6 +196,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict sp_polar, const __global int *dev_nbor, const __global int *dev_packed, + const __global int *dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, __global numtyp4 *restrict tep, @@ -255,6 +256,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp; int numj, nbor, nbor_end; + const __global int* nbor_mem=dev_packed; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); @@ -262,6 +264,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, //numtyp qtmp; fetch(qtmp,i,q_tex); //int itype=ix.w; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + ci = polar1[i].x; // rpole[i][0]; dix = polar1[i].y; // rpole[i][1]; diy = polar1[i].z; // rpole[i][2]; @@ -289,7 +299,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, for ( ; nbor global_device; template -BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) { +BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) { device=&global_device; ans=new Answer(); nbor=new Neighbor(); @@ -100,9 +100,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, } else { _nbor_data=&(nbor->dev_nbor); } - + + bool allocate_packed = false; success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, - _gpu_host,max_nbors,cell_size,false,_threads_per_atom); + _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom); if (success!=0) return success; @@ -126,6 +127,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, if (ef_nall==0) ef_nall=2000; + dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); + _max_tep_size=static_cast(static_cast(ef_nall)*1.10); _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); @@ -158,6 +161,7 @@ void BaseAmoebaT::clear_atomic() { time_pair.clear(); hd_balancer.clear(); + dev_short_nbor.clear(); nbor->clear(); ans->clear(); @@ -195,7 +199,7 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist, // Build neighbor list on device // --------------------------------------------------------------------------- template -inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, +inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, @@ -206,7 +210,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, resize_atom(inum,nall,success); resize_local(inum,host_inum,nbor->max_nbors(),success); if (!success) - return; + return 0; atom->cast_copy_x(host_x,host_type); int mn; @@ -232,6 +236,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; + return mn; } // --------------------------------------------------------------------------- @@ -385,7 +390,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall // Build neighbor list on GPU if necessary if (ago==0) { - build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, nspecial15, special15, success); if (!success) @@ -409,6 +414,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, boxlo, prd); + // re-allocate dev_short_nbor if necessary + if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(nall)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + return nbor->host_jlist.begin()-host_start; } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 755f11610f..eb8938d7c4 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -123,7 +123,7 @@ class BaseAmoeba { int **firstneigh, bool &success); /// Build neighbor list on device - void build_nbor_list(const int inum, const int host_inum, + int build_nbor_list(const int inum, const int host_inum, const int nall, double **host_x, int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, @@ -236,6 +236,8 @@ class BaseAmoeba { int add_onefive_neighbors(); + UCL_D_Vec dev_short_nbor; + // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15; @@ -251,8 +253,9 @@ class BaseAmoeba { bool _compiled; int _block_size, _block_bio_size, _threads_per_atom; int _extra_fields; - double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15; + double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors; double _gpu_overhead, _driver_overhead; + bool short_nbor_avail; UCL_D_Vec *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index c51f741c0a..9fc2ea5114 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_udirect2b_ready = true; gpu_umutual2b_ready = false; - gpu_polar_real_ready = false; + gpu_polar_real_ready = true; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); }