diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 8bcbd6c4cb..08b3f1c9a5 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -59,7 +59,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, "k_amoeba_polar", "k_amoeba_udirect2b", - "k_amoeba_umutual2b"); + "k_amoeba_umutual2b", "k_amoeba_short_nbor"); if (success!=0) return success; @@ -157,16 +157,23 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template int AmoebaT::udirect2b(const int eflag, const int vflag) { + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + int ainum=this->ans->inum(); + // Compute the block size and grid size to keep all cores busy const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - int _nall=this->atom->nall(); - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); - this->time_pair.start(); - + int GX; + + GX=static_cast(ceil(static_cast(ainum)/BX)); + this->k_short_nbor.set_size(GX,BX); + // NOTE: this->nbor->dev_packed is not allocated!! +/* + this->k_short_nbor.run(&this->atom->x, &_off2, + &this->nbor->dev_nbor, &this->nbor->dev_packed, + &ainum, &nbor_pitch, &this->_threads_per_atom); +*/ + GX=static_cast(ceil(static_cast(this->ans->inum())/(BX/this->_threads_per_atom))); this->k_udirect2b.set_size(GX,BX); this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 30db5ba334..9df1dbe485 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -781,8 +781,10 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_, numtyp zr = jx.z - ix.z; numtyp r2 = xr*xr + yr*yr + zr*zr; - if (r2>off2) continue; - + if (r2>off2) { + if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj); + continue; + } numtyp r = ucl_sqrt(r2); numtyp rinv = ucl_recip(r); numtyp r2inv = rinv*rinv; @@ -1091,3 +1093,112 @@ __kernel void k_special15(__global int * dev_nbor, } // if ii } + +/* +__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_, + const numtyp off2, __global int * dev_nbor, + const __global int * dev_packed, + const int inum, const int nbor_pitch, + const int t_per_atom) { + int tid, ii, offset, n_stride, i; + atom_info(t_per_atom,ii,tid,offset); + + int new_numj=0; + + if (ii1) { + for (unsigned int s=t_per_atom/2; s>0; s>>=1) + new_numj += shfl_down(new_numj, s, t_per_atom); + } + if (offset==0 && iipair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b); + compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b, + k_name_umutual2b,k_name_short_nbor); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); _nbor_data=&(nbor->dev_packed); - } else + } else { _nbor_data=&(nbor->dev_nbor); - - success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, - max_nbors,cell_size,false,_threads_per_atom); + } + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial, + _gpu_host,max_nbors,cell_size,false,_threads_per_atom); if (success!=0) return success; - + // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -223,6 +227,8 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, add_onefive_neighbors(); } + //nbor->copy_unpacked(inum,mn); + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); if (bytes>_max_an_bytes) _max_an_bytes=bytes; @@ -450,7 +456,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, success, host_q, boxlo, prd); - + // ------------------- Resize _fieldp array ------------------------ if (nall>_max_fieldp_size) { @@ -692,7 +698,8 @@ template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_polar, const char *kname_udirect2b, - const char *kname_umutual2b) { + const char *kname_umutual2b, + const char *kname_short_nbor) { if (_compiled) return; @@ -704,6 +711,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_polar.set_function(*pair_program,kname_polar); k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); + k_short_nbor.set_function(*pair_program,kname_short_nbor); k_special15.set_function(*pair_program,"k_special15"); pos_tex.get_texture(*pair_program,"pos_tex"); q_tex.get_texture(*pair_program,"q_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 3fb752c97c..755f11610f 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -55,7 +55,7 @@ class BaseAmoeba { const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, const char *kname_polar, const char *kname_udirect2b, - const char *kname_umutual2b); + const char *kname_umutual2b, const char *kname_short_nbor); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -239,6 +239,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15; + UCL_Kernel k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -256,7 +257,7 @@ class BaseAmoeba { void compile_kernels(UCL_Device &dev, const void *pair_string, const char *kname_polar, const char *kname_udirect2b, - const char *kname_umutual2b); + const char *kname_umutual2b, const char *kname_short_nbor); virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index b9ee884fa0..c51f741c0a 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_udirect2b_ready = true; gpu_umutual2b_ready = false; - gpu_polar_real_ready = true; + gpu_polar_real_ready = false; GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } @@ -297,7 +297,7 @@ void PairAmoebaGPU::init_style() // set the energy unit conversion factor for polar real-space calculation double felec = 0.5 * electric / am_dielectric; - + int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, special_polar_wscale, special_polar_piscale, special_polar_pscale, atom->nlocal,