diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 1d62e483d8..a9e02ee7b4 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -62,9 +62,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15, cell_size,gpu_split,_screen,amoeba, - "k_amoeba_multipole", "k_amoeba_udirect2b", - "k_amoeba_umutual2b", "k_amoeba_polar", - "k_amoeba_short_nbor"); + "k_amoeba_dispersion", "k_amoeba_multipole", + "k_amoeba_udirect2b", "k_amoeba_umutual2b", + "k_amoeba_polar", "k_amoeba_short_nbor"); if (success!=0) return success; @@ -150,7 +150,48 @@ double AmoebaT::host_memory_usage() const { } // --------------------------------------------------------------------------- -// Calculate the polar real-space term, returning tep +// Calculate the dispersion real-space term, returning tep +// --------------------------------------------------------------------------- +template +int AmoebaT::dispersion_real(const int eflag, const int vflag) { + int ainum=this->ans->inum(); + if (ainum == 0) + return 0; + + int _nall=this->atom->nall(); + int nbor_pitch=this->nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + this->time_pair.start(); + + // Build the short neighbor list for the cutoff off2_mpole, + // at this point mpole is the first kernel in a time step + + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->_off2_disp, &ainum, + &nbor_pitch, &this->_threads_per_atom); + printf("launching dispersion\n"); + this->k_dispersion.set_size(GX,BX); + this->k_dispersion.run(&this->atom->x, &this->atom->extra, + &coeff_amtype, &coeff_amclass, &sp_nonpolar, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &_nall, &nbor_pitch, + &this->_threads_per_atom, &this->_aewald, + &this->_off2_disp); + this->time_pair.stop(); + + return GX; +} + +// --------------------------------------------------------------------------- +// Calculate the multipole real-space term, returning tep // --------------------------------------------------------------------------- template int AmoebaT::multipole_real(const int eflag, const int vflag) { diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index e44f302563..60205b16ff 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -413,7 +413,7 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, const __global numtyp4 *restrict coeff_amtype, const __global numtyp4 *restrict coeff_amclass, - const __global numtyp4 *restrict sp_disp, + const __global numtyp4 *restrict sp_nonpolar, const __global int *dev_nbor, const __global int *dev_packed, const __global int *dev_short_nbor, @@ -422,8 +422,7 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, const int eflag, const int vflag, const int inum, const int nall, const int nbor_pitch, const int t_per_atom, const numtyp aewald, - const numtyp felec, const numtyp off2, - const numtyp polar_dscale, const numtyp polar_uscale) + const numtyp off2) { int tid, ii, offset, i; atom_info(t_per_atom,ii,tid,offset); @@ -876,9 +875,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_, // accumulate tq store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep); - // accumate force, energy and virial + // accumate force, energy and virial: use _acc if not the first kernel store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, offset,eflag,vflag,ans,engv); + //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, + // offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } /* ---------------------------------------------------------------------- @@ -1785,7 +1786,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, // accumate force, energy and virial //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, -// offset,eflag,vflag,ans,engv); + // offset,eflag,vflag,ans,engv); store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom, offset,eflag,vflag,ans,engv,NUM_BLOCKS_X); } diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h index 39d65375cb..df556a1018 100644 --- a/lib/gpu/lal_amoeba.h +++ b/lib/gpu/lal_amoeba.h @@ -38,9 +38,11 @@ class Amoeba : public BaseAmoeba { * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ int init(const int ntypes, const int max_amtype, const int max_amclass, - const double *host_pdamp, const double *host_thole, const double *host_dirdamp, - const int *host_amtype2class, const double *host_special_mpole, - const double *host_special_hal, const double *host_special_repel, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, + const double *host_special_mpole, + const double *host_special_hal, + const double *host_special_repel, const double *host_special_disp, const double *host_special_polar_wscale, const double *host_special_polar_piscale, @@ -91,6 +93,7 @@ class Amoeba : public BaseAmoeba { protected: bool _allocated; + int dispersion_real(const int eflag, const int vflag); int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 55c08adf82..309830e1ce 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -117,6 +117,23 @@ void amoeba_gpu_clear() { AMOEBAMF.clear(); } +int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd) { + return AMOEBAMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, sublo, subhi, + tag, nspecial, special, nspecial15, special15, + eflag, vflag, eatom, vatom, host_start, ilist, jnum, + cpu_time, success, aewald, off2, host_q, boxlo, prd); +} + int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index a5552f6f3b..f252131ea7 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -33,6 +33,7 @@ template BaseAmoebaT::~BaseAmoeba() { delete ans; delete nbor; + k_dispersion.clear(); k_multipole.clear(); k_udirect2b.clear(); k_umutual2b.clear(); @@ -54,6 +55,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, + const char *k_name_dispersion, const char *k_name_multipole, const char *k_name_udirect2b, const char *k_name_umutual2b, @@ -90,8 +92,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name_multipole,k_name_udirect2b, - k_name_umutual2b,k_name_polar,k_name_short_nbor); + compile_kernels(*ucl_device,pair_program,k_name_dispersion,k_name_multipole, + k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -427,7 +429,74 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall } // --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary, and then compute polar real-space +// Reneighbor on GPU if necessary, and then compute dispersion real-space +// --------------------------------------------------------------------------- +template +int** BaseAmoebaT::compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_disp, + double *host_q, double *boxlo, double *prd) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + + // reallocate per-atom arrays, transfer data from the host + // and build the neighbor lists if needed + // NOTE: + // For now we invoke precompute() again here, + // to be able to turn on/off the udirect2b kernel (which comes before this) + // Once all the kernels are ready, precompute() is needed only once + // in the first kernel in a time step. + // We only need to cast uind and uinp from host to device here + // if the neighbor lists are rebuilt and other per-atom arrays + // (x, type, amtype, amgroup, rpole) are ready on the device. + + int** firstneigh = nullptr; + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, + host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, sublo, subhi, tag, + nspecial, special, nspecial15, special15, + eflag_in, vflag_in, eatom, vatom, + host_start, ilist, jnum, cpu_time, + success, host_q, boxlo, prd); + + _off2_disp = off2_disp; + _aewald = aewald; + const int red_blocks=dispersion_real(eflag,vflag); + + // leave the answers (forces, energies and virial) on the device, + // only copy them back in the last kernel (polar_real) + //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + //device->add_ans_object(ans); + + hd_balancer.stop_timer(); + + return firstneigh; // nbor->host_jlist.begin()-host_start; +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- template int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, @@ -816,6 +885,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, template void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname_dispersion, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, @@ -828,7 +898,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, pair_program=new UCL_Program(dev); std::string oclstring = device->compile_string()+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); - + + k_dispersion.set_function(*pair_program,kname_dispersion); k_multipole.set_function(*pair_program,kname_multipole); k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index fea1728e8c..fcff3186c7 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -54,9 +54,9 @@ class BaseAmoeba { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, - const char *kname_multipole, const char *kname_udirect2b, - const char *kname_umutual2b, const char *kname_polar, - const char *kname_short_nbor); + const char *kname_dispersion, const char *kname_multipole, + const char *kname_udirect2b, const char *kname_umutual2b, + const char *kname_polar, const char *kname_short_nbor); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -142,6 +142,18 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); + /// Compute dispersion real-space with device neighboring + int** compute_dispersion_real(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + const double aewald, const double off2_disp, double *charge, + double *boxlo, double *prd); + /// Compute multipole real-space with device neighboring int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -257,8 +269,8 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_special15; - UCL_Kernel k_short_nbor; + UCL_Kernel k_dispersion, k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_special15, k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -276,13 +288,14 @@ class BaseAmoeba { UCL_D_Vec *_nbor_data; numtyp _aewald,_felec; - numtyp _off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar; + numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *kname_multipole, const char *kname_udirect2b, - const char *kname_umutual2b, const char *kname_polar, - const char *kname_short_nbor); + const char *kname_dispersion, const char *kname_multipole, + const char *kname_udirect2b, const char *kname_umutual2b, + const char *kname_polar, const char *kname_short_nbor); + virtual int dispersion_real(const int eflag, const int vflag) = 0; virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 72c142888e..8a2f09d443 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -348,7 +348,7 @@ class PairAmoeba : public Pair { int, double, double, double *); void dispersion(); - void dispersion_real(); + virtual void dispersion_real(); void dispersion_kspace(); void multipole(); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 35bba58a14..4894ac6203 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -65,6 +65,17 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas const double polar_dscale, const double polar_uscale, int& tq_size); void amoeba_gpu_clear(); +int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd); + int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, @@ -118,8 +129,8 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) tq_pinned = nullptr; gpu_hal_ready = false; - gpu_repulsion_ready = false; - gpu_dispersion_real_ready = false; + gpu_repulsion_ready = false; // true for HIPPO + gpu_dispersion_real_ready = false; // true for HIPPO gpu_multipole_real_ready = true; gpu_udirect2b_ready = true; gpu_umutual2b_ready = true; @@ -194,6 +205,54 @@ void PairAmoebaGPU::init_style() /* ---------------------------------------------------------------------- */ +void PairAmoebaGPU::dispersion_real() +{ + if (!gpu_dispersion_real_ready) { + PairAmoeba::dispersion_real(); + return; + } + + int eflag=1, vflag=1; + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double sublo[3],subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); + } + inum = atom->nlocal; + + // select the correct cutoff for the term + + if (use_dewald) choose(DISP_LONG); + else choose(DISP); + + firstneigh = amoeba_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x, + atom->type, amtype, amgroup, rpole, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, off2, atom->q, + domain->boxlo, domain->prd); + + if (!success) + error->one(FLERR,"Insufficient memory on accelerator"); +} + +/* ---------------------------------------------------------------------- */ + void PairAmoebaGPU::multipole_real() { if (!gpu_multipole_real_ready) { diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index 710f997e4c..de17703dc7 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -35,6 +35,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void induce(); + virtual void dispersion_real(); virtual void multipole_real(); virtual void udirect2b(double **, double **); virtual void umutual2b(double **, double **);