diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index e6ffcd764a..a9c76d578e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -270,99 +270,6 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum, return mn; } -// --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials -// for the polar real-space term -// --------------------------------------------------------------------------- -template -void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, - int *ilist, int *numj, int **firstneigh, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, const double cpu_time, - bool &success, const double aewald, const double felec, - const double off2_polar, double *host_q, const int nlocal, - double *boxlo, double *prd, void **tep_ptr) { - acc_timers(); - int eflag, vflag; - if (eatom) eflag=2; - else if (eflag_in) eflag=1; - else eflag=0; - if (vatom) vflag=2; - else if (vflag_in) vflag=1; - else vflag=0; - - #ifdef LAL_NO_BLOCK_REDUCE - if (eflag) eflag=2; - if (vflag) vflag=2; - #endif - - set_kernel(eflag,vflag); - - // ------------------- Resize _tep array ------------------------ - - if (nall>_max_tep_size) { - _max_tep_size=static_cast(static_cast(nall)*1.10); - _tep.resize(_max_tep_size*4); - - dev_nspecial15.clear(); - dev_special15.clear(); - dev_special15_t.clear(); - dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); - dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); - } - - *tep_ptr=_tep.host.begin(); - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - resize_atom(0,nall,success); - zero_timers(); - return; - } - - int ago=hd_balancer.ago_first(f_ago); - int inum=hd_balancer.balance(ago,inum_full,cpu_time); - ans->inum(inum); - host_start=inum; - - if (ago==0) { - reset_nbors(nall, inum, ilist, numj, firstneigh, success); - if (!success) - return; - } - - // packing host arrays into host_extra - - atom->cast_x_data(host_x,host_type); - atom->cast_q_data(host_q); - cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp); - hd_balancer.start_timer(); - atom->add_x_data(host_x,host_type); - atom->add_q_data(); - atom->add_extra_data(); - - device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q, - boxlo, prd); - - _off2_polar = off2_polar; - _felec = felec; - const int red_blocks=polar_real(eflag,vflag); - - ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); - device->add_ans_object(ans); - hd_balancer.stop_timer(); - - // copy tep from device to host - - _tep.update_host(_max_tep_size*4,false); -} - // --------------------------------------------------------------------------- // Prepare for multiple kernel calls in a time step: // - reallocate per-atom arrays, if needed @@ -450,6 +357,8 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall dev_short_nbor.resize((2+_max_nbors)*_nmax); } + hd_balancer.stop_timer(); + return nbor->host_jlist.begin()-host_start; } @@ -491,8 +400,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); //device->add_ans_object(ans); - hd_balancer.stop_timer(); - // copy tep from device to host _tep.update_host(_max_tep_size*4,false); @@ -828,7 +735,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, const double aewald, const double felec, const double off2_polar, void **tep_ptr) { - int** firstneigh = nullptr; + // cast necessary data arrays from host to device cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); atom->add_extra_data(); @@ -845,10 +752,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup, ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); device->add_ans_object(ans); - hd_balancer.stop_timer(); - // copy tep from device to host - _tep.update_host(_max_tep_size*4,false); } diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 2e992a33d9..0fb2469d23 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -182,11 +182,12 @@ class BaseAmoeba { const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out); - + /// Interpolate the induced potential from the grid virtual void compute_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); + /// Interpolate the multipolar potential from the grid virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec); @@ -198,17 +199,6 @@ class BaseAmoeba { const double aewald, const double felec, const double off2_polar, void **tep_ptr); - /// Compute polar real-space with host neighboring (not active for now) - void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double **host_uind, - double **host_uinp, int *ilist, int *numj, - int **firstneigh, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success, const double aewald, const double felec, - const double off2_polar, double *charge, const int nlocal, double *boxlo, - double *prd, void **tep_ptr); - // copy field and fieldp from device to host after umutual2b virtual void update_fieldp(void **fieldp_ptr) { *fieldp_ptr=_fieldp.host.begin(); diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index d4366cac85..334d75ac26 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -177,20 +177,20 @@ double HippoT::host_memory_usage() const { // --------------------------------------------------------------------------- template void HippoT::compute_repulsion(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double off2_repulse, - double *host_q, double *boxlo, double *prd, - double cut2, double c0, double c1, double c2, - double c3, double c4, double c5, void **tep_ptr) { + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double off2_repulse, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { this->acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -225,16 +225,7 @@ void HippoT::compute_repulsion(const int ago, const int inum_full, _c5 = c5; const int red_blocks=repulsion(this->_eflag,this->_vflag); - // only copy them back if this is the last kernel - // otherwise, commenting out these two lines to leave the answers - // (forces, energies and virial) on the device until the last kernel - //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //this->device->add_ans_object(this->ans); - - this->hd_balancer.stop_timer(); - // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); } @@ -303,8 +294,6 @@ void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup, // (forces, energies and virial) on the device until the last kernel //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); //this->device->add_ans_object(this->ans); - - this->hd_balancer.stop_timer(); } // --------------------------------------------------------------------------- @@ -386,15 +375,7 @@ void HippoT::compute_multipole_real(const int ago, const int inum_full, this->_aewald = aewald; const int red_blocks=multipole_real(this->_eflag,this->_vflag); - // leave the answers (forces, energies and virial) on the device, - // only copy them back in the last kernel (this one, or polar_real once done) - //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); - //this->device->add_ans_object(this->ans); - - this->hd_balancer.stop_timer(); - // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); } @@ -595,14 +576,11 @@ void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **ho const int red_blocks=polar_real(this->_eflag,this->_vflag); // only copy answers (forces, energies and virial) back from the device - // in the last kernel (which is polar_real here) + // in the last kernel in a timestep (which is polar_real here) this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); this->device->add_ans_object(this->ans); - this->hd_balancer.stop_timer(); - // copy tep from device to host - this->_tep.update_host(this->_max_tep_size*4,false); } diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 9644f5aca4..77450bf7b1 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -69,15 +69,15 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass int init_ok=0; if (world_me==0) init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, - host_pdamp, host_thole, host_dirdamp, - host_amtype2class, host_special_repel, host_special_disp, - host_special_mpole, host_special_polar_wscale, - host_special_polar_piscale, host_special_polar_pscale, - host_sizpr, host_dmppr, host_elepr, - host_csix, host_adisp, host_pcore, host_palpha, - nlocal, nall, max_nbors, - maxspecial, maxspecial15, cell_size, gpu_split, - screen, polar_dscale, polar_uscale); + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_sizpr, host_dmppr, host_elepr, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); HIPPOMF.device->world_barrier(); if (message) @@ -94,15 +94,15 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass } if (gpu_rank==i && world_me!=0) init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass, - host_pdamp, host_thole, host_dirdamp, - host_amtype2class, host_special_repel, host_special_disp, - host_special_mpole, host_special_polar_wscale, - host_special_polar_piscale, host_special_polar_pscale, - host_sizpr, host_dmppr, host_elepr, - host_csix, host_adisp, host_pcore, host_palpha, - nlocal, nall, max_nbors, - maxspecial, maxspecial15, cell_size, gpu_split, - screen, polar_dscale, polar_uscale); + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_repel, host_special_disp, + host_special_mpole, host_special_polar_wscale, + host_special_polar_piscale, host_special_polar_pscale, + host_sizpr, host_dmppr, host_elepr, + host_csix, host_adisp, host_pcore, host_palpha, + nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); HIPPOMF.device->gpu_barrier(); if (message) @@ -121,16 +121,16 @@ void hippo_gpu_clear() { } int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double **host_uind, double **host_uinp, double *host_pval, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, double *prd) { + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, double *prd) { return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, nullptr, nullptr, nullptr, sublo, subhi, tag, @@ -141,17 +141,17 @@ int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall, } void hippo_gpu_compute_repulsion(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - int *host_amtype, int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, int **nspecial, - tagint **special, int *nspecial15, tagint** special15, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, const double cpu_time, - bool &success, const double aewald, const double off2, - double *host_q, double *boxlo, double *prd, - double cut2, double c0, double c1, double c2, - double c3, double c4, double c5, void **tep_ptr) { + const int nall, double **host_x, int *host_type, + int *host_amtype, int *host_amgroup, double **host_rpole, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, int *nspecial15, tagint** special15, + const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, const double aewald, const double off2, + double *host_q, double *boxlo, double *prd, + double cut2, double c0, double c1, double c2, + double c3, double c4, double c5, void **tep_ptr) { HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, sublo, subhi, tag, nspecial, special, nspecial15, special15,