From 78ef0d631fefab0af68d22371271cb8935c5e3b6 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 25 Sep 2021 12:25:34 -0500 Subject: [PATCH] Working on the multipole real-space term of hippo --- lib/gpu/lal_base_amoeba.cpp | 10 ++- lib/gpu/lal_base_amoeba.h | 4 +- lib/gpu/lal_hippo.cpp | 133 +++++++++++++++++++++++++++++++----- lib/gpu/lal_hippo.cu | 83 ++++++++++++++-------- lib/gpu/lal_hippo.h | 16 ++++- lib/gpu/lal_hippo_ext.cpp | 4 +- src/GPU/pair_hippo_gpu.cpp | 20 +++--- 7 files changed, 207 insertions(+), 63 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index b8e927d6ce..1a299e902f 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -757,7 +757,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const { template void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, - double** uind, double** uinp) { + double** uind, double** uinp, double* pval) { // signal that we need to transfer extra data from the host atom->extra_data_unavail(); @@ -812,6 +812,14 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, pextra[idx+2] = uinp[i][2]; } } + + if (pval) { + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = pval[i]; + } + } } template diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 997e7b21ed..fc665ec731 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -131,7 +131,7 @@ class BaseAmoeba { bool &success); /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed - int** precompute(const int ago, const int inum_full, const int nall, + virtual int** precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *sublo, double *subhi, @@ -232,7 +232,7 @@ class BaseAmoeba { /// cast host arrays into a single array for atom->extra void cast_extra_data(int* amtype, int* amgroup, double** rpole, - double** uind, double** uinp); + double** uind, double** uinp, double* pval=nullptr); /// Per-atom arrays UCL_Vector _tep, _fieldp; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index fad749a185..10d75f2393 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -155,6 +155,102 @@ double HippoT::host_memory_usage() const { return this->host_memory_usage_atomic()+sizeof(Hippo); } +// --------------------------------------------------------------------------- +// Prepare for multiple kernel calls in a time step: +// - reallocate per-atom arrays, if needed +// - transfer extra data from host to device +// - build the full neighbor lists for use by different kernels +// --------------------------------------------------------------------------- + +template +int** HippoT::precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double **host_uind, double **host_uinp, double *host_pval, + double *sublo, double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + this->acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + this->set_kernel(eflag,vflag); + + // ------------------- Resize 1-5 neighbor arrays ------------------------ + + if (nall>this->_nmax) { + this->_nmax = nall; + this->dev_nspecial15.clear(); + this->dev_special15.clear(); + this->dev_special15_t.clear(); + this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); + this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); + this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + } + + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + this->resize_atom(0,nall,success); + this->zero_timers(); + return nullptr; + } + + this->hd_balancer.balance(cpu_time); + int inum=this->hd_balancer.get_gpu_count(ago,inum_full); + this->ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, nspecial15, special15, + success); + if (!success) + return nullptr; + this->atom->cast_q_data(host_q); + this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + this->hd_balancer.start_timer(); + } else { + this->atom->cast_x_data(host_x,host_type); + this->atom->cast_q_data(host_q); + this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + this->hd_balancer.start_timer(); + this->atom->add_x_data(host_x,host_type); + } + this->atom->add_q_data(); + this->atom->add_extra_data(); + + *ilist=this->nbor->host_ilist.begin(); + *jnum=this->nbor->host_acc.begin(); + + this->device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q, + boxlo, prd); + + // re-allocate dev_short_nbor if necessary + if (inum_full*(2+this->_max_nbors) > this->dev_short_nbor.cols()) { + int _nmax=static_cast(static_cast(inum_full)*1.10); + this->dev_short_nbor.resize((2+this->_max_nbors)*this->_nmax); + } + + return this->nbor->host_jlist.begin()-host_start; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute dispersion real-space // --------------------------------------------------------------------------- @@ -201,9 +297,9 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, sublo, subhi, tag, + nullptr, nullptr, nullptr, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, @@ -270,19 +366,20 @@ int HippoT::dispersion_real(const int eflag, const int vflag) { // --------------------------------------------------------------------------- template int** HippoT::compute_multipole_real(const int ago, const int inum_full, - const int nall, double **host_x, - int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, - int *nspecial15, tagint **special15, - const bool eflag_in, const bool vflag_in, - const bool eatom, const bool vatom, - int &host_start, int **ilist, int **jnum, - const double cpu_time, bool &success, - const double aewald, const double felec, - const double off2_mpole, double *host_q, - double *boxlo, double *prd, void **tep_ptr) { + const int nall, double **host_x, + int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, + double* host_pval, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, + int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, + const double aewald, const double felec, + const double off2_mpole, double *host_q, + double *boxlo, double *prd, void **tep_ptr) { this->acc_timers(); int eflag, vflag; if (eatom) eflag=2; @@ -311,9 +408,9 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, // (x, type, amtype, amgroup, rpole) are ready on the device. int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, + firstneigh = precompute(ago, inum_full, nall, host_x, host_type, host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, sublo, subhi, tag, + nullptr, nullptr, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag_in, vflag_in, eatom, vatom, host_start, ilist, jnum, cpu_time, @@ -380,7 +477,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->k_multipole.set_size(GX,BX); - this->k_multipole.run(&this->atom->x, &this->atom->extra, &_pval, + this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &coeff_amclass, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index 56da15f8aa..bc5d9270d4 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -908,7 +908,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_, __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, const __global numtyp *restrict extra, - const __global numtyp *restrict pval, const __global numtyp4 *restrict coeff_amtype, const __global numtyp4 *restrict coeff_amclass, const __global numtyp4 *restrict sp_polar, @@ -945,6 +944,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_, numtyp4* polar1 = (numtyp4*)(&extra[0]); numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + numtyp4* polar6 = (numtyp4*)(&extra[20*nall]); if (ii { const double gpu_split, FILE *_screen, const double polar_dscale, const double polar_uscale); + /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed + int** precompute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *host_amtype, + int *host_amgroup, double **host_rpole, double **host_uind, + double **host_uinp, double* host_pval, double *sublo, double *subhi, + tagint *tag, int **nspecial, tagint **special, + int *nspecial15, tagint **special15, + const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **&ilist, int **&numj, const double cpu_time, bool &success, + double *charge, double *boxlo, double *prd); + /// Compute dispersion real-space with device neighboring int** compute_dispersion_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -69,8 +81,8 @@ class Hippo : public BaseAmoeba { /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, - int *host_amgroup, double **host_rpole, double *sublo, double *subhi, - tagint *tag, int **nspecial, tagint **special, + int *host_amgroup, double **host_rpole, double *host_pval, + double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index fa09e7bce4..390f713d98 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -140,7 +140,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, - double *sublo, double *subhi, tagint *tag, int **nspecial, + double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, @@ -148,7 +148,7 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full, bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tep_ptr) { return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, sublo, subhi, + host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi, tag, nspecial, special, nspecial15, special15, eflag, vflag, eatom, vatom, host_start, ilist, jnum, cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr); diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 91465abb82..6ac22e0721 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -79,7 +79,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full, int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, - double **host_rpole, double *sublo, double *subhi, tagint *tag, + double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, @@ -135,7 +135,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_hal_ready = false; // always false for HIPPO gpu_repulsion_ready = false; // true for HIPPO when ready gpu_dispersion_real_ready = true; // true for HIPPO when ready - gpu_multipole_real_ready = false; + gpu_multipole_real_ready = true; gpu_udirect2b_ready = false; gpu_umutual2b_ready = false; gpu_polar_real_ready = false; @@ -294,14 +294,14 @@ void PairHippoGPU::multipole_real() double felec = electric / am_dielectric; firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, - atom->type, amtype, amgroup, rpole, - sublo, subhi, atom->tag, - atom->nspecial, atom->special, - atom->nspecial15, atom->special15, - eflag, vflag, eflag_atom, vflag_atom, - host_start, &ilist, &numneigh, cpu_time, - success, aewald, felec, off2, atom->q, - domain->boxlo, domain->prd, &tq_pinned); + atom->type, amtype, amgroup, rpole, pval, + sublo, subhi, atom->tag, + atom->nspecial, atom->special, + atom->nspecial15, atom->special15, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success, aewald, felec, off2, atom->q, + domain->boxlo, domain->prd, &tq_pinned); if (!success) error->one(FLERR,"Insufficient memory on accelerator");