From 5a6426bf96b2aa8d69d8e4580460b82a48d7573c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 2 Oct 2021 00:56:15 -0500 Subject: [PATCH] Only transfer data arrays that are needed in each kernel --- lib/gpu/lal_base_amoeba.cpp | 53 ++++++++++--------- lib/gpu/lal_hippo.cpp | 102 ++++++++++-------------------------- 2 files changed, 55 insertions(+), 100 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 7cd410b6b8..c56cb77aa3 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -350,8 +350,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **&ilist, int **&jnum, const double cpu_time, - bool &success, double *host_q, double *boxlo, - double *prd) { + bool &success, double *host_q, double *boxlo, double *prd) { acc_timers(); if (eatom) _eflag=2; else if (eflag_in) _eflag=1; @@ -509,7 +508,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, int** firstneigh = nullptr; cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); - atom->add_extra_data(); + atom->add_extra_data(); // ------------------- Resize _fieldp array ------------------------ @@ -647,30 +646,34 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole, int n = 0; int nstride = 4; - for (int i = 0; i < _nall; i++) { - int idx = n+i*nstride; - pextra[idx] = rpole[i][0]; - pextra[idx+1] = rpole[i][1]; - pextra[idx+2] = rpole[i][2]; - pextra[idx+3] = rpole[i][3]; - } + if (rpole) { + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][0]; + pextra[idx+1] = rpole[i][1]; + pextra[idx+2] = rpole[i][2]; + pextra[idx+3] = rpole[i][3]; + } - n += nstride*_nall; - for (int i = 0; i < _nall; i++) { - int idx = n+i*nstride; - pextra[idx] = rpole[i][4]; - pextra[idx+1] = rpole[i][5]; - pextra[idx+2] = rpole[i][6]; - pextra[idx+3] = rpole[i][8]; - } + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][4]; + pextra[idx+1] = rpole[i][5]; + pextra[idx+2] = rpole[i][6]; + pextra[idx+3] = rpole[i][8]; + } - n += nstride*_nall; - for (int i = 0; i < _nall; i++) { - int idx = n+i*nstride; - pextra[idx] = rpole[i][9]; - pextra[idx+1] = rpole[i][12]; - pextra[idx+2] = (numtyp)amtype[i]; - pextra[idx+3] = (numtyp)amgroup[i]; + n += nstride*_nall; + for (int i = 0; i < _nall; i++) { + int idx = n+i*nstride; + pextra[idx] = rpole[i][9]; + pextra[idx+1] = rpole[i][12]; + pextra[idx+2] = (numtyp)amtype[i]; + pextra[idx+3] = (numtyp)amgroup[i]; + } + } else { + n += 2*nstride*_nall; } n += nstride*_nall; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index a5e3be5974..5a348c9272 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -314,23 +314,12 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, const double cpu_time, bool &success, const double aewald, const double off2_disp, double *host_q, double *boxlo, double *prd) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // We only need to cast necesary data arrays from host to device here - // because the neighbor lists are rebuilt and other per-atom arrays - // (x, type) are ready on the device. - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, nullptr, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // cast necessary data arrays from host to device + + this->cast_extra_data(host_amtype, host_amgroup, host_rpole, + nullptr, nullptr, nullptr); + this->atom->add_extra_data(); this->_off2_disp = off2_disp; this->_aewald = aewald; @@ -344,7 +333,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full, this->hd_balancer.stop_timer(); - return firstneigh; // nbor->host_jlist.begin()-host_start; + return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -407,25 +396,11 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_mpole, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // Once all the kernels are ready, precompute() is needed only once - // in the first kernel in a time step. - // We only need to cast uind and uinp from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - nullptr, nullptr, host_pval, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval); + this->atom->add_extra_data(); // ------------------- Resize _tep array ------------------------ @@ -451,7 +426,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full, this->_tep.update_host(this->_max_tep_size*4,false); - return firstneigh; // nbor->host_jlist.begin()-host_start; + return nullptr; // nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -516,17 +491,11 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, host_pval, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // all the necessary data arrays are already copied from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval); + this->atom->add_extra_data(); // ------------------- Resize _fieldp array ------------------------ @@ -544,7 +513,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full, this->_fieldp.update_host(this->_max_fieldp_size*8,false); - return firstneigh; //nbor->host_jlist.begin()-host_start; + return nullptr; //nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -608,17 +577,11 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full, const double aewald, const double off2_polar, double *host_q, double *boxlo, double *prd, void** fieldp_ptr) { - // reallocate per-atom arrays, transfer extra data from the host - // and build the neighbor lists if needed - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, host_pval, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // cast necessary data arrays from host to device + + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); + this->atom->add_extra_data(); // ------------------- Resize _fieldp array ------------------------ @@ -636,7 +599,7 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full, this->_fieldp.update_host(this->_max_fieldp_size*8,false); - return firstneigh; //nbor->host_jlist.begin()-host_start; + return nullptr; //nbor->host_jlist.begin()-host_start; } // --------------------------------------------------------------------------- @@ -698,23 +661,12 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, const double aewald, const double felec, const double off2_polar, double *host_q, double *boxlo, double *prd, void **tep_ptr) { - // reallocate per-atom arrays, transfer data from the host - // and build the neighbor lists if needed - // NOTE: - // For now we invoke precompute() again here, - // to be able to turn on/off the udirect2b kernel (which comes before this) - // We only need to cast uind and uinp from host to device here - // if the neighbor lists are rebuilt and other per-atom arrays - // (x, type, amtype, amgroup, rpole) are ready on the device. - int** firstneigh = nullptr; - firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type, - host_amtype, host_amgroup, host_rpole, - host_uind, host_uinp, host_pval, sublo, subhi, tag, - nspecial, special, nspecial15, special15, - eflag_in, vflag_in, eatom, vatom, - host_start, ilist, jnum, cpu_time, - success, host_q, boxlo, prd); + // cast necessary data arrays from host to device + + //this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval); + this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr); + this->atom->add_extra_data(); // ------------------- Resize _tep array ------------------------ @@ -740,7 +692,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full, this->_tep.update_host(this->_max_tep_size*4,false); - return firstneigh; // nbor->host_jlist.begin()-host_start; + return nullptr; } // ---------------------------------------------------------------------------