From cad7e1b364c6b6e2a376b26b31af6386038580e3 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 2 Sep 2022 10:18:59 -0500 Subject: [PATCH] Moved fphi_uind up to BaseAmoeba --- lib/gpu/lal_amoeba.cpp | 32 -------------------------------- lib/gpu/lal_amoeba.cu | 6 +++--- lib/gpu/lal_amoeba.h | 1 - lib/gpu/lal_base_amoeba.cpp | 36 ++++++++++++++++++++++++++++++++---- lib/gpu/lal_base_amoeba.h | 2 +- 5 files changed, 36 insertions(+), 41 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 38058bab55..924a175cfe 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -259,38 +259,6 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { return GX; } -// --------------------------------------------------------------------------- -// Interpolate the potential from the PME grid -// --------------------------------------------------------------------------- -template -int AmoebaT::fphi_uind() { - int ainum=this->ans->inum(); - if (ainum == 0) - return 0; - - int _nall=this->atom->nall(); - int nbor_pitch=this->nbor->nbor_pitch(); - - // Compute the block size and grid size to keep all cores busy - const int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/ - (BX/this->_threads_per_atom))); - - this->time_pair.start(); - int ngridyz = this->_ngridy * this->_ngridz; - this->k_fphi_uind.set_size(GX,BX); - this->k_fphi_uind.run(&this->atom->x, &this->_thetai1, - &this->_thetai2, &this->_thetai3, - &this->_igrid, &this->_cgrid_brick, - &this->_fdip_phi1, &this->_fdip_phi2, - &this->_fdip_sum_phi, &this->_bsorder, - &ainum, &ngridyz, &this->_ngridy, - &this->_threads_per_atom); - this->time_pair.stop(); - - return GX; -} - // --------------------------------------------------------------------------- // Calculate the polar real-space term, returning tep // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 984154f16e..200191cea2 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1706,7 +1706,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, tuv012 = (numtyp)0.0; tuv111 = (numtyp)0.0; - k = igrid[3*i+2] - nlpts; + k = igrid[4*i+2] - nlpts; for (int kb = 0; kb < bsorder; kb++) { /* v0 = thetai3[m][kb][0]; @@ -1742,7 +1742,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, tu12 = (numtyp)0.0; tu03 = (numtyp)0.0; - j = igrid[3*i+1] - nlpts; + j = igrid[4*i+1] - nlpts; for (int jb = 0; jb < bsorder; jb++) { /* u0 = thetai2[m][jb][0]; @@ -1763,7 +1763,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, t2_2 = (numtyp)0.0; t3 = (numtyp)0.0; - int ii = igrid[3*i] - nlpts; + int ii = igrid[4*i] - nlpts; for (int ib = 0; ib < bsorder; ib++) { /* tq_1 = grid[k][j][ii][0]; diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h index 005ea14fb9..d12b79719f 100644 --- a/lib/gpu/lal_amoeba.h +++ b/lib/gpu/lal_amoeba.h @@ -91,7 +91,6 @@ class Amoeba : public BaseAmoeba { int multipole_real(const int eflag, const int vflag); int udirect2b(const int eflag, const int vflag); int umutual2b(const int eflag, const int vflag); - int fphi_uind(); int polar_real(const int eflag, const int vflag); }; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index cd5a9abf81..1269a798db 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _max_fieldp_size = _max_tep_size; _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _max_thetai_size = 0; + _max_thetai_size = _max_tep_size; _nmax = nall; dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); @@ -466,7 +466,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); @@ -478,7 +478,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _thetai1.resize(_max_thetai_size*bsorder*4); _thetai2.resize(_max_thetai_size*bsorder*4); _thetai3.resize(_max_thetai_size*bsorder*4); - _igrid.resize(_max_thetai_size*3); + _igrid.resize(_max_thetai_size*4); _fdip_phi1.resize(_max_thetai_size*10); _fdip_phi2.resize(_max_thetai_size*10); @@ -498,7 +498,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, ucl_copy(_thetai3,dview,false); UCL_H_Vec dview_int; - dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device)); + dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device)); ucl_copy(_igrid,dview_int,false); _nzlo_out = nzlo_out; @@ -666,6 +666,34 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, const int red_blocks = fphi_uind(); } +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int BaseAmoebaT::fphi_uind() { + int ainum=ans->inum(); + if (ainum == 0) + return 0; + + int _nall=atom->nall(); + int nbor_pitch=nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=block_size(); + int GX=static_cast(ceil(static_cast(ans->inum())/ + (BX/_threads_per_atom))); + + time_pair.start(); + int ngridyz = _ngridy * _ngridz; + k_fphi_uind.set_size(GX,BX); + k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3, + &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2, + &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy, + &_threads_per_atom); + time_pair.stop(); + + return GX; +} // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 8503e6fba4..d3ae3a750b 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -311,7 +311,7 @@ class BaseAmoeba { virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; - virtual int fphi_uind() = 0; + virtual int fphi_uind(); virtual int polar_real(const int eflag, const int vflag) = 0;