From 1166845fcf025292ac37646ed37e4b62d3bcc85b Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Sat, 18 Sep 2021 10:22:22 -0500 Subject: [PATCH] Prepared data structure for the dispersion real-space term --- lib/gpu/lal_amoeba.cpp | 38 ++++++--- lib/gpu/lal_amoeba.cu | 166 ++++++++++++++++++++++++++++++------ lib/gpu/lal_amoeba.h | 14 +-- lib/gpu/lal_amoeba_ext.cpp | 24 +++--- src/GPU/pair_amoeba_gpu.cpp | 12 +-- 5 files changed, 197 insertions(+), 57 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index d2f2b1bf79..28ed02b480 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -44,12 +44,14 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const { } template -int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_dirdamp, +int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, + const double *host_dirdamp, const int *host_amtype2class, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, @@ -80,11 +82,22 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda host_write[i].x = host_pdamp[i]; host_write[i].y = host_thole[i]; host_write[i].z = host_dirdamp[i]; - host_write[i].w = (numtyp)0; + host_write[i].w = host_amtype2class[i]; } - damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); - ucl_copy(damping,host_write,false); + coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amtype,host_write,false); + + UCL_H_Vec host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < max_amclass; i++) { + host_write2[i].x = host_csix[i]; + host_write2[i].y = host_adisp[i]; + host_write2[i].z = (numtyp)0; + host_write2[i].w = (numtyp)0; + } + + coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff_amclass,host_write2,false); UCL_H_Vec dview(5, *(this->ucl_device), UCL_WRITE_ONLY); sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY); @@ -100,9 +113,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda _polar_uscale = polar_uscale; _allocated=true; - this->_max_bytes=damping.row_bytes() - + sp_polar.row_bytes() - + this->_tep.row_bytes(); + this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes() + + sp_polar.row_bytes() + this->_tep.row_bytes(); return 0; } @@ -112,7 +124,7 @@ void AmoebaT::clear() { return; _allocated=false; - damping.clear(); + coeff_amtype.clear(); sp_polar.clear(); this->clear_atomic(); @@ -151,7 +163,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) { &nbor_pitch, &this->_threads_per_atom); this->k_multipole.set_size(GX,BX); - this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, @@ -192,7 +204,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) { } this->k_udirect2b.set_size(GX,BX); - this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, @@ -232,7 +244,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) { } this->k_umutual2b.set_size(GX,BX); - this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall, &nbor_pitch, &this->_threads_per_atom, &this->_aewald, @@ -271,7 +283,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { } this->k_polar.set_size(GX,BX); - this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar, + this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar, &this->nbor->dev_nbor, &this->_nbor_data->begin(), &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &this->_tep, diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 41185f30e3..5a1151f610 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -147,7 +147,7 @@ _texture( q_tex,int2); fieldp[ii+inum] = fp; \ } -#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom \ +#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z); \ @@ -210,8 +210,7 @@ _texture( q_tex,int2); } \ } -// SHUFFLE_AVAIL == 1 -#else +#else // SHUFFLE_AVAIL == 1 #define local_allocate_store_ufld() @@ -280,7 +279,7 @@ _texture( q_tex,int2); #if (EVFLAG == 1) -#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ +#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) { \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ @@ -376,7 +375,7 @@ _texture( q_tex,int2); // EVFLAG == 0 #else -#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ +#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \ offset, eflag, vflag, ans, engv, ev_stride) \ if (t_per_atom>1) \ simd_reduce_add3(t_per_atom, f.x, f.y, f.z); \ @@ -394,6 +393,125 @@ _texture( q_tex,int2); #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MY_PIS (acctyp)1.77245385090551602729 +/* ---------------------------------------------------------------------- + dispersion = real-space portion of Ewald dispersion + adapted from Tinker edreal1d() routine +------------------------------------------------------------------------- */ + +__kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_, + const __global numtyp *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict sp_polar, + const __global int *dev_nbor, + const __global int *dev_packed, + const __global int *dev_short_nbor, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nall, const int nbor_pitch, + const int t_per_atom, const numtyp aewald, + const numtyp felec, const numtyp off2, + const numtyp polar_dscale, const numtyp polar_uscale) +{ + int tid, ii, offset, i; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_charge(); + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int l=0; l<6; l++) virial[l]=(acctyp)0; + } + + acctyp4 tq; + tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; + + numtyp4* polar1 = (numtyp4*)(&extra[0]); + numtyp4* polar2 = (numtyp4*)(&extra[4*nall]); + numtyp4* polar3 = (numtyp4*)(&extra[8*nall]); + + if (iioff2) continue; + + numtyp r = ucl_sqrt(r2); + numtyp ck = polar1[j].x; // rpole[j][0]; + numtyp dkx = polar1[j].y; // rpole[j][1]; + numtyp dky = polar1[j].z; // rpole[j][2]; + numtyp dkz = polar1[j].w; // rpole[j][3]; + numtyp qkxx = polar2[j].x; // rpole[j][4]; + numtyp qkxy = polar2[j].y; // rpole[j][5]; + numtyp qkxz = polar2[j].z; // rpole[j][6]; + numtyp qkyy = polar2[j].w; // rpole[j][8]; + numtyp qkyz = polar3[j].x; // rpole[j][9]; + numtyp qkzz = polar3[j].y; // rpole[j][12]; + int jtype = polar3[j].z; // amtype[j]; + int jgroup = polar3[j].w; // amgroup[j]; + + } // nbor + + } // ii { * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(const int ntypes, const int max_amtype, const double *host_pdamp, - const double *host_thole, const double *host_dirdamp, - const double *host_special_mpole, + int init(const int ntypes, const int max_amtype, const int max_amclass, + const double *host_pdamp, const double *host_thole, const double *host_dirdamp, + const int *host_amtype2class, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, const double gpu_split, FILE *_screen, @@ -60,8 +61,11 @@ class Amoeba : public BaseAmoeba { // --------------------------- TYPE DATA -------------------------- - /// pdamp = damping.x; thole = damping.y - UCL_D_Vec damping; + /// pdamp = coeff_amtype.x; thole = coeff_amtype.y; + /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w + UCL_D_Vec coeff_amtype; + /// csix = coeff_amclass.x; adisp = coeff_amclass.y; + UCL_D_Vec coeff_amclass; /// Special polar values [0-4]: /// sp_polar.x = special_polar_wscale /// sp_polar.y special_polar_pscale, diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 8493e9331d..804bf10f32 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -27,13 +27,14 @@ static Amoeba AMOEBAMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -int amoeba_gpu_init(const int ntypes, const int max_amtype, +int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, - const double *host_dirdamp, + const double *host_dirdamp, const int *host_amtype2class, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, @@ -63,11 +64,13 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, int init_ok=0; if (world_me==0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, - host_special_mpole, host_special_polar_wscale, + init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, + host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, - nlocal, nall, max_nbors, maxspecial, maxspecial15, - cell_size, gpu_split, screen, polar_dscale, polar_uscale); + host_csix, host_adisp, nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); AMOEBAMF.device->world_barrier(); if (message) @@ -83,11 +86,12 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, fflush(screen); } if (gpu_rank==i && world_me!=0) - init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp, - host_special_mpole, host_special_polar_wscale, + init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp, + host_amtype2class, host_special_mpole, host_special_polar_wscale, host_special_polar_piscale, host_special_polar_pscale, - nlocal, nall, max_nbors, maxspecial, maxspecial15, - cell_size, gpu_split, screen, polar_dscale, polar_uscale); + host_csix, host_adisp, nlocal, nall, max_nbors, + maxspecial, maxspecial15, cell_size, gpu_split, + screen, polar_dscale, polar_uscale); AMOEBAMF.device->gpu_barrier(); if (message) diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index f932f05e25..25f4718163 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -50,13 +50,14 @@ enum{GORDON1,GORDON2}; // External functions from cuda library for atom decomposition -int amoeba_gpu_init(const int ntypes, const int max_amtype, +int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, - const double *host_dirdamp, + const double *host_dirdamp, const int* host_amtype2class, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, + const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, @@ -168,9 +169,10 @@ void PairAmoebaGPU::init_style() int tq_size; int mnf = 5e-2 * neighbor->oneatom; - int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp, - special_mpole, special_polar_wscale, special_polar_piscale, - special_polar_pscale, atom->nlocal, + int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, + pdamp, thole, dirdamp, amtype2class, special_mpole, + special_polar_wscale, special_polar_piscale, + special_polar_pscale, csix, adisp, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, polar_dscale, polar_uscale, tq_size);