From 785131932c87e0575d336f5b296cbce5731f13b6 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 20 Sep 2022 13:58:17 -0500 Subject: [PATCH] Added fphi_mpole in amoeba/gpu, fixed a bug in the kernel when indexing grid --- lib/gpu/lal_amoeba.cpp | 4 +- lib/gpu/lal_amoeba.cu | 2 +- lib/gpu/lal_amoeba_ext.cpp | 4 + lib/gpu/lal_base_amoeba.cpp | 74 ++++++++++++++- lib/gpu/lal_base_amoeba.h | 14 ++- lib/gpu/lal_hippo.cpp | 4 +- lib/gpu/lal_hippo.cu | 178 ++++++++++++++++++++++++++++++++++++ src/GPU/pair_amoeba_gpu.cpp | 37 +++++++- 8 files changed, 300 insertions(+), 17 deletions(-) diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 02870ea861..7be4a6f59c 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -64,8 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass, cell_size,gpu_split,_screen,amoeba, "k_amoeba_multipole", "k_amoeba_udirect2b", "k_amoeba_umutual2b", "k_amoeba_polar", - "k_amoeba_fphi_uind", "k_amoeba_short_nbor", - "k_amoeba_special15"); + "k_amoeba_fphi_uind", "k_amoeba_fphi_mpole", + "k_amoeba_short_nbor", "k_amoeba_special15"); if (success!=0) return success; diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index da5c6f0c3c..6f77fb932f 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -2026,7 +2026,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1, for (int ib = 0; ib < bsorder; ib++) { int i1 = istart + ib; numtyp4 tha1 = thetai1[i1]; - int gidx = 2*(k*ngridxy + j*ngridx + i); + int gidx = k*ngridxy + j*ngridx + i; numtyp tq = grid[gidx]; t0 += tq*tha1.x; t1 += tq*tha1.y; diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 42384cf7de..1f56fa86f8 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -179,6 +179,10 @@ void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi); } +void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi) { + AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi); +} + void amoeba_setup_fft(const int numel, const int element_type) { AMOEBAMF.setup_fft(numel, element_type); } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index e3da81762e..08dcd8123e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -38,6 +38,7 @@ BaseAmoebaT::~BaseAmoeba() { k_udirect2b.clear(); k_umutual2b.clear(); k_fphi_uind.clear(); + k_fphi_mpole.clear(); k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); @@ -66,6 +67,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, const char *k_name_umutual2b, const char *k_name_polar, const char *k_name_fphi_uind, + const char *k_name_fphi_mpole, const char *k_name_short_nbor, const char* k_name_special15) { screen=_screen; @@ -100,8 +102,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); compile_kernels(*ucl_device,pair_program,k_name_multipole, - k_name_udirect2b, k_name_umutual2b,k_name_polar, - k_name_fphi_uind, k_name_short_nbor, k_name_special15); + k_name_udirect2b, k_name_umutual2b,k_name_polar, + k_name_fphi_uind, k_name_fphi_mpole, + k_name_short_nbor, k_name_special15); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -559,6 +562,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double // host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4 // host_igrid is allocated with nmax by 4 // - transfer extra data from host to device +// NOTE: can be re-used for fphi_mpole() (already allocate 2x grid points) // --------------------------------------------------------------------------- template @@ -568,7 +572,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out) { - + // update bsorder with that of the kspace solver _bsorder = bsorder; // allocate or resize per-atom arrays @@ -586,7 +590,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); } else { - if (inum_full>_max_thetai_size) { + if (_thetai1.cols()<_max_thetai_size*bsorder) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); _thetai1.resize(_max_thetai_size*bsorder); _thetai2.resize(_max_thetai_size*bsorder); @@ -667,6 +671,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, // --------------------------------------------------------------------------- // fphi_uind = induced potential from grid // fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// NOTE: host_grid_brick is from ic_kspace post_convolution() // --------------------------------------------------------------------------- template @@ -687,7 +692,7 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1]; n += 2; } - _cgrid_brick.update_device(false); + _cgrid_brick.update_device(_num_grid_points*2, false); const int red_blocks = fphi_uind(); @@ -727,6 +732,63 @@ int BaseAmoebaT::fphi_uind() { return GX; } +// --------------------------------------------------------------------------- +// fphi_mpole = multipole potential from grid (limited to polar_kspace for now) +// fphi_mpole extracts the permanent multipole potential from +// the particle mesh Ewald grid +// NOTE: host_grid_brick is from p_kspace post_convolution() +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi) +{ + // TODO: grid brick[k][j][i] is a scalar + UCL_H_Vec hdummy; + hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); + + int n = 0; + for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) + for (int iy = _nylo_out; iy <= _nyhi_out; iy++) + for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) { + _cgrid_brick[n] = host_grid_brick[iz][iy][ix]; + n++; + } + _cgrid_brick.update_device(_num_grid_points, false); + + const int red_blocks = fphi_mpole(); + + _fdip_sum_phi.update_host(_max_thetai_size*20); + + *host_fphi = _fdip_sum_phi.host.begin(); +} + +// --------------------------------------------------------------------------- +// Interpolate the potential from the PME grid +// --------------------------------------------------------------------------- +template +int BaseAmoebaT::fphi_mpole() { + int ainum=ans->inum(); + if (ainum == 0) + return 0; + + int _nall=atom->nall(); + int nbor_pitch=nbor->nbor_pitch(); + + // Compute the block size and grid size to keep all cores busy + const int BX=block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + + time_pair.start(); + int ngridxy = _ngridx * _ngridy; + k_fphi_mpole.set_size(GX,BX); + k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick, + &_fdip_sum_phi, &_bsorder, &ainum, + &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx); + time_pair.stop(); + + return GX; +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute polar real-space // --------------------------------------------------------------------------- @@ -920,6 +982,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, const char *kname_umutual2b, const char *kname_polar, const char *kname_fphi_uind, + const char *kname_fphi_mpole, const char *kname_short_nbor, const char* kname_special15) { if (_compiled) @@ -935,6 +998,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_umutual2b.set_function(*pair_program, kname_umutual2b); k_polar.set_function(*pair_program, kname_polar); k_fphi_uind.set_function(*pair_program, kname_fphi_uind); + k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole); k_short_nbor.set_function(*pair_program, kname_short_nbor); k_special15.set_function(*pair_program, kname_special15); pos_tex.get_texture(*pair_program, "pos_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index a88a63e870..a5ee245623 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -64,8 +64,8 @@ class BaseAmoeba { const double gpu_split, FILE *screen, const void *pair_program, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, - const char *kname_fphi_uind, const char *kname_short_nbor, - const char* kname_special15); + const char *kname_fphi_uind, const char *kname_fphi_mpole, + const char *kname_short_nbor, const char* kname_special15); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -185,6 +185,8 @@ class BaseAmoeba { void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); + virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi); + /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, @@ -279,7 +281,8 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind; + UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_fphi_uind, k_fphi_mpole; UCL_Kernel k_special15, k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -305,13 +308,14 @@ class BaseAmoeba { void compile_kernels(UCL_Device &dev, const void *pair_string, const char *kname_multipole, const char *kname_udirect2b, const char *kname_umutual2b, const char *kname_polar, - const char *kname_fphi_uind, const char *kname_short_nbor, - const char* kname_special15); + const char *kname_fphi_uind, const char *kname_fphi_mpole, + const char *kname_short_nbor, const char* kname_special15); virtual int multipole_real(const int eflag, const int vflag) = 0; virtual int udirect2b(const int eflag, const int vflag) = 0; virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int fphi_uind(); + virtual int fphi_mpole(); virtual int polar_real(const int eflag, const int vflag) = 0; diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 9917ab91a2..3de6dc544c 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -67,8 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass, cell_size,gpu_split,_screen,hippo, "k_hippo_multipole", "k_hippo_udirect2b", "k_hippo_umutual2b", "k_hippo_polar", - "k_hippo_fphi_uind", "k_hippo_short_nbor", - "k_hippo_special15"); + "k_hippo_fphi_uind", "k_hippo_fphi_mpole", + "k_hippo_short_nbor", "k_hippo_special15"); if (success!=0) return success; diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu index dde8f9bfd5..91793747ef 100644 --- a/lib/gpu/lal_hippo.cu +++ b/lib/gpu/lal_hippo.cu @@ -2346,6 +2346,184 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1, } } +/* ---------------------------------------------------------------------- + fphi_mpole = multipole potential from grid + fphi_mpole extracts the permanent multipole potential from + the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +__kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1, + const __global numtyp4 *restrict thetai2, + const __global numtyp4 *restrict thetai3, + const __global int *restrict igrid, + const __global numtyp *restrict grid, + __global numtyp *restrict fphi, + const int bsorder, const int inum, + const int nzlo_out, const int nylo_out, + const int nxlo_out, const int ngridxy, + const int ngridx) +{ + int tid=THREAD_ID_X; + int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X; + + if (iinlocal, bsorder, thetai1, thetai2, thetai3, igrid, @@ -1311,6 +1314,8 @@ void PairAmoebaGPU::polar_kspace() double cphid[4],cphip[4]; double a[3][3]; // indices not flipped vs Fortran + bool gpu_fphi_mpole_ready = true; + // indices into the electrostatic field array // decremented by 1 versus Fortran @@ -1373,6 +1378,18 @@ void PairAmoebaGPU::polar_kspace() moduli(); bspline_fill(); + // allocate memory and make early host-device transfers + + // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill + if (gpu_fphi_mpole_ready) { + amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, p_kspace->nzlo_out, + p_kspace->nzhi_out, p_kspace->nylo_out, + p_kspace->nyhi_out, p_kspace->nxlo_out, + p_kspace->nxhi_out); + } + + // convert Cartesian multipoles to fractional coordinates cmp_to_fmp(cmp,fmp); @@ -1441,8 +1458,24 @@ void PairAmoebaGPU::polar_kspace() double ***gridpost = (double ***) p_kspace->post_convolution(); // get potential - - fphi_mpole(gridpost,fphi); + + if (!gpu_fphi_mpole_ready) { + fphi_mpole(gridpost,fphi); + //printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]); + } else { + void* fphi_pinned = nullptr; + amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned); + + double *_fphi_ptr = (double *)fphi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = i; + for (int m = 0; m < 20; m++) { + fphi[i][m] = _fphi_ptr[idx]; + idx += nlocal; + } + } + //printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]); + } for (i = 0; i < nlocal; i++) { for (k = 0; k < 20; k++)