From 21b7fb2fcfb842b1f332eb737ae83fa5f89d48d2 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Fri, 2 Sep 2022 14:55:20 -0500 Subject: [PATCH] Exposing fphi_uind to the gpu pair style, still keeping the part not ready though --- lib/gpu/lal_amoeba_ext.cpp | 12 ++- lib/gpu/lal_base_amoeba.cpp | 200 ++++++++++++++++++++---------------- lib/gpu/lal_base_amoeba.h | 14 +-- src/AMOEBA/pair_amoeba.h | 2 +- src/GPU/pair_amoeba_gpu.cpp | 60 ++++++++++- src/GPU/pair_amoeba_gpu.h | 4 + 6 files changed, 193 insertions(+), 99 deletions(-) diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 6989a5e6f6..151c38c9c4 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -162,9 +162,17 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } -void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp, +void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, double ***grid) { + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out, bool& first_iteration) { + AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, igrid, host_grid_brick_start, host_fdip_phi1, + host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); } void amoeba_setup_fft(const int numel, const int element_type) { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 1269a798db..bdd43aa59e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _max_fieldp_size = _max_tep_size; _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _max_thetai_size = _max_tep_size; + _max_thetai_size = 0; _nmax = nall; dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); @@ -441,81 +441,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall return nbor->host_jlist.begin()-host_start; } -// --------------------------------------------------------------------------- -// Prepare for umutual1: bspline_fill -// - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed -// - transfer extra data from host to device -// --------------------------------------------------------------------------- - -template -void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** host_igrid, - double* grid_brick_start, int nzlo_out, - int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out) { - - _bsorder = bsorder; - - // allocate or resize per-atom arrays - // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax - // will be consolidated once all terms are ready - - if (_max_thetai_size == 0) { - _max_thetai_size = static_cast(static_cast(inum_full)*1.10); - _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); - - _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); - _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); - _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY); - - } else { - if (inum_full>_max_thetai_size) { - _max_thetai_size=static_cast(static_cast(inum_full)*1.10); - _thetai1.resize(_max_thetai_size*bsorder*4); - _thetai2.resize(_max_thetai_size*bsorder*4); - _thetai3.resize(_max_thetai_size*bsorder*4); - _igrid.resize(_max_thetai_size*4); - - _fdip_phi1.resize(_max_thetai_size*10); - _fdip_phi2.resize(_max_thetai_size*10); - _fdip_sum_phi.resize(_max_thetai_size*20); - } - } - - UCL_H_Vec dview; - - // copy from host to device - - dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device)); - ucl_copy(_thetai1,dview,false); - dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device)); - ucl_copy(_thetai2,dview,false); - dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device)); - ucl_copy(_thetai3,dview,false); - - UCL_H_Vec dview_int; - dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device)); - ucl_copy(_igrid,dview_int,false); - - _nzlo_out = nzlo_out; - _nzhi_out = nzhi_out; - _nylo_out = nylo_out; - _nyhi_out = nyhi_out; - _nxlo_out = nxlo_out; - _nxhi_out = nxhi_out; - _ngridz = nzhi_out - nzlo_out + 1; - _ngridy = nyhi_out - nylo_out + 1; - _ngridx = nxhi_out - nxlo_out + 1; - _num_grid_points = _ngridx*_ngridy*_ngridz*2; - dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device)); - ucl_copy(_cgrid_brick,dview,false); - -} - // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- @@ -626,6 +551,98 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double // _fieldp.update_host(_max_fieldp_size*8,false); } +// --------------------------------------------------------------------------- +// Prepare for umutual1() after bspline_fill() is done on host +// - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed +// host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4 +// host_igrid is allocated with nmax by by 4 +// - transfer extra data from host to device +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** host_igrid, + double* grid_brick_start, int nzlo_out, + int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out) { + + _bsorder = bsorder; + + // allocate or resize per-atom arrays + // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax + // will be consolidated once all terms are ready + + if (_max_thetai_size == 0) { + _max_thetai_size = static_cast(static_cast(inum_full)*1.10); + _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); + + _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY); + + } else { + if (inum_full>_max_thetai_size) { + _max_thetai_size=static_cast(static_cast(inum_full)*1.10); + _thetai1.resize(_max_thetai_size*bsorder*4); + _thetai2.resize(_max_thetai_size*bsorder*4); + _thetai3.resize(_max_thetai_size*bsorder*4); + _igrid.resize(_max_thetai_size*4); + + _fdip_phi1.resize(_max_thetai_size*10); + _fdip_phi2.resize(_max_thetai_size*10); + _fdip_sum_phi.resize(_max_thetai_size*20); + } + } + + UCL_H_Vec dview; + dview.alloc(inum_full*bsorder*4,*(this->ucl_device)); + + // pack host data to device + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*4*bsorder + 4*j; + dview[idx+0] = host_thetai1[i][j][0]; + dview[idx+1] = host_thetai1[i][j][1]; + dview[idx+2] = host_thetai1[i][j][2]; + dview[idx+3] = host_thetai1[i][j][3]; + } + ucl_copy(_thetai1,dview,false); + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*4*bsorder + 4*j; + dview[idx+0] = host_thetai2[i][j][0]; + dview[idx+1] = host_thetai2[i][j][1]; + dview[idx+2] = host_thetai2[i][j][2]; + dview[idx+3] = host_thetai2[i][j][3]; + } + ucl_copy(_thetai2,dview,false); + + for (int i = 0; i < inum_full; i++) + for (int j = 0; j < bsorder; j++) { + int idx = i*4*bsorder + 4*j; + dview[idx+0] = host_thetai3[i][j][0]; + dview[idx+1] = host_thetai3[i][j][1]; + dview[idx+2] = host_thetai3[i][j][2]; + dview[idx+3] = host_thetai3[i][j][3]; + } + ucl_copy(_thetai3,dview,false); + + UCL_H_Vec dview_int; + for (int i = 0; i < inum_full; i++) { + int idx = i*4; + dview_int[idx+0] = host_igrid[i][0]; + dview_int[idx+1] = host_igrid[i][1]; + dview_int[idx+2] = host_igrid[i][2]; + } + ucl_copy(_igrid,dview_int,false); +} + // --------------------------------------------------------------------------- // fphi_uind = induced potential from grid // fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid @@ -633,19 +650,22 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double template void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** igrid, - double *host_grid_brick_start, double **host_fdip_phi1, - double **host_fdip_phi2, double **host_fdip_sum_phi, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, void** host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out) + int nxlo_out, int nxhi_out, bool& first_iteration) { - // allocation/resize and transfers (do this right after udirect?) + // allocation/resize and transfers before the first iteration - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, - igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out, - nxlo_out, nxhi_out); - + if (first_iteration) { + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, + igrid, host_grid_brick_start, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); + if (first_iteration) first_iteration = false; + } + // update the cgrid_brick with data host _nzlo_out = nzlo_out; @@ -664,6 +684,14 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, ucl_copy(_cgrid_brick,dview,false); const int red_blocks = fphi_uind(); + + _fdip_phi1.update_host(_max_thetai_size*10); + _fdip_phi2.update_host(_max_thetai_size*10); + _fdip_sum_phi.update_host(_max_thetai_size*20); + + *host_fdip_phi1 = _fdip_phi1.host.begin(); + *host_fdip_phi2 = _fdip_phi2.host.begin(); + *host_fdip_sum_phi = _fdip_sum_phi.host.begin(); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index d3ae3a750b..a001423812 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -151,8 +151,8 @@ class BaseAmoeba { double *charge, double *boxlo, double *prd); virtual void precompute_induce(const int inum_full, const int bsorder, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** igrid, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, double* grid_brick_start, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out); @@ -180,12 +180,12 @@ class BaseAmoeba { const double aewald, const double off2_polar, void **fieldp_ptr); virtual void compute_fphi_uind(const int inum_full, const int bsorder, - double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** igrid, - double *host_grid_brick_start, double **host_fdip_phi1, - double **host_fdip_phi2, double **host_fdip_sum_phi, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out); + int nxlo_out, int nxhi_out, bool& first_iteration); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h index 93978ab1f2..17b2d4a1e8 100644 --- a/src/AMOEBA/pair_amoeba.h +++ b/src/AMOEBA/pair_amoeba.h @@ -407,7 +407,7 @@ class PairAmoeba : public Pair { void grid_mpole(double **, double ***); void fphi_mpole(double ***, double **); void grid_uind(double **, double **, double ****); - void fphi_uind(double ****, double **, double **, double **); + virtual void fphi_uind(double ****, double **, double **, double **); void grid_disp(double ***); void kewald(); diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index cd3c01cde3..bf6db3472d 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -88,9 +88,13 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, void amoeba_gpu_update_fieldp(void **fieldp_ptr); -void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp, - double** host_thetai1, double** host_thetai2, - double** host_thetai3, double ***grid); +void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** igrid, + double *host_grid_brick_start, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out, bool& first_iteration); void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -117,6 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) gpu_multipole_real_ready = true; // need to be true for precompute() gpu_udirect2b_ready = true; gpu_umutual1_ready = true; + gpu_fphi_uind_ready = false; gpu_umutual2b_ready = true; gpu_polar_real_ready = true; // need to be true for copying data from device back to host @@ -481,6 +486,8 @@ void PairAmoebaGPU::induce() // conjugate gradient iteration of the mutual induced dipoles + first_induce_iteration = true; + while (!done) { iter++; @@ -1115,6 +1122,53 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp) */ } +/* ---------------------------------------------------------------------- + fphi_uind = induced potential from grid + fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +------------------------------------------------------------------------- */ + +void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, + double **fdip_phi2, double **fdip_sum_phi) +{ + if (!gpu_fphi_uind_ready) { + PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); + return; + } + + void* fdip_phi1_pinned = nullptr; + void* fdip_phi2_pinned = nullptr; + void* fdip_sum_phi_pinned = nullptr; + amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3, + igrid, ic_kspace->grid_brick_start, + &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out, + first_induce_iteration); + + int nlocal = atom->nlocal; + double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 10 * i; + for (int m = 0; m < 10; m++) + fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m]; + } + + double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 10 * i; + for (int m = 0; m < 10; m++) + fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m]; + } + + double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; + for (int i = 0; i < nlocal; i++) { + int idx = 20 * i; + for (int m = 0; m < 20; m++) + fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m]; + } +} + /* ---------------------------------------------------------------------- umutual2b = Ewald real mutual field via list umutual2b computes the real space contribution of the induced diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index e0563cd8b5..fe6ed3368f 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -39,6 +39,7 @@ class PairAmoebaGPU : public PairAmoeba { virtual void multipole_real(); virtual void udirect2b(double **, double **); virtual void umutual1(double **, double **); + virtual void fphi_uind(double ****, double **, double **, double **); virtual void umutual2b(double **, double **); virtual void ufield0c(double **, double **); virtual void polar_real(); @@ -56,9 +57,12 @@ class PairAmoebaGPU : public PairAmoeba { bool gpu_multipole_real_ready; bool gpu_udirect2b_ready; bool gpu_umutual1_ready; + bool gpu_fphi_uind_ready; bool gpu_umutual2b_ready; bool gpu_polar_real_ready; + bool first_induce_iteration; + void udirect2b_cpu(); template