diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 105f18cfa8..d67fa4f869 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1646,7 +1646,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, //numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i]; acctyp fdip_buf[32]; - int j,k,m; + int j,k; int nlpts = (bsorder-1) / 2; // extract the permanent multipole field at each site diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 95b7237e46..f91b76f688 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -164,15 +164,16 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - double *host_grid_brick_start, double ****host_grid_brick, + double ***host_thetai3, int** igrid, double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration) { + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + bool& first_iteration) { AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, igrid, host_grid_brick_start, host_grid_brick, host_fdip_phi1, - host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); + host_thetai3, igrid, host_grid_brick, host_fdip_phi1, + host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); } void amoeba_setup_fft(const int numel, const int element_type) { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 05b830d773..dfd5565f1e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -186,8 +186,10 @@ void BaseAmoebaT::clear_atomic() { _igrid.clear(); _fdip_phi1.clear(); _fdip_phi2.clear(); - _cgrid_brick.clear(); _fdip_sum_phi.clear(); + _cgrid_brick.clear(); + hview_cgrid.clear(); + dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -563,10 +565,9 @@ template void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, - double* host_grid_brick_start, double**** host_grid_brick, - int nzlo_out, int nzhi_out, - int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out) { + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out) { _bsorder = bsorder; @@ -642,34 +643,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, dview_int[idx+1] = host_igrid[i][1]; dview_int[idx+2] = host_igrid[i][2]; } - ucl_copy(_igrid, dview_int, false); -} + ucl_copy(_igrid, dview_int, false); -// --------------------------------------------------------------------------- -// fphi_uind = induced potential from grid -// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid -// --------------------------------------------------------------------------- - -template -void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** host_igrid, - double *host_grid_brick_start, double ****host_grid_brick, - void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration) -{ - // allocation/resize and transfers before the first iteration - - if (first_iteration) { - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, - host_igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out); - first_iteration = false; - } - - // update the cgrid_brick with data host - _nzlo_out = nzlo_out; _nzhi_out = nzhi_out; _nylo_out = nylo_out; @@ -681,8 +656,47 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, _ngridx = nxhi_out - nxlo_out + 1; _num_grid_points = _ngridx * _ngridy * _ngridz; - UCL_H_Vec hview_cgrid; - hview_cgrid.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_WRITE); + int numel = _num_grid_points*2; + if (_cgrid_brick.cols() == 0) { + hview_cgrid.alloc(numel, *(this->ucl_device), UCL_READ_WRITE); + _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_ONLY); + } else if (numel > _cgrid_brick.cols()) { + hview_cgrid.resize(numel); + _cgrid_brick.resize(numel); + } +} + +// --------------------------------------------------------------------------- +// fphi_uind = induced potential from grid +// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, + double ***host_thetai1, double ***host_thetai2, + double ***host_thetai3, int** host_igrid, + double ****host_grid_brick, + void** host_fdip_phi1, + void **host_fdip_phi2, + void **host_fdip_sum_phi, + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + bool& first_iteration) +{ + // TODO: find out why this alloc helps makes the cgrid_brick ucl_copy work + UCL_H_Vec hview; + hview.alloc(1, *(this->ucl_device), UCL_READ_ONLY); + + // allocation/resize and transfers before the first iteration + + if (first_iteration) { + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, + host_thetai3, host_igrid, nzlo_out, nzhi_out, + nylo_out, nyhi_out, nxlo_out, nxhi_out); + first_iteration = false; + } + int n = 0; for (int iz = nzlo_out; iz <= nzhi_out; iz++) for (int iy = nylo_out; iy <= nyhi_out; iy++) @@ -691,10 +705,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1]; n += 2; } - //hview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device)); - _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY); - ucl_copy(_cgrid_brick,hview_cgrid,false); - + ucl_copy(_cgrid_brick, hview_cgrid, false); const int red_blocks = fphi_uind(); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index c2c2a2d93d..a4a7a8d1a7 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -153,10 +153,9 @@ class BaseAmoeba { virtual void precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, double ****host_grid_brick, - int nzlo_out, int nzhi_out, - int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out); + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out); /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, @@ -183,10 +182,12 @@ class BaseAmoeba { virtual void compute_fphi_uind(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double *host_grid_brick_start, double ****host_grid_brick, + double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, - int nxlo_out, int nxhi_out, bool& first_iteration); + const int nzlo_out, const int nzhi_out, + const int nylo_out, const int nyhi_out, + const int nxlo_out, const int nxhi_out, + bool& first_iteration); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -255,7 +256,9 @@ class BaseAmoeba { int _nmax, _max_tep_size, _max_fieldp_size; int _bsorder; - UCL_D_Vec _thetai1, _thetai2, _thetai3, _cgrid_brick; + UCL_D_Vec _thetai1, _thetai2, _thetai3; + UCL_H_Vec hview_cgrid; + UCL_D_Vec _cgrid_brick; UCL_D_Vec _igrid; UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size;