diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 425caaabbb..42384cf7de 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -173,18 +173,10 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, nylo_out, nyhi_out, nxlo_out, nxhi_out); } -void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, double ****host_grid_brick, - void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration) { - AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, igrid, host_grid_brick, host_fdip_phi1, - host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); +void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi) { + AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, + host_fdip_phi2, host_fdip_sum_phi); } void amoeba_setup_fft(const int numel, const int element_type) { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 304e23274f..e3da81762e 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -670,17 +670,10 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** host_igrid, - double ****host_grid_brick, +void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, - void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration) + void **host_fdip_sum_phi) { // TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly UCL_H_Vec hdummy; diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 5aeb729993..a88a63e870 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -151,13 +151,6 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); - virtual void precompute_induce(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out); - /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -180,15 +173,17 @@ class BaseAmoeba { double **host_uind, double **host_uinp, double *host_pval, const double aewald, const double off2_polar, void **fieldp_ptr); - virtual void compute_fphi_uind(const int inum_full, const int bsorder, + /// Allocate/resize per-atom arrays before induce() + virtual void precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, - double ****host_grid_brick, - void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration); + const int nxlo_out, const int nxhi_out); + + virtual void compute_fphi_uind(double ****host_grid_brick, + void **host_fdip_phi1, void **host_fdip_phi2, + void **host_fdip_sum_phi); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp index 2cc17c6ced..1bd6bade3a 100644 --- a/lib/gpu/lal_hippo_ext.cpp +++ b/lib/gpu/lal_hippo_ext.cpp @@ -204,18 +204,9 @@ void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, nylo_out, nyhi_out, nxlo_out, nxhi_out); } -void hippo_gpu_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, double ****host_grid_brick, - void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration) { - HIPPOMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2, - host_thetai3, igrid, host_grid_brick, host_fdip_phi1, - host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out, - nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration); +void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi) { + HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi); } double hippo_gpu_bytes() { diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp index 5770d9542d..e5cdc281b9 100644 --- a/src/GPU/pair_amoeba_gpu.cpp +++ b/src/GPU/pair_amoeba_gpu.cpp @@ -95,15 +95,8 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out); -void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - double ****host_grid_brick, void **host_fdip_phi1, - void **host_fdip_phi2, void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration); +void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi); void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, @@ -299,13 +292,6 @@ void PairAmoebaGPU::induce() int debug = 1; - first_induce_iteration = true; - - amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, - thetai2, thetai3, igrid, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out); // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() @@ -351,6 +337,15 @@ void PairAmoebaGPU::induce() } } + // allocate memory and make early host-device transfers + // must be done before the first ufield0c + + amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating // uopt,uoptp with a optorder+1 dimension, just optorder ?? @@ -1160,14 +1155,8 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_phi1_pinned = nullptr; void* fdip_phi2_pinned = nullptr; void* fdip_sum_phi_pinned = nullptr; - amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, - thetai2, thetai3, igrid, grid, - &fdip_phi1_pinned, &fdip_phi2_pinned, - &fdip_sum_phi_pinned, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out, - first_induce_iteration); + amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned); int nlocal = atom->nlocal; double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h index 77b594177b..420874df21 100644 --- a/src/GPU/pair_amoeba_gpu.h +++ b/src/GPU/pair_amoeba_gpu.h @@ -62,8 +62,6 @@ class PairAmoebaGPU : public PairAmoeba { bool gpu_umutual2b_ready; bool gpu_polar_real_ready; - bool first_induce_iteration; - void udirect2b_cpu(); template diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp index 9317b11794..1151027993 100644 --- a/src/GPU/pair_hippo_gpu.cpp +++ b/src/GPU/pair_hippo_gpu.cpp @@ -112,15 +112,8 @@ void hippo_gpu_precompute_induce(const int inum_full, const int bsorder, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out); -void hippo_gpu_fphi_uind(const int inum_full, const int bsorder, - double ***host_thetai1, double ***host_thetai2, - double ***host_thetai3, int** igrid, - double ****host_grid_brick, void **host_fdip_phi1, - void **host_fdip_phi2, void **host_fdip_sum_phi, - const int nzlo_out, const int nzhi_out, - const int nylo_out, const int nyhi_out, - const int nxlo_out, const int nxhi_out, - bool& first_iteration); +void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, + void **host_fdip_phi2, void **host_fdip_sum_phi); void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, @@ -424,14 +417,6 @@ void PairHippoGPU::induce() int debug = 1; - first_induce_iteration = true; - - hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, - thetai2, thetai3, igrid, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out); - // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() @@ -486,6 +471,16 @@ void PairHippoGPU::induce() udirp[i][0], udirp[i][1], udirp[i][2]); } */ + + // allocate memory and make early host-device transfers + // must be done before the first ufield0c + + hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2, + thetai3, igrid, + ic_kspace->nzlo_out, ic_kspace->nzhi_out, + ic_kspace->nylo_out, ic_kspace->nyhi_out, + ic_kspace->nxlo_out, ic_kspace->nxhi_out); + // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating // uopt,uoptp with a optorder+1 dimension, just optorder ?? @@ -1296,14 +1291,8 @@ void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1, void* fdip_phi1_pinned = nullptr; void* fdip_phi2_pinned = nullptr; void* fdip_sum_phi_pinned = nullptr; - hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, - thetai2, thetai3, igrid, grid, - &fdip_phi1_pinned, &fdip_phi2_pinned, - &fdip_sum_phi_pinned, - ic_kspace->nzlo_out, ic_kspace->nzhi_out, - ic_kspace->nylo_out, ic_kspace->nyhi_out, - ic_kspace->nxlo_out, ic_kspace->nxhi_out, - first_induce_iteration); + hippo_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, + &fdip_sum_phi_pinned); int nlocal = atom->nlocal; double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned; diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h index 742fbfb119..b1b908411d 100644 --- a/src/GPU/pair_hippo_gpu.h +++ b/src/GPU/pair_hippo_gpu.h @@ -62,8 +62,6 @@ class PairHippoGPU : public PairAmoeba { bool gpu_umutual2b_ready; bool gpu_polar_real_ready; - bool first_induce_iteration; - void udirect2b_cpu(); template