Refactored precompute_induce to overlap data transfers with kernel launches
This commit is contained in:
@ -162,6 +162,17 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
|
|||||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
|
||||||
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
|
double ***host_thetai3, int** igrid,
|
||||||
|
const int nzlo_out, const int nzhi_out,
|
||||||
|
const int nylo_out, const int nyhi_out,
|
||||||
|
const int nxlo_out, const int nxhi_out) {
|
||||||
|
AMOEBAMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
|
||||||
|
host_thetai3, igrid, nzlo_out, nzhi_out,
|
||||||
|
nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
||||||
|
}
|
||||||
|
|
||||||
void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
|
void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid, double ****host_grid_brick,
|
double ***host_thetai3, int** igrid, double ****host_grid_brick,
|
||||||
|
|||||||
@ -580,7 +580,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
|
|||||||
_thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
|
_thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
_thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
|
_thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
_thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
|
_thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
|
||||||
_igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
|
_igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
|
||||||
|
|
||||||
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
|
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
|
||||||
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
|
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
|
||||||
@ -674,7 +674,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
|
|||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** host_igrid,
|
double ***host_thetai3, int** host_igrid,
|
||||||
double ****host_grid_brick,
|
double ****host_grid_brick,
|
||||||
void** host_fdip_phi1,
|
void **host_fdip_phi1,
|
||||||
void **host_fdip_phi2,
|
void **host_fdip_phi2,
|
||||||
void **host_fdip_sum_phi,
|
void **host_fdip_sum_phi,
|
||||||
const int nzlo_out, const int nzhi_out,
|
const int nzlo_out, const int nzhi_out,
|
||||||
@ -682,16 +682,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
|
|||||||
const int nxlo_out, const int nxhi_out,
|
const int nxlo_out, const int nxhi_out,
|
||||||
bool& first_iteration)
|
bool& first_iteration)
|
||||||
{
|
{
|
||||||
// allocation/resize and transfers before the first iteration
|
// TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly
|
||||||
|
|
||||||
if (first_iteration) {
|
|
||||||
precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
|
|
||||||
host_thetai3, host_igrid, nzlo_out, nzhi_out,
|
|
||||||
nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
|
||||||
first_iteration = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: find out why this host alloc helps the cgrid_brick update_device() work correcly
|
|
||||||
UCL_H_Vec<numtyp> hdummy;
|
UCL_H_Vec<numtyp> hdummy;
|
||||||
hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
|
hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
|
||||||
|
|
||||||
|
|||||||
@ -193,6 +193,17 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
|
|||||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
|
||||||
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
|
double ***host_thetai3, int** igrid,
|
||||||
|
const int nzlo_out, const int nzhi_out,
|
||||||
|
const int nylo_out, const int nyhi_out,
|
||||||
|
const int nxlo_out, const int nxhi_out) {
|
||||||
|
HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
|
||||||
|
host_thetai3, igrid, nzlo_out, nzhi_out,
|
||||||
|
nylo_out, nyhi_out, nxlo_out, nxhi_out);
|
||||||
|
}
|
||||||
|
|
||||||
void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
|
void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid, double ****host_grid_brick,
|
double ***host_thetai3, int** igrid, double ****host_grid_brick,
|
||||||
|
|||||||
@ -88,6 +88,13 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
|
|||||||
|
|
||||||
void amoeba_gpu_update_fieldp(void **fieldp_ptr);
|
void amoeba_gpu_update_fieldp(void **fieldp_ptr);
|
||||||
|
|
||||||
|
void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
|
||||||
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
|
double ***host_thetai3, int** igrid,
|
||||||
|
const int nzlo_out, const int nzhi_out,
|
||||||
|
const int nylo_out, const int nyhi_out,
|
||||||
|
const int nxlo_out, const int nxhi_out);
|
||||||
|
|
||||||
void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
|
void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid,
|
double ***host_thetai3, int** igrid,
|
||||||
@ -294,6 +301,12 @@ void PairAmoebaGPU::induce()
|
|||||||
|
|
||||||
first_induce_iteration = true;
|
first_induce_iteration = true;
|
||||||
|
|
||||||
|
amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1,
|
||||||
|
thetai2, thetai3, igrid,
|
||||||
|
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
|
||||||
|
ic_kspace->nylo_out, ic_kspace->nyhi_out,
|
||||||
|
ic_kspace->nxlo_out, ic_kspace->nxhi_out);
|
||||||
|
|
||||||
// set cutoffs, taper coeffs, and PME params
|
// set cutoffs, taper coeffs, and PME params
|
||||||
// create qfac here, free at end of polar()
|
// create qfac here, free at end of polar()
|
||||||
|
|
||||||
|
|||||||
@ -105,6 +105,13 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
|
|||||||
|
|
||||||
void hippo_gpu_update_fieldp(void **fieldp_ptr);
|
void hippo_gpu_update_fieldp(void **fieldp_ptr);
|
||||||
|
|
||||||
|
void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
|
||||||
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
|
double ***host_thetai3, int** igrid,
|
||||||
|
const int nzlo_out, const int nzhi_out,
|
||||||
|
const int nylo_out, const int nyhi_out,
|
||||||
|
const int nxlo_out, const int nxhi_out);
|
||||||
|
|
||||||
void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
|
void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
|
||||||
double ***host_thetai1, double ***host_thetai2,
|
double ***host_thetai1, double ***host_thetai2,
|
||||||
double ***host_thetai3, int** igrid,
|
double ***host_thetai3, int** igrid,
|
||||||
@ -419,6 +426,12 @@ void PairHippoGPU::induce()
|
|||||||
|
|
||||||
first_induce_iteration = true;
|
first_induce_iteration = true;
|
||||||
|
|
||||||
|
hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1,
|
||||||
|
thetai2, thetai3, igrid,
|
||||||
|
ic_kspace->nzlo_out, ic_kspace->nzhi_out,
|
||||||
|
ic_kspace->nylo_out, ic_kspace->nyhi_out,
|
||||||
|
ic_kspace->nxlo_out, ic_kspace->nxhi_out);
|
||||||
|
|
||||||
// set cutoffs, taper coeffs, and PME params
|
// set cutoffs, taper coeffs, and PME params
|
||||||
// create qfac here, free at end of polar()
|
// create qfac here, free at end of polar()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user