Removed mem alloc in precompute_induce, used buffer for packing, and switched to using ucl_vector

This commit is contained in:
Trung Nguyen
2022-09-13 12:53:48 -05:00
parent 17e54c9390
commit 31047b4a31
2 changed files with 45 additions and 42 deletions

View File

@ -188,7 +188,8 @@ void BaseAmoebaT::clear_atomic() {
_fdip_phi2.clear();
_fdip_sum_phi.clear();
_cgrid_brick.clear();
hview_cgrid.clear();
hview.clear();
dev_nspecial15.clear();
dev_special15.clear();
@ -586,6 +587,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
hview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
} else {
if (inum_full>_max_thetai_size) {
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
@ -597,53 +600,53 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
_fdip_phi1.resize(_max_thetai_size*10);
_fdip_phi2.resize(_max_thetai_size*10);
_fdip_sum_phi.resize(_max_thetai_size*20);
hview.resize(_max_thetai_size*bsorder*4);
}
}
UCL_H_Vec<double> dview;
dview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
// pack host data to device
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*4*bsorder + 4*j;
dview[idx+0] = host_thetai1[i][j][0];
dview[idx+1] = host_thetai1[i][j][1];
dview[idx+2] = host_thetai1[i][j][2];
dview[idx+3] = host_thetai1[i][j][3];
hview[idx+0] = host_thetai1[i][j][0];
hview[idx+1] = host_thetai1[i][j][1];
hview[idx+2] = host_thetai1[i][j][2];
hview[idx+3] = host_thetai1[i][j][3];
}
ucl_copy(_thetai1,dview,false);
ucl_copy(_thetai1,hview,false);
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*4*bsorder + 4*j;
dview[idx+0] = host_thetai2[i][j][0];
dview[idx+1] = host_thetai2[i][j][1];
dview[idx+2] = host_thetai2[i][j][2];
dview[idx+3] = host_thetai2[i][j][3];
hview[idx+0] = host_thetai2[i][j][0];
hview[idx+1] = host_thetai2[i][j][1];
hview[idx+2] = host_thetai2[i][j][2];
hview[idx+3] = host_thetai2[i][j][3];
}
ucl_copy(_thetai2,dview,false);
ucl_copy(_thetai2,hview,false);
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*4*bsorder + 4*j;
dview[idx+0] = host_thetai3[i][j][0];
dview[idx+1] = host_thetai3[i][j][1];
dview[idx+2] = host_thetai3[i][j][2];
dview[idx+3] = host_thetai3[i][j][3];
hview[idx+0] = host_thetai3[i][j][0];
hview[idx+1] = host_thetai3[i][j][1];
hview[idx+2] = host_thetai3[i][j][2];
hview[idx+3] = host_thetai3[i][j][3];
}
ucl_copy(_thetai3,dview,false);
ucl_copy(_thetai3,hview,false);
UCL_H_Vec<int> dview_int;
dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
//UCL_H_Vec<int> dview_int;
//dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
for (int i = 0; i < inum_full; i++) {
int idx = i*4;
dview_int[idx+0] = host_igrid[i][0];
dview_int[idx+1] = host_igrid[i][1];
dview_int[idx+2] = host_igrid[i][2];
_igrid[idx+0] = host_igrid[i][0];
_igrid[idx+1] = host_igrid[i][1];
_igrid[idx+2] = host_igrid[i][2];
}
ucl_copy(_igrid, dview_int, false);
//ucl_copy(_igrid, dview_int, false);
_igrid.update_device(false);
_nzlo_out = nzlo_out;
_nzhi_out = nzhi_out;
@ -658,10 +661,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
int numel = _num_grid_points*2;
if (_cgrid_brick.cols() == 0) {
hview_cgrid.alloc(numel, *(this->ucl_device), UCL_READ_WRITE);
_cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_ONLY);
_cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
} else if (numel > _cgrid_brick.cols()) {
hview_cgrid.resize(numel);
_cgrid_brick.resize(numel);
}
}
@ -684,10 +685,6 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
const int nxlo_out, const int nxhi_out,
bool& first_iteration)
{
// TODO: find out why this alloc helps makes the cgrid_brick ucl_copy work
UCL_H_Vec<numtyp> hview;
hview.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
// allocation/resize and transfers before the first iteration
if (first_iteration) {
@ -697,15 +694,19 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
first_iteration = false;
}
// TODO: find out why this host alloc helps makes the cgrid_brick update_device() work correcly
UCL_H_Vec<numtyp> hdummy;
hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
int n = 0;
for (int iz = nzlo_out; iz <= nzhi_out; iz++)
for (int iy = nylo_out; iy <= nyhi_out; iy++)
for (int ix = nxlo_out; ix <= nxhi_out; ix++) {
hview_cgrid[n] = host_grid_brick[iz][iy][ix][0];
hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1];
for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
_cgrid_brick[n] = host_grid_brick[iz][iy][ix][0];
_cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1];
n += 2;
}
ucl_copy(_cgrid_brick, hview_cgrid, false);
_cgrid_brick.update_device(false);
const int red_blocks = fphi_uind();
@ -762,7 +763,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
int** firstneigh = nullptr;
cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
atom->add_extra_data();
atom->add_extra_data();
*tep_ptr=_tep.host.begin();