From 8eb722a32ad3d551a1c64e88fc768f66af771a35 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Thu, 19 Jan 2023 13:22:27 -0600 Subject: [PATCH] Enforced synchronous host-device transfers for cgrid_brick and fdip arrays --- lib/gpu/lal_base_amoeba.cpp | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 841d968e56..21d9975b28 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -591,10 +591,6 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, void **host_fdip_phi2, void **host_fdip_sum_phi) { - // TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly - UCL_H_Vec hdummy; - hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); - int n = 0; for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++) @@ -605,7 +601,7 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, _cgrid_brick[n] = v; n++; } - _cgrid_brick.update_device(_num_grid_points, true); + _cgrid_brick.update_device(_num_grid_points, false); #ifdef ASYNC_DEVICE_COPY ucl_device->sync(); @@ -614,10 +610,10 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick, // launch the kernel with its execution configuration (see below) fphi_uind(); - // copy data from device to host asynchronously - _fdip_phi1.update_host(_max_thetai_size*10, true); - _fdip_phi2.update_host(_max_thetai_size*10, true); - _fdip_sum_phi.update_host(_max_thetai_size*20, true); + // copy data from device to host + _fdip_phi1.update_host(_max_thetai_size*10, false); + _fdip_phi2.update_host(_max_thetai_size*10, false); + _fdip_sum_phi.update_host(_max_thetai_size*20, false); // return the pointers to the host-side arrays *host_fdip_phi1 = _fdip_phi1.host.begin(); @@ -638,13 +634,7 @@ int BaseAmoebaT::fphi_uind() { const int BX=block_size(); const int GX=static_cast(ceil(static_cast(ainum)/BX)); - /* - const int cus = this->device->gpu->cus(); - while (GX < cus && GX > 1) { - BX /= 2; - GX=static_cast(ceil(static_cast(ainum)/BX)); - } - */ + time_pair.start(); int ngridxy = _ngridx * _ngridy; k_fphi_uind.set_size(GX,BX); @@ -666,10 +656,6 @@ int BaseAmoebaT::fphi_uind() { template void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) { - // TODO: grid brick[k][j][i] is a scalar - UCL_H_Vec hdummy; - hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); - int n = 0; for (int iz = _nzlo_out; iz <= _nzhi_out; iz++) for (int iy = _nylo_out; iy <= _nyhi_out; iy++)