From a0af9627e5e9d2d3849ad74f1fe4d2ef7291123c Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 6 Sep 2022 16:19:17 -0500 Subject: [PATCH] Fixed memory bugs with device array allocations --- lib/gpu/lal_amoeba.cu | 6 ++--- lib/gpu/lal_base_amoeba.cpp | 49 ++++++++++++++++++++----------------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 200191cea2..4a26f7f98d 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1714,7 +1714,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, v2 = thetai3[m][kb][2]; v3 = thetai3[m][kb][3]; */ - int i3 = m*4*bsorder + 4*kb; + int i3 = i*4*bsorder + 4*kb; v0 = thetai3[i3]; v1 = thetai3[i3]+1; v2 = thetai3[i3+2]; @@ -1750,7 +1750,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, u2 = thetai2[m][jb][2]; u3 = thetai2[m][jb][3]; */ - int i2 = m*4*bsorder+4*jb; + int i2 = i*4*bsorder+4*jb; u0 = thetai2[i2]; u1 = thetai2[i2+1]; u2 = thetai2[i2+2]; @@ -1776,7 +1776,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, t2_2 += tq_2*thetai1[m][ib][2]; t3 += (tq_1+tq_2)*thetai1[m][ib][3]; */ - int i1 = m*4*bsorder+4*ib; + int i1 = i*4*bsorder+4*ib; numtyp w0 = thetai1[i1]; numtyp w1 = thetai1[i1+1]; numtyp w2 = thetai1[i1+2]; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index bdd43aa59e..af8d5ca481 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -563,7 +563,7 @@ template void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** host_igrid, - double* grid_brick_start, int nzlo_out, + double* host_grid_brick_start, int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, int nxlo_out, int nxhi_out) { @@ -580,9 +580,9 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); - _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); - _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); - _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); + _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); + _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); } else { if (inum_full>_max_thetai_size) { @@ -634,13 +634,33 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, ucl_copy(_thetai3,dview,false); UCL_H_Vec dview_int; + dview_int.alloc(inum_full*4, *(this->ucl_device)); for (int i = 0; i < inum_full; i++) { int idx = i*4; dview_int[idx+0] = host_igrid[i][0]; dview_int[idx+1] = host_igrid[i][1]; dview_int[idx+2] = host_igrid[i][2]; } - ucl_copy(_igrid,dview_int,false); + ucl_copy(_igrid, dview_int, false); + + // update the cgrid_brick with data host + + _nzlo_out = nzlo_out; + _nzhi_out = nzhi_out; + _nylo_out = nylo_out; + _nyhi_out = nyhi_out; + _nxlo_out = nxlo_out; + _nxhi_out = nxhi_out; + _ngridz = nzhi_out - nzlo_out + 1; + _ngridy = nyhi_out - nylo_out + 1; + _ngridx = nxhi_out - nxlo_out + 1; + _num_grid_points = _ngridx * _ngridy * _ngridz; + + UCL_H_Vec dview_cgrid; + dview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device)); + _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY); + ucl_copy(_cgrid_brick,dview_cgrid,false); + } // --------------------------------------------------------------------------- @@ -666,23 +686,6 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, if (first_iteration) first_iteration = false; } - // update the cgrid_brick with data host - - _nzlo_out = nzlo_out; - _nzhi_out = nzhi_out; - _nylo_out = nylo_out; - _nyhi_out = nyhi_out; - _nxlo_out = nxlo_out; - _nxhi_out = nxhi_out; - _ngridz = nzhi_out - nzlo_out + 1; - _ngridy = nyhi_out - nylo_out + 1; - _ngridx = nxhi_out - nxlo_out + 1; - _num_grid_points = _ngridx*_ngridy*_ngridz*2; - - UCL_H_Vec dview; - dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device)); - ucl_copy(_cgrid_brick,dview,false); - const int red_blocks = fphi_uind(); _fdip_phi1.update_host(_max_thetai_size*10); @@ -929,7 +932,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); k_polar.set_function(*pair_program,kname_polar); - k_fphi_uind.set_function(*pair_program,"kname_fphi_uind"); + k_fphi_uind.set_function(*pair_program,"k_fphi_uind"); k_short_nbor.set_function(*pair_program,kname_short_nbor); k_special15.set_function(*pair_program,kname_special15); pos_tex.get_texture(*pair_program,"pos_tex");