diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index d67fa4f869..53a9f6aa3e 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1621,9 +1621,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, - const __global numtyp *restrict thetai1, - const __global numtyp *restrict thetai2, - const __global numtyp *restrict thetai3, + const __global numtyp4 *restrict thetai1, + const __global numtyp4 *restrict thetai2, + const __global numtyp4 *restrict thetai3, const __global int *restrict igrid, const __global numtyp *restrict grid, __global numtyp *restrict fdip_phi1, @@ -1698,11 +1698,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, v2 = thetai3[m][kb][2]; v3 = thetai3[m][kb][3]; */ - int i3 = ii*4*bsorder + 4*kb; - numtyp v0 = thetai3[i3]; - numtyp v1 = thetai3[i3+1]; - numtyp v2 = thetai3[i3+2]; - numtyp v3 = thetai3[i3+3]; + int i3 = ii*bsorder + kb; + numtyp4 tha3 = thetai3[i3]; + numtyp v0 = tha3.x; + numtyp v1 = tha3.y; + numtyp v2 = tha3.z; + numtyp v3 = tha3.w; numtyp tu00_1 = (numtyp)0.0; numtyp tu01_1 = (numtyp)0.0; numtyp tu10_1 = (numtyp)0.0; @@ -1734,11 +1735,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, u2 = thetai2[m][jb][2]; u3 = thetai2[m][jb][3]; */ - int i2 = ii*4*bsorder+4*jb; - numtyp u0 = thetai2[i2]; - numtyp u1 = thetai2[i2+1]; - numtyp u2 = thetai2[i2+2]; - numtyp u3 = thetai2[i2+3]; + int i2 = ii*bsorder+jb; + numtyp4 tha2 = thetai2[i2]; + numtyp u0 = tha2.x; + numtyp u1 = tha2.y; + numtyp u2 = tha2.z; + numtyp u3 = tha2.w; numtyp t0_1 = (numtyp)0.0; numtyp t1_1 = (numtyp)0.0; numtyp t2_1 = (numtyp)0.0; @@ -1760,11 +1762,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, t2_2 += tq_2*thetai1[m][ib][2]; t3 += (tq_1+tq_2)*thetai1[m][ib][3]; */ - int i1 = ii*4*bsorder+4*ib; - numtyp w0 = thetai1[i1]; - numtyp w1 = thetai1[i1+1]; - numtyp w2 = thetai1[i1+2]; - numtyp w3 = thetai1[i1+3]; + int i1 = ii*bsorder+ib; + numtyp4 tha1 = thetai1[i1]; + numtyp w0 = tha1.x; + numtyp w1 = tha1.y; + numtyp w2 = tha1.z; + numtyp w3 = tha1.w; int gidx = 2*(k*ngridxy + j*ngridx + i); numtyp tq_1 = grid[gidx]; numtyp tq_2 = grid[gidx+1]; diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 5989ba889d..3e14159d5a 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -189,8 +189,6 @@ void BaseAmoebaT::clear_atomic() { _fdip_sum_phi.clear(); _cgrid_brick.clear(); - hview.clear(); - dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -578,30 +576,25 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, if (_max_thetai_size == 0) { _max_thetai_size = static_cast(static_cast(inum_full)*1.10); - _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); - _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY); _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY); _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE); _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE); - - hview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device)); - } else { if (inum_full>_max_thetai_size) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); - _thetai1.resize(_max_thetai_size*bsorder*4); - _thetai2.resize(_max_thetai_size*bsorder*4); - _thetai3.resize(_max_thetai_size*bsorder*4); + _thetai1.resize(_max_thetai_size*bsorder); + _thetai2.resize(_max_thetai_size*bsorder); + _thetai3.resize(_max_thetai_size*bsorder); _igrid.resize(_max_thetai_size*4); _fdip_phi1.resize(_max_thetai_size*10); _fdip_phi2.resize(_max_thetai_size*10); _fdip_sum_phi.resize(_max_thetai_size*20); - - hview.resize(_max_thetai_size*bsorder*4); } } @@ -609,44 +602,47 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { - int idx = i*4*bsorder + 4*j; - hview[idx+0] = host_thetai1[i][j][0]; - hview[idx+1] = host_thetai1[i][j][1]; - hview[idx+2] = host_thetai1[i][j][2]; - hview[idx+3] = host_thetai1[i][j][3]; + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai1[i][j][0]; + v.y = host_thetai1[i][j][1]; + v.z = host_thetai1[i][j][2]; + v.w = host_thetai1[i][j][3]; + _thetai1[idx] = v; } - ucl_copy(_thetai1,hview,false); + _thetai1.update_device(true); for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { - int idx = i*4*bsorder + 4*j; - hview[idx+0] = host_thetai2[i][j][0]; - hview[idx+1] = host_thetai2[i][j][1]; - hview[idx+2] = host_thetai2[i][j][2]; - hview[idx+3] = host_thetai2[i][j][3]; + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai2[i][j][0]; + v.y = host_thetai2[i][j][1]; + v.z = host_thetai2[i][j][2]; + v.w = host_thetai2[i][j][3]; + _thetai2[idx] = v; } - ucl_copy(_thetai2,hview,false); + _thetai2.update_device(true); for (int i = 0; i < inum_full; i++) for (int j = 0; j < bsorder; j++) { - int idx = i*4*bsorder + 4*j; - hview[idx+0] = host_thetai3[i][j][0]; - hview[idx+1] = host_thetai3[i][j][1]; - hview[idx+2] = host_thetai3[i][j][2]; - hview[idx+3] = host_thetai3[i][j][3]; + int idx = i*bsorder + j; + numtyp4 v; + v.x = host_thetai3[i][j][0]; + v.y = host_thetai3[i][j][1]; + v.z = host_thetai3[i][j][2]; + v.w = host_thetai3[i][j][3]; + _thetai3[idx] = v; } - ucl_copy(_thetai3,hview,false); + _thetai3.update_device(true); - //UCL_H_Vec dview_int; - //dview_int.alloc(_max_thetai_size*4, *(this->ucl_device)); for (int i = 0; i < inum_full; i++) { int idx = i*4; _igrid[idx+0] = host_igrid[i][0]; _igrid[idx+1] = host_igrid[i][1]; _igrid[idx+2] = host_igrid[i][2]; } - //ucl_copy(_igrid, dview_int, false); - _igrid.update_device(false); + _igrid.update_device(true); _nzlo_out = nzlo_out; _nzhi_out = nzhi_out; @@ -694,7 +690,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, first_iteration = false; } - // TODO: find out why this host alloc helps makes the cgrid_brick update_device() work correcly + // TODO: find out why this host alloc helps the cgrid_brick update_device() work correcly UCL_H_Vec hdummy; hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 760d0e3005..802b6962b7 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -256,7 +256,7 @@ class BaseAmoeba { int _nmax, _max_tep_size, _max_fieldp_size; int _bsorder; - UCL_D_Vec _thetai1, _thetai2, _thetai3; + UCL_Vector _thetai1, _thetai2, _thetai3; UCL_Vector _igrid; UCL_Vector _cgrid_brick; UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; @@ -264,9 +264,6 @@ class BaseAmoeba { int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; int _ngridx, _ngridy, _ngridz, _num_grid_points; - /// buffer - UCL_H_Vec hview; - // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans;