Cleaned up and converted arrays to ucl_vector of numtyp4

This commit is contained in:
Trung Nguyen
2022-09-13 16:48:39 -05:00
parent 31047b4a31
commit 9c4d3db558
3 changed files with 54 additions and 58 deletions

View File

@ -1621,9 +1621,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
------------------------------------------------------------------------- */
__kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
const __global numtyp *restrict thetai1,
const __global numtyp *restrict thetai2,
const __global numtyp *restrict thetai3,
const __global numtyp4 *restrict thetai1,
const __global numtyp4 *restrict thetai2,
const __global numtyp4 *restrict thetai3,
const __global int *restrict igrid,
const __global numtyp *restrict grid,
__global numtyp *restrict fdip_phi1,
@ -1698,11 +1698,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
v2 = thetai3[m][kb][2];
v3 = thetai3[m][kb][3];
*/
int i3 = ii*4*bsorder + 4*kb;
numtyp v0 = thetai3[i3];
numtyp v1 = thetai3[i3+1];
numtyp v2 = thetai3[i3+2];
numtyp v3 = thetai3[i3+3];
int i3 = ii*bsorder + kb;
numtyp4 tha3 = thetai3[i3];
numtyp v0 = tha3.x;
numtyp v1 = tha3.y;
numtyp v2 = tha3.z;
numtyp v3 = tha3.w;
numtyp tu00_1 = (numtyp)0.0;
numtyp tu01_1 = (numtyp)0.0;
numtyp tu10_1 = (numtyp)0.0;
@ -1734,11 +1735,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
u2 = thetai2[m][jb][2];
u3 = thetai2[m][jb][3];
*/
int i2 = ii*4*bsorder+4*jb;
numtyp u0 = thetai2[i2];
numtyp u1 = thetai2[i2+1];
numtyp u2 = thetai2[i2+2];
numtyp u3 = thetai2[i2+3];
int i2 = ii*bsorder+jb;
numtyp4 tha2 = thetai2[i2];
numtyp u0 = tha2.x;
numtyp u1 = tha2.y;
numtyp u2 = tha2.z;
numtyp u3 = tha2.w;
numtyp t0_1 = (numtyp)0.0;
numtyp t1_1 = (numtyp)0.0;
numtyp t2_1 = (numtyp)0.0;
@ -1760,11 +1762,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
t2_2 += tq_2*thetai1[m][ib][2];
t3 += (tq_1+tq_2)*thetai1[m][ib][3];
*/
int i1 = ii*4*bsorder+4*ib;
numtyp w0 = thetai1[i1];
numtyp w1 = thetai1[i1+1];
numtyp w2 = thetai1[i1+2];
numtyp w3 = thetai1[i1+3];
int i1 = ii*bsorder+ib;
numtyp4 tha1 = thetai1[i1];
numtyp w0 = tha1.x;
numtyp w1 = tha1.y;
numtyp w2 = tha1.z;
numtyp w3 = tha1.w;
int gidx = 2*(k*ngridxy + j*ngridx + i);
numtyp tq_1 = grid[gidx];
numtyp tq_2 = grid[gidx+1];

View File

@ -189,8 +189,6 @@ void BaseAmoebaT::clear_atomic() {
_fdip_sum_phi.clear();
_cgrid_brick.clear();
hview.clear();
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
@ -578,30 +576,25 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
if (_max_thetai_size == 0) {
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
_thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
_thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
_thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
_thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
_thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
_igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
hview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
} else {
if (inum_full>_max_thetai_size) {
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.resize(_max_thetai_size*bsorder*4);
_thetai2.resize(_max_thetai_size*bsorder*4);
_thetai3.resize(_max_thetai_size*bsorder*4);
_thetai1.resize(_max_thetai_size*bsorder);
_thetai2.resize(_max_thetai_size*bsorder);
_thetai3.resize(_max_thetai_size*bsorder);
_igrid.resize(_max_thetai_size*4);
_fdip_phi1.resize(_max_thetai_size*10);
_fdip_phi2.resize(_max_thetai_size*10);
_fdip_sum_phi.resize(_max_thetai_size*20);
hview.resize(_max_thetai_size*bsorder*4);
}
}
@ -609,44 +602,47 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*4*bsorder + 4*j;
hview[idx+0] = host_thetai1[i][j][0];
hview[idx+1] = host_thetai1[i][j][1];
hview[idx+2] = host_thetai1[i][j][2];
hview[idx+3] = host_thetai1[i][j][3];
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai1[i][j][0];
v.y = host_thetai1[i][j][1];
v.z = host_thetai1[i][j][2];
v.w = host_thetai1[i][j][3];
_thetai1[idx] = v;
}
ucl_copy(_thetai1,hview,false);
_thetai1.update_device(true);
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*4*bsorder + 4*j;
hview[idx+0] = host_thetai2[i][j][0];
hview[idx+1] = host_thetai2[i][j][1];
hview[idx+2] = host_thetai2[i][j][2];
hview[idx+3] = host_thetai2[i][j][3];
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai2[i][j][0];
v.y = host_thetai2[i][j][1];
v.z = host_thetai2[i][j][2];
v.w = host_thetai2[i][j][3];
_thetai2[idx] = v;
}
ucl_copy(_thetai2,hview,false);
_thetai2.update_device(true);
for (int i = 0; i < inum_full; i++)
for (int j = 0; j < bsorder; j++) {
int idx = i*4*bsorder + 4*j;
hview[idx+0] = host_thetai3[i][j][0];
hview[idx+1] = host_thetai3[i][j][1];
hview[idx+2] = host_thetai3[i][j][2];
hview[idx+3] = host_thetai3[i][j][3];
int idx = i*bsorder + j;
numtyp4 v;
v.x = host_thetai3[i][j][0];
v.y = host_thetai3[i][j][1];
v.z = host_thetai3[i][j][2];
v.w = host_thetai3[i][j][3];
_thetai3[idx] = v;
}
ucl_copy(_thetai3,hview,false);
_thetai3.update_device(true);
//UCL_H_Vec<int> dview_int;
//dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
for (int i = 0; i < inum_full; i++) {
int idx = i*4;
_igrid[idx+0] = host_igrid[i][0];
_igrid[idx+1] = host_igrid[i][1];
_igrid[idx+2] = host_igrid[i][2];
}
//ucl_copy(_igrid, dview_int, false);
_igrid.update_device(false);
_igrid.update_device(true);
_nzlo_out = nzlo_out;
_nzhi_out = nzhi_out;
@ -694,7 +690,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
first_iteration = false;
}
// TODO: find out why this host alloc helps makes the cgrid_brick update_device() work correcly
// TODO: find out why this host alloc helps the cgrid_brick update_device() work correcly
UCL_H_Vec<numtyp> hdummy;
hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);

View File

@ -256,7 +256,7 @@ class BaseAmoeba {
int _nmax, _max_tep_size, _max_fieldp_size;
int _bsorder;
UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3;
UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
UCL_Vector<int,int> _igrid;
UCL_Vector<numtyp,numtyp> _cgrid_brick;
UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
@ -264,9 +264,6 @@ class BaseAmoeba {
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
int _ngridx, _ngridy, _ngridz, _num_grid_points;
/// buffer
UCL_H_Vec<double> hview;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;