diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile index 5f50486e28..c52246b06b 100644 --- a/lib/gpu/Nvidia.makefile +++ b/lib/gpu/Nvidia.makefile @@ -68,7 +68,31 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H) # host code compilation -$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H) +$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H) + $(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H) + $(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H) + $(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H) + $(CUDR) -o $@ -c $< -I$(OBJ_DIR) + +$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H) $(CUDR) -o $@ -c $< -I$(OBJ_DIR) #ifdef CUDPP_OPT diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 498c55ceba..38058bab55 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -275,13 +275,18 @@ int AmoebaT::fphi_uind() { const int BX=this->block_size(); int GX=static_cast(ceil(static_cast(this->ans->inum())/ (BX/this->_threads_per_atom))); -/* - this->time_pair.start(); + this->time_pair.start(); + int ngridyz = this->_ngridy * this->_ngridz; this->k_fphi_uind.set_size(GX,BX); - this->k_fphi_uind.run(); + this->k_fphi_uind.run(&this->atom->x, &this->_thetai1, + &this->_thetai2, &this->_thetai3, + &this->_igrid, &this->_cgrid_brick, + &this->_fdip_phi1, &this->_fdip_phi2, + &this->_fdip_sum_phi, &this->_bsorder, + &ainum, &ngridyz, &this->_ngridy, + &this->_threads_per_atom); this->time_pair.stop(); -*/ return GX; } diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu index 1239764108..984154f16e 100644 --- a/lib/gpu/lal_amoeba.cu +++ b/lib/gpu/lal_amoeba.cu @@ -1621,15 +1621,16 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_, ------------------------------------------------------------------------- */ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, - const __global numtyp4 *restrict thetai1, - const __global numtyp4 *restrict thetai2, - const __global numtyp4 *restrict thetai3, - const __global int4 *restrict igrid, - const __global numtyp4 *restrict grid, - __global numtyp4 *restrict fdip_phi1, - __global numtyp4 *restrict fdip_phi2, - __global numtyp4 *restrict fdip_sum_phi, + const __global numtyp *restrict thetai1, + const __global numtyp *restrict thetai2, + const __global numtyp *restrict thetai3, + const __global int *restrict igrid, + const __global numtyp *restrict grid, + __global numtyp *restrict fdip_phi1, + __global numtyp *restrict fdip_phi2, + __global numtyp *restrict fdip_sum_phi, const int bsorder, const int inum, + const int nyzgrid, const int nygrid, const int t_per_atom) { int tid, ii, offset, i, n_stride; @@ -1666,46 +1667,46 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, // extract the permanent multipole field at each site - tuv100_1 = 0.0; - tuv010_1 = 0.0; - tuv001_1 = 0.0; - tuv200_1 = 0.0; - tuv020_1 = 0.0; - tuv002_1 = 0.0; - tuv110_1 = 0.0; - tuv101_1 = 0.0; - tuv011_1 = 0.0; - tuv100_2 = 0.0; - tuv010_2 = 0.0; - tuv001_2 = 0.0; - tuv200_2 = 0.0; - tuv020_2 = 0.0; - tuv002_2 = 0.0; - tuv110_2 = 0.0; - tuv101_2 = 0.0; - tuv011_2 = 0.0; - tuv000 = 0.0; - tuv001 = 0.0; - tuv010 = 0.0; - tuv100 = 0.0; - tuv200 = 0.0; - tuv020 = 0.0; - tuv002 = 0.0; - tuv110 = 0.0; - tuv101 = 0.0; - tuv011 = 0.0; - tuv300 = 0.0; - tuv030 = 0.0; - tuv003 = 0.0; - tuv210 = 0.0; - tuv201 = 0.0; - tuv120 = 0.0; - tuv021 = 0.0; - tuv102 = 0.0; - tuv012 = 0.0; - tuv111 = 0.0; + tuv100_1 = (numtyp)0.0; + tuv010_1 = (numtyp)0.0; + tuv001_1 = (numtyp)0.0; + tuv200_1 = (numtyp)0.0; + tuv020_1 = (numtyp)0.0; + tuv002_1 = (numtyp)0.0; + tuv110_1 = (numtyp)0.0; + tuv101_1 = (numtyp)0.0; + tuv011_1 = (numtyp)0.0; + tuv100_2 = (numtyp)0.0; + tuv010_2 = (numtyp)0.0; + tuv001_2 = (numtyp)0.0; + tuv200_2 = (numtyp)0.0; + tuv020_2 = (numtyp)0.0; + tuv002_2 = (numtyp)0.0; + tuv110_2 = (numtyp)0.0; + tuv101_2 = (numtyp)0.0; + tuv011_2 = (numtyp)0.0; + tuv000 = (numtyp)0.0; + tuv001 = (numtyp)0.0; + tuv010 = (numtyp)0.0; + tuv100 = (numtyp)0.0; + tuv200 = (numtyp)0.0; + tuv020 = (numtyp)0.0; + tuv002 = (numtyp)0.0; + tuv110 = (numtyp)0.0; + tuv101 = (numtyp)0.0; + tuv011 = (numtyp)0.0; + tuv300 = (numtyp)0.0; + tuv030 = (numtyp)0.0; + tuv003 = (numtyp)0.0; + tuv210 = (numtyp)0.0; + tuv201 = (numtyp)0.0; + tuv120 = (numtyp)0.0; + tuv021 = (numtyp)0.0; + tuv102 = (numtyp)0.0; + tuv012 = (numtyp)0.0; + tuv111 = (numtyp)0.0; - k = igrid[i].z - nlpts; + k = igrid[3*i+2] - nlpts; for (int kb = 0; kb < bsorder; kb++) { /* v0 = thetai3[m][kb][0]; @@ -1713,30 +1714,35 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, v2 = thetai3[m][kb][2]; v3 = thetai3[m][kb][3]; */ - tu00_1 = 0.0; - tu01_1 = 0.0; - tu10_1 = 0.0; - tu20_1 = 0.0; - tu11_1 = 0.0; - tu02_1 = 0.0; - tu00_2 = 0.0; - tu01_2 = 0.0; - tu10_2 = 0.0; - tu20_2 = 0.0; - tu11_2 = 0.0; - tu02_2 = 0.0; - tu00 = 0.0; - tu10 = 0.0; - tu01 = 0.0; - tu20 = 0.0; - tu11 = 0.0; - tu02 = 0.0; - tu30 = 0.0; - tu21 = 0.0; - tu12 = 0.0; - tu03 = 0.0; + int i3 = m*4*bsorder + 4*kb; + v0 = thetai3[i3]; + v1 = thetai3[i3]+1; + v2 = thetai3[i3+2]; + v3 = thetai3[i3+3]; + tu00_1 = (numtyp)0.0; + tu01_1 = (numtyp)0.0; + tu10_1 = (numtyp)0.0; + tu20_1 = (numtyp)0.0; + tu11_1 = (numtyp)0.0; + tu02_1 = (numtyp)0.0; + tu00_2 = (numtyp)0.0; + tu01_2 = (numtyp)0.0; + tu10_2 = (numtyp)0.0; + tu20_2 = (numtyp)0.0; + tu11_2 = (numtyp)0.0; + tu02_2 = (numtyp)0.0; + tu00 = (numtyp)0.0; + tu10 = (numtyp)0.0; + tu01 = (numtyp)0.0; + tu20 = (numtyp)0.0; + tu11 = (numtyp)0.0; + tu02 = (numtyp)0.0; + tu30 = (numtyp)0.0; + tu21 = (numtyp)0.0; + tu12 = (numtyp)0.0; + tu03 = (numtyp)0.0; - j = igrid[i].y - nlpts; + j = igrid[3*i+1] - nlpts; for (int jb = 0; jb < bsorder; jb++) { /* u0 = thetai2[m][jb][0]; @@ -1744,19 +1750,24 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, u2 = thetai2[m][jb][2]; u3 = thetai2[m][jb][3]; */ - t0_1 = 0.0; - t1_1 = 0.0; - t2_1 = 0.0; - t0_2 = 0.0; - t1_2 = 0.0; - t2_2 = 0.0; - t3 = 0.0; + int i2 = m*4*bsorder+4*jb; + u0 = thetai2[i2]; + u1 = thetai2[i2+1]; + u2 = thetai2[i2+2]; + u3 = thetai2[i2+3]; + t0_1 = (numtyp)0.0; + t1_1 = (numtyp)0.0; + t2_1 = (numtyp)0.0; + t0_2 = (numtyp)0.0; + t1_2 = (numtyp)0.0; + t2_2 = (numtyp)0.0; + t3 = (numtyp)0.0; - i = igrid[m].x - nlpts; + int ii = igrid[3*i] - nlpts; for (int ib = 0; ib < bsorder; ib++) { /* - tq_1 = grid[k][j][i][0]; - tq_2 = grid[k][j][i][1]; + tq_1 = grid[k][j][ii][0]; + tq_2 = grid[k][j][ii][1]; t0_1 += tq_1*thetai1[m][ib][0]; t1_1 += tq_1*thetai1[m][ib][1]; t2_1 += tq_1*thetai1[m][ib][2]; @@ -1765,7 +1776,22 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, t2_2 += tq_2*thetai1[m][ib][2]; t3 += (tq_1+tq_2)*thetai1[m][ib][3]; */ - i++; + int i1 = m*4*bsorder+4*ib; + numtyp w0 = thetai1[i1]; + numtyp w1 = thetai1[i1+1]; + numtyp w2 = thetai1[i1+2]; + numtyp w3 = thetai1[i1+3]; + int gidx = 2*(k*nyzgrid + j*nygrid + ii); + tq_1 = grid[gidx]; + tq_2 = grid[gidx+1]; + t0_1 += tq_1*w0; + t1_1 += tq_1*w1; + t2_1 += tq_1*w2; + t0_2 += tq_2*w0; + t1_2 += tq_2*w1; + t2_2 += tq_2*w2; + t3 += (tq_1+tq_2)*w3; + ii++; } tu00_1 += t0_1*u0; @@ -1836,6 +1862,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, tuv111 += tu11*v1; k++; } + /* fdip_phi1[m][0] = 0.0; fdip_phi1[m][1] = tuv100_1; @@ -1847,39 +1874,51 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_, fdip_phi1[m][7] = tuv110_1; fdip_phi1[m][8] = tuv101_1; fdip_phi1[m][9] = tuv011_1; - - fdip_phi2[m][0] = 0.0; - fdip_phi2[m][1] = tuv100_2; - fdip_phi2[m][2] = tuv010_2; - fdip_phi2[m][3] = tuv001_2; - fdip_phi2[m][4] = tuv200_2; - fdip_phi2[m][5] = tuv020_2; - fdip_phi2[m][6] = tuv002_2; - fdip_phi2[m][7] = tuv110_2; - fdip_phi2[m][8] = tuv101_2; - fdip_phi2[m][9] = tuv011_2; - - fdip_sum_phi[m][0] = tuv000; - fdip_sum_phi[m][1] = tuv100; - fdip_sum_phi[m][2] = tuv010; - fdip_sum_phi[m][3] = tuv001; - fdip_sum_phi[m][4] = tuv200; - fdip_sum_phi[m][5] = tuv020; - fdip_sum_phi[m][6] = tuv002; - fdip_sum_phi[m][7] = tuv110; - fdip_sum_phi[m][8] = tuv101; - fdip_sum_phi[m][9] = tuv011; - fdip_sum_phi[m][10] = tuv300; - fdip_sum_phi[m][11] = tuv030; - fdip_sum_phi[m][12] = tuv003; - fdip_sum_phi[m][13] = tuv210; - fdip_sum_phi[m][14] = tuv201; - fdip_sum_phi[m][15] = tuv120; - fdip_sum_phi[m][16] = tuv021; - fdip_sum_phi[m][17] = tuv102; - fdip_sum_phi[m][18] = tuv012; - fdip_sum_phi[m][19] = tuv111; */ + int idx = 10*m; + fdip_phi1[idx+0] = (numtyp)0.0; + fdip_phi1[idx+1] = tuv100_1; + fdip_phi1[idx+2] = tuv010_1; + fdip_phi1[idx+3] = tuv001_1; + fdip_phi1[idx+4] = tuv200_1; + fdip_phi1[idx+5] = tuv020_1; + fdip_phi1[idx+6] = tuv002_1; + fdip_phi1[idx+7] = tuv110_1; + fdip_phi1[idx+8] = tuv101_1; + fdip_phi1[idx+9] = tuv011_1; + + fdip_phi2[idx+0] = (numtyp)0.0; + fdip_phi2[idx+1] = tuv100_2; + fdip_phi2[idx+2] = tuv010_2; + fdip_phi2[idx+3] = tuv001_2; + fdip_phi2[idx+4] = tuv200_2; + fdip_phi2[idx+5] = tuv020_2; + fdip_phi2[idx+6] = tuv002_2; + fdip_phi2[idx+7] = tuv110_2; + fdip_phi2[idx+8] = tuv101_2; + fdip_phi2[idx+9] = tuv011_2; + + idx = 20*m; + fdip_sum_phi[idx+0] = tuv000; + fdip_sum_phi[idx+1] = tuv100; + fdip_sum_phi[idx+2] = tuv010; + fdip_sum_phi[idx+3] = tuv001; + fdip_sum_phi[idx+4] = tuv200; + fdip_sum_phi[idx+5] = tuv020; + fdip_sum_phi[idx+6] = tuv002; + fdip_sum_phi[idx+7] = tuv110; + fdip_sum_phi[idx+8] = tuv101; + fdip_sum_phi[idx+9] = tuv011; + fdip_sum_phi[idx+10] = tuv300; + fdip_sum_phi[idx+11] = tuv030; + fdip_sum_phi[idx+12] = tuv003; + fdip_sum_phi[idx+13] = tuv210; + fdip_sum_phi[idx+14] = tuv201; + fdip_sum_phi[idx+15] = tuv120; + fdip_sum_phi[idx+16] = tuv021; + fdip_sum_phi[idx+17] = tuv102; + fdip_sum_phi[idx+18] = tuv012; + fdip_sum_phi[idx+19] = tuv111; } } diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index c18b10675b..cd5a9abf81 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() { k_multipole.clear(); k_udirect2b.clear(); k_umutual2b.clear(); + k_fphi_uind.clear(); k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); @@ -182,6 +183,11 @@ void BaseAmoebaT::clear_atomic() { _thetai1.clear(); _thetai2.clear(); _thetai3.clear(); + _igrid.clear(); + _fdip_phi1.clear(); + _fdip_phi2.clear(); + _cgrid_brick.clear(); + _fdip_sum_phi.clear(); dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -444,34 +450,70 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall template void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** host_igrid) { + double **host_thetai3, int** host_igrid, + double* grid_brick_start, int nzlo_out, + int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out) { _bsorder = bsorder; + // allocate or resize per-atom arrays + // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax + // will be consolidated once all terms are ready + if (_max_thetai_size == 0) { _max_thetai_size = static_cast(static_cast(inum_full)*1.10); - _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); - _igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY); + _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY); + + _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY); + _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY); + } else { if (inum_full>_max_thetai_size) { _max_thetai_size=static_cast(static_cast(inum_full)*1.10); _thetai1.resize(_max_thetai_size*bsorder*4); _thetai2.resize(_max_thetai_size*bsorder*4); _thetai3.resize(_max_thetai_size*bsorder*4); - _igrid.resize(_max_thetai_size); + _igrid.resize(_max_thetai_size*3); + + _fdip_phi1.resize(_max_thetai_size*10); + _fdip_phi2.resize(_max_thetai_size*10); + _fdip_sum_phi.resize(_max_thetai_size*20); } } - memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp)); - memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp)); - memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp)); - memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int)); - _thetai1.update_device(inum_full*bsorder*4,true); - _thetai2.update_device(inum_full*bsorder*4,true); - _thetai3.update_device(inum_full*bsorder*4,true); - _igrid.update_device(inum_full,true); + UCL_H_Vec dview; + + // copy from host to device + + dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device)); + ucl_copy(_thetai1,dview,false); + dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device)); + ucl_copy(_thetai2,dview,false); + dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device)); + ucl_copy(_thetai3,dview,false); + + UCL_H_Vec dview_int; + dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device)); + ucl_copy(_igrid,dview_int,false); + + _nzlo_out = nzlo_out; + _nzhi_out = nzhi_out; + _nylo_out = nylo_out; + _nyhi_out = nyhi_out; + _nxlo_out = nxlo_out; + _nxhi_out = nxhi_out; + _ngridz = nzhi_out - nzlo_out + 1; + _ngridy = nyhi_out - nylo_out + 1; + _ngridx = nxhi_out - nxlo_out + 1; + _num_grid_points = _ngridx*_ngridy*_ngridz*2; + dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device)); + ucl_copy(_cgrid_brick,dview,false); + } // --------------------------------------------------------------------------- @@ -593,18 +635,35 @@ template void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, double **host_thetai3, int** igrid, - double ****host_cgrid_brick, double **host_fdip_phi1, - double **host_fdip_phi2, double **host_fdip_sum_phi) + double *host_grid_brick_start, double **host_fdip_phi1, + double **host_fdip_phi2, double **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out) { - // once allocation and transfers - precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid); + // allocation/resize and transfers (do this right after udirect?) - // resize grid if needed, then copy from host to device - // cgrid_brick.alloc()/resize() - // cgrid_brick.begin() = host_cgrid_brick[0][0][0][0]; - // + precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, + igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out, + nxlo_out, nxhi_out); - const int red_bllocks = fphi_uind(); + // update the cgrid_brick with data host + + _nzlo_out = nzlo_out; + _nzhi_out = nzhi_out; + _nylo_out = nylo_out; + _nyhi_out = nyhi_out; + _nxlo_out = nxlo_out; + _nxhi_out = nxhi_out; + _ngridz = nzhi_out - nzlo_out + 1; + _ngridy = nyhi_out - nylo_out + 1; + _ngridx = nxhi_out - nxlo_out + 1; + _num_grid_points = _ngridx*_ngridy*_ngridz*2; + + UCL_H_Vec dview; + dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device)); + ucl_copy(_cgrid_brick,dview,false); + + const int red_blocks = fphi_uind(); } @@ -814,6 +873,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str, k_udirect2b.set_function(*pair_program,kname_udirect2b); k_umutual2b.set_function(*pair_program,kname_umutual2b); k_polar.set_function(*pair_program,kname_polar); + k_fphi_uind.set_function(*pair_program,"kname_fphi_uind"); k_short_nbor.set_function(*pair_program,kname_short_nbor); k_special15.set_function(*pair_program,kname_special15); pos_tex.get_texture(*pair_program,"pos_tex"); diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index f333bdf9a6..8503e6fba4 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -152,7 +152,10 @@ class BaseAmoeba { virtual void precompute_induce(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, - double **host_thetai3, int** igrid); + double **host_thetai3, int** igrid, + double* grid_brick_start, int nzlo_out, + int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out); /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, @@ -179,8 +182,10 @@ class BaseAmoeba { virtual void compute_fphi_uind(const int inum_full, const int bsorder, double **host_thetai1, double **host_thetai2, double **host_thetai3, int** igrid, - double ****host_grid, double **host_fdip_phi1, - double **host_fdip_phi2, double **host_fdip_sum_phi); + double *host_grid_brick_start, double **host_fdip_phi1, + double **host_fdip_phi2, double **host_fdip_sum_phi, + int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out, + int nxlo_out, int nxhi_out); /// Compute polar real-space with device neighboring virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, @@ -249,9 +254,12 @@ class BaseAmoeba { int _nmax, _max_tep_size, _max_fieldp_size; int _bsorder; - UCL_Vector _thetai1, _thetai2, _thetai3; - UCL_Vector _igrid; + UCL_D_Vec _thetai1, _thetai2, _thetai3, _cgrid_brick; + UCL_D_Vec _igrid; + UCL_Vector _fdip_phi1, _fdip_phi2, _fdip_sum_phi; int _max_thetai_size; + int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out; + int _ngridx, _ngridy, _ngridz, _num_grid_points; // ------------------------ FORCE/ENERGY DATA ----------------------- @@ -272,7 +280,7 @@ class BaseAmoeba { // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; - UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar; + UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind; UCL_Kernel k_special15, k_short_nbor; inline int block_size() { return _block_size; } inline void set_kernel(const int eflag, const int vflag) {} @@ -305,7 +313,6 @@ class BaseAmoeba { virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int fphi_uind() = 0; virtual int polar_real(const int eflag, const int vflag) = 0; - #if !defined(USE_OPENCL) && !defined(USE_HIP)