Working on the fphi_uind kernel and array allocations

This commit is contained in:
Trung Nguyen
2022-08-30 23:40:04 -05:00
parent c5c3c697df
commit aac264f2e2
5 changed files with 283 additions and 148 deletions

View File

@ -68,7 +68,31 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
# host code compilation
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
#ifdef CUDPP_OPT

View File

@ -275,13 +275,18 @@ int AmoebaT::fphi_uind() {
const int BX=this->block_size();
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));
/*
this->time_pair.start();
this->time_pair.start();
int ngridyz = this->_ngridy * this->_ngridz;
this->k_fphi_uind.set_size(GX,BX);
this->k_fphi_uind.run();
this->k_fphi_uind.run(&this->atom->x, &this->_thetai1,
&this->_thetai2, &this->_thetai3,
&this->_igrid, &this->_cgrid_brick,
&this->_fdip_phi1, &this->_fdip_phi2,
&this->_fdip_sum_phi, &this->_bsorder,
&ainum, &ngridyz, &this->_ngridy,
&this->_threads_per_atom);
this->time_pair.stop();
*/
return GX;
}

View File

@ -1621,15 +1621,16 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
------------------------------------------------------------------------- */
__kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict thetai1,
const __global numtyp4 *restrict thetai2,
const __global numtyp4 *restrict thetai3,
const __global int4 *restrict igrid,
const __global numtyp4 *restrict grid,
__global numtyp4 *restrict fdip_phi1,
__global numtyp4 *restrict fdip_phi2,
__global numtyp4 *restrict fdip_sum_phi,
const __global numtyp *restrict thetai1,
const __global numtyp *restrict thetai2,
const __global numtyp *restrict thetai3,
const __global int *restrict igrid,
const __global numtyp *restrict grid,
__global numtyp *restrict fdip_phi1,
__global numtyp *restrict fdip_phi2,
__global numtyp *restrict fdip_sum_phi,
const int bsorder, const int inum,
const int nyzgrid, const int nygrid,
const int t_per_atom)
{
int tid, ii, offset, i, n_stride;
@ -1666,46 +1667,46 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
// extract the permanent multipole field at each site
tuv100_1 = 0.0;
tuv010_1 = 0.0;
tuv001_1 = 0.0;
tuv200_1 = 0.0;
tuv020_1 = 0.0;
tuv002_1 = 0.0;
tuv110_1 = 0.0;
tuv101_1 = 0.0;
tuv011_1 = 0.0;
tuv100_2 = 0.0;
tuv010_2 = 0.0;
tuv001_2 = 0.0;
tuv200_2 = 0.0;
tuv020_2 = 0.0;
tuv002_2 = 0.0;
tuv110_2 = 0.0;
tuv101_2 = 0.0;
tuv011_2 = 0.0;
tuv000 = 0.0;
tuv001 = 0.0;
tuv010 = 0.0;
tuv100 = 0.0;
tuv200 = 0.0;
tuv020 = 0.0;
tuv002 = 0.0;
tuv110 = 0.0;
tuv101 = 0.0;
tuv011 = 0.0;
tuv300 = 0.0;
tuv030 = 0.0;
tuv003 = 0.0;
tuv210 = 0.0;
tuv201 = 0.0;
tuv120 = 0.0;
tuv021 = 0.0;
tuv102 = 0.0;
tuv012 = 0.0;
tuv111 = 0.0;
tuv100_1 = (numtyp)0.0;
tuv010_1 = (numtyp)0.0;
tuv001_1 = (numtyp)0.0;
tuv200_1 = (numtyp)0.0;
tuv020_1 = (numtyp)0.0;
tuv002_1 = (numtyp)0.0;
tuv110_1 = (numtyp)0.0;
tuv101_1 = (numtyp)0.0;
tuv011_1 = (numtyp)0.0;
tuv100_2 = (numtyp)0.0;
tuv010_2 = (numtyp)0.0;
tuv001_2 = (numtyp)0.0;
tuv200_2 = (numtyp)0.0;
tuv020_2 = (numtyp)0.0;
tuv002_2 = (numtyp)0.0;
tuv110_2 = (numtyp)0.0;
tuv101_2 = (numtyp)0.0;
tuv011_2 = (numtyp)0.0;
tuv000 = (numtyp)0.0;
tuv001 = (numtyp)0.0;
tuv010 = (numtyp)0.0;
tuv100 = (numtyp)0.0;
tuv200 = (numtyp)0.0;
tuv020 = (numtyp)0.0;
tuv002 = (numtyp)0.0;
tuv110 = (numtyp)0.0;
tuv101 = (numtyp)0.0;
tuv011 = (numtyp)0.0;
tuv300 = (numtyp)0.0;
tuv030 = (numtyp)0.0;
tuv003 = (numtyp)0.0;
tuv210 = (numtyp)0.0;
tuv201 = (numtyp)0.0;
tuv120 = (numtyp)0.0;
tuv021 = (numtyp)0.0;
tuv102 = (numtyp)0.0;
tuv012 = (numtyp)0.0;
tuv111 = (numtyp)0.0;
k = igrid[i].z - nlpts;
k = igrid[3*i+2] - nlpts;
for (int kb = 0; kb < bsorder; kb++) {
/*
v0 = thetai3[m][kb][0];
@ -1713,30 +1714,35 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
v2 = thetai3[m][kb][2];
v3 = thetai3[m][kb][3];
*/
tu00_1 = 0.0;
tu01_1 = 0.0;
tu10_1 = 0.0;
tu20_1 = 0.0;
tu11_1 = 0.0;
tu02_1 = 0.0;
tu00_2 = 0.0;
tu01_2 = 0.0;
tu10_2 = 0.0;
tu20_2 = 0.0;
tu11_2 = 0.0;
tu02_2 = 0.0;
tu00 = 0.0;
tu10 = 0.0;
tu01 = 0.0;
tu20 = 0.0;
tu11 = 0.0;
tu02 = 0.0;
tu30 = 0.0;
tu21 = 0.0;
tu12 = 0.0;
tu03 = 0.0;
int i3 = m*4*bsorder + 4*kb;
v0 = thetai3[i3];
v1 = thetai3[i3]+1;
v2 = thetai3[i3+2];
v3 = thetai3[i3+3];
tu00_1 = (numtyp)0.0;
tu01_1 = (numtyp)0.0;
tu10_1 = (numtyp)0.0;
tu20_1 = (numtyp)0.0;
tu11_1 = (numtyp)0.0;
tu02_1 = (numtyp)0.0;
tu00_2 = (numtyp)0.0;
tu01_2 = (numtyp)0.0;
tu10_2 = (numtyp)0.0;
tu20_2 = (numtyp)0.0;
tu11_2 = (numtyp)0.0;
tu02_2 = (numtyp)0.0;
tu00 = (numtyp)0.0;
tu10 = (numtyp)0.0;
tu01 = (numtyp)0.0;
tu20 = (numtyp)0.0;
tu11 = (numtyp)0.0;
tu02 = (numtyp)0.0;
tu30 = (numtyp)0.0;
tu21 = (numtyp)0.0;
tu12 = (numtyp)0.0;
tu03 = (numtyp)0.0;
j = igrid[i].y - nlpts;
j = igrid[3*i+1] - nlpts;
for (int jb = 0; jb < bsorder; jb++) {
/*
u0 = thetai2[m][jb][0];
@ -1744,19 +1750,24 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
u2 = thetai2[m][jb][2];
u3 = thetai2[m][jb][3];
*/
t0_1 = 0.0;
t1_1 = 0.0;
t2_1 = 0.0;
t0_2 = 0.0;
t1_2 = 0.0;
t2_2 = 0.0;
t3 = 0.0;
int i2 = m*4*bsorder+4*jb;
u0 = thetai2[i2];
u1 = thetai2[i2+1];
u2 = thetai2[i2+2];
u3 = thetai2[i2+3];
t0_1 = (numtyp)0.0;
t1_1 = (numtyp)0.0;
t2_1 = (numtyp)0.0;
t0_2 = (numtyp)0.0;
t1_2 = (numtyp)0.0;
t2_2 = (numtyp)0.0;
t3 = (numtyp)0.0;
i = igrid[m].x - nlpts;
int ii = igrid[3*i] - nlpts;
for (int ib = 0; ib < bsorder; ib++) {
/*
tq_1 = grid[k][j][i][0];
tq_2 = grid[k][j][i][1];
tq_1 = grid[k][j][ii][0];
tq_2 = grid[k][j][ii][1];
t0_1 += tq_1*thetai1[m][ib][0];
t1_1 += tq_1*thetai1[m][ib][1];
t2_1 += tq_1*thetai1[m][ib][2];
@ -1765,7 +1776,22 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
t2_2 += tq_2*thetai1[m][ib][2];
t3 += (tq_1+tq_2)*thetai1[m][ib][3];
*/
i++;
int i1 = m*4*bsorder+4*ib;
numtyp w0 = thetai1[i1];
numtyp w1 = thetai1[i1+1];
numtyp w2 = thetai1[i1+2];
numtyp w3 = thetai1[i1+3];
int gidx = 2*(k*nyzgrid + j*nygrid + ii);
tq_1 = grid[gidx];
tq_2 = grid[gidx+1];
t0_1 += tq_1*w0;
t1_1 += tq_1*w1;
t2_1 += tq_1*w2;
t0_2 += tq_2*w0;
t1_2 += tq_2*w1;
t2_2 += tq_2*w2;
t3 += (tq_1+tq_2)*w3;
ii++;
}
tu00_1 += t0_1*u0;
@ -1836,6 +1862,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
tuv111 += tu11*v1;
k++;
}
/*
fdip_phi1[m][0] = 0.0;
fdip_phi1[m][1] = tuv100_1;
@ -1847,39 +1874,51 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
fdip_phi1[m][7] = tuv110_1;
fdip_phi1[m][8] = tuv101_1;
fdip_phi1[m][9] = tuv011_1;
fdip_phi2[m][0] = 0.0;
fdip_phi2[m][1] = tuv100_2;
fdip_phi2[m][2] = tuv010_2;
fdip_phi2[m][3] = tuv001_2;
fdip_phi2[m][4] = tuv200_2;
fdip_phi2[m][5] = tuv020_2;
fdip_phi2[m][6] = tuv002_2;
fdip_phi2[m][7] = tuv110_2;
fdip_phi2[m][8] = tuv101_2;
fdip_phi2[m][9] = tuv011_2;
fdip_sum_phi[m][0] = tuv000;
fdip_sum_phi[m][1] = tuv100;
fdip_sum_phi[m][2] = tuv010;
fdip_sum_phi[m][3] = tuv001;
fdip_sum_phi[m][4] = tuv200;
fdip_sum_phi[m][5] = tuv020;
fdip_sum_phi[m][6] = tuv002;
fdip_sum_phi[m][7] = tuv110;
fdip_sum_phi[m][8] = tuv101;
fdip_sum_phi[m][9] = tuv011;
fdip_sum_phi[m][10] = tuv300;
fdip_sum_phi[m][11] = tuv030;
fdip_sum_phi[m][12] = tuv003;
fdip_sum_phi[m][13] = tuv210;
fdip_sum_phi[m][14] = tuv201;
fdip_sum_phi[m][15] = tuv120;
fdip_sum_phi[m][16] = tuv021;
fdip_sum_phi[m][17] = tuv102;
fdip_sum_phi[m][18] = tuv012;
fdip_sum_phi[m][19] = tuv111;
*/
int idx = 10*m;
fdip_phi1[idx+0] = (numtyp)0.0;
fdip_phi1[idx+1] = tuv100_1;
fdip_phi1[idx+2] = tuv010_1;
fdip_phi1[idx+3] = tuv001_1;
fdip_phi1[idx+4] = tuv200_1;
fdip_phi1[idx+5] = tuv020_1;
fdip_phi1[idx+6] = tuv002_1;
fdip_phi1[idx+7] = tuv110_1;
fdip_phi1[idx+8] = tuv101_1;
fdip_phi1[idx+9] = tuv011_1;
fdip_phi2[idx+0] = (numtyp)0.0;
fdip_phi2[idx+1] = tuv100_2;
fdip_phi2[idx+2] = tuv010_2;
fdip_phi2[idx+3] = tuv001_2;
fdip_phi2[idx+4] = tuv200_2;
fdip_phi2[idx+5] = tuv020_2;
fdip_phi2[idx+6] = tuv002_2;
fdip_phi2[idx+7] = tuv110_2;
fdip_phi2[idx+8] = tuv101_2;
fdip_phi2[idx+9] = tuv011_2;
idx = 20*m;
fdip_sum_phi[idx+0] = tuv000;
fdip_sum_phi[idx+1] = tuv100;
fdip_sum_phi[idx+2] = tuv010;
fdip_sum_phi[idx+3] = tuv001;
fdip_sum_phi[idx+4] = tuv200;
fdip_sum_phi[idx+5] = tuv020;
fdip_sum_phi[idx+6] = tuv002;
fdip_sum_phi[idx+7] = tuv110;
fdip_sum_phi[idx+8] = tuv101;
fdip_sum_phi[idx+9] = tuv011;
fdip_sum_phi[idx+10] = tuv300;
fdip_sum_phi[idx+11] = tuv030;
fdip_sum_phi[idx+12] = tuv003;
fdip_sum_phi[idx+13] = tuv210;
fdip_sum_phi[idx+14] = tuv201;
fdip_sum_phi[idx+15] = tuv120;
fdip_sum_phi[idx+16] = tuv021;
fdip_sum_phi[idx+17] = tuv102;
fdip_sum_phi[idx+18] = tuv012;
fdip_sum_phi[idx+19] = tuv111;
}
}

View File

@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
k_multipole.clear();
k_udirect2b.clear();
k_umutual2b.clear();
k_fphi_uind.clear();
k_polar.clear();
k_special15.clear();
k_short_nbor.clear();
@ -182,6 +183,11 @@ void BaseAmoebaT::clear_atomic() {
_thetai1.clear();
_thetai2.clear();
_thetai3.clear();
_igrid.clear();
_fdip_phi1.clear();
_fdip_phi2.clear();
_cgrid_brick.clear();
_fdip_sum_phi.clear();
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
@ -444,34 +450,70 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
template <class numtyp, class acctyp>
void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
double **host_thetai1, double **host_thetai2,
double **host_thetai3, int** host_igrid) {
double **host_thetai3, int** host_igrid,
double* grid_brick_start, int nzlo_out,
int nzhi_out, int nylo_out, int nyhi_out,
int nxlo_out, int nxhi_out) {
_bsorder = bsorder;
// allocate or resize per-atom arrays
// _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
// will be consolidated once all terms are ready
if (_max_thetai_size == 0) {
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
_thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
_thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
_igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY);
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
} else {
if (inum_full>_max_thetai_size) {
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.resize(_max_thetai_size*bsorder*4);
_thetai2.resize(_max_thetai_size*bsorder*4);
_thetai3.resize(_max_thetai_size*bsorder*4);
_igrid.resize(_max_thetai_size);
_igrid.resize(_max_thetai_size*3);
_fdip_phi1.resize(_max_thetai_size*10);
_fdip_phi2.resize(_max_thetai_size*10);
_fdip_sum_phi.resize(_max_thetai_size*20);
}
}
memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp));
memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp));
memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp));
memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int));
_thetai1.update_device(inum_full*bsorder*4,true);
_thetai2.update_device(inum_full*bsorder*4,true);
_thetai3.update_device(inum_full*bsorder*4,true);
_igrid.update_device(inum_full,true);
UCL_H_Vec<double> dview;
// copy from host to device
dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device));
ucl_copy(_thetai1,dview,false);
dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device));
ucl_copy(_thetai2,dview,false);
dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device));
ucl_copy(_thetai3,dview,false);
UCL_H_Vec<int> dview_int;
dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device));
ucl_copy(_igrid,dview_int,false);
_nzlo_out = nzlo_out;
_nzhi_out = nzhi_out;
_nylo_out = nylo_out;
_nyhi_out = nyhi_out;
_nxlo_out = nxlo_out;
_nxhi_out = nxhi_out;
_ngridz = nzhi_out - nzlo_out + 1;
_ngridy = nyhi_out - nylo_out + 1;
_ngridx = nxhi_out - nxlo_out + 1;
_num_grid_points = _ngridx*_ngridy*_ngridz*2;
dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device));
ucl_copy(_cgrid_brick,dview,false);
}
// ---------------------------------------------------------------------------
@ -593,18 +635,35 @@ template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
double **host_thetai1, double **host_thetai2,
double **host_thetai3, int** igrid,
double ****host_cgrid_brick, double **host_fdip_phi1,
double **host_fdip_phi2, double **host_fdip_sum_phi)
double *host_grid_brick_start, double **host_fdip_phi1,
double **host_fdip_phi2, double **host_fdip_sum_phi,
int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
int nxlo_out, int nxhi_out)
{
// once allocation and transfers
precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid);
// allocation/resize and transfers (do this right after udirect?)
// resize grid if needed, then copy from host to device
// cgrid_brick.alloc()/resize()
// cgrid_brick.begin() = host_cgrid_brick[0][0][0][0];
//
precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out,
nxlo_out, nxhi_out);
const int red_bllocks = fphi_uind();
// update the cgrid_brick with data host
_nzlo_out = nzlo_out;
_nzhi_out = nzhi_out;
_nylo_out = nylo_out;
_nyhi_out = nyhi_out;
_nxlo_out = nxlo_out;
_nxhi_out = nxhi_out;
_ngridz = nzhi_out - nzlo_out + 1;
_ngridy = nyhi_out - nylo_out + 1;
_ngridx = nxhi_out - nxlo_out + 1;
_num_grid_points = _ngridx*_ngridy*_ngridz*2;
UCL_H_Vec<double> dview;
dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device));
ucl_copy(_cgrid_brick,dview,false);
const int red_blocks = fphi_uind();
}
@ -814,6 +873,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
k_udirect2b.set_function(*pair_program,kname_udirect2b);
k_umutual2b.set_function(*pair_program,kname_umutual2b);
k_polar.set_function(*pair_program,kname_polar);
k_fphi_uind.set_function(*pair_program,"kname_fphi_uind");
k_short_nbor.set_function(*pair_program,kname_short_nbor);
k_special15.set_function(*pair_program,kname_special15);
pos_tex.get_texture(*pair_program,"pos_tex");

View File

@ -152,7 +152,10 @@ class BaseAmoeba {
virtual void precompute_induce(const int inum_full, const int bsorder,
double **host_thetai1, double **host_thetai2,
double **host_thetai3, int** igrid);
double **host_thetai3, int** igrid,
double* grid_brick_start, int nzlo_out,
int nzhi_out, int nylo_out, int nyhi_out,
int nxlo_out, int nxhi_out);
/// Compute multipole real-space with device neighboring
virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
@ -179,8 +182,10 @@ class BaseAmoeba {
virtual void compute_fphi_uind(const int inum_full, const int bsorder,
double **host_thetai1, double **host_thetai2,
double **host_thetai3, int** igrid,
double ****host_grid, double **host_fdip_phi1,
double **host_fdip_phi2, double **host_fdip_sum_phi);
double *host_grid_brick_start, double **host_fdip_phi1,
double **host_fdip_phi2, double **host_fdip_sum_phi,
int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
int nxlo_out, int nxhi_out);
/// Compute polar real-space with device neighboring
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@ -249,9 +254,12 @@ class BaseAmoeba {
int _nmax, _max_tep_size, _max_fieldp_size;
int _bsorder;
UCL_Vector<numtyp,numtyp> _thetai1, _thetai2, _thetai3;
UCL_Vector<int4,int4> _igrid;
UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3, _cgrid_brick;
UCL_D_Vec<int> _igrid;
UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
int _max_thetai_size;
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
int _ngridx, _ngridy, _ngridz, _num_grid_points;
// ------------------------ FORCE/ENERGY DATA -----------------------
@ -272,7 +280,7 @@ class BaseAmoeba {
// ------------------------- DEVICE KERNELS -------------------------
UCL_Program *pair_program;
UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind;
UCL_Kernel k_special15, k_short_nbor;
inline int block_size() { return _block_size; }
inline void set_kernel(const int eflag, const int vflag) {}
@ -305,7 +313,6 @@ class BaseAmoeba {
virtual int umutual2b(const int eflag, const int vflag) = 0;
virtual int fphi_uind() = 0;
virtual int polar_real(const int eflag, const int vflag) = 0;
#if !defined(USE_OPENCL) && !defined(USE_HIP)