Working on the fphi_uind kernel and array allocations
This commit is contained in:
@ -68,7 +68,31 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
|
||||
|
||||
# host code compilation
|
||||
|
||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
|
||||
$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
|
||||
$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
|
||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
|
||||
$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
|
||||
|
||||
#ifdef CUDPP_OPT
|
||||
|
||||
@ -275,13 +275,18 @@ int AmoebaT::fphi_uind() {
|
||||
const int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
/*
|
||||
this->time_pair.start();
|
||||
|
||||
this->time_pair.start();
|
||||
int ngridyz = this->_ngridy * this->_ngridz;
|
||||
this->k_fphi_uind.set_size(GX,BX);
|
||||
this->k_fphi_uind.run();
|
||||
this->k_fphi_uind.run(&this->atom->x, &this->_thetai1,
|
||||
&this->_thetai2, &this->_thetai3,
|
||||
&this->_igrid, &this->_cgrid_brick,
|
||||
&this->_fdip_phi1, &this->_fdip_phi2,
|
||||
&this->_fdip_sum_phi, &this->_bsorder,
|
||||
&ainum, &ngridyz, &this->_ngridy,
|
||||
&this->_threads_per_atom);
|
||||
this->time_pair.stop();
|
||||
*/
|
||||
|
||||
return GX;
|
||||
}
|
||||
|
||||
@ -1621,15 +1621,16 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
__kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict thetai1,
|
||||
const __global numtyp4 *restrict thetai2,
|
||||
const __global numtyp4 *restrict thetai3,
|
||||
const __global int4 *restrict igrid,
|
||||
const __global numtyp4 *restrict grid,
|
||||
__global numtyp4 *restrict fdip_phi1,
|
||||
__global numtyp4 *restrict fdip_phi2,
|
||||
__global numtyp4 *restrict fdip_sum_phi,
|
||||
const __global numtyp *restrict thetai1,
|
||||
const __global numtyp *restrict thetai2,
|
||||
const __global numtyp *restrict thetai3,
|
||||
const __global int *restrict igrid,
|
||||
const __global numtyp *restrict grid,
|
||||
__global numtyp *restrict fdip_phi1,
|
||||
__global numtyp *restrict fdip_phi2,
|
||||
__global numtyp *restrict fdip_sum_phi,
|
||||
const int bsorder, const int inum,
|
||||
const int nyzgrid, const int nygrid,
|
||||
const int t_per_atom)
|
||||
{
|
||||
int tid, ii, offset, i, n_stride;
|
||||
@ -1666,46 +1667,46 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
|
||||
|
||||
// extract the permanent multipole field at each site
|
||||
|
||||
tuv100_1 = 0.0;
|
||||
tuv010_1 = 0.0;
|
||||
tuv001_1 = 0.0;
|
||||
tuv200_1 = 0.0;
|
||||
tuv020_1 = 0.0;
|
||||
tuv002_1 = 0.0;
|
||||
tuv110_1 = 0.0;
|
||||
tuv101_1 = 0.0;
|
||||
tuv011_1 = 0.0;
|
||||
tuv100_2 = 0.0;
|
||||
tuv010_2 = 0.0;
|
||||
tuv001_2 = 0.0;
|
||||
tuv200_2 = 0.0;
|
||||
tuv020_2 = 0.0;
|
||||
tuv002_2 = 0.0;
|
||||
tuv110_2 = 0.0;
|
||||
tuv101_2 = 0.0;
|
||||
tuv011_2 = 0.0;
|
||||
tuv000 = 0.0;
|
||||
tuv001 = 0.0;
|
||||
tuv010 = 0.0;
|
||||
tuv100 = 0.0;
|
||||
tuv200 = 0.0;
|
||||
tuv020 = 0.0;
|
||||
tuv002 = 0.0;
|
||||
tuv110 = 0.0;
|
||||
tuv101 = 0.0;
|
||||
tuv011 = 0.0;
|
||||
tuv300 = 0.0;
|
||||
tuv030 = 0.0;
|
||||
tuv003 = 0.0;
|
||||
tuv210 = 0.0;
|
||||
tuv201 = 0.0;
|
||||
tuv120 = 0.0;
|
||||
tuv021 = 0.0;
|
||||
tuv102 = 0.0;
|
||||
tuv012 = 0.0;
|
||||
tuv111 = 0.0;
|
||||
tuv100_1 = (numtyp)0.0;
|
||||
tuv010_1 = (numtyp)0.0;
|
||||
tuv001_1 = (numtyp)0.0;
|
||||
tuv200_1 = (numtyp)0.0;
|
||||
tuv020_1 = (numtyp)0.0;
|
||||
tuv002_1 = (numtyp)0.0;
|
||||
tuv110_1 = (numtyp)0.0;
|
||||
tuv101_1 = (numtyp)0.0;
|
||||
tuv011_1 = (numtyp)0.0;
|
||||
tuv100_2 = (numtyp)0.0;
|
||||
tuv010_2 = (numtyp)0.0;
|
||||
tuv001_2 = (numtyp)0.0;
|
||||
tuv200_2 = (numtyp)0.0;
|
||||
tuv020_2 = (numtyp)0.0;
|
||||
tuv002_2 = (numtyp)0.0;
|
||||
tuv110_2 = (numtyp)0.0;
|
||||
tuv101_2 = (numtyp)0.0;
|
||||
tuv011_2 = (numtyp)0.0;
|
||||
tuv000 = (numtyp)0.0;
|
||||
tuv001 = (numtyp)0.0;
|
||||
tuv010 = (numtyp)0.0;
|
||||
tuv100 = (numtyp)0.0;
|
||||
tuv200 = (numtyp)0.0;
|
||||
tuv020 = (numtyp)0.0;
|
||||
tuv002 = (numtyp)0.0;
|
||||
tuv110 = (numtyp)0.0;
|
||||
tuv101 = (numtyp)0.0;
|
||||
tuv011 = (numtyp)0.0;
|
||||
tuv300 = (numtyp)0.0;
|
||||
tuv030 = (numtyp)0.0;
|
||||
tuv003 = (numtyp)0.0;
|
||||
tuv210 = (numtyp)0.0;
|
||||
tuv201 = (numtyp)0.0;
|
||||
tuv120 = (numtyp)0.0;
|
||||
tuv021 = (numtyp)0.0;
|
||||
tuv102 = (numtyp)0.0;
|
||||
tuv012 = (numtyp)0.0;
|
||||
tuv111 = (numtyp)0.0;
|
||||
|
||||
k = igrid[i].z - nlpts;
|
||||
k = igrid[3*i+2] - nlpts;
|
||||
for (int kb = 0; kb < bsorder; kb++) {
|
||||
/*
|
||||
v0 = thetai3[m][kb][0];
|
||||
@ -1713,30 +1714,35 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
|
||||
v2 = thetai3[m][kb][2];
|
||||
v3 = thetai3[m][kb][3];
|
||||
*/
|
||||
tu00_1 = 0.0;
|
||||
tu01_1 = 0.0;
|
||||
tu10_1 = 0.0;
|
||||
tu20_1 = 0.0;
|
||||
tu11_1 = 0.0;
|
||||
tu02_1 = 0.0;
|
||||
tu00_2 = 0.0;
|
||||
tu01_2 = 0.0;
|
||||
tu10_2 = 0.0;
|
||||
tu20_2 = 0.0;
|
||||
tu11_2 = 0.0;
|
||||
tu02_2 = 0.0;
|
||||
tu00 = 0.0;
|
||||
tu10 = 0.0;
|
||||
tu01 = 0.0;
|
||||
tu20 = 0.0;
|
||||
tu11 = 0.0;
|
||||
tu02 = 0.0;
|
||||
tu30 = 0.0;
|
||||
tu21 = 0.0;
|
||||
tu12 = 0.0;
|
||||
tu03 = 0.0;
|
||||
int i3 = m*4*bsorder + 4*kb;
|
||||
v0 = thetai3[i3];
|
||||
v1 = thetai3[i3]+1;
|
||||
v2 = thetai3[i3+2];
|
||||
v3 = thetai3[i3+3];
|
||||
tu00_1 = (numtyp)0.0;
|
||||
tu01_1 = (numtyp)0.0;
|
||||
tu10_1 = (numtyp)0.0;
|
||||
tu20_1 = (numtyp)0.0;
|
||||
tu11_1 = (numtyp)0.0;
|
||||
tu02_1 = (numtyp)0.0;
|
||||
tu00_2 = (numtyp)0.0;
|
||||
tu01_2 = (numtyp)0.0;
|
||||
tu10_2 = (numtyp)0.0;
|
||||
tu20_2 = (numtyp)0.0;
|
||||
tu11_2 = (numtyp)0.0;
|
||||
tu02_2 = (numtyp)0.0;
|
||||
tu00 = (numtyp)0.0;
|
||||
tu10 = (numtyp)0.0;
|
||||
tu01 = (numtyp)0.0;
|
||||
tu20 = (numtyp)0.0;
|
||||
tu11 = (numtyp)0.0;
|
||||
tu02 = (numtyp)0.0;
|
||||
tu30 = (numtyp)0.0;
|
||||
tu21 = (numtyp)0.0;
|
||||
tu12 = (numtyp)0.0;
|
||||
tu03 = (numtyp)0.0;
|
||||
|
||||
j = igrid[i].y - nlpts;
|
||||
j = igrid[3*i+1] - nlpts;
|
||||
for (int jb = 0; jb < bsorder; jb++) {
|
||||
/*
|
||||
u0 = thetai2[m][jb][0];
|
||||
@ -1744,19 +1750,24 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
|
||||
u2 = thetai2[m][jb][2];
|
||||
u3 = thetai2[m][jb][3];
|
||||
*/
|
||||
t0_1 = 0.0;
|
||||
t1_1 = 0.0;
|
||||
t2_1 = 0.0;
|
||||
t0_2 = 0.0;
|
||||
t1_2 = 0.0;
|
||||
t2_2 = 0.0;
|
||||
t3 = 0.0;
|
||||
int i2 = m*4*bsorder+4*jb;
|
||||
u0 = thetai2[i2];
|
||||
u1 = thetai2[i2+1];
|
||||
u2 = thetai2[i2+2];
|
||||
u3 = thetai2[i2+3];
|
||||
t0_1 = (numtyp)0.0;
|
||||
t1_1 = (numtyp)0.0;
|
||||
t2_1 = (numtyp)0.0;
|
||||
t0_2 = (numtyp)0.0;
|
||||
t1_2 = (numtyp)0.0;
|
||||
t2_2 = (numtyp)0.0;
|
||||
t3 = (numtyp)0.0;
|
||||
|
||||
i = igrid[m].x - nlpts;
|
||||
int ii = igrid[3*i] - nlpts;
|
||||
for (int ib = 0; ib < bsorder; ib++) {
|
||||
/*
|
||||
tq_1 = grid[k][j][i][0];
|
||||
tq_2 = grid[k][j][i][1];
|
||||
tq_1 = grid[k][j][ii][0];
|
||||
tq_2 = grid[k][j][ii][1];
|
||||
t0_1 += tq_1*thetai1[m][ib][0];
|
||||
t1_1 += tq_1*thetai1[m][ib][1];
|
||||
t2_1 += tq_1*thetai1[m][ib][2];
|
||||
@ -1765,7 +1776,22 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
|
||||
t2_2 += tq_2*thetai1[m][ib][2];
|
||||
t3 += (tq_1+tq_2)*thetai1[m][ib][3];
|
||||
*/
|
||||
i++;
|
||||
int i1 = m*4*bsorder+4*ib;
|
||||
numtyp w0 = thetai1[i1];
|
||||
numtyp w1 = thetai1[i1+1];
|
||||
numtyp w2 = thetai1[i1+2];
|
||||
numtyp w3 = thetai1[i1+3];
|
||||
int gidx = 2*(k*nyzgrid + j*nygrid + ii);
|
||||
tq_1 = grid[gidx];
|
||||
tq_2 = grid[gidx+1];
|
||||
t0_1 += tq_1*w0;
|
||||
t1_1 += tq_1*w1;
|
||||
t2_1 += tq_1*w2;
|
||||
t0_2 += tq_2*w0;
|
||||
t1_2 += tq_2*w1;
|
||||
t2_2 += tq_2*w2;
|
||||
t3 += (tq_1+tq_2)*w3;
|
||||
ii++;
|
||||
}
|
||||
|
||||
tu00_1 += t0_1*u0;
|
||||
@ -1836,6 +1862,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
|
||||
tuv111 += tu11*v1;
|
||||
k++;
|
||||
}
|
||||
|
||||
/*
|
||||
fdip_phi1[m][0] = 0.0;
|
||||
fdip_phi1[m][1] = tuv100_1;
|
||||
@ -1847,39 +1874,51 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
|
||||
fdip_phi1[m][7] = tuv110_1;
|
||||
fdip_phi1[m][8] = tuv101_1;
|
||||
fdip_phi1[m][9] = tuv011_1;
|
||||
|
||||
fdip_phi2[m][0] = 0.0;
|
||||
fdip_phi2[m][1] = tuv100_2;
|
||||
fdip_phi2[m][2] = tuv010_2;
|
||||
fdip_phi2[m][3] = tuv001_2;
|
||||
fdip_phi2[m][4] = tuv200_2;
|
||||
fdip_phi2[m][5] = tuv020_2;
|
||||
fdip_phi2[m][6] = tuv002_2;
|
||||
fdip_phi2[m][7] = tuv110_2;
|
||||
fdip_phi2[m][8] = tuv101_2;
|
||||
fdip_phi2[m][9] = tuv011_2;
|
||||
|
||||
fdip_sum_phi[m][0] = tuv000;
|
||||
fdip_sum_phi[m][1] = tuv100;
|
||||
fdip_sum_phi[m][2] = tuv010;
|
||||
fdip_sum_phi[m][3] = tuv001;
|
||||
fdip_sum_phi[m][4] = tuv200;
|
||||
fdip_sum_phi[m][5] = tuv020;
|
||||
fdip_sum_phi[m][6] = tuv002;
|
||||
fdip_sum_phi[m][7] = tuv110;
|
||||
fdip_sum_phi[m][8] = tuv101;
|
||||
fdip_sum_phi[m][9] = tuv011;
|
||||
fdip_sum_phi[m][10] = tuv300;
|
||||
fdip_sum_phi[m][11] = tuv030;
|
||||
fdip_sum_phi[m][12] = tuv003;
|
||||
fdip_sum_phi[m][13] = tuv210;
|
||||
fdip_sum_phi[m][14] = tuv201;
|
||||
fdip_sum_phi[m][15] = tuv120;
|
||||
fdip_sum_phi[m][16] = tuv021;
|
||||
fdip_sum_phi[m][17] = tuv102;
|
||||
fdip_sum_phi[m][18] = tuv012;
|
||||
fdip_sum_phi[m][19] = tuv111;
|
||||
*/
|
||||
int idx = 10*m;
|
||||
fdip_phi1[idx+0] = (numtyp)0.0;
|
||||
fdip_phi1[idx+1] = tuv100_1;
|
||||
fdip_phi1[idx+2] = tuv010_1;
|
||||
fdip_phi1[idx+3] = tuv001_1;
|
||||
fdip_phi1[idx+4] = tuv200_1;
|
||||
fdip_phi1[idx+5] = tuv020_1;
|
||||
fdip_phi1[idx+6] = tuv002_1;
|
||||
fdip_phi1[idx+7] = tuv110_1;
|
||||
fdip_phi1[idx+8] = tuv101_1;
|
||||
fdip_phi1[idx+9] = tuv011_1;
|
||||
|
||||
fdip_phi2[idx+0] = (numtyp)0.0;
|
||||
fdip_phi2[idx+1] = tuv100_2;
|
||||
fdip_phi2[idx+2] = tuv010_2;
|
||||
fdip_phi2[idx+3] = tuv001_2;
|
||||
fdip_phi2[idx+4] = tuv200_2;
|
||||
fdip_phi2[idx+5] = tuv020_2;
|
||||
fdip_phi2[idx+6] = tuv002_2;
|
||||
fdip_phi2[idx+7] = tuv110_2;
|
||||
fdip_phi2[idx+8] = tuv101_2;
|
||||
fdip_phi2[idx+9] = tuv011_2;
|
||||
|
||||
idx = 20*m;
|
||||
fdip_sum_phi[idx+0] = tuv000;
|
||||
fdip_sum_phi[idx+1] = tuv100;
|
||||
fdip_sum_phi[idx+2] = tuv010;
|
||||
fdip_sum_phi[idx+3] = tuv001;
|
||||
fdip_sum_phi[idx+4] = tuv200;
|
||||
fdip_sum_phi[idx+5] = tuv020;
|
||||
fdip_sum_phi[idx+6] = tuv002;
|
||||
fdip_sum_phi[idx+7] = tuv110;
|
||||
fdip_sum_phi[idx+8] = tuv101;
|
||||
fdip_sum_phi[idx+9] = tuv011;
|
||||
fdip_sum_phi[idx+10] = tuv300;
|
||||
fdip_sum_phi[idx+11] = tuv030;
|
||||
fdip_sum_phi[idx+12] = tuv003;
|
||||
fdip_sum_phi[idx+13] = tuv210;
|
||||
fdip_sum_phi[idx+14] = tuv201;
|
||||
fdip_sum_phi[idx+15] = tuv120;
|
||||
fdip_sum_phi[idx+16] = tuv021;
|
||||
fdip_sum_phi[idx+17] = tuv102;
|
||||
fdip_sum_phi[idx+18] = tuv012;
|
||||
fdip_sum_phi[idx+19] = tuv111;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
|
||||
k_multipole.clear();
|
||||
k_udirect2b.clear();
|
||||
k_umutual2b.clear();
|
||||
k_fphi_uind.clear();
|
||||
k_polar.clear();
|
||||
k_special15.clear();
|
||||
k_short_nbor.clear();
|
||||
@ -182,6 +183,11 @@ void BaseAmoebaT::clear_atomic() {
|
||||
_thetai1.clear();
|
||||
_thetai2.clear();
|
||||
_thetai3.clear();
|
||||
_igrid.clear();
|
||||
_fdip_phi1.clear();
|
||||
_fdip_phi2.clear();
|
||||
_cgrid_brick.clear();
|
||||
_fdip_sum_phi.clear();
|
||||
dev_nspecial15.clear();
|
||||
dev_special15.clear();
|
||||
dev_special15_t.clear();
|
||||
@ -444,34 +450,70 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, int** host_igrid) {
|
||||
double **host_thetai3, int** host_igrid,
|
||||
double* grid_brick_start, int nzlo_out,
|
||||
int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out) {
|
||||
|
||||
_bsorder = bsorder;
|
||||
|
||||
// allocate or resize per-atom arrays
|
||||
// _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
|
||||
// will be consolidated once all terms are ready
|
||||
|
||||
if (_max_thetai_size == 0) {
|
||||
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
_thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
_thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
_igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
_thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
_igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
_fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
_fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
_fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
|
||||
|
||||
} else {
|
||||
if (inum_full>_max_thetai_size) {
|
||||
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.resize(_max_thetai_size*bsorder*4);
|
||||
_thetai2.resize(_max_thetai_size*bsorder*4);
|
||||
_thetai3.resize(_max_thetai_size*bsorder*4);
|
||||
_igrid.resize(_max_thetai_size);
|
||||
_igrid.resize(_max_thetai_size*3);
|
||||
|
||||
_fdip_phi1.resize(_max_thetai_size*10);
|
||||
_fdip_phi2.resize(_max_thetai_size*10);
|
||||
_fdip_sum_phi.resize(_max_thetai_size*20);
|
||||
}
|
||||
}
|
||||
|
||||
memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp));
|
||||
memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp));
|
||||
memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp));
|
||||
memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int));
|
||||
_thetai1.update_device(inum_full*bsorder*4,true);
|
||||
_thetai2.update_device(inum_full*bsorder*4,true);
|
||||
_thetai3.update_device(inum_full*bsorder*4,true);
|
||||
_igrid.update_device(inum_full,true);
|
||||
UCL_H_Vec<double> dview;
|
||||
|
||||
// copy from host to device
|
||||
|
||||
dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device));
|
||||
ucl_copy(_thetai1,dview,false);
|
||||
dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device));
|
||||
ucl_copy(_thetai2,dview,false);
|
||||
dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device));
|
||||
ucl_copy(_thetai3,dview,false);
|
||||
|
||||
UCL_H_Vec<int> dview_int;
|
||||
dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device));
|
||||
ucl_copy(_igrid,dview_int,false);
|
||||
|
||||
_nzlo_out = nzlo_out;
|
||||
_nzhi_out = nzhi_out;
|
||||
_nylo_out = nylo_out;
|
||||
_nyhi_out = nyhi_out;
|
||||
_nxlo_out = nxlo_out;
|
||||
_nxhi_out = nxhi_out;
|
||||
_ngridz = nzhi_out - nzlo_out + 1;
|
||||
_ngridy = nyhi_out - nylo_out + 1;
|
||||
_ngridx = nxhi_out - nxlo_out + 1;
|
||||
_num_grid_points = _ngridx*_ngridy*_ngridz*2;
|
||||
dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device));
|
||||
ucl_copy(_cgrid_brick,dview,false);
|
||||
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -593,18 +635,35 @@ template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, int** igrid,
|
||||
double ****host_cgrid_brick, double **host_fdip_phi1,
|
||||
double **host_fdip_phi2, double **host_fdip_sum_phi)
|
||||
double *host_grid_brick_start, double **host_fdip_phi1,
|
||||
double **host_fdip_phi2, double **host_fdip_sum_phi,
|
||||
int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out)
|
||||
{
|
||||
// once allocation and transfers
|
||||
precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid);
|
||||
// allocation/resize and transfers (do this right after udirect?)
|
||||
|
||||
// resize grid if needed, then copy from host to device
|
||||
// cgrid_brick.alloc()/resize()
|
||||
// cgrid_brick.begin() = host_cgrid_brick[0][0][0][0];
|
||||
//
|
||||
precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
|
||||
igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out,
|
||||
nxlo_out, nxhi_out);
|
||||
|
||||
const int red_bllocks = fphi_uind();
|
||||
// update the cgrid_brick with data host
|
||||
|
||||
_nzlo_out = nzlo_out;
|
||||
_nzhi_out = nzhi_out;
|
||||
_nylo_out = nylo_out;
|
||||
_nyhi_out = nyhi_out;
|
||||
_nxlo_out = nxlo_out;
|
||||
_nxhi_out = nxhi_out;
|
||||
_ngridz = nzhi_out - nzlo_out + 1;
|
||||
_ngridy = nyhi_out - nylo_out + 1;
|
||||
_ngridx = nxhi_out - nxlo_out + 1;
|
||||
_num_grid_points = _ngridx*_ngridy*_ngridz*2;
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device));
|
||||
ucl_copy(_cgrid_brick,dview,false);
|
||||
|
||||
const int red_blocks = fphi_uind();
|
||||
}
|
||||
|
||||
|
||||
@ -814,6 +873,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
|
||||
k_udirect2b.set_function(*pair_program,kname_udirect2b);
|
||||
k_umutual2b.set_function(*pair_program,kname_umutual2b);
|
||||
k_polar.set_function(*pair_program,kname_polar);
|
||||
k_fphi_uind.set_function(*pair_program,"kname_fphi_uind");
|
||||
k_short_nbor.set_function(*pair_program,kname_short_nbor);
|
||||
k_special15.set_function(*pair_program,kname_special15);
|
||||
pos_tex.get_texture(*pair_program,"pos_tex");
|
||||
|
||||
@ -152,7 +152,10 @@ class BaseAmoeba {
|
||||
|
||||
virtual void precompute_induce(const int inum_full, const int bsorder,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, int** igrid);
|
||||
double **host_thetai3, int** igrid,
|
||||
double* grid_brick_start, int nzlo_out,
|
||||
int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out);
|
||||
|
||||
/// Compute multipole real-space with device neighboring
|
||||
virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
|
||||
@ -179,8 +182,10 @@ class BaseAmoeba {
|
||||
virtual void compute_fphi_uind(const int inum_full, const int bsorder,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, int** igrid,
|
||||
double ****host_grid, double **host_fdip_phi1,
|
||||
double **host_fdip_phi2, double **host_fdip_sum_phi);
|
||||
double *host_grid_brick_start, double **host_fdip_phi1,
|
||||
double **host_fdip_phi2, double **host_fdip_sum_phi,
|
||||
int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
|
||||
int nxlo_out, int nxhi_out);
|
||||
|
||||
/// Compute polar real-space with device neighboring
|
||||
virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
|
||||
@ -249,9 +254,12 @@ class BaseAmoeba {
|
||||
int _nmax, _max_tep_size, _max_fieldp_size;
|
||||
|
||||
int _bsorder;
|
||||
UCL_Vector<numtyp,numtyp> _thetai1, _thetai2, _thetai3;
|
||||
UCL_Vector<int4,int4> _igrid;
|
||||
UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3, _cgrid_brick;
|
||||
UCL_D_Vec<int> _igrid;
|
||||
UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
|
||||
int _max_thetai_size;
|
||||
int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
|
||||
int _ngridx, _ngridy, _ngridz, _num_grid_points;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
@ -272,7 +280,7 @@ class BaseAmoeba {
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pair_program;
|
||||
UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
|
||||
UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind;
|
||||
UCL_Kernel k_special15, k_short_nbor;
|
||||
inline int block_size() { return _block_size; }
|
||||
inline void set_kernel(const int eflag, const int vflag) {}
|
||||
@ -305,7 +313,6 @@ class BaseAmoeba {
|
||||
virtual int umutual2b(const int eflag, const int vflag) = 0;
|
||||
virtual int fphi_uind() = 0;
|
||||
virtual int polar_real(const int eflag, const int vflag) = 0;
|
||||
|
||||
|
||||
|
||||
#if !defined(USE_OPENCL) && !defined(USE_HIP)
|
||||
|
||||
Reference in New Issue
Block a user