Separate the computation of newsite into another kernel

This eliminates the need for thread fence and makes the calculation stable on GTX1070 (CUDA and OpenCL) and TitanV
This commit is contained in:
Vsevak
2019-12-07 15:54:20 +03:00
parent 4a51e1660f
commit a3fca53e97
3 changed files with 51 additions and 23 deletions

View File

@ -66,6 +66,7 @@ int LJTIP4PLongT::init(const int ntypes,
return success;
k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib");
k_pair_reneigh.set_function(*this->pair_program,"k_lj_tip4p_reneigh");
k_pair_newsite.set_function(*this->pair_program,"k_lj_tip4p_newsite");
TypeH = tH;
TypeO = tO;
@ -163,6 +164,7 @@ void LJTIP4PLongT::clear() {
k_pair_distrib.clear();
k_pair_reneigh.clear();
k_pair_newsite.clear();
this->clear_atomic();
}
@ -195,9 +197,8 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
int nbor_pitch=this->nbor->nbor_pitch();
this->time_pair.start();
int GX;
GX=static_cast<int>(ceil(static_cast<double>(nall)/BX));
if (t_ago == 0) {
GX=static_cast<int>(ceil(static_cast<double>(nall)/BX));
this->k_pair_reneigh.set_size(GX,BX);
this->k_pair_reneigh.run(&this->atom->x,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
@ -205,6 +206,14 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
&hneight, &m, &TypeO, &TypeH,
&tag, &map_array, &atom_sametag);
}
this->k_pair_newsite.set_size(GX,BX);
this->k_pair_newsite.run(&this->atom->x,
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
&nall, &ainum,
&nbor_pitch, &this->_threads_per_atom,
&hneight, &m, &TypeO, &TypeH, &alpha,
&this->atom->q, &tag, &map_array,
&atom_sametag);
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
(BX/this->_threads_per_atom)));

View File

@ -65,11 +65,12 @@ ucl_inline int closest_image(int i, int j, const __global int* sametag,
}
ucl_inline void compute_newsite(int iO, int iH1, int iH2,
__global numtyp4 *xM,
__global numtyp4 *xM, numtyp q,
numtyp alpha, const __global numtyp4 *restrict x_){
numtyp4 xO; fetch4(xO,iO,pos_tex);
numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
numtyp4 xH2; fetch4(xH2,iH2,pos_tex);
numtyp4 M;
numtyp delx1 = xH1.x - xO.x;
numtyp dely1 = xH1.y - xO.y;
@ -81,9 +82,12 @@ ucl_inline void compute_newsite(int iO, int iH1, int iH2,
numtyp ap = alpha * (numtyp)0.5;
(*xM).x = xO.x + ap * (delx1 + delx2);
(*xM).y = xO.y + ap * (dely1 + dely2);
(*xM).z = xO.z + ap * (delz1 + delz2);
M.x = xO.x + ap * (delx1 + delx2);
M.y = xO.y + ap * (dely1 + dely2);
M.z = xO.z + ap * (delz1 + delz2);
M.w = q;
*xM = M;
}
__kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
@ -214,6 +218,37 @@ __kernel void k_lj_tip4p_reneigh(const __global numtyp4 *restrict x_,
}
__kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
const __global int * dev_nbor,
const __global int * dev_packed,
const int nall, const int inum,
const int nbor_pitch, const int t_per_atom,
__global int *restrict hneigh,
__global numtyp4 *restrict m,
const int typeO, const int typeH,
const numtyp alpha, const __global numtyp *restrict q_,
const __global int *restrict tag, const __global int *restrict map,
const __global int *restrict sametag) {
int tid, ii, offset;
atom_info(t_per_atom,ii,tid,offset);
int i = BLOCK_ID_X*(BLOCK_SIZE_X)+tid;
if (i<nall) {
int iO, iH1, iH2;
iO = i;
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
int itype = ix.w;
if (itype == typeO){
int iH1, iH2, iO;
iH1 = hneigh[i*4 ];
iH2 = hneigh[i*4+1];
iO = i;
numtyp qO; fetch(qO,iO,q_tex);
compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_);
}
}
}
__kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
const __global numtyp4 *restrict lj1,
const __global numtyp4 *restrict lj3,
@ -269,11 +304,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
iO = i;
iH1 = hneigh[i*4 ];
iH2 = hneigh[i*4+1];
if(fabs(m[iO].w) <= eq_zero) {
compute_newsite(iO,iH1,iH2, &m[iO], alpha, x_);
__threadfence();
m[iO].w = qtmp;
}
x1 = m[iO];
} else {
iO = hneigh[i *4 ];
@ -281,12 +311,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
iH2 = hneigh[iO*4+1];
if (iO >= inum) {
non_local_oxy = 1;
if(fabs(m[iO].w) <= eq_zero) {
compute_newsite(iO,iH1,iH2, &m[iO], alpha, x_);
numtyp qO; fetch(qO,iO,q_tex);
__threadfence();
m[iO].w = qO;
}
}
}
@ -341,11 +365,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
jO = j;
jH1 = hneigh[j*4 ];
jH2 = hneigh[j*4+1];
if (fabs(m[j].w) <= eq_zero) {
compute_newsite(j, jH1, jH2, &m[j], alpha, x_);
__threadfence();
m[j].w = qj;
}
x2 = m[j];
}
delx = x1.x-x2.x;

View File

@ -116,7 +116,7 @@ public:
UCL_D_Vec<int> map_array;
UCL_D_Vec<int> atom_sametag;
UCL_Kernel k_pair_distrib, k_pair_reneigh;
UCL_Kernel k_pair_distrib, k_pair_reneigh, k_pair_newsite;
private:
bool _allocated;