Separate the computation of newsite into another kernel
This eliminates the need for thread fence and makes the calculation stable on GTX1070 (CUDA and OpenCL) and TitanV
This commit is contained in:
@ -66,6 +66,7 @@ int LJTIP4PLongT::init(const int ntypes,
|
||||
return success;
|
||||
k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib");
|
||||
k_pair_reneigh.set_function(*this->pair_program,"k_lj_tip4p_reneigh");
|
||||
k_pair_newsite.set_function(*this->pair_program,"k_lj_tip4p_newsite");
|
||||
|
||||
TypeH = tH;
|
||||
TypeO = tO;
|
||||
@ -163,6 +164,7 @@ void LJTIP4PLongT::clear() {
|
||||
|
||||
k_pair_distrib.clear();
|
||||
k_pair_reneigh.clear();
|
||||
k_pair_newsite.clear();
|
||||
|
||||
this->clear_atomic();
|
||||
}
|
||||
@ -195,9 +197,8 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
int GX;
|
||||
|
||||
GX=static_cast<int>(ceil(static_cast<double>(nall)/BX));
|
||||
if (t_ago == 0) {
|
||||
GX=static_cast<int>(ceil(static_cast<double>(nall)/BX));
|
||||
this->k_pair_reneigh.set_size(GX,BX);
|
||||
this->k_pair_reneigh.run(&this->atom->x,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
@ -205,6 +206,14 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
|
||||
&hneight, &m, &TypeO, &TypeH,
|
||||
&tag, &map_array, &atom_sametag);
|
||||
}
|
||||
this->k_pair_newsite.set_size(GX,BX);
|
||||
this->k_pair_newsite.run(&this->atom->x,
|
||||
&this->nbor->dev_nbor, &this->_nbor_data->begin(),
|
||||
&nall, &ainum,
|
||||
&nbor_pitch, &this->_threads_per_atom,
|
||||
&hneight, &m, &TypeO, &TypeH, &alpha,
|
||||
&this->atom->q, &tag, &map_array,
|
||||
&atom_sametag);
|
||||
|
||||
GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
@ -65,11 +65,12 @@ ucl_inline int closest_image(int i, int j, const __global int* sametag,
|
||||
}
|
||||
|
||||
ucl_inline void compute_newsite(int iO, int iH1, int iH2,
|
||||
__global numtyp4 *xM,
|
||||
__global numtyp4 *xM, numtyp q,
|
||||
numtyp alpha, const __global numtyp4 *restrict x_){
|
||||
numtyp4 xO; fetch4(xO,iO,pos_tex);
|
||||
numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
|
||||
numtyp4 xH2; fetch4(xH2,iH2,pos_tex);
|
||||
numtyp4 M;
|
||||
|
||||
numtyp delx1 = xH1.x - xO.x;
|
||||
numtyp dely1 = xH1.y - xO.y;
|
||||
@ -81,9 +82,12 @@ ucl_inline void compute_newsite(int iO, int iH1, int iH2,
|
||||
|
||||
numtyp ap = alpha * (numtyp)0.5;
|
||||
|
||||
(*xM).x = xO.x + ap * (delx1 + delx2);
|
||||
(*xM).y = xO.y + ap * (dely1 + dely2);
|
||||
(*xM).z = xO.z + ap * (delz1 + delz2);
|
||||
M.x = xO.x + ap * (delx1 + delx2);
|
||||
M.y = xO.y + ap * (dely1 + dely2);
|
||||
M.z = xO.z + ap * (delz1 + delz2);
|
||||
M.w = q;
|
||||
|
||||
*xM = M;
|
||||
}
|
||||
|
||||
__kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
|
||||
@ -214,6 +218,37 @@ __kernel void k_lj_tip4p_reneigh(const __global numtyp4 *restrict x_,
|
||||
}
|
||||
|
||||
|
||||
__kernel void k_lj_tip4p_newsite(const __global numtyp4 *restrict x_,
|
||||
const __global int * dev_nbor,
|
||||
const __global int * dev_packed,
|
||||
const int nall, const int inum,
|
||||
const int nbor_pitch, const int t_per_atom,
|
||||
__global int *restrict hneigh,
|
||||
__global numtyp4 *restrict m,
|
||||
const int typeO, const int typeH,
|
||||
const numtyp alpha, const __global numtyp *restrict q_,
|
||||
const __global int *restrict tag, const __global int *restrict map,
|
||||
const __global int *restrict sametag) {
|
||||
int tid, ii, offset;
|
||||
atom_info(t_per_atom,ii,tid,offset);
|
||||
int i = BLOCK_ID_X*(BLOCK_SIZE_X)+tid;
|
||||
|
||||
if (i<nall) {
|
||||
int iO, iH1, iH2;
|
||||
iO = i;
|
||||
numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
|
||||
int itype = ix.w;
|
||||
if (itype == typeO){
|
||||
int iH1, iH2, iO;
|
||||
iH1 = hneigh[i*4 ];
|
||||
iH2 = hneigh[i*4+1];
|
||||
iO = i;
|
||||
numtyp qO; fetch(qO,iO,q_tex);
|
||||
compute_newsite(iO,iH1,iH2, &m[iO], qO, alpha, x_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
||||
const __global numtyp4 *restrict lj1,
|
||||
const __global numtyp4 *restrict lj3,
|
||||
@ -269,11 +304,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
||||
iO = i;
|
||||
iH1 = hneigh[i*4 ];
|
||||
iH2 = hneigh[i*4+1];
|
||||
if(fabs(m[iO].w) <= eq_zero) {
|
||||
compute_newsite(iO,iH1,iH2, &m[iO], alpha, x_);
|
||||
__threadfence();
|
||||
m[iO].w = qtmp;
|
||||
}
|
||||
x1 = m[iO];
|
||||
} else {
|
||||
iO = hneigh[i *4 ];
|
||||
@ -281,12 +311,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
||||
iH2 = hneigh[iO*4+1];
|
||||
if (iO >= inum) {
|
||||
non_local_oxy = 1;
|
||||
if(fabs(m[iO].w) <= eq_zero) {
|
||||
compute_newsite(iO,iH1,iH2, &m[iO], alpha, x_);
|
||||
numtyp qO; fetch(qO,iO,q_tex);
|
||||
__threadfence();
|
||||
m[iO].w = qO;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -341,11 +365,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
|
||||
jO = j;
|
||||
jH1 = hneigh[j*4 ];
|
||||
jH2 = hneigh[j*4+1];
|
||||
if (fabs(m[j].w) <= eq_zero) {
|
||||
compute_newsite(j, jH1, jH2, &m[j], alpha, x_);
|
||||
__threadfence();
|
||||
m[j].w = qj;
|
||||
}
|
||||
x2 = m[j];
|
||||
}
|
||||
delx = x1.x-x2.x;
|
||||
|
||||
@ -116,7 +116,7 @@ public:
|
||||
UCL_D_Vec<int> map_array;
|
||||
UCL_D_Vec<int> atom_sametag;
|
||||
|
||||
UCL_Kernel k_pair_distrib, k_pair_reneigh;
|
||||
UCL_Kernel k_pair_distrib, k_pair_reneigh, k_pair_newsite;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
|
||||
Reference in New Issue
Block a user