Optimization to reduce atomics on CPU
This commit is contained in:
@ -1596,7 +1596,8 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlocking<
|
||||
|
||||
const int three = 3;
|
||||
F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[three], dBOp_i[three], dln_BOp_pi_i[three], dln_BOp_pi2_i[three];
|
||||
F_FLOAT total_bo = 0.0;
|
||||
F_FLOAT dDeltap_self_i[3] = {0.0,0.0,0.0};
|
||||
F_FLOAT total_bo_i = 0.0;
|
||||
|
||||
int j_index,i_index;
|
||||
d_bo_first[i] = i*maxbo;
|
||||
@ -1771,7 +1772,7 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlocking<
|
||||
for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
|
||||
for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
|
||||
for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
|
||||
for (int d = 0; d < 3; d++) a_dDeltap_self(i,d) += dBOp_i[d];
|
||||
for (int d = 0; d < 3; d++) dDeltap_self_i[d] += dBOp_i[d];
|
||||
for (int d = 0; d < 3; d++) a_dDeltap_self(j,d) += -dBOp_i[d];
|
||||
|
||||
d_dln_BOp_pix(i,jj_index) = dln_BOp_pi_i[0];
|
||||
@ -1802,13 +1803,16 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlocking<
|
||||
d_BO(j,ii_index) -= bo_cut;
|
||||
d_BO_s(i,jj_index) -= bo_cut;
|
||||
d_BO_s(j,ii_index) -= bo_cut;
|
||||
total_bo += d_BO(i,jj_index);
|
||||
total_bo_i += d_BO(i,jj_index);
|
||||
a_total_bo[j] += d_BO(j,ii_index);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
a_total_bo[i] += total_bo;
|
||||
for (int d = 0; d < 3; d++)
|
||||
a_dDeltap_self(i,d) += dDeltap_self_i[d];
|
||||
|
||||
a_total_bo[i] += total_bo_i;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
@ -1977,106 +1981,6 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlockingP
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsFull, const int &ii) const {
|
||||
|
||||
const int i = d_ilist[ii];
|
||||
const X_FLOAT xtmp = x(i,0);
|
||||
const X_FLOAT ytmp = x(i,1);
|
||||
const X_FLOAT ztmp = x(i,2);
|
||||
const int itype = type(i);
|
||||
|
||||
F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3], dBOp_i[3], dln_BOp_pi_i[3], dln_BOp_pi2_i[3];
|
||||
F_FLOAT dDeltap_self_i[3] = {0.0,0.0,0.0};
|
||||
F_FLOAT total_bo_i = 0.0;
|
||||
|
||||
const int j_start = d_bo_first[i];
|
||||
const int j_end = j_start + d_bo_num[i];
|
||||
for (int jj = j_start; jj < j_end; jj++) {
|
||||
int j = d_bo_list[jj];
|
||||
j &= NEIGHMASK;
|
||||
const int jtype = type(j);
|
||||
const int j_index = jj - j_start;
|
||||
delij[0] = x(j,0) - xtmp;
|
||||
delij[1] = x(j,1) - ytmp;
|
||||
delij[2] = x(j,2) - ztmp;
|
||||
const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
|
||||
const F_FLOAT rsq_inv = 1.0 / rsq;
|
||||
|
||||
// bond_list
|
||||
const F_FLOAT rij = sqrt(rsq);
|
||||
const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
|
||||
const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
|
||||
const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
|
||||
const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
|
||||
const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
|
||||
const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
|
||||
const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
|
||||
const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
|
||||
const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
|
||||
|
||||
if (paramssing(itype).r_s > 0.0 && paramssing(jtype).r_s > 0.0) {
|
||||
C12 = p_bo1 * ((p_bo2 != 0) ? (pow(rij/r_s,p_bo2)) : 1.0);
|
||||
BO_s = (1.0+bo_cut)*exp(C12);
|
||||
} else BO_s = C12 = 0.0;
|
||||
|
||||
if (paramssing(itype).r_pi > 0.0 && paramssing(jtype).r_pi > 0.0) {
|
||||
C34 = p_bo3 * ((p_bo4 != 0) ? (pow(rij/r_pi,p_bo4)) : 1.0);
|
||||
BO_pi = exp(C34);
|
||||
} else BO_pi = C34 = 0.0;
|
||||
|
||||
if (paramssing(itype).r_pi2 > 0.0 && paramssing(jtype).r_pi2 > 0.0) {
|
||||
C56 = p_bo5 * ((p_bo6 != 0) ? (pow(rij/r_pi2,p_bo6)) : 1.0);
|
||||
BO_pi2 = exp(C56);
|
||||
} else BO_pi2 = C56 = 0.0;
|
||||
|
||||
BO = BO_s + BO_pi + BO_pi2;
|
||||
|
||||
// from BondOrder1
|
||||
|
||||
d_BO(i,j_index) = BO;
|
||||
d_BO_s(i,j_index) = BO_s;
|
||||
d_BO_pi(i,j_index) = BO_pi;
|
||||
d_BO_pi2(i,j_index) = BO_pi2;
|
||||
|
||||
F_FLOAT Cln_BOp_s = p_bo2 * C12 * rsq_inv;
|
||||
F_FLOAT Cln_BOp_pi = p_bo4 * C34 * rsq_inv;
|
||||
F_FLOAT Cln_BOp_pi2 = p_bo6 * C56 * rsq_inv;
|
||||
|
||||
if (nlocal == 0)
|
||||
Cln_BOp_s = Cln_BOp_pi = Cln_BOp_pi2 = 0.0;
|
||||
|
||||
for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
|
||||
for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
|
||||
for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
|
||||
for (int d = 0; d < 3; d++) dDeltap_self_i[d] += dBOp_i[d];
|
||||
|
||||
d_dln_BOp_pix(i,j_index) = dln_BOp_pi_i[0];
|
||||
d_dln_BOp_piy(i,j_index) = dln_BOp_pi_i[1];
|
||||
d_dln_BOp_piz(i,j_index) = dln_BOp_pi_i[2];
|
||||
|
||||
d_dln_BOp_pi2x(i,j_index) = dln_BOp_pi2_i[0];
|
||||
d_dln_BOp_pi2y(i,j_index) = dln_BOp_pi2_i[1];
|
||||
d_dln_BOp_pi2z(i,j_index) = dln_BOp_pi2_i[2];
|
||||
|
||||
d_dBOpx(i,j_index) = dBOp_i[0];
|
||||
d_dBOpy(i,j_index) = dBOp_i[1];
|
||||
d_dBOpz(i,j_index) = dBOp_i[2];
|
||||
|
||||
d_BO(i,j_index) -= bo_cut;
|
||||
d_BO_s(i,j_index) -= bo_cut;
|
||||
total_bo_i += d_BO(i,j_index);
|
||||
}
|
||||
|
||||
for (int d = 0; d < 3; d++)
|
||||
d_dDeltap_self(i,d) = dDeltap_self_i[d];
|
||||
|
||||
d_total_bo[i] = total_bo_i;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
template<int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -2209,6 +2113,106 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfPreview<N
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsFull, const int &ii) const {
|
||||
|
||||
const int i = d_ilist[ii];
|
||||
const X_FLOAT xtmp = x(i,0);
|
||||
const X_FLOAT ytmp = x(i,1);
|
||||
const X_FLOAT ztmp = x(i,2);
|
||||
const int itype = type(i);
|
||||
|
||||
F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3], dBOp_i[3], dln_BOp_pi_i[3], dln_BOp_pi2_i[3];
|
||||
F_FLOAT dDeltap_self_i[3] = {0.0,0.0,0.0};
|
||||
F_FLOAT total_bo_i = 0.0;
|
||||
|
||||
const int j_start = d_bo_first[i];
|
||||
const int j_end = j_start + d_bo_num[i];
|
||||
for (int jj = j_start; jj < j_end; jj++) {
|
||||
int j = d_bo_list[jj];
|
||||
j &= NEIGHMASK;
|
||||
const int jtype = type(j);
|
||||
const int j_index = jj - j_start;
|
||||
delij[0] = x(j,0) - xtmp;
|
||||
delij[1] = x(j,1) - ytmp;
|
||||
delij[2] = x(j,2) - ztmp;
|
||||
const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
|
||||
const F_FLOAT rsq_inv = 1.0 / rsq;
|
||||
|
||||
// bond_list
|
||||
const F_FLOAT rij = sqrt(rsq);
|
||||
const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
|
||||
const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
|
||||
const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
|
||||
const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
|
||||
const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
|
||||
const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
|
||||
const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
|
||||
const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
|
||||
const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
|
||||
|
||||
if (paramssing(itype).r_s > 0.0 && paramssing(jtype).r_s > 0.0) {
|
||||
C12 = p_bo1 * ((p_bo2 != 0) ? (pow(rij/r_s,p_bo2)) : 1.0);
|
||||
BO_s = (1.0+bo_cut)*exp(C12);
|
||||
} else BO_s = C12 = 0.0;
|
||||
|
||||
if (paramssing(itype).r_pi > 0.0 && paramssing(jtype).r_pi > 0.0) {
|
||||
C34 = p_bo3 * ((p_bo4 != 0) ? (pow(rij/r_pi,p_bo4)) : 1.0);
|
||||
BO_pi = exp(C34);
|
||||
} else BO_pi = C34 = 0.0;
|
||||
|
||||
if (paramssing(itype).r_pi2 > 0.0 && paramssing(jtype).r_pi2 > 0.0) {
|
||||
C56 = p_bo5 * ((p_bo6 != 0) ? (pow(rij/r_pi2,p_bo6)) : 1.0);
|
||||
BO_pi2 = exp(C56);
|
||||
} else BO_pi2 = C56 = 0.0;
|
||||
|
||||
BO = BO_s + BO_pi + BO_pi2;
|
||||
|
||||
// from BondOrder1
|
||||
|
||||
d_BO(i,j_index) = BO;
|
||||
d_BO_s(i,j_index) = BO_s;
|
||||
d_BO_pi(i,j_index) = BO_pi;
|
||||
d_BO_pi2(i,j_index) = BO_pi2;
|
||||
|
||||
F_FLOAT Cln_BOp_s = p_bo2 * C12 * rsq_inv;
|
||||
F_FLOAT Cln_BOp_pi = p_bo4 * C34 * rsq_inv;
|
||||
F_FLOAT Cln_BOp_pi2 = p_bo6 * C56 * rsq_inv;
|
||||
|
||||
if (nlocal == 0)
|
||||
Cln_BOp_s = Cln_BOp_pi = Cln_BOp_pi2 = 0.0;
|
||||
|
||||
for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
|
||||
for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
|
||||
for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
|
||||
for (int d = 0; d < 3; d++) dDeltap_self_i[d] += dBOp_i[d];
|
||||
|
||||
d_dln_BOp_pix(i,j_index) = dln_BOp_pi_i[0];
|
||||
d_dln_BOp_piy(i,j_index) = dln_BOp_pi_i[1];
|
||||
d_dln_BOp_piz(i,j_index) = dln_BOp_pi_i[2];
|
||||
|
||||
d_dln_BOp_pi2x(i,j_index) = dln_BOp_pi2_i[0];
|
||||
d_dln_BOp_pi2y(i,j_index) = dln_BOp_pi2_i[1];
|
||||
d_dln_BOp_pi2z(i,j_index) = dln_BOp_pi2_i[2];
|
||||
|
||||
d_dBOpx(i,j_index) = dBOp_i[0];
|
||||
d_dBOpy(i,j_index) = dBOp_i[1];
|
||||
d_dBOpz(i,j_index) = dBOp_i[2];
|
||||
|
||||
d_BO(i,j_index) -= bo_cut;
|
||||
d_BO_s(i,j_index) -= bo_cut;
|
||||
total_bo_i += d_BO(i,j_index);
|
||||
}
|
||||
|
||||
for (int d = 0; d < 3; d++)
|
||||
d_dDeltap_self(i,d) = dDeltap_self_i[d];
|
||||
|
||||
d_total_bo[i] = total_bo_i;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBondOrder1, const int &ii) const {
|
||||
|
||||
@ -57,15 +57,15 @@ struct TagPairReaxComputeLJCoulombShortNeigh{};
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
struct TagPairReaxComputeTabulatedLJCoulomb{};
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
struct TagPairReaxBuildListsHalfBlocking{};
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
struct TagPairReaxBuildListsHalfBlockingPreview{};
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
struct TagPairReaxBuildListsHalfPreview{};
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
struct TagPairReaxBuildListsHalfBlocking{};
|
||||
|
||||
struct TagPairReaxBuildListsFull{};
|
||||
|
||||
struct TagPairReaxZero{};
|
||||
@ -165,6 +165,10 @@ class PairReaxFFKokkos : public PairReaxFF {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagPairReaxComputeLJCoulombShortNeigh, const int&) const;
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagPairReaxBuildListsHalfBlocking<NEIGHFLAG>, const int&) const;
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagPairReaxBuildListsHalfBlockingPreview<NEIGHFLAG>, const int&) const;
|
||||
@ -173,10 +177,6 @@ class PairReaxFFKokkos : public PairReaxFF {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagPairReaxBuildListsHalfPreview<NEIGHFLAG>, const int&) const;
|
||||
|
||||
template<int NEIGHFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagPairReaxBuildListsHalfBlocking<NEIGHFLAG>, const int&) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagPairReaxBuildListsFull, const int&) const;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user