Optimization to reduce atomics on CPU

This commit is contained in:
Stan Gerald Moore
2022-03-03 07:46:35 -07:00
parent beee3126e0
commit b6b7846c50
2 changed files with 115 additions and 111 deletions

View File

@ -1596,7 +1596,8 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlocking<
const int three = 3;
F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[three], dBOp_i[three], dln_BOp_pi_i[three], dln_BOp_pi2_i[three];
F_FLOAT total_bo = 0.0;
F_FLOAT dDeltap_self_i[3] = {0.0,0.0,0.0};
F_FLOAT total_bo_i = 0.0;
int j_index,i_index;
d_bo_first[i] = i*maxbo;
@ -1771,7 +1772,7 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlocking<
for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
for (int d = 0; d < 3; d++) a_dDeltap_self(i,d) += dBOp_i[d];
for (int d = 0; d < 3; d++) dDeltap_self_i[d] += dBOp_i[d];
for (int d = 0; d < 3; d++) a_dDeltap_self(j,d) += -dBOp_i[d];
d_dln_BOp_pix(i,jj_index) = dln_BOp_pi_i[0];
@ -1802,13 +1803,16 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlocking<
d_BO(j,ii_index) -= bo_cut;
d_BO_s(i,jj_index) -= bo_cut;
d_BO_s(j,ii_index) -= bo_cut;
total_bo += d_BO(i,jj_index);
total_bo_i += d_BO(i,jj_index);
a_total_bo[j] += d_BO(j,ii_index);
}
}
}
a_total_bo[i] += total_bo;
for (int d = 0; d < 3; d++)
a_dDeltap_self(i,d) += dDeltap_self_i[d];
a_total_bo[i] += total_bo_i;
}
/* ---------------------------------------------------------------------- */
@ -1977,106 +1981,6 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfBlockingP
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsFull, const int &ii) const {
const int i = d_ilist[ii];
const X_FLOAT xtmp = x(i,0);
const X_FLOAT ytmp = x(i,1);
const X_FLOAT ztmp = x(i,2);
const int itype = type(i);
F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3], dBOp_i[3], dln_BOp_pi_i[3], dln_BOp_pi2_i[3];
F_FLOAT dDeltap_self_i[3] = {0.0,0.0,0.0};
F_FLOAT total_bo_i = 0.0;
const int j_start = d_bo_first[i];
const int j_end = j_start + d_bo_num[i];
for (int jj = j_start; jj < j_end; jj++) {
int j = d_bo_list[jj];
j &= NEIGHMASK;
const int jtype = type(j);
const int j_index = jj - j_start;
delij[0] = x(j,0) - xtmp;
delij[1] = x(j,1) - ytmp;
delij[2] = x(j,2) - ztmp;
const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
const F_FLOAT rsq_inv = 1.0 / rsq;
// bond_list
const F_FLOAT rij = sqrt(rsq);
const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
if (paramssing(itype).r_s > 0.0 && paramssing(jtype).r_s > 0.0) {
C12 = p_bo1 * ((p_bo2 != 0) ? (pow(rij/r_s,p_bo2)) : 1.0);
BO_s = (1.0+bo_cut)*exp(C12);
} else BO_s = C12 = 0.0;
if (paramssing(itype).r_pi > 0.0 && paramssing(jtype).r_pi > 0.0) {
C34 = p_bo3 * ((p_bo4 != 0) ? (pow(rij/r_pi,p_bo4)) : 1.0);
BO_pi = exp(C34);
} else BO_pi = C34 = 0.0;
if (paramssing(itype).r_pi2 > 0.0 && paramssing(jtype).r_pi2 > 0.0) {
C56 = p_bo5 * ((p_bo6 != 0) ? (pow(rij/r_pi2,p_bo6)) : 1.0);
BO_pi2 = exp(C56);
} else BO_pi2 = C56 = 0.0;
BO = BO_s + BO_pi + BO_pi2;
// from BondOrder1
d_BO(i,j_index) = BO;
d_BO_s(i,j_index) = BO_s;
d_BO_pi(i,j_index) = BO_pi;
d_BO_pi2(i,j_index) = BO_pi2;
F_FLOAT Cln_BOp_s = p_bo2 * C12 * rsq_inv;
F_FLOAT Cln_BOp_pi = p_bo4 * C34 * rsq_inv;
F_FLOAT Cln_BOp_pi2 = p_bo6 * C56 * rsq_inv;
if (nlocal == 0)
Cln_BOp_s = Cln_BOp_pi = Cln_BOp_pi2 = 0.0;
for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
for (int d = 0; d < 3; d++) dDeltap_self_i[d] += dBOp_i[d];
d_dln_BOp_pix(i,j_index) = dln_BOp_pi_i[0];
d_dln_BOp_piy(i,j_index) = dln_BOp_pi_i[1];
d_dln_BOp_piz(i,j_index) = dln_BOp_pi_i[2];
d_dln_BOp_pi2x(i,j_index) = dln_BOp_pi2_i[0];
d_dln_BOp_pi2y(i,j_index) = dln_BOp_pi2_i[1];
d_dln_BOp_pi2z(i,j_index) = dln_BOp_pi2_i[2];
d_dBOpx(i,j_index) = dBOp_i[0];
d_dBOpy(i,j_index) = dBOp_i[1];
d_dBOpz(i,j_index) = dBOp_i[2];
d_BO(i,j_index) -= bo_cut;
d_BO_s(i,j_index) -= bo_cut;
total_bo_i += d_BO(i,j_index);
}
for (int d = 0; d < 3; d++)
d_dDeltap_self(i,d) = dDeltap_self_i[d];
d_total_bo[i] = total_bo_i;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
@ -2209,6 +2113,106 @@ void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsHalfPreview<N
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBuildListsFull, const int &ii) const {
const int i = d_ilist[ii];
const X_FLOAT xtmp = x(i,0);
const X_FLOAT ytmp = x(i,1);
const X_FLOAT ztmp = x(i,2);
const int itype = type(i);
F_FLOAT C12, C34, C56, BO_s, BO_pi, BO_pi2, BO, delij[3], dBOp_i[3], dln_BOp_pi_i[3], dln_BOp_pi2_i[3];
F_FLOAT dDeltap_self_i[3] = {0.0,0.0,0.0};
F_FLOAT total_bo_i = 0.0;
const int j_start = d_bo_first[i];
const int j_end = j_start + d_bo_num[i];
for (int jj = j_start; jj < j_end; jj++) {
int j = d_bo_list[jj];
j &= NEIGHMASK;
const int jtype = type(j);
const int j_index = jj - j_start;
delij[0] = x(j,0) - xtmp;
delij[1] = x(j,1) - ytmp;
delij[2] = x(j,2) - ztmp;
const F_FLOAT rsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
const F_FLOAT rsq_inv = 1.0 / rsq;
// bond_list
const F_FLOAT rij = sqrt(rsq);
const F_FLOAT p_bo1 = paramstwbp(itype,jtype).p_bo1;
const F_FLOAT p_bo2 = paramstwbp(itype,jtype).p_bo2;
const F_FLOAT p_bo3 = paramstwbp(itype,jtype).p_bo3;
const F_FLOAT p_bo4 = paramstwbp(itype,jtype).p_bo4;
const F_FLOAT p_bo5 = paramstwbp(itype,jtype).p_bo5;
const F_FLOAT p_bo6 = paramstwbp(itype,jtype).p_bo6;
const F_FLOAT r_s = paramstwbp(itype,jtype).r_s;
const F_FLOAT r_pi = paramstwbp(itype,jtype).r_pi;
const F_FLOAT r_pi2 = paramstwbp(itype,jtype).r_pi2;
if (paramssing(itype).r_s > 0.0 && paramssing(jtype).r_s > 0.0) {
C12 = p_bo1 * ((p_bo2 != 0) ? (pow(rij/r_s,p_bo2)) : 1.0);
BO_s = (1.0+bo_cut)*exp(C12);
} else BO_s = C12 = 0.0;
if (paramssing(itype).r_pi > 0.0 && paramssing(jtype).r_pi > 0.0) {
C34 = p_bo3 * ((p_bo4 != 0) ? (pow(rij/r_pi,p_bo4)) : 1.0);
BO_pi = exp(C34);
} else BO_pi = C34 = 0.0;
if (paramssing(itype).r_pi2 > 0.0 && paramssing(jtype).r_pi2 > 0.0) {
C56 = p_bo5 * ((p_bo6 != 0) ? (pow(rij/r_pi2,p_bo6)) : 1.0);
BO_pi2 = exp(C56);
} else BO_pi2 = C56 = 0.0;
BO = BO_s + BO_pi + BO_pi2;
// from BondOrder1
d_BO(i,j_index) = BO;
d_BO_s(i,j_index) = BO_s;
d_BO_pi(i,j_index) = BO_pi;
d_BO_pi2(i,j_index) = BO_pi2;
F_FLOAT Cln_BOp_s = p_bo2 * C12 * rsq_inv;
F_FLOAT Cln_BOp_pi = p_bo4 * C34 * rsq_inv;
F_FLOAT Cln_BOp_pi2 = p_bo6 * C56 * rsq_inv;
if (nlocal == 0)
Cln_BOp_s = Cln_BOp_pi = Cln_BOp_pi2 = 0.0;
for (int d = 0; d < 3; d++) dln_BOp_pi_i[d] = -(BO_pi*Cln_BOp_pi)*delij[d];
for (int d = 0; d < 3; d++) dln_BOp_pi2_i[d] = -(BO_pi2*Cln_BOp_pi2)*delij[d];
for (int d = 0; d < 3; d++) dBOp_i[d] = -(BO_s*Cln_BOp_s+BO_pi*Cln_BOp_pi+BO_pi2*Cln_BOp_pi2)*delij[d];
for (int d = 0; d < 3; d++) dDeltap_self_i[d] += dBOp_i[d];
d_dln_BOp_pix(i,j_index) = dln_BOp_pi_i[0];
d_dln_BOp_piy(i,j_index) = dln_BOp_pi_i[1];
d_dln_BOp_piz(i,j_index) = dln_BOp_pi_i[2];
d_dln_BOp_pi2x(i,j_index) = dln_BOp_pi2_i[0];
d_dln_BOp_pi2y(i,j_index) = dln_BOp_pi2_i[1];
d_dln_BOp_pi2z(i,j_index) = dln_BOp_pi2_i[2];
d_dBOpx(i,j_index) = dBOp_i[0];
d_dBOpy(i,j_index) = dBOp_i[1];
d_dBOpz(i,j_index) = dBOp_i[2];
d_BO(i,j_index) -= bo_cut;
d_BO_s(i,j_index) -= bo_cut;
total_bo_i += d_BO(i,j_index);
}
for (int d = 0; d < 3; d++)
d_dDeltap_self(i,d) = dDeltap_self_i[d];
d_total_bo[i] = total_bo_i;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairReaxFFKokkos<DeviceType>::operator()(TagPairReaxBondOrder1, const int &ii) const {

View File

@ -57,15 +57,15 @@ struct TagPairReaxComputeLJCoulombShortNeigh{};
template<int NEIGHFLAG, int EVFLAG>
struct TagPairReaxComputeTabulatedLJCoulomb{};
template<int NEIGHFLAG>
struct TagPairReaxBuildListsHalfBlocking{};
template<int NEIGHFLAG>
struct TagPairReaxBuildListsHalfBlockingPreview{};
template<int NEIGHFLAG>
struct TagPairReaxBuildListsHalfPreview{};
template<int NEIGHFLAG>
struct TagPairReaxBuildListsHalfBlocking{};
struct TagPairReaxBuildListsFull{};
struct TagPairReaxZero{};
@ -165,6 +165,10 @@ class PairReaxFFKokkos : public PairReaxFF {
KOKKOS_INLINE_FUNCTION
void operator()(TagPairReaxComputeLJCoulombShortNeigh, const int&) const;
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void operator()(TagPairReaxBuildListsHalfBlocking<NEIGHFLAG>, const int&) const;
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void operator()(TagPairReaxBuildListsHalfBlockingPreview<NEIGHFLAG>, const int&) const;
@ -173,10 +177,6 @@ class PairReaxFFKokkos : public PairReaxFF {
KOKKOS_INLINE_FUNCTION
void operator()(TagPairReaxBuildListsHalfPreview<NEIGHFLAG>, const int&) const;
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void operator()(TagPairReaxBuildListsHalfBlocking<NEIGHFLAG>, const int&) const;
KOKKOS_INLINE_FUNCTION
void operator()(TagPairReaxBuildListsFull, const int&) const;