Removed AoSoA from ulisttot_[re_/im_/]_pack

This commit is contained in:
Evan Weinberg
2024-11-19 10:44:30 -08:00
parent e95d1dec3f
commit 120c4600cf
3 changed files with 44 additions and 47 deletions

View File

@ -746,26 +746,25 @@ template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
const int ii = iatom_mod + iatom_div * vector_length;
if (ii >= chunk_size) return;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
const int ninside = d_ninside(ii);
const int ninside = d_ninside(iatom);
if (jnbor >= ninside) return;
snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
snaKK.compute_cayley_klein(iatom,jnbor);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
const int ii = iatom_mod + iatom_div * vector_length;
if (ii >= chunk_size) return;
int itype = type(ii);
int itype = type(iatom);
int ielem = d_map[itype];
snaKK.pre_ui(iatom_mod, j, ielem, iatom_div);
snaKK.pre_ui(iatom, j, ielem);
}
template<class DeviceType, typename real_type, int vector_length>
@ -834,8 +833,8 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
const FullHalfMapper mapper = snaKK.idxu_full_half[idxu];
auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
auto utot_re = snaKK.ulisttot_re_gpu(iatom, mapper.idxu_half, ielem);
auto utot_im = snaKK.ulisttot_im_gpu(iatom, mapper.idxu_half, ielem);
if (mapper.flip_sign == 1) {
utot_im = -utot_im;
@ -843,7 +842,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
utot_re = -utot_re;
}
snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
snaKK.ulisttot_gpu(iatom, idxu, ielem) = { utot_re, utot_im };
if (mapper.flip_sign == 0) {
snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;

View File

@ -187,9 +187,9 @@ class SNAKokkos {
// functions for bispectrum coefficients, GPU only
KOKKOS_INLINE_FUNCTION
void compute_cayley_klein(const int&, const int&, const int&) const;
void compute_cayley_klein(const int&, const int&) const;
KOKKOS_INLINE_FUNCTION
void pre_ui(const int&, const int&, const int&, const int&) const; // ForceSNAP
void pre_ui(const int&, const int&, const int&) const; // ForceSNAP
// version of the code with parallelism over j_bend
KOKKOS_INLINE_FUNCTION
@ -223,7 +223,7 @@ class SNAKokkos {
// plugged into compute_ui_small, compute_ui_large
KOKKOS_FORCEINLINE_FUNCTION
void evaluate_ui_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&, const int&,
const int&, const int&, const int&) const;
const int&, const int&) const;
// plugged into compute_zi, compute_yi
KOKKOS_FORCEINLINE_FUNCTION
complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
@ -308,9 +308,9 @@ class SNAKokkos {
t_sna_3c db_gpu; // `db`
t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z}
t_sna_4d_ll ulisttot_re_pack; // split real,
t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened
t_sna_4c_ll ulisttot_pack; // AoSoA layout
t_sna_3d_ll ulisttot_re_gpu; // split real,
t_sna_3d_ll ulisttot_im_gpu; // imag, AoSoA, flattened
t_sna_3c_ll ulisttot_gpu; // packed and de-symmetrized
t_sna_4c_ll zlist_pack; // AoSoA layout
t_sna_4d_ll blist_pack;
t_sna_4d_ll ylist_pack_re; // split real,

View File

@ -322,9 +322,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::grow_rij(int newnatom, int
MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4);
MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation
MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1);
MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div);
MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div);
MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div);
MemKK::realloc_kokkos(ulisttot_re_gpu,"sna:ulisttot_re_gpu",natom_pad,idxu_half_max,nelements);
MemKK::realloc_kokkos(ulisttot_im_gpu,"sna:ulisttot_im_gpu",natom_pad,idxu_half_max,nelements);
MemKK::realloc_kokkos(ulisttot_gpu,"sna:ulisttot_gpu",natom_pad,idxu_max,nelements);
MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1);
MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1);
MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div);
@ -342,9 +342,9 @@ void SNAKokkos<DeviceType, real_type, vector_length>::grow_rij(int newnatom, int
MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1);
MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom_pad);
MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom_pad);
MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1);
MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1);
MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1);
MemKK::realloc_kokkos(ulisttot_re_gpu,"sna:ulisttot_re_gpu",1,1,1);
MemKK::realloc_kokkos(ulisttot_im_gpu,"sna:ulisttot_im_gpu",1,1,1);
MemKK::realloc_kokkos(ulisttot_gpu,"sna:ulisttot_pack_gpu",1,1,1);
MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom_pad,nmax);
MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom_pad);
MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1);
@ -371,9 +371,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::grow_rij(int newnatom, int
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div) const
void SNAKokkos<DeviceType, real_type, vector_length>::compute_cayley_klein(const int& iatom, const int& jnbor) const
{
const int iatom = iatom_mod + vector_length * iatom_div;
const real_type x = rij(iatom,jnbor,0);
const real_type y = rij(iatom,jnbor,1);
const real_type z = rij(iatom,jnbor,2);
@ -460,9 +459,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_cayley_klein(const
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div) const
void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui(const int& iatom, const int& j, const int& ielem) const
{
for (int jelem = 0; jelem < nelements; jelem++) {
int jju_half = idxu_half_block(j);
@ -475,8 +473,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui(const int& iatom_mo
real_type re_part = static_cast<real_type>(0.);
if (ma == mb && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; }
ulisttot_re_pack(iatom_mod, jju_half, jelem, iatom_div) = re_part;
ulisttot_im_pack(iatom_mod, jju_half, jelem, iatom_div) = static_cast<real_type>(0.);
ulisttot_re_gpu(iatom, jju_half, jelem) = re_part;
ulisttot_im_gpu(iatom, jju_half, jelem) = static_cast<real_type>(0.);
jju_half++;
}
@ -518,7 +516,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_small(const typ
// this for loop is here for context --- we expose additional
// parallelism over this loop instead
//for (int j_bend = 0; j_bend <= twojmax; j_bend++) {
evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div);
evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom, j_bend);
}
// Version of the code that loops over all `j_bend` values which reduces integer arithmetic
@ -550,7 +548,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_large(const typ
#pragma unroll
#endif
for (int j_bend = 0; j_bend <= twojmax; j_bend++) {
evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div);
evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom, j_bend);
}
}
@ -559,9 +557,8 @@ template<class DeviceType, typename real_type, int vector_length>
KOKKOS_FORCEINLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const WignerWrapper<real_type, vector_length>& ulist_wrapper,
const complex& a, const complex& b, const real_type& sfac, const int& jelem,
const int& iatom_mod, const int& j_bend, const int& iatom_div) const
const int& iatom, const int& j_bend) const
{
// utot(j,ma,mb) = 0 for all j,ma,ma
// utot(j,ma,ma) = 1 for all j,ma
// for j in neighbors of i:
@ -621,8 +618,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
const complex ulist_prev = ulist_wrapper.get(ma);
// atomic add the previous level here
Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac);
Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac);
Kokkos::atomic_add(&(ulisttot_re_gpu(iatom, jjup + ma, jelem)), ulist_prev.re * sfac);
Kokkos::atomic_add(&(ulisttot_im_gpu(iatom, jjup + ma, jelem)), ulist_prev.im * sfac);
// ulist_accum += rootpq * b * ulist_prev;
real_type rootpq = rootpqarray(j - ma, mb);
@ -651,8 +648,8 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
const complex ulist_prev = ulist_wrapper.get(ma);
// atomic add the previous level here
Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac);
Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac);
Kokkos::atomic_add(&(ulisttot_re_gpu(iatom, jjup + ma, jelem)), ulist_prev.re * sfac);
Kokkos::atomic_add(&(ulisttot_im_gpu(iatom, jjup + ma, jelem)), ulist_prev.im * sfac);
}
}
@ -695,6 +692,7 @@ template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) const
{
const int iatom = iatom_mod + vector_length * iatom_div;
// for j1 = 0,...,twojmax
// for j2 = 0,twojmax
// for j = |j1-j2|,Min(twojmax,j1+j2),2
@ -725,7 +723,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
const int jju_index = jju+mb*(j+1)+ma;
const int jjz_index = jjz+mb*(j+1)+ma;
if (2*mb == j) return; // I think we can remove this?
const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
const complex utot = ulisttot_gpu(iatom, jju_index, elem3);
const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
}
@ -741,7 +739,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
const complex utot = ulisttot_gpu(iatom, jju_index, elem3);
const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
sumzu_temp += utot.re * zloc.re + utot.im * zloc.im;
@ -752,7 +750,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iato
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
const complex utot = ulisttot_gpu(iatom, jju_index, elem3);
const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
sumzu += static_cast<real_type>(0.5) * (utot.re * zloc.re + utot.im * zloc.im);
} // end if jeven
@ -861,7 +859,7 @@ KOKKOS_FORCEINLINE_FUNCTION
typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<DeviceType, real_type, vector_length>::evaluate_zi(const int& j1, const int& j2, const int& j,
const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb,
const int& iatom_mod, const int& elem1, const int& elem2, const int& iatom_div, const real_type* cgblock) const {
const int iatom = iatom_mod + vector_length * iatom_div;
complex ztmp = complex::zero();
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
@ -881,8 +879,8 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
#pragma unroll
#endif
for (int ia = 0; ia < na; ia++) {
const complex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div);
const complex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div);
const complex utot1 = ulisttot_gpu(iatom, jju1+ma1, elem1);
const complex utot2 = ulisttot_gpu(iatom, jju2+ma2, elem2);
const real_type cgcoeff_a = cgblock[icga];
const real_type cgcoeff_b = cgblock[icgb];
ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
@ -2347,9 +2345,9 @@ double SNAKokkos<DeviceType, real_type, vector_length>::memory_usage()
bytes += MemKK::memory_usage(sfac_gpu);
bytes += MemKK::memory_usage(ulisttot_re_pack);
bytes += MemKK::memory_usage(ulisttot_im_pack);
bytes += MemKK::memory_usage(ulisttot_pack);
bytes += MemKK::memory_usage(ulisttot_re_gpu);
bytes += MemKK::memory_usage(ulisttot_im_gpu);
bytes += MemKK::memory_usage(ulisttot_gpu);
bytes += MemKK::memory_usage(zlist_pack);
bytes += MemKK::memory_usage(blist_pack);