From 1216188d4828cf2f09ba64441c000fd3c90fbabf Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Fri, 9 Sep 2022 09:34:22 -0700 Subject: [PATCH] Implementation of an int4-packed Clebsch-Gordon look-up table for SNAP --- src/KOKKOS/pair_snap_kokkos.h | 4 +- src/KOKKOS/sna_kokkos.h | 64 ++++++++++++++++++++++++++++++- src/KOKKOS/sna_kokkos_impl.h | 72 ++++++++++------------------------- 3 files changed, 85 insertions(+), 55 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index b8160c318b..86b9f0d492 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -103,10 +103,10 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_pre_ui = 4; static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4; static constexpr int tile_size_transform_ui = 4; - static constexpr int tile_size_compute_zi = 8; + static constexpr int tile_size_compute_zi = 4; static constexpr int tile_size_compute_bi = 4; static constexpr int tile_size_transform_bi = 4; - static constexpr int tile_size_compute_yi = 8; + static constexpr int tile_size_compute_yi = 4; static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; #endif diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index fe70129660..62567c71a7 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -61,6 +61,68 @@ struct alignas(8) FullHalfMapper { int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj }; +// Packed types for Zi, Yi lookup tables +// This is abstracted into a stand-alone struct so different implementations +// could be used for different architectures via various `ifdef` guards. +struct alignas(16) idxz_struct { + reax_int4 j1_j2_j_jjuhalf; + reax_int4 mabminmax; + reax_int4 nanb; + + idxz_struct() = default; + + KOKKOS_INLINE_FUNCTION + idxz_struct(int j1, int j2, int j, int ma1min, int ma2max, int mb1min, int mb2max, int na, int nb, int jju_half) + : j1_j2_j_jjuhalf{j1, j2, j, jju_half}, + mabminmax{ma1min, ma2max, mb1min, mb2max}, + nanb{na, nb, 0, 0} + { } + + KOKKOS_INLINE_FUNCTION + void get_zi(int &j1, int &j2, int &j, int &ma1min, int &ma2max, int &mb1min, int &mb2max, int &na, int &nb) { + reax_int4 pack1 = this->j1_j2_j_jjuhalf; + j1 = pack1.i0; + j2 = pack1.i1; + j = pack1.i2; + reax_int4 pack2 = this->mabminmax; + ma1min = pack2.i0; + ma2max = pack2.i1; + mb1min = pack2.i2; + mb2max = pack2.i3; + reax_int4 pack3 = this->nanb; + na = pack3.i0; + nb = pack3.i1; + } + + KOKKOS_INLINE_FUNCTION + void get_yi(int &j1, int &j2, int &j, int &ma1min, int &ma2max, int &mb1min, int &mb2max, int &na, int &nb, int& jju_half) { + reax_int4 pack1 = this->j1_j2_j_jjuhalf; + j1 = pack1.i0; + j2 = pack1.i1; + j = pack1.i2; + jju_half = pack1.i3; + reax_int4 pack2 = this->mabminmax; + ma1min = pack2.i0; + ma2max = pack2.i1; + mb1min = pack2.i2; + mb2max = pack2.i3; + reax_int4 pack3 = this->nanb; + na = pack3.i0; + nb = pack3.i1; + } + + KOKKOS_INLINE_FUNCTION + void get_yi_with_zlist(int &j1, int &j2, int &j, int &jju_half) { + reax_int4 pack1 = this->j1_j2_j_jjuhalf; + j1 = pack1.i0; + j2 = pack1.i1; + j = pack1.i2; + jju_half = pack1.i3; + } + +}; + + template class SNAKokkos { @@ -263,7 +325,7 @@ class SNAKokkos { //use indexlist instead of loops, constructor generates these // Same across all SNAKokkos - Kokkos::View idxz; + Kokkos::View idxz; Kokkos::View idxb; Kokkos::View idxcg_block; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 8d81a4a65b..ee2237d31d 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -253,21 +253,20 @@ void SNAKokkos::build_indexlist() for (int mb = 0; 2*mb <= j; mb++) for (int ma = 0; ma <= j; ma++) { - h_idxz(idxz_count,0) = j1; - h_idxz(idxz_count,1) = j2; - h_idxz(idxz_count,2) = j; - h_idxz(idxz_count,3) = MAX(0, (2 * ma - j - j2 + j1) / 2); - h_idxz(idxz_count,4) = (2 * ma - j - (2 * h_idxz(idxz_count,3) - j1) + j2) / 2; - h_idxz(idxz_count,5) = MAX(0, (2 * mb - j - j2 + j1) / 2); - h_idxz(idxz_count,6) = (2 * mb - j - (2 * h_idxz(idxz_count,5) - j1) + j2) / 2; - h_idxz(idxz_count,7) = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz(idxz_count,3) + 1; - h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1; + int ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2); + int ma2max = (2 * ma - j - (2 * ma1min - j1) + j2) / 2; + int mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2); + int mb2max = (2 * mb - j - (2 * mb1min - j1) + j2) / 2; + int na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - ma1min + 1; + int nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - mb1min + 1; // apply to z(j1,j2,j,ma,mb) to unique element of y(j) // ylist is "compressed" via symmetry in its // contraction with dulist const int jju_half = h_idxu_half_block[j] + (j+1)*mb + ma; - h_idxz(idxz_count,9) = jju_half; + + // idxz_struct's constructor handles all of the data packing + h_idxz(idxz_count) = idxz_struct(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half); idxz_count++; } @@ -663,15 +662,8 @@ KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) { - const int j1 = idxz(jjz, 0); - const int j2 = idxz(jjz, 1); - const int j = idxz(jjz, 2); - const int ma1min = idxz(jjz, 3); - const int ma2max = idxz(jjz, 4); - const int mb1min = idxz(jjz, 5); - const int mb2max = idxz(jjz, 6); - const int na = idxz(jjz, 7); - const int nb = idxz(jjz, 8); + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb; + idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb); const real_type* cgblock = cglist.data() + idxcg_block(j1, j2, j); @@ -792,16 +784,8 @@ void SNAKokkos::compute_yi(int iatom_mod, const Kokkos::View &beta_pack) { - const int j1 = idxz(jjz, 0); - const int j2 = idxz(jjz, 1); - const int j = idxz(jjz, 2); - const int ma1min = idxz(jjz, 3); - const int ma2max = idxz(jjz, 4); - const int mb1min = idxz(jjz, 5); - const int mb2max = idxz(jjz, 6); - const int na = idxz(jjz, 7); - const int nb = idxz(jjz, 8); - const int jju_half = idxz(jjz, 9); + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half; + idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half); const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j); //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; @@ -840,10 +824,9 @@ KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, const Kokkos::View &beta_pack) { - const int j1 = idxz(jjz, 0); - const int j2 = idxz(jjz, 1); - const int j = idxz(jjz, 2); - const int jju_half = idxz(jjz, 9); + int j1, j2, j, jju_half; + idxz(jjz).get_yi_with_zlist(j1, j2, j, jju_half); + int idouble = 0; for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { @@ -1259,15 +1242,8 @@ void SNAKokkos::compute_zi_cpu(const int& const int iatom = iter / idxz_max; const int jjz = iter % idxz_max; - const int j1 = idxz(jjz, 0); - const int j2 = idxz(jjz, 1); - const int j = idxz(jjz, 2); - const int ma1min = idxz(jjz, 3); - const int ma2max = idxz(jjz, 4); - const int mb1min = idxz(jjz, 5); - const int mb2max = idxz(jjz, 6); - const int na = idxz(jjz, 7); - const int nb = idxz(jjz, 8); + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb; + idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb); const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j); @@ -1429,16 +1405,8 @@ void SNAKokkos::compute_yi_cpu(int iter, const int iatom = iter / idxz_max; const int jjz = iter % idxz_max; - const int j1 = idxz(jjz, 0); - const int j2 = idxz(jjz, 1); - const int j = idxz(jjz, 2); - const int ma1min = idxz(jjz, 3); - const int ma2max = idxz(jjz, 4); - const int mb1min = idxz(jjz, 5); - const int mb2max = idxz(jjz, 6); - const int na = idxz(jjz, 7); - const int nb = idxz(jjz, 8); - const int jju_half = idxz(jjz, 9); + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half; + idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half); const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j); //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;