Added less-parallelism-higher-perf paths to recursive polynomial routines. Fixed qSNAP memory coalescing issue. Various modularity improvements.
This commit is contained in:
@ -44,7 +44,8 @@ struct TagPairSNAPComputeForce{};
|
||||
struct TagPairSNAPComputeNeigh{};
|
||||
struct TagPairSNAPComputeCayleyKlein{};
|
||||
struct TagPairSNAPPreUi{};
|
||||
struct TagPairSNAPComputeUi{};
|
||||
struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
|
||||
struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
|
||||
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
|
||||
struct TagPairSNAPComputeZi{};
|
||||
struct TagPairSNAPBeta{};
|
||||
@ -53,7 +54,9 @@ struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPComputeYiWithZlist{};
|
||||
template<int dir>
|
||||
struct TagPairSNAPComputeFusedDeidrj{};
|
||||
struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
|
||||
template<int dir>
|
||||
struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
|
||||
|
||||
// CPU backend only
|
||||
struct TagPairSNAPComputeNeighCPU{};
|
||||
@ -143,7 +146,10 @@ public:
|
||||
void operator() (TagPairSNAPPreUi,const int iatom_mod, const int j, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
|
||||
void operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiSmall>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiLarge>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
|
||||
@ -168,7 +174,11 @@ public:
|
||||
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const;
|
||||
void operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const;
|
||||
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const;
|
||||
|
||||
// CPU backend only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
||||
@ -341,18 +341,32 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
// ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
|
||||
{
|
||||
// team_size_compute_ui is defined in `pair_snap_kokkos.h`
|
||||
|
||||
// scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
|
||||
const int tile_size = vector_length * (twojmax + 1);
|
||||
const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
|
||||
|
||||
// total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
|
||||
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
|
||||
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
|
||||
if (chunk_size < 16384)
|
||||
{
|
||||
// Version with parallelism over j_bend
|
||||
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUi> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
|
||||
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
|
||||
// total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
|
||||
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
|
||||
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
|
||||
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
|
||||
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
|
||||
} else {
|
||||
// Version w/out parallelism over j_bend
|
||||
|
||||
// total number of teams needed: (natoms / 32) * (max_neighs)
|
||||
const int n_teams = chunk_size_div * max_neighs;
|
||||
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
|
||||
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
|
||||
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
|
||||
}
|
||||
}
|
||||
|
||||
//TransformUi: un-"fold" ulisttot, zero ylist
|
||||
@ -412,25 +426,51 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
|
||||
const int tile_size = vector_length * (twojmax + 1);
|
||||
const int scratch_size = scratch_size_helper<complex>(2 * team_size_compute_fused_deidrj * tile_size);
|
||||
|
||||
// total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
|
||||
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
|
||||
const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
|
||||
if (chunk_size < 16384)
|
||||
{
|
||||
// Version with parallelism over j_bend
|
||||
|
||||
// x direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrj<0>",policy_fused_deidrj_x,*this);
|
||||
// total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
|
||||
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
|
||||
const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
|
||||
|
||||
// y direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrj<1>",policy_fused_deidrj_y,*this);
|
||||
// x direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this);
|
||||
|
||||
// z direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrj<2>",policy_fused_deidrj_z,*this);
|
||||
// y direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this);
|
||||
|
||||
// z direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this);
|
||||
} else {
|
||||
// Version w/out parallelism over j_bend
|
||||
|
||||
// total number of teams needed: (natoms / 32) * (max_neighs)
|
||||
const int n_teams = chunk_size_div * max_neighs;
|
||||
const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
|
||||
|
||||
// x direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this);
|
||||
|
||||
// y direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this);
|
||||
|
||||
// z direction
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
|
||||
policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // LMP_KOKKOS_GPU
|
||||
@ -603,13 +643,13 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
auto bveci = my_sna.blist(idxb, idx_chem, ii);
|
||||
real_type bveci = my_sna.blist(ii, idx_chem, idxb);
|
||||
d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
const auto jdxb = jcoeff % idxb_max;
|
||||
const auto jdx_chem = jcoeff / idxb_max;
|
||||
real_type bvecj = my_sna.blist(jdxb, jdx_chem, ii);
|
||||
real_type bvecj = my_sna.blist(ii, jdx_chem, jdxb);
|
||||
d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
|
||||
d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
@ -736,7 +776,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUi>::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
@ -756,11 +796,37 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.compute_ui(team,iatom_mod, jbend, jj, iatom_div);
|
||||
my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
|
||||
int jj = flattened_idx - iatom_div * max_neighs;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
|
||||
@ -861,9 +927,9 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
|
||||
for (int itriple = 0; itriple < ntriples; itriple++) {
|
||||
|
||||
const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
|
||||
const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
|
||||
|
||||
my_sna.blist(idxb, itriple, iatom) = blocal;
|
||||
my_sna.blist(iatom, itriple, idxb) = blocal;
|
||||
}
|
||||
|
||||
}
|
||||
@ -871,7 +937,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const {
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
@ -891,12 +957,38 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.template compute_fused_deidrj<dir>(team, iatom_mod, jbend, jj, iatom_div);
|
||||
my_sna.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
|
||||
int jj = flattened_idx - max_neighs * iatom_div;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Begin routines that are unique to the CPU codepath. These do not take
|
||||
advantage of AoSoA data layouts, but that could be a good point of
|
||||
@ -925,13 +1017,13 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
auto bveci = my_sna.blist(idxb,idx_chem,ii);
|
||||
real_type bveci = my_sna.blist(ii,idx_chem,idxb);
|
||||
d_beta(icoeff,ii) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
const auto jdxb = jcoeff % idxb_max;
|
||||
const auto jdx_chem = jcoeff / idxb_max;
|
||||
auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
|
||||
real_type bvecj = my_sna.blist(ii,jdx_chem,jdxb);
|
||||
d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
|
||||
d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
@ -1221,7 +1313,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
|
||||
evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,idx_chem,idxb);
|
||||
}
|
||||
|
||||
// quadratic contributions
|
||||
@ -1230,12 +1322,12 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
auto bveci = my_sna.blist(idxb,idx_chem,ii);
|
||||
real_type bveci = my_sna.blist(ii,idx_chem,idxb);
|
||||
evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
auto jdxb = jcoeff % idxb_max;
|
||||
auto jdx_chem = jcoeff / idxb_max;
|
||||
auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
|
||||
auto bvecj = my_sna.blist(ii,jdx_chem,jdxb);
|
||||
evdwl += d_coeffi[k++]*bveci*bvecj;
|
||||
}
|
||||
}
|
||||
|
||||
@ -45,12 +45,12 @@ struct WignerWrapper {
|
||||
{ ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex get(const int& ma) {
|
||||
complex get(const int& ma) const {
|
||||
return complex(buffer[offset + 2 * vector_length * ma], buffer[offset + vector_length + 2 * vector_length * ma]);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void set(const int& ma, const complex& store) {
|
||||
void set(const int& ma, const complex& store) const {
|
||||
buffer[offset + 2 * vector_length * ma] = store.re;
|
||||
buffer[offset + vector_length + 2 * vector_length * ma] = store.im;
|
||||
}
|
||||
@ -122,8 +122,14 @@ inline
|
||||
void compute_cayley_klein(const int&, const int&, const int&);
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP
|
||||
|
||||
// version of the code with parallelism over j_bend
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
|
||||
void compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
|
||||
// version of the code without parallelism over j_bend
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); // ForceSNAP
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_zi(const int&, const int&, const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -135,6 +141,35 @@ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_bi(const int&, const int&, const int&); // ForceSNAP
|
||||
|
||||
// functions for derivatives, GPU only
|
||||
// version of the code with parallelism over j_bend
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
|
||||
// version of the code without parallelism over j_bend
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); //ForceSNAP
|
||||
|
||||
// core "evaluation" functions that get plugged into "compute" functions
|
||||
// plugged into compute_ui_small, compute_ui_large
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void evaluate_ui_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&, const int&,
|
||||
const int&, const int&, const int&);
|
||||
// plugged into compute_zi, compute_yi
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
|
||||
const int&, const int&, const int&, const int&, const real_type*);
|
||||
// plugged into compute_yi, compute_yi_with_zlist
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
|
||||
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &);
|
||||
// plugged into compute_fused_deidrj_small, compute_fused_deidrj_large
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
real_type evaluate_duidrj_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&,
|
||||
const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&,
|
||||
const int&, const int&, const int&, const int&);
|
||||
|
||||
// functions for bispectrum coefficients, CPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
|
||||
@ -148,11 +183,6 @@ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
|
||||
|
||||
// functions for derivatives, GPU only
|
||||
template<int dir>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
|
||||
|
||||
// functions for derivatives, CPU only
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
|
||||
@ -168,23 +198,6 @@ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_s_dsfac(const real_type, const real_type, real_type&, real_type&); // compute_cayley_klein
|
||||
|
||||
static KOKKOS_FORCEINLINE_FUNCTION
|
||||
void sincos_wrapper(double x, double* sin_, double *cos_) {
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
*sin_ = sycl::sincos(x, cos_);
|
||||
#else
|
||||
sincos(x, sin_, cos_);
|
||||
#endif
|
||||
}
|
||||
static KOKKOS_FORCEINLINE_FUNCTION
|
||||
void sincos_wrapper(float x, float* sin_, float *cos_) {
|
||||
#ifdef __SYCL_DEVICE_ONLY__
|
||||
*sin_ = sycl::sincos(x, cos_);
|
||||
#else
|
||||
sincosf(x, sin_, cos_);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef TIMING_INFO
|
||||
double* timers;
|
||||
timespec starttime, endtime;
|
||||
@ -207,7 +220,7 @@ inline
|
||||
|
||||
int twojmax, diagonalstyle;
|
||||
|
||||
t_sna_3d_ll blist;
|
||||
t_sna_3d blist;
|
||||
t_sna_3c_ll ulisttot;
|
||||
t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
|
||||
t_sna_3c_ll zlist;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user