Added less-parallelism-higher-perf paths to recursive polynomial routines. Fixed qSNAP memory coalescing issue. Various modularity improvements.

This commit is contained in:
Evan Weinberg
2021-08-26 22:55:07 -04:00
parent a587f67423
commit a686ce33c6
4 changed files with 642 additions and 458 deletions

View File

@ -44,7 +44,8 @@ struct TagPairSNAPComputeForce{};
struct TagPairSNAPComputeNeigh{};
struct TagPairSNAPComputeCayleyKlein{};
struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
struct TagPairSNAPComputeZi{};
struct TagPairSNAPBeta{};
@ -53,7 +54,9 @@ struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeYiWithZlist{};
template<int dir>
struct TagPairSNAPComputeFusedDeidrj{};
struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
template<int dir>
struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
// CPU backend only
struct TagPairSNAPComputeNeighCPU{};
@ -143,7 +146,10 @@ public:
void operator() (TagPairSNAPPreUi,const int iatom_mod, const int j, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
void operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiSmall>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiLarge>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
@ -168,7 +174,11 @@ public:
template<int dir>
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const;
void operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const;
template<int dir>
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const;
// CPU backend only
KOKKOS_INLINE_FUNCTION

View File

@ -341,18 +341,32 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
// ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
{
// team_size_compute_ui is defined in `pair_snap_kokkos.h`
// scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
const int tile_size = vector_length * (twojmax + 1);
const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
// total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
if (chunk_size < 16384)
{
// Version with parallelism over j_bend
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUi> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
// total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiSmall> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this);
} else {
// Version w/out parallelism over j_bend
// total number of teams needed: (natoms / 32) * (max_neighs)
const int n_teams = chunk_size_div * max_neighs;
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagPairSNAPComputeUiLarge> policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this);
}
}
//TransformUi: un-"fold" ulisttot, zero ylist
@ -412,25 +426,51 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in,
const int tile_size = vector_length * (twojmax + 1);
const int scratch_size = scratch_size_helper<complex>(2 * team_size_compute_fused_deidrj * tile_size);
// total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
if (chunk_size < 16384)
{
// Version with parallelism over j_bend
// x direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrj<0>",policy_fused_deidrj_x,*this);
// total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations)
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
// y direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrj<1>",policy_fused_deidrj_y,*this);
// x direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this);
// z direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrj<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrj<2>",policy_fused_deidrj_z,*this);
// y direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this);
// z direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjSmall<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this);
} else {
// Version w/out parallelism over j_bend
// total number of teams needed: (natoms / 32) * (max_neighs)
const int n_teams = chunk_size_div * max_neighs;
const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj;
// x direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<0> > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this);
// y direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<1> > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this);
// z direction
SnapAoSoATeamPolicy<DeviceType, team_size_compute_fused_deidrj, TagPairSNAPComputeFusedDeidrjLarge<2> > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length);
policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this);
}
}
#endif // LMP_KOKKOS_GPU
@ -603,13 +643,13 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
auto bveci = my_sna.blist(idxb, idx_chem, ii);
real_type bveci = my_sna.blist(ii, idx_chem, idxb);
d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
k++;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
const auto jdxb = jcoeff % idxb_max;
const auto jdx_chem = jcoeff / idxb_max;
real_type bvecj = my_sna.blist(jdxb, jdx_chem, ii);
real_type bvecj = my_sna.blist(ii, jdx_chem, jdxb);
d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
k++;
@ -736,7 +776,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUi>::member_type& team) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiSmall>::member_type& team) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract flattened atom_div / neighbor number / bend location
@ -756,11 +796,37 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_ui(team,iatom_mod, jbend, jj, iatom_div);
my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiLarge>::member_type& team) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
int jj = flattened_idx - iatom_div * max_neighs;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
@ -861,9 +927,9 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
for (int itriple = 0; itriple < ntriples; itriple++) {
const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
my_sna.blist(idxb, itriple, iatom) = blocal;
my_sna.blist(iatom, itriple, idxb) = blocal;
}
}
@ -871,7 +937,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
template<class DeviceType, typename real_type, int vector_length>
template<int dir>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrj<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrj<dir> >::member_type& team) const {
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjSmall<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjSmall<dir> >::member_type& team) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract flattened atom_div / neighbor number / bend location
@ -891,12 +957,38 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.template compute_fused_deidrj<dir>(team, iatom_mod, jbend, jj, iatom_div);
my_sna.template compute_fused_deidrj_small<dir>(team, iatom_mod, jbend, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
template<int dir>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrjLarge<dir>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrjLarge<dir> >::member_type& team) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
int jj = flattened_idx - max_neighs * iatom_div;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.template compute_fused_deidrj_large<dir>(team, iatom_mod, jj, iatom_div);
});
}
/* ----------------------------------------------------------------------
Begin routines that are unique to the CPU codepath. These do not take
advantage of AoSoA data layouts, but that could be a good point of
@ -925,13 +1017,13 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
auto bveci = my_sna.blist(idxb,idx_chem,ii);
real_type bveci = my_sna.blist(ii,idx_chem,idxb);
d_beta(icoeff,ii) += d_coeffi[k]*bveci;
k++;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
const auto jdxb = jcoeff % idxb_max;
const auto jdx_chem = jcoeff / idxb_max;
auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
real_type bvecj = my_sna.blist(ii,jdx_chem,jdxb);
d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
k++;
@ -1221,7 +1313,7 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
evdwl += d_coeffi[icoeff+1]*my_sna.blist(idxb,idx_chem,ii);
evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,idx_chem,idxb);
}
// quadratic contributions
@ -1230,12 +1322,12 @@ void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSN
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
auto bveci = my_sna.blist(idxb,idx_chem,ii);
real_type bveci = my_sna.blist(ii,idx_chem,idxb);
evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
auto jdxb = jcoeff % idxb_max;
auto jdx_chem = jcoeff / idxb_max;
auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
auto bvecj = my_sna.blist(ii,jdx_chem,jdxb);
evdwl += d_coeffi[k++]*bveci*bvecj;
}
}

View File

@ -45,12 +45,12 @@ struct WignerWrapper {
{ ; }
KOKKOS_INLINE_FUNCTION
complex get(const int& ma) {
complex get(const int& ma) const {
return complex(buffer[offset + 2 * vector_length * ma], buffer[offset + vector_length + 2 * vector_length * ma]);
}
KOKKOS_INLINE_FUNCTION
void set(const int& ma, const complex& store) {
void set(const int& ma, const complex& store) const {
buffer[offset + 2 * vector_length * ma] = store.re;
buffer[offset + vector_length + 2 * vector_length * ma] = store.im;
}
@ -122,8 +122,14 @@ inline
void compute_cayley_klein(const int&, const int&, const int&);
KOKKOS_INLINE_FUNCTION
void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP
// version of the code with parallelism over j_bend
KOKKOS_INLINE_FUNCTION
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
void compute_ui_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
// version of the code without parallelism over j_bend
KOKKOS_INLINE_FUNCTION
void compute_ui_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_zi(const int&, const int&, const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION
@ -135,6 +141,35 @@ inline
KOKKOS_INLINE_FUNCTION
void compute_bi(const int&, const int&, const int&); // ForceSNAP
// functions for derivatives, GPU only
// version of the code with parallelism over j_bend
template<int dir>
KOKKOS_INLINE_FUNCTION
void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
// version of the code without parallelism over j_bend
template<int dir>
KOKKOS_INLINE_FUNCTION
void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int); //ForceSNAP
// core "evaluation" functions that get plugged into "compute" functions
// plugged into compute_ui_small, compute_ui_large
KOKKOS_FORCEINLINE_FUNCTION
void evaluate_ui_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&, const int&,
const int&, const int&, const int&);
// plugged into compute_zi, compute_yi
KOKKOS_FORCEINLINE_FUNCTION
complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
const int&, const int&, const int&, const int&, const real_type*);
// plugged into compute_yi, compute_yi_with_zlist
KOKKOS_FORCEINLINE_FUNCTION
real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&,
const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &);
// plugged into compute_fused_deidrj_small, compute_fused_deidrj_large
KOKKOS_FORCEINLINE_FUNCTION
real_type evaluate_duidrj_jbend(const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&,
const WignerWrapper<real_type, vector_length>&, const complex&, const complex&, const real_type&,
const int&, const int&, const int&, const int&);
// functions for bispectrum coefficients, CPU only
KOKKOS_INLINE_FUNCTION
void pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
@ -148,11 +183,6 @@ inline
KOKKOS_INLINE_FUNCTION
void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
// functions for derivatives, GPU only
template<int dir>
KOKKOS_INLINE_FUNCTION
void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
// functions for derivatives, CPU only
KOKKOS_INLINE_FUNCTION
void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
@ -168,23 +198,6 @@ inline
KOKKOS_INLINE_FUNCTION
void compute_s_dsfac(const real_type, const real_type, real_type&, real_type&); // compute_cayley_klein
static KOKKOS_FORCEINLINE_FUNCTION
void sincos_wrapper(double x, double* sin_, double *cos_) {
#ifdef __SYCL_DEVICE_ONLY__
*sin_ = sycl::sincos(x, cos_);
#else
sincos(x, sin_, cos_);
#endif
}
static KOKKOS_FORCEINLINE_FUNCTION
void sincos_wrapper(float x, float* sin_, float *cos_) {
#ifdef __SYCL_DEVICE_ONLY__
*sin_ = sycl::sincos(x, cos_);
#else
sincosf(x, sin_, cos_);
#endif
}
#ifdef TIMING_INFO
double* timers;
timespec starttime, endtime;
@ -207,7 +220,7 @@ inline
int twojmax, diagonalstyle;
t_sna_3d_ll blist;
t_sna_3d blist;
t_sna_3c_ll ulisttot;
t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only
t_sna_3c_ll zlist;

File diff suppressed because it is too large Load Diff