diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 2193e9ff24..8586c4bdab 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -39,6 +39,7 @@ struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUi{}; struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; +struct TagPairSNAPZeroYi{}; struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeDuidrj{}; struct TagPairSNAPComputeDeidrj{}; @@ -73,19 +74,22 @@ public: void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPPreUi,const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeZi,const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPZeroYi,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi,const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy::member_type& team) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 95afcc5ec7..02c8554fa5 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -184,22 +184,12 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs(k_list), Kokkos::Experimental::Max(max_neighs)); int vector_length = 1; - int ui_vector_length = 1; int team_size = 1; - int yi_team_size = 1; int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); #ifdef KOKKOS_ENABLE_CUDA team_size = 32;//max_neighs; if (team_size*vector_length > team_size_max) team_size = team_size_max/vector_length; - - yi_team_size = 256; - if (yi_team_size*vector_length > team_size_max) - yi_team_size = team_size_max/vector_length; - - ui_vector_length = 8; - if (team_size*ui_vector_length > team_size_max) - team_size = team_size_max/ui_vector_length; #endif if (beta_max < inum) { @@ -227,17 +217,21 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); //PreUi - typename Kokkos::TeamPolicy policy_preui(chunk_size,team_size,vector_length); + typename Kokkos::RangePolicy policy_preui(0,chunk_size); Kokkos::parallel_for("PreUi",policy_preui,*this); //ComputeUi - typename Kokkos::TeamPolicy policy_ui(((inum+team_size-1)/team_size)*max_neighs,team_size,ui_vector_length); + typename Kokkos::TeamPolicy policy_ui(((inum+team_size-1)/team_size)*max_neighs,team_size,vector_length); Kokkos::parallel_for("ComputeUi",policy_ui,*this); + //Ulisttot transpose + snaKK.transpose_ulisttot(); + //Compute bispectrum if (quadraticflag || eflag) { //ComputeZi - typename Kokkos::TeamPolicy policy_zi(chunk_size,team_size,vector_length); + int idxz_max = snaKK.idxz_max; + typename Kokkos::RangePolicy policy_zi(0,chunk_size*idxz_max); Kokkos::parallel_for("ComputeZi",policy_zi,*this); //ComputeBi @@ -250,7 +244,12 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeBeta",policy_beta,*this); //ComputeYi - typename Kokkos::TeamPolicy policy_yi(chunk_size,yi_team_size,vector_length); + typename Kokkos::RangePolicy policy_zero_yi(0,chunk_size); + Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this); + + //ComputeYi + int idxz_max = snaKK.idxz_max; + typename Kokkos::RangePolicy policy_yi(0,chunk_size*idxz_max); Kokkos::parallel_for("ComputeYi",policy_yi,*this); //ComputeDuidrj @@ -504,10 +503,9 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typen template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); +void PairSNAPKokkos::operator() (TagPairSNAPPreUi,const int& ii) const { SNAKokkos my_sna = snaKK; - my_sna.pre_ui(team,ii); + my_sna.pre_ui(ii); } template @@ -529,18 +527,23 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); +void PairSNAPKokkos::operator() (TagPairSNAPZeroYi,const int& ii) const { SNAKokkos my_sna = snaKK; - my_sna.compute_yi(team,ii,d_beta); + my_sna.zero_yi(ii); } template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int& ii) const { SNAKokkos my_sna = snaKK; - my_sna.compute_zi(team,ii); + my_sna.compute_yi(ii,d_beta); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int& ii) const { + SNAKokkos my_sna = snaKK; + my_sna.compute_zi(ii); } template diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 2dbfdcb47c..7aa154c3d5 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -26,15 +26,28 @@ namespace LAMMPS_NS { typedef double SNAreal; -typedef struct { SNAreal re, im; } SNAcomplex; -struct SNAKK_ZINDICES { - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju; +//typedef struct { SNAreal re, im; } SNAcomplex; +struct alignas(2*sizeof(SNAreal)) SNAcomplex{ + SNAreal re, im; + + KOKKOS_INLINE_FUNCTION + SNAcomplex() : re(0),im(0) + {} + + KOKKOS_INLINE_FUNCTION + SNAcomplex(SNAreal real_in, SNAreal imag_in) + :re(real_in),im(imag_in) + {} }; -struct SNAKK_BINDICES { - int j1, j2, j; -}; +//struct SNAKK_ZINDICES { +// int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju; +//}; +// +//struct SNAKK_BINDICES { +// int j1, j2, j; +//}; template class SNAKokkos { @@ -53,12 +66,32 @@ public: typedef Kokkos::View t_sna_1c; typedef Kokkos::View > t_sna_1c_atomic; typedef Kokkos::View t_sna_2c; - typedef Kokkos::View t_sna_2c_cpu; + typedef Kokkos::View t_sna_2c_lr; typedef Kokkos::View t_sna_3c; typedef Kokkos::View t_sna_4c; typedef Kokkos::View t_sna_3c3; typedef Kokkos::View t_sna_5c; +// Helper class to get ulisttot_r + +template +class UlisttotHelper { +public: + inline + static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) { + Kokkos::deep_copy(ulisttot_lr,ulisttot); + } +}; + +template +class UlisttotHelper { +public: + inline + static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) { + ulisttot_lr = ulisttot; + } +}; + inline SNAKokkos() {}; KOKKOS_INLINE_FUNCTION @@ -80,17 +113,22 @@ inline int ncoeff; +inline + void transpose_ulisttot(); + // functions for bispectrum coefficients KOKKOS_INLINE_FUNCTION - void pre_ui(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + void pre_ui(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_ui(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_ui_orig(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_zi(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + void compute_zi(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_yi(const typename Kokkos::TeamPolicy::member_type& team, int, + void zero_yi(const int&); + KOKKOS_INLINE_FUNCTION + void compute_yi(int, const Kokkos::View &beta); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP @@ -129,23 +167,25 @@ inline int twojmax, diagonalstyle; t_sna_2d blist; - t_sna_2c_cpu ulisttot; + t_sna_2c ulisttot; + t_sna_2c_lr ulisttot_lr; t_sna_2c zlist; t_sna_3c ulist; - t_sna_2c ylist; + t_sna_2c_lr ylist; // derivatives of data t_sna_4c dulist; + int idxcg_max, idxu_max, idxz_max, idxb_max; + private: double rmin0, rfac0; //use indexlist instead of loops, constructor generates these // Same across all SNAKokkos - Kokkos::View idxz; - Kokkos::View idxb; - int idxcg_max, idxu_max, idxz_max, idxb_max; + Kokkos::View idxz; + Kokkos::View idxb; Kokkos::View idxcg_block; Kokkos::View idxu_block; Kokkos::View idxz_block; @@ -173,9 +213,9 @@ inline inline void init_rootpqarray(); // init() KOKKOS_INLINE_FUNCTION - void zero_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int); // compute_ui + void zero_uarraytot(const int&); // compute_ui KOKKOS_INLINE_FUNCTION - void addself_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, double); // compute_ui + void addself_uarraytot(const int&, const double&); // compute_ui KOKKOS_INLINE_FUNCTION void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, double, double, double); // compute_ui diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 36765e9cd6..3e4ebc2e42 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -117,7 +117,7 @@ void SNAKokkos::build_indexlist() if (j >= j1) idxb_count++; idxb_max = idxb_count; - idxb = Kokkos::View("SNAKokkos::idxb",idxb_max); + idxb = Kokkos::View("SNAKokkos::idxb",idxb_max); auto h_idxb = Kokkos::create_mirror_view(idxb); idxb_count = 0; @@ -125,9 +125,9 @@ void SNAKokkos::build_indexlist() for(int j2 = 0; j2 <= j1; j2++) for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) if (j >= j1) { - h_idxb[idxb_count].j1 = j1; - h_idxb[idxb_count].j2 = j2; - h_idxb[idxb_count].j = j; + h_idxb(idxb_count,0) = j1; + h_idxb(idxb_count,1) = j2; + h_idxb(idxb_count,2) = j; idxb_count++; } Kokkos::deep_copy(idxb,h_idxb); @@ -160,7 +160,7 @@ void SNAKokkos::build_indexlist() idxz_count++; idxz_max = idxz_count; - idxz = Kokkos::View("SNAKokkos::idxz",idxz_max); + idxz = Kokkos::View("SNAKokkos::idxz",idxz_max); auto h_idxz = Kokkos::create_mirror_view(idxz); idxz_block = Kokkos::View("SNAKokkos::idxz_block", jdim,jdim,jdim); @@ -178,20 +178,20 @@ void SNAKokkos::build_indexlist() for (int mb = 0; 2*mb <= j; mb++) for (int ma = 0; ma <= j; ma++) { - h_idxz[idxz_count].j1 = j1; - h_idxz[idxz_count].j2 = j2; - h_idxz[idxz_count].j = j; - h_idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2); - h_idxz[idxz_count].ma2max = (2 * ma - j - (2 * h_idxz[idxz_count].ma1min - j1) + j2) / 2; - h_idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz[idxz_count].ma1min + 1; - h_idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2); - h_idxz[idxz_count].mb2max = (2 * mb - j - (2 * h_idxz[idxz_count].mb1min - j1) + j2) / 2; - h_idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz[idxz_count].mb1min + 1; + h_idxz(idxz_count,0) = j1; + h_idxz(idxz_count,1) = j2; + h_idxz(idxz_count,2) = j; + h_idxz(idxz_count,3) = MAX(0, (2 * ma - j - j2 + j1) / 2); + h_idxz(idxz_count,4) = (2 * ma - j - (2 * h_idxz(idxz_count,3) - j1) + j2) / 2; + h_idxz(idxz_count,5) = MAX(0, (2 * mb - j - j2 + j1) / 2); + h_idxz(idxz_count,6) = (2 * mb - j - (2 * h_idxz(idxz_count,5) - j1) + j2) / 2; + h_idxz(idxz_count,7) = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz(idxz_count,3) + 1; + h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1; // apply to z(j1,j2,j,ma,mb) to unique element of y(j) const int jju = h_idxu_block[j] + (j+1)*mb + ma; - h_idxz[idxz_count].jju = jju; + h_idxz(idxz_count,9) = jju; idxz_count++; } @@ -225,11 +225,13 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) dedr = t_sna_3d("sna:dedr",natom,nmax,3); blist = t_sna_2d("sna:blist",natom,idxb_max); - ulisttot = t_sna_2c_cpu("sna:ulisttot",natom,idxu_max); + ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max); + if (!Kokkos::Impl::is_same::value) + ulisttot_lr = t_sna_2c_lr("sna:ulisttot_lr",natom,idxu_max); zlist = t_sna_2c("sna:zlist",natom,idxz_max); ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); - ylist = t_sna_2c("sna:ylist",natom,idxu_max); + ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max); } @@ -240,15 +242,15 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNAKokkos::pre_ui(const int& iatom) { - if(team.team_rank() == 0) { - zero_uarraytot(team,iatom); + //if(team.team_rank() == 0) { + zero_uarraytot(iatom); //Kokkos::single(Kokkos::PerThread(team), [&] (){ - addself_uarraytot(team,iatom,wself); + addself_uarraytot(iatom,wself); //}); - } - team.team_barrier(); + //} + //team.team_barrier(); } /* ---------------------------------------------------------------------- @@ -278,50 +280,7 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_orig(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnum) -{ - double rsq, r, x, y, z, z0, theta0; - - // utot(j,ma,mb) = 0 for all j,ma,ma - // utot(j,ma,ma) = 1 for all j,ma - // for j in neighbors of i: - // compute r0 = (x,y,z,z0) - // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb - - if(team.team_rank() == 0) { - zero_uarraytot(team,iatom); - //Kokkos::single(Kokkos::PerThread(team), [&] (){ - addself_uarraytot(team,iatom,wself); - //}); - } - team.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,jnum), - [&] (const int& j) { - //for(int j = 0; j < jnum; j++) { - x = rij(iatom,j,0); - y = rij(iatom,j,1); - z = rij(iatom,j,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - - theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,j) - rmin0); - // theta0 = (r - rmin0) * rscale0; - z0 = r / tan(theta0); - - compute_uarray(team, iatom, j, x, y, z, z0, r); - //Kokkos::single(Kokkos::PerThread(team), [&] (){ - add_uarraytot(team, iatom, j, r, wj(iatom,j), rcutij(iatom,j)); - //}); - }); - } /* ---------------------------------------------------------------------- @@ -330,54 +289,52 @@ void SNAKokkos::compute_ui_orig(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNAKokkos::compute_zi(const int& iter) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), - [&] (const int& jjz) { - //for(int jjz = 0; jjz < idxz_max; jjz++) { - const int j1 = idxz[jjz].j1; - const int j2 = idxz[jjz].j2; - const int j = idxz[jjz].j; - const int ma1min = idxz[jjz].ma1min; - const int ma2max = idxz[jjz].ma2max; - const int na = idxz[jjz].na; - const int mb1min = idxz[jjz].mb1min; - const int mb2max = idxz[jjz].mb2max; - const int nb = idxz[jjz].nb; + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; - const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); + const int j1 = idxz(jjz,0); + const int j2 = idxz(jjz,1); + const int j = idxz(jjz,2); + const int ma1min = idxz(jjz,3); + const int ma2max = idxz(jjz,4); + const int mb1min = idxz(jjz,5); + const int mb2max = idxz(jjz,6); + const int na = idxz(jjz,7); + const int nb = idxz(jjz,8); - zlist(iatom,jjz).re = 0.0; - zlist(iatom,jjz).im = 0.0; + const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); - int jju1 = idxu_block[j1] + (j1+1)*mb1min; - int jju2 = idxu_block[j2] + (j2+1)*mb2max; - int icgb = mb1min*(j2+1) + mb2max; - for(int ib = 0; ib < nb; ib++) { + zlist(iatom,jjz).re = 0.0; + zlist(iatom,jjz).im = 0.0; - double suma1_r = 0.0; - double suma1_i = 0.0; + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + for(int ib = 0; ib < nb; ib++) { - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min*(j2+1) + ma2max; - for(int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia + double suma1_r = 0.0; + double suma1_i = 0.0; - zlist(iatom,jjz).re += cgblock[icgb] * suma1_r; - zlist(iatom,jjz).im += cgblock[icgb] * suma1_i; + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + for(int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im); + suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia - jju1 += j1+1; - jju2 -= j2+1; - icgb += j2; - } // end loop over ib + zlist(iatom,jjz).re += cgblock[icgb] * suma1_r; + zlist(iatom,jjz).im += cgblock[icgb] * suma1_i; - }); // end loop over jjz + jju1 += j1+1; + jju2 -= j2+1; + icgb += j2; + } // end loop over ib } /* ---------------------------------------------------------------------- @@ -386,102 +343,94 @@ void SNAKokkos::compute_zi(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi(const typename Kokkos::TeamPolicy::member_type& team, int iatom, +void SNAKokkos::zero_yi(const int& iatom) +{ + for (int j = 0; j < idxu_max; j++) + ylist(iatom,j) = {0.0,0.0}; +} + +/* ---------------------------------------------------------------------- + compute Yi from Ui without storing Zi, looping over zlist indices +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_yi(int iter, const Kokkos::View &beta) { double betaj; - const int ii = iatom; + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; - { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist.extent(1)), - [&] (const int& i) { - ylist(iatom,i).re = 0.0; - ylist(iatom,i).im = 0.0; - }); - } + const int j1 = idxz(jjz,0); + const int j2 = idxz(jjz,1); + const int j = idxz(jjz,2); + const int ma1min = idxz(jjz,3); + const int ma2max = idxz(jjz,4); + const int mb1min = idxz(jjz,5); + const int mb2max = idxz(jjz,6); + const int na = idxz(jjz,7); + const int nb = idxz(jjz,8); + const int jju = idxz(jjz,9); - //int flopsum = 0; + const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); + //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; + //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), - [&] (const int& jjz) { - //for(int jjz = 0; jjz < idxz_max; jjz++) { - const int j1 = idxz[jjz].j1; - const int j2 = idxz[jjz].j2; - const int j = idxz[jjz].j; - const int ma1min = idxz[jjz].ma1min; - const int ma2max = idxz[jjz].ma2max; - const int na = idxz[jjz].na; - const int mb1min = idxz[jjz].mb1min; - const int mb2max = idxz[jjz].mb2max; - const int nb = idxz[jjz].nb; + double ztmp_r = 0.0; + double ztmp_i = 0.0; - const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); - //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; - //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + for(int ib = 0; ib < nb; ib++) { - double ztmp_r = 0.0; - double ztmp_i = 0.0; + double suma1_r = 0.0; + double suma1_i = 0.0; - int jju1 = idxu_block[j1] + (j1+1)*mb1min; - int jju2 = idxu_block[j2] + (j2+1)*mb2max; - int icgb = mb1min*(j2+1) + mb2max; - for(int ib = 0; ib < nb; ib++) { + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; - double suma1_r = 0.0; - double suma1_i = 0.0; + for(int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).re - ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).im); + suma1_i += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).im + ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min*(j2+1) + ma2max; + ztmp_r += cgblock[icgb] * suma1_r; + ztmp_i += cgblock[icgb] * suma1_i; + jju1 += j1+1; + jju2 -= j2+1; + icgb += j2; + } // end loop over ib - for(int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re); - //flopsum += 10; - ma1++; - ma2--; - icga += j2; - } // end loop over ia - - ztmp_r += cgblock[icgb] * suma1_r; - ztmp_i += cgblock[icgb] * suma1_i; - jju1 += j1+1; - jju2 -= j2+1; - icgb += j2; - } // end loop over ib - - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(ii,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - - const int jju = idxz[jjz].jju; + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 // pick out right beta value - if (j >= j1) { - const int jjb = idxb_block(j1,j2,j); - if (j1 == j) { - if (j2 == j) betaj = 3*beta(ii,jjb); - else betaj = 2*beta(ii,jjb); - } else betaj = beta(ii,jjb); - } else if (j >= j2) { - const int jjb = idxb_block(j,j2,j1); - if (j2 == j) betaj = 2*beta(ii,jjb)*(j1+1)/(j+1.0); - else betaj = beta(ii,jjb)*(j1+1)/(j+1.0); - } else { - const int jjb = idxb_block(j2,j,j1); - betaj = beta(ii,jjb)*(j1+1)/(j+1.0); - } + if (j >= j1) { + const int jjb = idxb_block(j1,j2,j); + if (j1 == j) { + if (j2 == j) betaj = 3*beta(iatom,jjb); + else betaj = 2*beta(iatom,jjb); + } else betaj = beta(iatom,jjb); + } else if (j >= j2) { + const int jjb = idxb_block(j,j2,j1); + if (j2 == j) betaj = 2*beta(iatom,jjb)*(j1+1)/(j+1.0); + else betaj = beta(iatom,jjb)*(j1+1)/(j+1.0); + } else { + const int jjb = idxb_block(j2,j,j1); + betaj = beta(iatom,jjb)*(j1+1)/(j+1.0); + } - Kokkos::single(Kokkos::PerThread(team), [&] () { - Kokkos::atomic_add(&(ylist(iatom,jju).re), betaj*ztmp_r); - Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i); - }); - - }); // end loop over jjz - - //printf("sum %i\n",flopsum); + Kokkos::atomic_add(&(ylist(iatom,jju).re), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i); } /* ---------------------------------------------------------------------- @@ -556,9 +505,9 @@ void SNAKokkos::compute_bi(const typename Kokkos::TeamPolicy::compute_duidrj(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::zero_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNAKokkos::zero_uarraytot(const int& iatom) { { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)), - [&] (const int& i) { + //Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)), + // [&] (const int& i) { + for (int i = 0; i < ulisttot.extent(1); i++) { ulisttot(iatom,i).re = 0.0; ulisttot(iatom,i).im = 0.0; - }); + } + //}); } } @@ -663,18 +614,18 @@ void SNAKokkos::zero_uarraytot(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::addself_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, double wself_in) +void SNAKokkos::addself_uarraytot(const int& iatom, const double& wself_in) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j) { - //for (int j = 0; j <= twojmax; j++) + //Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), + // [&] (const int& j) { + for (int j = 0; j <= twojmax; j++) { int jju = idxu_block[j]; for (int ma = 0; ma <= j; ma++) { ulisttot(iatom,jju).re = wself_in; ulisttot(iatom,jju).im = 0.0; jju += j+2; } - }); + }//}); } /* ---------------------------------------------------------------------- @@ -786,6 +737,12 @@ void SNAKokkos::compute_uarray(const typename Kokkos::TeamPolicy +void SNAKokkos::transpose_ulisttot() +{ + UlisttotHelper::transpose(ulisttot_lr,ulisttot); +} + /* ---------------------------------------------------------------------- compute derivatives of Wigner U-functions for one neighbor see comments in compute_uarray() @@ -1318,6 +1275,8 @@ double SNAKokkos::memory_usage() bytes += natom * idxu_max * sizeof(double) * 2; // ulist bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot + if (!Kokkos::Impl::is_same::value) + bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist bytes += natom * idxz_max * sizeof(double) * 2; // zlist @@ -1329,8 +1288,8 @@ double SNAKokkos::memory_usage() bytes += jdim * jdim * jdim * sizeof(int); // idxz_block bytes += jdim * jdim * jdim * sizeof(int); // idxb_block - bytes += idxz_max * sizeof(SNAKK_ZINDICES); // idxz - bytes += idxb_max * sizeof(SNAKK_BINDICES); // idxb + bytes += idxz_max * 10 * sizeof(int); // idxz + bytes += idxb_max * 3 * sizeof(int); // idxb bytes += jdim * sizeof(double); // bzero