From bb1cb2e5a833943178b6e5752a8ea066669f1e0e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 5 Sep 2019 10:23:52 -0600 Subject: [PATCH 1/5] Add transpose for ulisttot --- src/KOKKOS/sna_kokkos.h | 3 ++- src/KOKKOS/sna_kokkos_impl.h | 17 ++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 2dbfdcb47c..1e83a621f0 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -129,7 +129,8 @@ inline int twojmax, diagonalstyle; t_sna_2d blist; - t_sna_2c_cpu ulisttot; + t_sna_2c ulisttot; + t_sna_2c_cpu ulisttot_lr; t_sna_2c zlist; t_sna_3c ulist; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 36765e9cd6..2f63a8353b 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -225,7 +225,8 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) dedr = t_sna_3d("sna:dedr",natom,nmax,3); blist = t_sna_2d("sna:blist",natom,idxb_max); - ulisttot = t_sna_2c_cpu("sna:ulisttot",natom,idxu_max); + ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max); + ulisttot_lr = t_sna_2c_cpu("sna:ulisttot_lr",natom,idxu_max); zlist = t_sna_2c("sna:zlist",natom,idxz_max); ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); @@ -281,6 +282,11 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::value) + ulisttot_lr = ulisttot; + else + Kokkos::deep_copy(ulisttot_lr,ulisttot); } template @@ -400,8 +406,6 @@ void SNAKokkos::compute_yi(const typename Kokkos::TeamPolicy::compute_yi(const typename Kokkos::TeamPolicy::compute_yi(const typename Kokkos::TeamPolicy::memory_usage() bytes += natom * idxu_max * sizeof(double) * 2; // ulist bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot + bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist bytes += natom * idxz_max * sizeof(double) * 2; // zlist From 9b955a6c1cfbca201d731abc710d1d36e0e49eda Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 6 Sep 2019 14:49:14 -0600 Subject: [PATCH 2/5] Some tweaks --- src/KOKKOS/pair_snap_kokkos_impl.h | 3 +++ src/KOKKOS/sna_kokkos.h | 29 ++++++++++++++++++++++++++--- src/KOKKOS/sna_kokkos_impl.h | 19 +++++++++++-------- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 95afcc5ec7..c2b56ebf21 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -234,6 +234,9 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) typename Kokkos::TeamPolicy policy_ui(((inum+team_size-1)/team_size)*max_neighs,team_size,ui_vector_length); Kokkos::parallel_for("ComputeUi",policy_ui,*this); + //Ulisttot transpose + snaKK.transpose_ulisttot(); + //Compute bispectrum if (quadraticflag || eflag) { //ComputeZi diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 1e83a621f0..e1c8d56a18 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -53,12 +53,32 @@ public: typedef Kokkos::View t_sna_1c; typedef Kokkos::View > t_sna_1c_atomic; typedef Kokkos::View t_sna_2c; - typedef Kokkos::View t_sna_2c_cpu; + typedef Kokkos::View t_sna_2c_lr; typedef Kokkos::View t_sna_3c; typedef Kokkos::View t_sna_4c; typedef Kokkos::View t_sna_3c3; typedef Kokkos::View t_sna_5c; +// Helper class to get ulisttot_r + +template +class UlisttotHelper { +public: + inline + static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) { + Kokkos::deep_copy(ulisttot_lr,ulisttot); + } +}; + +template +class UlisttotHelper { +public: + inline + static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) { + ulisttot_lr = ulisttot; + } +}; + inline SNAKokkos() {}; KOKKOS_INLINE_FUNCTION @@ -80,6 +100,9 @@ inline int ncoeff; +inline + void transpose_ulisttot(); + // functions for bispectrum coefficients KOKKOS_INLINE_FUNCTION void pre_ui(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP @@ -130,11 +153,11 @@ inline t_sna_2d blist; t_sna_2c ulisttot; - t_sna_2c_cpu ulisttot_lr; + t_sna_2c_lr ulisttot_lr; t_sna_2c zlist; t_sna_3c ulist; - t_sna_2c ylist; + t_sna_2c_lr ylist; // derivatives of data t_sna_4c dulist; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 2f63a8353b..1eb4d0d764 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -226,11 +226,12 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) blist = t_sna_2d("sna:blist",natom,idxb_max); ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max); - ulisttot_lr = t_sna_2c_cpu("sna:ulisttot_lr",natom,idxu_max); + if (!Kokkos::Impl::is_same::value) + ulisttot_lr = t_sna_2c_lr("sna:ulisttot_lr",natom,idxu_max); zlist = t_sna_2c("sna:zlist",natom,idxz_max); ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); - ylist = t_sna_2c("sna:ylist",natom,idxu_max); + ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max); } @@ -282,11 +283,6 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::value) - ulisttot_lr = ulisttot; - else - Kokkos::deep_copy(ulisttot_lr,ulisttot); } template @@ -788,6 +784,12 @@ void SNAKokkos::compute_uarray(const typename Kokkos::TeamPolicy +void SNAKokkos::transpose_ulisttot() +{ + UlisttotHelper::transpose(ulisttot_lr,ulisttot); +} + /* ---------------------------------------------------------------------- compute derivatives of Wigner U-functions for one neighbor see comments in compute_uarray() @@ -1320,7 +1322,8 @@ double SNAKokkos::memory_usage() bytes += natom * idxu_max * sizeof(double) * 2; // ulist bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot - bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr + if (!Kokkos::Impl::is_same::value) + bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist bytes += natom * idxz_max * sizeof(double) * 2; // zlist From 2be848e5df604d491e21fb11983c474652428f67 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 25 Sep 2019 15:41:45 -0600 Subject: [PATCH 3/5] Remove team from compute_yi --- src/KOKKOS/pair_snap_kokkos.h | 2 +- src/KOKKOS/pair_snap_kokkos_impl.h | 8 ++++---- src/KOKKOS/sna_kokkos.h | 2 +- src/KOKKOS/sna_kokkos_impl.h | 26 ++++++++++++++------------ 4 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 2193e9ff24..c37236df36 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -85,7 +85,7 @@ public: void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeYi,const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy::member_type& team) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 95afcc5ec7..df16309fc5 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -250,7 +250,8 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeBeta",policy_beta,*this); //ComputeYi - typename Kokkos::TeamPolicy policy_yi(chunk_size,yi_team_size,vector_length); + //typename Kokkos::TeamPolicy policy_yi(chunk_size,yi_team_size,vector_length); + typename Kokkos::RangePolicy policy_yi(0,chunk_size); Kokkos::parallel_for("ComputeYi",policy_yi,*this); //ComputeDuidrj @@ -529,10 +530,9 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int &ii) const { SNAKokkos my_sna = snaKK; - my_sna.compute_yi(team,ii,d_beta); + my_sna.compute_yi(ii,d_beta); } template diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 2dbfdcb47c..589b0ba971 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -90,7 +90,7 @@ inline KOKKOS_INLINE_FUNCTION void compute_zi(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_yi(const typename Kokkos::TeamPolicy::member_type& team, int, + void compute_yi(int, const Kokkos::View &beta); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 36765e9cd6..4eb3ae9a4e 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -386,25 +386,27 @@ void SNAKokkos::compute_zi(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi(const typename Kokkos::TeamPolicy::member_type& team, int iatom, +void SNAKokkos::compute_yi(int iatom, const Kokkos::View &beta) { double betaj; const int ii = iatom; - { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist.extent(1)), - [&] (const int& i) { + //{ + //Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist.extent(1)), + // [&] (const int& i) { + for (int i = 0; i < ylist.extent(1); i++) { ylist(iatom,i).re = 0.0; ylist(iatom,i).im = 0.0; - }); - } + } + // }); + //} //int flopsum = 0; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), - [&] (const int& jjz) { - //for(int jjz = 0; jjz < idxz_max; jjz++) { + //Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), + // [&] (const int& jjz) { + for (int jjz = 0; jjz < idxz_max; jjz++) { const int j1 = idxz[jjz].j1; const int j2 = idxz[jjz].j2; const int j = idxz[jjz].j; @@ -474,12 +476,12 @@ void SNAKokkos::compute_yi(const typename Kokkos::TeamPolicy Date: Wed, 25 Sep 2019 16:15:25 -0600 Subject: [PATCH 4/5] WIP --- src/KOKKOS/pair_snap_kokkos_impl.h | 3 ++- src/KOKKOS/sna_kokkos.h | 3 ++- src/KOKKOS/sna_kokkos_impl.h | 35 ++++++++++++++++-------------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index df16309fc5..ed701393e4 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -250,8 +250,9 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeBeta",policy_beta,*this); //ComputeYi + int idxz_max = snaKK.idxz_max; //typename Kokkos::TeamPolicy policy_yi(chunk_size,yi_team_size,vector_length); - typename Kokkos::RangePolicy policy_yi(0,chunk_size); + typename Kokkos::RangePolicy policy_yi(0,chunk_size*idxz_max); Kokkos::parallel_for("ComputeYi",policy_yi,*this); //ComputeDuidrj diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 589b0ba971..88cf963ee8 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -138,6 +138,8 @@ inline // derivatives of data t_sna_4c dulist; + int idxcg_max, idxu_max, idxz_max, idxb_max; + private: double rmin0, rfac0; @@ -145,7 +147,6 @@ private: // Same across all SNAKokkos Kokkos::View idxz; Kokkos::View idxb; - int idxcg_max, idxu_max, idxz_max, idxb_max; Kokkos::View idxcg_block; Kokkos::View idxu_block; Kokkos::View idxz_block; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 4eb3ae9a4e..60928a6515 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -386,27 +386,30 @@ void SNAKokkos::compute_zi(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi(int iatom, +void SNAKokkos::compute_yi(int iter, const Kokkos::View &beta) { double betaj; - const int ii = iatom; + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; + const int jju = idxz[jjz].jju; //{ //Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist.extent(1)), // [&] (const int& i) { - for (int i = 0; i < ylist.extent(1); i++) { - ylist(iatom,i).re = 0.0; - ylist(iatom,i).im = 0.0; - } + //for (int i = 0; i < ylist.extent(1); i++) { + ylist(iatom,jju) = {0.0,0.0}; + // } // }); //} + + //int flopsum = 0; //Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), // [&] (const int& jjz) { - for (int jjz = 0; jjz < idxz_max; jjz++) { + //for (int jjz = 0; jjz < idxz_max; jjz++) { const int j1 = idxz[jjz].j1; const int j2 = idxz[jjz].j2; const int j = idxz[jjz].j; @@ -453,27 +456,27 @@ void SNAKokkos::compute_yi(int iatom, } // end loop over ib // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(ii,jjb) entries + // find right y_list[jju] and beta(iatom,jjb) entries // multiply and divide by j+1 factors // account for multiplicity of 1, 2, or 3 - const int jju = idxz[jjz].jju; + //const int jju = idxz[jjz].jju; // pick out right beta value if (j >= j1) { const int jjb = idxb_block(j1,j2,j); if (j1 == j) { - if (j2 == j) betaj = 3*beta(ii,jjb); - else betaj = 2*beta(ii,jjb); - } else betaj = beta(ii,jjb); + if (j2 == j) betaj = 3*beta(iatom,jjb); + else betaj = 2*beta(iatom,jjb); + } else betaj = beta(iatom,jjb); } else if (j >= j2) { const int jjb = idxb_block(j,j2,j1); - if (j2 == j) betaj = 2*beta(ii,jjb)*(j1+1)/(j+1.0); - else betaj = beta(ii,jjb)*(j1+1)/(j+1.0); + if (j2 == j) betaj = 2*beta(iatom,jjb)*(j1+1)/(j+1.0); + else betaj = beta(iatom,jjb)*(j1+1)/(j+1.0); } else { const int jjb = idxb_block(j2,j,j1); - betaj = beta(ii,jjb)*(j1+1)/(j+1.0); + betaj = beta(iatom,jjb)*(j1+1)/(j+1.0); } //Kokkos::single(Kokkos::PerThread(team), [&] () { @@ -481,7 +484,7 @@ void SNAKokkos::compute_yi(int iatom, Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i); //}); - }//); // end loop over jjz + //}//); // end loop over jjz //printf("sum %i\n",flopsum); } From 67a284c28ada485fbd99dad05d96ea3c2478cef6 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 26 Sep 2019 11:44:12 -0600 Subject: [PATCH 5/5] Collapse loop in compute_zi --- src/KOKKOS/pair_snap_kokkos.h | 2 +- src/KOKKOS/pair_snap_kokkos_impl.h | 8 ++-- src/KOKKOS/sna_kokkos.h | 32 ++++--------- src/KOKKOS/sna_kokkos_impl.h | 76 +++++++++++++++--------------- 4 files changed, 52 insertions(+), 66 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 7f2a6a14d5..8586c4bdab 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -80,7 +80,7 @@ public: void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeZi,const int& ii) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy::member_type& team) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 0c4bc79c54..02c8554fa5 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -230,7 +230,8 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) //Compute bispectrum if (quadraticflag || eflag) { //ComputeZi - typename Kokkos::TeamPolicy policy_zi(chunk_size,team_size,vector_length); + int idxz_max = snaKK.idxz_max; + typename Kokkos::RangePolicy policy_zi(0,chunk_size*idxz_max); Kokkos::parallel_for("ComputeZi",policy_zi,*this); //ComputeBi @@ -540,10 +541,9 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int& ii) template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int& ii) const { SNAKokkos my_sna = snaKK; - my_sna.compute_zi(team,ii); + my_sna.compute_zi(ii); } template diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index af16daf970..7aa154c3d5 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -29,27 +29,17 @@ typedef double SNAreal; //typedef struct { SNAreal re, im; } SNAcomplex; struct alignas(2*sizeof(SNAreal)) SNAcomplex{ - SNAreal re, im; + SNAreal re, im; - KOKKOS_INLINE_FUNCTION - SNAcomplex() : re(0),im(0) - {;} + KOKKOS_INLINE_FUNCTION + SNAcomplex() : re(0),im(0) + {} - KOKKOS_INLINE_FUNCTION - SNAcomplex(SNAreal real_in, SNAreal imag_in) - :re(real_in),im(imag_in) - {;} - -#ifdef __CUDACC__ - KOKKOS_INLINE_FUNCTION - SNAcomplex(double2 ref) - :re(ref.x),im(ref.y) - {;} - - operator double2() { return make_double2(re,im); } -#endif - -} ; + KOKKOS_INLINE_FUNCTION + SNAcomplex(SNAreal real_in, SNAreal imag_in) + :re(real_in),im(imag_in) + {} +}; //struct SNAKK_ZINDICES { // int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju; @@ -125,8 +115,6 @@ inline inline void transpose_ulisttot(); -inline - void zero_yi(); // functions for bispectrum coefficients KOKKOS_INLINE_FUNCTION @@ -136,7 +124,7 @@ inline KOKKOS_INLINE_FUNCTION void compute_ui_orig(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_zi(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + void compute_zi(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void zero_yi(const int&); KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 38f7b16ed4..3e4ebc2e42 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -289,54 +289,52 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNAKokkos::compute_zi(const int& iter) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), - [&] (const int& jjz) { - //for(int jjz = 0; jjz < idxz_max; jjz++) { - const int j1 = idxz(jjz,0); - const int j2 = idxz(jjz,1); - const int j = idxz(jjz,2); - const int ma1min = idxz(jjz,3); - const int ma2max = idxz(jjz,4); - const int mb1min = idxz(jjz,5); - const int mb2max = idxz(jjz,6); - const int na = idxz(jjz,7); - const int nb = idxz(jjz,8); + const int iatom = iter / idxz_max; + const int jjz = iter % idxz_max; - const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); + const int j1 = idxz(jjz,0); + const int j2 = idxz(jjz,1); + const int j = idxz(jjz,2); + const int ma1min = idxz(jjz,3); + const int ma2max = idxz(jjz,4); + const int mb1min = idxz(jjz,5); + const int mb2max = idxz(jjz,6); + const int na = idxz(jjz,7); + const int nb = idxz(jjz,8); - zlist(iatom,jjz).re = 0.0; - zlist(iatom,jjz).im = 0.0; + const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); - int jju1 = idxu_block[j1] + (j1+1)*mb1min; - int jju2 = idxu_block[j2] + (j2+1)*mb2max; - int icgb = mb1min*(j2+1) + mb2max; - for(int ib = 0; ib < nb; ib++) { + zlist(iatom,jjz).re = 0.0; + zlist(iatom,jjz).im = 0.0; - double suma1_r = 0.0; - double suma1_i = 0.0; + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + for(int ib = 0; ib < nb; ib++) { - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min*(j2+1) + ma2max; - for(int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia + double suma1_r = 0.0; + double suma1_i = 0.0; - zlist(iatom,jjz).re += cgblock[icgb] * suma1_r; - zlist(iatom,jjz).im += cgblock[icgb] * suma1_i; + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + for(int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im); + suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re); + ma1++; + ma2--; + icga += j2; + } // end loop over ia - jju1 += j1+1; - jju2 -= j2+1; - icgb += j2; - } // end loop over ib + zlist(iatom,jjz).re += cgblock[icgb] * suma1_r; + zlist(iatom,jjz).im += cgblock[icgb] * suma1_i; - }); // end loop over jjz + jju1 += j1+1; + jju2 -= j2+1; + icgb += j2; + } // end loop over ib } /* ----------------------------------------------------------------------