From f1dfcaf514948935724df4b0b754adecfca3cf04 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 25 Jun 2019 14:47:21 -0600 Subject: [PATCH] WIP --- src/KOKKOS/pair_snap_kokkos.h | 25 +- src/KOKKOS/pair_snap_kokkos_impl.h | 381 +++++--- src/KOKKOS/sna_kokkos.h | 51 +- src/KOKKOS/sna_kokkos_impl.h | 1461 +++++++++++++++++----------- 4 files changed, 1183 insertions(+), 735 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index b2019879ed..8be0bf9afb 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -31,7 +31,10 @@ PairStyle(snap/kk/host,PairSNAPKokkos) namespace LAMMPS_NS { template -struct TagPairSNAP{}; +struct TagPairSNAPCompute{}; + +struct TagPairSNAPBeta{}; +struct TagPairSNAPBispectrum{}; template class PairSNAPKokkos : public PairSNAP { @@ -53,11 +56,17 @@ public: template KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAP,const typename Kokkos::TeamPolicy >::member_type& team) const; + void operator() (TagPairSNAPCompute,const typename Kokkos::TeamPolicy >::member_type& team) const; template KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAP,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT&) const; + void operator() (TagPairSNAPCompute,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT&) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPBispectrum,const typename Kokkos::TeamPolicy::member_type& team) const; template KOKKOS_INLINE_FUNCTION @@ -82,10 +91,14 @@ protected: SNAKokkos snaKK; // How much parallelism to use within an interaction - int vector_length; + int vector_length,team_size; + int team_scratch_size; + int thread_scratch_size; int eflag,vflag; + void compute_beta(); + void compute_bispectrum(); void allocate(); //void read_files(char *, char *); /*template @@ -117,7 +130,9 @@ inline double dist2(double* x,double* y); Kokkos::View d_radelem; // element radii Kokkos::View d_wjelem; // elements weights Kokkos::View d_coeffelem; // element bispectrum coefficients - Kokkos::View d_map; // mapping from atom types to elements + Kokkos::View d_map; // mapping from atom types to elements + Kokkos::View d_beta; // betas for all atoms in list + Kokkos::View d_bispectrum; // bispectrum components for all atoms in list typedef Kokkos::DualView tdual_fparams; tdual_fparams k_cutsq; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 0ec4ed0995..687c9dc7cb 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -186,31 +186,45 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) snaKK.nmax = max_neighs; - T_INT team_scratch_size = snaKK.size_team_scratch_arrays(); - T_INT thread_scratch_size = snaKK.size_thread_scratch_arrays(); + team_scratch_size = snaKK.size_team_scratch_arrays(); + thread_scratch_size = snaKK.size_thread_scratch_arrays(); //printf("Sizes: %i %i\n",team_scratch_size/1024,thread_scratch_size/1024); int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); - int vector_length = 8; + vector_length = 8; #ifdef KOKKOS_ENABLE_CUDA - int team_size = 32;//max_neighs; + team_size = 32;//max_neighs; if (team_size*vector_length > team_size_max) team_size = team_size_max/vector_length; #else - int team_size = 1; + team_size = 1; #endif + if (beta_max < list->inum) { // TODO: no init + d_beta = Kokkos::View("PairSNAPKokkos:beta", + list->inum,ncoeff); + d_bispectrum = Kokkos::View("PairSNAPKokkos:bispectrum", + list->inum,ncoeff); + beta_max = list->inum; + } + + // compute dE_i/dB_i = beta_i for all i in list + + if (quadraticflag || eflag) + compute_bispectrum(); + compute_beta(); + EV_FLOAT ev; if (eflag) { if (neighflag == HALF) { - typename Kokkos::TeamPolicy > policy(inum,team_size,vector_length); + typename Kokkos::TeamPolicy > policy(inum,team_size,vector_length); Kokkos::parallel_reduce(policy .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size)) ,*this,ev); } else if (neighflag == HALFTHREAD) { - typename Kokkos::TeamPolicy > policy(inum,team_size,vector_length); + typename Kokkos::TeamPolicy > policy(inum,team_size,vector_length); Kokkos::parallel_reduce(policy .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size)) @@ -218,13 +232,13 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) } } else { if (neighflag == HALF) { - typename Kokkos::TeamPolicy > policy(inum,team_size,vector_length); + typename Kokkos::TeamPolicy > policy(inum,team_size,vector_length); Kokkos::parallel_for(policy .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size)) ,*this); } else if (neighflag == HALFTHREAD) { - typename Kokkos::TeamPolicy > policy(inum,team_size,vector_length); + typename Kokkos::TeamPolicy > policy(inum,team_size,vector_length); Kokkos::parallel_for(policy .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size)) @@ -232,11 +246,6 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) } } -//static int step =0; -//step++; -//if (step%10==0) -// printf(" %e %e %e %e %e (%e %e): %e\n",t1,t2,t3,t4,t5,t6,t7,t1+t2+t3+t4+t5); - if (need_dup) Kokkos::Experimental::contribute(f, dup_f); @@ -275,6 +284,153 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) } } +/* ---------------------------------------------------------------------- + compute beta +------------------------------------------------------------------------- */ + +template +void PairSNAPKokkos::compute_beta() +{ + // TODO: use RangePolicy instead, or thread over ncoeff? + int inum = list->inum; + typename Kokkos::TeamPolicy policy(inum,team_size,vector_length); + Kokkos::parallel_for(policy + .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) + .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size)) + ,*this); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy::member_type& team) const { + + const int ii = team.league_rank(); + const int i = d_ilist[ii]; + const int itype = type[i]; + const int ielem = map[itype]; + Kokkos::View> + d_coeffi(d_coeffelem,ielem,Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + d_beta(ii,icoeff) = d_coeffi[icoeff+1]; + + if (quadraticflag) { + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + double bveci = d_bispectrum(ii,icoeff); + d_beta(ii,icoeff) += d_coeffi[k]*bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + double bvecj = d_bispectrum(ii,jcoeff); + d_beta(ii,icoeff) += d_coeffi[k]*bvecj; + d_beta(ii,jcoeff) += d_coeffi[k]*bveci; + k++; + } + } + } +} + +/* ---------------------------------------------------------------------- + compute bispectrum +------------------------------------------------------------------------- */ + +template +void PairSNAPKokkos::compute_bispectrum() +{ + int inum = list->inum; + typename Kokkos::TeamPolicy policy(inum,team_size,vector_length); + Kokkos::parallel_for(policy + .set_scratch_size(1,Kokkos::PerThread(thread_scratch_size)) + .set_scratch_size(1,Kokkos::PerTeam(team_scratch_size)) + ,*this); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPBispectrum,const typename Kokkos::TeamPolicy::member_type& team) const { + + const int ii = team.league_rank(); + const int i = d_ilist[ii]; + SNAKokkos my_sna(snaKK,team); + const double xtmp = x(i,0); + const double ytmp = x(i,1); + const double ztmp = x(i,2); + const int itype = type[i]; + const int ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + const int num_neighs = d_numneigh[i]; + + // rij[][3] = displacements between atom I and those neighbors + // inside = indices of neighbors of I within cutoff + // wj = weights for neighbors of I within cutoff + // rcutij = cutoffs for neighbors of I within cutoff + // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi + + int ninside = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,num_neighs), + [&] (const int jj, int& count) { + Kokkos::single(Kokkos::PerThread(team), [&] (){ + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + + const int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + const int elem_j = d_map[jtype]; + + if ( rsq < rnd_cutsq(itype,jtype) ) + count++; + }); + },ninside); + + if (team.team_rank() == 0) + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs), + [&] (const int jj, int& offset, bool final) { + //for (int jj = 0; jj < num_neighs; jj++) { + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + + const int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + const int elem_j = d_map[jtype]; + + if ( rsq < rnd_cutsq(itype,jtype) ) { + if (final) { + my_sna.rij(offset,0) = dx; + my_sna.rij(offset,1) = dy; + my_sna.rij(offset,2) = dz; + my_sna.inside[offset] = j; + my_sna.wj[offset] = d_wjelem[elem_j]; + my_sna.rcutij[offset] = (radi + d_radelem[elem_j])*rcutfac; + } + offset++; + } + }); + team.team_barrier(); + + // compute Ui, Zi, and Bi for atom I + + my_sna.compute_ui(team,ninside); + team.team_barrier(); + + my_sna.compute_zi(team); + team.team_barrier(); + + my_sna.compute_bi(team); + team.team_barrier(); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + d_bispectrum(ii,icoeff) = my_sna.blist[icoeff]; +} + /* ---------------------------------------------------------------------- allocate all arrays ------------------------------------------------------------------------- */ @@ -354,7 +510,7 @@ void PairSNAPKokkos::coeff(int narg, char **arg) template template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAP,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT& ev) const { +void PairSNAPKokkos::operator() (TagPairSNAPCompute,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT& ev) const { // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial @@ -364,12 +520,12 @@ void PairSNAPKokkos::operator() (TagPairSNAP,const const int ii = team.league_rank(); const int i = d_ilist[ii]; SNAKokkos my_sna(snaKK,team); - const double x_i = x(i,0); - const double y_i = x(i,1); - const double z_i = x(i,2); - const int type_i = type[i]; - const int elem_i = d_map[type_i]; - const double radi = d_radelem[elem_i]; + const double xtmp = x(i,0); + const double ytmp = x(i,1); + const double ztmp = x(i,2); + const int itype = type[i]; + const int ielem = d_map[itype]; + const double radi = d_radelem[ielem]; const int num_neighs = d_numneigh[i]; @@ -379,41 +535,38 @@ void PairSNAPKokkos::operator() (TagPairSNAP,const // rcutij = cutoffs for neighbors of I within cutoff // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi - //Kokkos::Timer timer; int ninside = 0; Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,num_neighs), [&] (const int jj, int& count) { Kokkos::single(Kokkos::PerThread(team), [&] (){ T_INT j = d_neighbors(i,jj); - const F_FLOAT dx = x(j,0) - x_i; - const F_FLOAT dy = x(j,1) - y_i; - const F_FLOAT dz = x(j,2) - z_i; + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; - const int type_j = type(j); + const int jtype = type(j); const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; - const int elem_j = d_map[type_j]; + const int elem_j = d_map[jtype]; - if ( rsq < rnd_cutsq(type_i,type_j) ) + if ( rsq < rnd_cutsq(itype,jtype) ) count++; }); },ninside); - //t1 += timer.seconds(); timer.reset(); - if (team.team_rank() == 0) Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs), - [&] (const int jj, int& offset, bool final){ + [&] (const int jj, int& offset, bool final) { //for (int jj = 0; jj < num_neighs; jj++) { T_INT j = d_neighbors(i,jj); - const F_FLOAT dx = x(j,0) - x_i; - const F_FLOAT dy = x(j,1) - y_i; - const F_FLOAT dz = x(j,2) - z_i; + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; - const int type_j = type(j); + const int jtype = type(j); const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; - const int elem_j = d_map[type_j]; + const int elem_j = d_map[jtype]; - if ( rsq < rnd_cutsq(type_i,type_j) ) { + if ( rsq < rnd_cutsq(itype,jtype) ) { if (final) { my_sna.rij(offset,0) = dx; my_sna.rij(offset,1) = dy; @@ -425,157 +578,85 @@ void PairSNAPKokkos::operator() (TagPairSNAP,const offset++; } }); - - //t2 += timer.seconds(); timer.reset(); - team.team_barrier(); - // compute Ui, Zi, and Bi for atom I + + // compute Ui, Yi for atom I + my_sna.compute_ui(team,ninside); - //t3 += timer.seconds(); timer.reset(); team.team_barrier(); - my_sna.compute_zi(team); - //t4 += timer.seconds(); timer.reset(); - team.team_barrier(); - - if (quadraticflag) { - my_sna.compute_bi(team); - team.team_barrier(); - my_sna.copy_bi2bvec(team); - team.team_barrier(); - } // for neighbors of I within cutoff: - // compute dUi/drj and dBi/drj - // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj + // compute Fij = dEi/dRj = -dEi/dRi + // add to Fi, subtract from Fj + + my_sna.compute_yi(team,d_beta,ii); + team.team_barrier(); Kokkos::View> - d_coeffi(d_coeffelem,elem_i,Kokkos::ALL); + d_coeffi(d_coeffelem,ielem,Kokkos::ALL); Kokkos::parallel_for (Kokkos::TeamThreadRange(team,ninside), [&] (const int jj) { //for (int jj = 0; jj < ninside; jj++) { int j = my_sna.inside[jj]; - //Kokkos::Timer timer2; my_sna.compute_duidrj(team,&my_sna.rij(jj,0), my_sna.wj[jj],my_sna.rcutij[jj]); - //t6 += timer2.seconds(); timer2.reset(); - my_sna.compute_dbidrj(team); - //t7 += timer2.seconds(); timer2.reset(); - my_sna.copy_dbi2dbvec(team); Kokkos::single(Kokkos::PerThread(team), [&] (){ - F_FLOAT fij[3]; - fij[0] = 0.0; - fij[1] = 0.0; - fij[2] = 0.0; - - // linear contributions - - for (int k = 1; k <= ncoeff; k++) { - double bgb = d_coeffi[k]; - fij[0] += bgb*my_sna.dbvec(k-1,0); - fij[1] += bgb*my_sna.dbvec(k-1,1); - fij[2] += bgb*my_sna.dbvec(k-1,2); - } - - if (quadraticflag) { - - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - double bveci = my_sna.bvec[icoeff]; - double fack = d_coeffi[k]*bveci; - double dbvecix = my_sna.dbvec(icoeff,0); - double dbveciy = my_sna.dbvec(icoeff,1); - double dbveciz = my_sna.dbvec(icoeff,2); - fij[0] += fack*dbvecix; - fij[1] += fack*dbveciy; - fij[2] += fack*dbveciz; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - double facki = d_coeffi[k]*bveci; - double fackj = d_coeffi[k]*my_sna.bvec[jcoeff]; - fij[0] += facki*my_sna.dbvec(jcoeff,0)+fackj*dbvecix; - fij[1] += facki*my_sna.dbvec(jcoeff,1)+fackj*dbveciy; - fij[2] += facki*my_sna.dbvec(jcoeff,2)+fackj*dbveciz; - k++; + F_FLOAT fij[3]; + my_sna.compute_deidrj(team,fij); + + a_f(i,0) += fij[0]; + a_f(i,1) += fij[1]; + a_f(i,2) += fij[2]; + a_f(j,0) -= fij[0]; + a_f(j,1) -= fij[1]; + a_f(j,2) -= fij[2]; + + // tally global and per-atom virial contribution + + if (EVFLAG) { + if (vflag_either) { + v_tally_xyz(ev,i,j, + fij[0],fij[1],fij[2], + -my_sna.rij(jj,0),-my_sna.rij(jj,1), + -my_sna.rij(jj,2)); } } - } - - // Hard-coded ZBL potential - //const double dx = my_sna.rij(jj,0); - //const double dy = my_sna.rij(jj,1); - //const double dz = my_sna.rij(jj,2); - //const double fdivr = -1.5e6/pow(dx*dx + dy*dy + dz*dz,7.0); - //fij[0] += dx*fdivr; - //fij[1] += dy*fdivr; - //fij[2] += dz*fdivr; - - //OK - //printf("%lf %lf %lf %lf %lf %lf %lf %lf %lf SNAP-COMPARE: FIJ\n" - // ,x(i,0),x(i,1),x(i,2),x(j,0),x(j,1),x(j,2),fij[0],fij[1],fij[2] ); - a_f(i,0) += fij[0]; - a_f(i,1) += fij[1]; - a_f(i,2) += fij[2]; - a_f(j,0) -= fij[0]; - a_f(j,1) -= fij[1]; - a_f(j,2) -= fij[2]; - - // tally global and per-atom virial contribution - - if (EVFLAG) { - if (vflag_either) { - v_tally_xyz(ev,i,j, - fij[0],fij[1],fij[2], - -my_sna.rij(jj,0),-my_sna.rij(jj,1), - -my_sna.rij(jj,2)); - } - } - + }); }); - //t5 += timer.seconds(); timer.reset(); // tally energy contribution if (EVFLAG) { if (eflag_either) { - if (!quadraticflag) { - my_sna.compute_bi(team); - team.team_barrier(); - my_sna.copy_bi2bvec(team); - team.team_barrier(); - } - - // E = beta.B + 0.5*B^t.alpha.B - // coeff[k] = beta[k-1] or - // coeff[k] = alpha_ii or - // coeff[k] = alpha_ij = alpha_ji, j != i - Kokkos::single(Kokkos::PerTeam(team), [&] () { // evdwl = energy of atom I, sum over coeffs_k * Bi_k double evdwl = d_coeffi[0]; - + + // E = beta.B + 0.5*B^t.alpha.B + // linear contributions - // could use thread vector range on this loop - - for (int k = 1; k <= ncoeff; k++) - evdwl += d_coeffi[k]*my_sna.bvec[k-1]; - + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + evdwl += d_coeffi[icoeff+1]*d_bispectrum(ii,icoeff); + // quadratic contributions - + if (quadraticflag) { int k = ncoeff+1; for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - double bveci = my_sna.bvec[icoeff]; + double bveci = d_bispectrum(ii,icoeff); evdwl += 0.5*d_coeffi[k++]*bveci*bveci; for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - evdwl += d_coeffi[k++]*bveci*my_sna.bvec[jcoeff]; + double bvecj = d_bispectrum(ii,jcoeff); + evdwl += d_coeffi[k++]*bveci*bvecj; } } } @@ -591,9 +672,9 @@ void PairSNAPKokkos::operator() (TagPairSNAP,const template template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAP,const typename Kokkos::TeamPolicy >::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPCompute,const typename Kokkos::TeamPolicy >::member_type& team) const { EV_FLOAT ev; - this->template operator()(TagPairSNAP(), team, ev); + this->template operator()(TagPairSNAPCompute(), team, ev); } /* ---------------------------------------------------------------------- */ diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 40e5fe0ad4..ff2541dca3 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -25,7 +25,11 @@ namespace LAMMPS_NS { -struct SNAKK_LOOPINDICES { +struct SNAKK_ZINDICES { + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju; +}; + +struct SNAKK_BINDICES { int j1, j2, j; }; @@ -35,9 +39,9 @@ class SNAKokkos { public: typedef Kokkos::View t_sna_1i; typedef Kokkos::View t_sna_1d; + typedef Kokkos::View > t_sna_1d_atomic; typedef Kokkos::View t_sna_2d; typedef Kokkos::View t_sna_3d; - typedef Kokkos::View > t_sna_3d_atomic; typedef Kokkos::View t_sna_4d; typedef Kokkos::View t_sna_3d3; typedef Kokkos::View t_sna_5d; @@ -76,9 +80,10 @@ inline KOKKOS_INLINE_FUNCTION void compute_zi(const typename Kokkos::TeamPolicy::member_type& team); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_bi(const typename Kokkos::TeamPolicy::member_type& team); // ForceSNAP + void compute_yi(const typename Kokkos::TeamPolicy::member_type& team, + const Kokkos::View &beta, const int ii); // ForceSNAP KOKKOS_INLINE_FUNCTION - void copy_bi2bvec(const typename Kokkos::TeamPolicy::member_type& team); //ForceSNAP + void compute_bi(const typename Kokkos::TeamPolicy::member_type& team); // ForceSNAP // functions for derivatives @@ -87,7 +92,7 @@ inline KOKKOS_INLINE_FUNCTION void compute_dbidrj(const typename Kokkos::TeamPolicy::member_type& team); //ForceSNAP KOKKOS_INLINE_FUNCTION - void copy_dbi2dbvec(const typename Kokkos::TeamPolicy::member_type& team); //ForceSNAP + void compute_deidrj(const typename Kokkos::TeamPolicy::member_type& team, double *); // ForceSNAP KOKKOS_INLINE_FUNCTION double compute_sfac(double, double); // add_uarraytot, compute_duarray KOKKOS_INLINE_FUNCTION @@ -114,37 +119,41 @@ inline int twojmax, diagonalstyle; // Per InFlight Particle - t_sna_3d barray; - t_sna_3d uarraytot_r, uarraytot_i; - t_sna_3d_atomic uarraytot_r_a, uarraytot_i_a; - t_sna_5d zarray_r, zarray_i; + t_sna_1d blist; + t_sna_1d ulisttot_r, ulisttot_i; + t_sna_1d_atomic ulisttot_r_a, ulisttot_i_a; + t_sna_1d zlist_r, zlist_i; // Per InFlight Interaction - t_sna_3d uarray_r, uarray_i; - - Kokkos::View bvec; + t_sna_1d ulist_r, ulist_i; + t_sna_1d ylist_r, ylist_i; // derivatives of data - Kokkos::View dbvec; - t_sna_4d duarray_r, duarray_i; - t_sna_4d dbarray; + t_sna_2d dulist_r, dulist_i; + t_sna_2d dblist; private: double rmin0, rfac0; //use indexlist instead of loops, constructor generates these - // Same accross all SNAKokkos - Kokkos::View idxj,idxj_full; - int idxj_max,idxj_full_max; + // Same across all SNAKokkos + Kokkos::View idxz; + Kokkos::View idxb; + int idxcg_max, idxu_max, idxz_max, idxb_max; + Kokkos::View idxcg_block; + Kokkos::View idxu_block; + Kokkos::View idxz_block; + Kokkos::View idxb_block; + // data for bispectrum coefficients // Same accross all SNAKokkos - t_sna_5d cgarray; + t_sna_1d cglist; t_sna_2d rootpqarray; - static const int nmaxfactorial = 167; - KOKKOS_INLINE_FUNCTION + static const double nfac_table[]; + inline double factorial(int); KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index c43003af97..4ca8ae4471 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -47,13 +47,13 @@ SNAKokkos::SNAKokkos(double rfac0_in, build_indexlist(); - int jdim = twojmax + 1; + int jdimpq = twojmax + 2; + rootpqarray = t_sna_2d("SNAKokkos::rootpqarray",jdimpq,jdimpq); - cgarray = t_sna_5d("SNAKokkos::cgarray",jdim,jdim,jdim,jdim,jdim); - rootpqarray = t_sna_2d("SNAKokkos::rootpqarray",jdim+1,jdim+1); + cglist = t_sna_1d("SNAKokkos::cglist",idxcg_max); if (bzero_flag) { - bzero = Kokkos::View("sna:bzero",jdim); + bzero = Kokkos::View("sna:bzero",twojmax+1); auto h_bzero = Kokkos::create_mirror_view(bzero); double www = wself*wself*wself; @@ -77,11 +77,17 @@ SNAKokkos::SNAKokkos(const SNAKokkos& sna, const typenam ncoeff = sna.ncoeff; nmax = sna.nmax; - idxj = sna.idxj; - idxj_max = sna.idxj_max; - idxj_full = sna.idxj_full; - idxj_full_max = sna.idxj_full_max; - cgarray = sna.cgarray; + idxz = sna.idxz; + idxb = sna.idxb; + idxcg_max = sna.idxcg_max; + idxu_max = sna.idxu_max; + idxz_max = sna.idxz_max; + idxb_max = sna.idxb_max; + idxcg_block = sna.idxcg_block; + idxu_block = sna.idxu_block; + idxz_block = sna.idxz_block; + idxb_block = sna.idxb_block; + cglist = sna.cglist; rootpqarray = sna.rootpqarray; bzero = sna.bzero; create_team_scratch_arrays(team); @@ -100,47 +106,133 @@ template inline void SNAKokkos::build_indexlist() { - int idxj_count = 0; - int idxj_full_count = 0; + // index list for cglist + + int jdim = twojmax + 1; + idxcg_block = Kokkos::View("SNAKokkos::idxcg_block",jdim,jdim,jdim); + auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block); + + int idxcg_count = 0; + for(int j1 = 0; j1 <= twojmax; j1++) + for(int j2 = 0; j2 <= j1; j2++) + for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + h_idxcg_block(j1,j2,j) = idxcg_count; + for (int m1 = 0; m1 <= j1; m1++) + for (int m2 = 0; m2 <= j2; m2++) + idxcg_count++; + } + idxcg_max = idxcg_count; + Kokkos::deep_copy(idxcg_block,h_idxcg_block); + + // index list for uarray + // need to include both halves + + idxu_block = Kokkos::View("SNAKokkos::idxu_block",jdim); + auto h_idxu_block = Kokkos::create_mirror_view(idxu_block); + + int idxu_count = 0; + + for(int j = 0; j <= twojmax; j++) { + h_idxu_block[j] = idxu_count; + for(int mb = 0; mb <= j; mb++) + for(int ma = 0; ma <= j; ma++) + idxu_count++; + } + idxu_max = idxu_count; + Kokkos::deep_copy(idxu_block,h_idxu_block); + + // index list for beta and B + + int idxb_count = 0; + for(int j1 = 0; j1 <= twojmax; j1++) + for(int j2 = 0; j2 <= j1; j2++) + for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) idxb_count++; + + idxb_max = idxb_count; + idxb = Kokkos::View("SNAKokkos::idxb",idxb_max); + auto h_idxb = Kokkos::create_mirror_view(idxb); + + idxb_count = 0; + for(int j1 = 0; j1 <= twojmax; j1++) + for(int j2 = 0; j2 <= j1; j2++) + for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) { + h_idxb[idxb_count].j1 = j1; + h_idxb[idxb_count].j2 = j2; + h_idxb[idxb_count].j = j; + idxb_count++; + } + Kokkos::deep_copy(idxb,h_idxb); + + // reverse index list for beta and b + + idxb_block = Kokkos::View("SNAKokkos::idxb_block",jdim,jdim,jdim); + auto h_idxb_block = Kokkos::create_mirror_view(idxb_block); + + idxb_count = 0; + for(int j1 = 0; j1 <= twojmax; j1++) + for(int j2 = 0; j2 <= j1; j2++) + for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + if (j >= j1) { + h_idxb_block(j1,j2,j) = idxb_count; + idxb_count++; + } + } + Kokkos::deep_copy(idxb_block,h_idxb_block); + + // index list for zlist + + int idxz_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) - for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) { - if (j >= j1) idxj_count++; - idxj_full_count++; - } - - // indexList can be changed here - - idxj = Kokkos::View("SNAKokkos::idxj",idxj_count); - idxj_full = Kokkos::View("SNAKokkos::idxj_full",idxj_full_count); - auto h_idxj = Kokkos::create_mirror_view(idxj); - auto h_idxj_full = Kokkos::create_mirror_view(idxj_full); - - idxj_max = idxj_count; - idxj_full_max = idxj_full_count; - - idxj_count = 0; - idxj_full_count = 0; + for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) + idxz_count++; + + idxz_max = idxz_count; + idxz = Kokkos::View("SNAKokkos::idxz",idxz_max); + auto h_idxz = Kokkos::create_mirror_view(idxz); + idxz_block = Kokkos::View("SNAKokkos::idxz_block", jdim,jdim,jdim); + auto h_idxz_block = Kokkos::create_mirror_view(idxz_block); + + idxz_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) - for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) { - if (j >= j1) { - h_idxj[idxj_count].j1 = j1; - h_idxj[idxj_count].j2 = j2; - h_idxj[idxj_count].j = j; - idxj_count++; - } - h_idxj_full[idxj_full_count].j1 = j1; - h_idxj_full[idxj_full_count].j2 = j2; - h_idxj_full[idxj_full_count].j = j; - idxj_full_count++; - } - Kokkos::deep_copy(idxj,h_idxj); - Kokkos::deep_copy(idxj_full,h_idxj_full); + for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + h_idxz_block(j1,j2,j) = idxz_count; + // find right beta(ii,jjb) entry + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) { + h_idxz[idxz_count].j1 = j1; + h_idxz[idxz_count].j2 = j2; + h_idxz[idxz_count].j = j; + h_idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2); + h_idxz[idxz_count].ma2max = (2 * ma - j - (2 * h_idxz[idxz_count].ma1min - j1) + j2) / 2; + h_idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz[idxz_count].ma1min + 1; + h_idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2); + h_idxz[idxz_count].mb2max = (2 * mb - j - (2 * h_idxz[idxz_count].mb1min - j1) + j2) / 2; + h_idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz[idxz_count].mb1min + 1; + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + + const int jju = idxu_block[j] + (j+1)*mb + ma; + h_idxz[idxz_count].jju = jju; + + idxz_count++; + } + } + Kokkos::deep_copy(idxz,h_idxz); + Kokkos::deep_copy(idxz_block,h_idxz_block); } + /* ---------------------------------------------------------------------- */ template @@ -166,6 +258,7 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::member_type& team, int jnum) { + //printf("jnum %i\n",jnum); double rsq, r, x, y, z, z0, theta0; // utot(j,ma,mb) = 0 for all j,ma,ma @@ -211,93 +304,234 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_zi(const typename Kokkos::TeamPolicy::member_type& team) { - // for j1 = 0,...,twojmax - // for j2 = 0,twojmax - // for j = |j1-j2|,Min(twojmax,j1+j2),2 - // for ma = 0,...,j - // for mb = 0,...,jmid - // z(j1,j2,j,ma,mb) = 0 - // for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2) - // sumb1 = 0 - // ma2 = ma-ma1+(j1+j2-j)/2; - // for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2) - // mb2 = mb-mb1+(j1+j2-j)/2; - // sumb1 += cg(j1,mb1,j2,mb2,j) * - // u(j1,ma1,mb1) * u(j2,ma2,mb2) - // z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j) + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), + [&] (const int& jjz) { + //for(int jjz = 0; jjz < idxz_max; jjz++) { + const int j1 = idxz[jjz].j1; + const int j2 = idxz[jjz].j2; + const int j = idxz[jjz].j; + const int ma1min = idxz[jjz].ma1min; + const int ma2max = idxz[jjz].ma2max; + const int na = idxz[jjz].na; + const int mb1min = idxz[jjz].mb1min; + const int mb2max = idxz[jjz].mb2max; + const int nb = idxz[jjz].nb; -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &starttime); -#endif + const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); - // compute_dbidrj() requires full j1/j2/j chunk of z elements - // use zarray j1/j2 symmetry + zlist_r[jjz] = 0.0; + zlist_i[jjz] = 0.0; - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxj_full_max), - [&] (const int& idx) { - const int j1 = idxj_full(idx).j1; - const int j2 = idxj_full(idx).j2; - const int j = idxj_full(idx).j; + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + for(int ib = 0; ib < nb; ib++) { - const int bound = (j+2)/2; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+1)*bound), - [&] (const int mbma ) { - //for(int mb = 0; 2*mb <= j; mb++) - //for(int ma = 0; ma <= j; ma++) { - const int ma = mbma%(j+1); - const int mb = mbma/(j+1); + double suma1_r = 0.0; + double suma1_i = 0.0; - //zarray_r(j1,j2,j,ma,mb) = 0.0; - //zarray_i(j1,j2,j,ma,mb) = 0.0; - double z_r = 0.0; - double z_i = 0.0; + const double* u1_r = ulisttot_r.data() + jju1; + const double* u1_i = ulisttot_i.data() + jju1; + const double* u2_r = ulisttot_r.data() + jju2; + const double* u2_i = ulisttot_i.data() + jju2; - for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2); - ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) { - double sumb1_r = 0.0; - double sumb1_i = 0.0; + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + for(int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]); + suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]); + ma1++; + ma2--; + icga += j2; + } // end loop over ia - const int ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2; + zlist_r[jjz] += cgblock[icgb] * suma1_r; + zlist_i[jjz] += cgblock[icgb] * suma1_i; + //printf("%i %i %i %g %g\n",j1,j2,j,cgblock[icgb],suma1_r); + jju1 += j1+1; + jju2 -= j2+1; + icgb += j2; + } // end loop over ib - for(int mb1 = MAX( 0, (2 * mb - j - j2 + j1) / 2); - mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) { - - const int mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2; - const double cga = cgarray(j1,j2,j,mb1,mb2); - const double uat1_r = uarraytot_r(j1,ma1,mb1); - const double uat1_i = uarraytot_i(j1,ma1,mb1); - const double uat2_r = uarraytot_r(j2,ma2,mb2); - const double uat2_i = uarraytot_i(j2,ma2,mb2); - sumb1_r += cga * (uat1_r * uat2_r - uat1_i * uat2_i); - sumb1_i += cga * (uat1_r * uat2_i + uat1_i * uat2_r); - /*sumb1_r += cgarray(j1,j2,j,mb1,mb2) * - (uarraytot_r(j1,ma1,mb1) * uarraytot_r(j2,ma2,mb2) - - uarraytot_i(j1,ma1,mb1) * uarraytot_i(j2,ma2,mb2)); - sumb1_i += cgarray(j1,j2,j,mb1,mb2) * - (uarraytot_r(j1,ma1,mb1) * uarraytot_i(j2,ma2,mb2) + - uarraytot_i(j1,ma1,mb1) * uarraytot_r(j2,ma2,mb2));*/ - } // end loop over mb1 - - const double cga = cgarray(j1,j2,j,ma1,ma2); - z_r += sumb1_r * cga;//rray(j1,j2,j,ma1,ma2); - z_i += sumb1_i * cga;//rray(j1,j2,j,ma1,ma2); - } // end loop over ma1 - zarray_r(j1,j2,j,mb,ma) = z_r; - zarray_i(j1,j2,j,mb,ma) = z_i; - }); // end loop over ma, mb - // } - //} - }); - //} // end loop over j - //} // end loop over j1, j2 - -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &endtime); - timers[1] += (endtime.tv_sec - starttime.tv_sec + 1.0 * - (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); -#endif + }); // end loop over jjz } +/* ---------------------------------------------------------------------- + compute Yi from Ui without storing Zi, looping over zlist indices +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_yi(const typename Kokkos::TeamPolicy::member_type& team, + const Kokkos::View &beta, const int ii) +{ + int j; + int jjz; + int jju; + double betaj; + + { + double* const ptr = ylist_r.data(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist_r.span()), + [&] (const int& i) { + ptr[i] = 0.0; + }); + } + { + double* const ptr = ylist_i.data(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist_i.span()), + [&] (const int& i) { + ptr[i] = 0.0; + }); + } + + Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max), + [&] (const int& jjz) { + //for(int jjz = 0; jjz < idxz_max; jjz++) { + const int j1 = idxz[jjz].j1; + const int j2 = idxz[jjz].j2; + const int j = idxz[jjz].j; + const int ma1min = idxz[jjz].ma1min; + const int ma2max = idxz[jjz].ma2max; + const int na = idxz[jjz].na; + const int mb1min = idxz[jjz].mb1min; + const int mb2max = idxz[jjz].mb2max; + const int nb = idxz[jjz].nb; + + const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); + int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; + int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; + + double ztmp_r = 0.0; + double ztmp_i = 0.0; + + int jju1 = idxu_block[j1] + (j1+1)*mb1min; + int jju2 = idxu_block[j2] + (j2+1)*mb2max; + int icgb = mb1min*(j2+1) + mb2max; + for(int ib = 0; ib < nb; ib++) { + + double suma1_r = 0.0; + double suma1_i = 0.0; + + const double* u1_r = ulisttot_r.data() + jju1; + const double* u1_i = ulisttot_i.data() + jju1; + const double* u2_r = ulisttot_r.data() + jju2; + const double* u2_i = ulisttot_i.data() + jju2; + + int ma1 = ma1min; + int ma2 = ma2max; + int icga = ma1min*(j2+1) + ma2max; + + for(int ia = 0; ia < na; ia++) { + suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]); + suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]); ma1++; + ma2--; + icga += j2; + } // end loop over ia + + ztmp_r += cgblock[icgb] * suma1_r; + ztmp_i += cgblock[icgb] * suma1_i; + jju1 += j1+1; + jju2 -= j2+1; + icgb += j2; + } // end loop over ib + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(ii,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + const int jju = idxz[jjz].jju; + + // pick out right beta value + + if (j >= j1) { + const int jjb = idxb_block(j1,j2,j); + if (j1 == j) { + if (j2 == j) betaj = 3*beta(ii,jjb); + else betaj = 2*beta(ii,jjb); + } else betaj = beta(ii,jjb); + } else if (j >= j2) { + const int jjb = idxb_block(j,j2,j1); + if (j2 == j) betaj = 2*beta(ii,jjb)*(j1+1)/(j+1.0); + else betaj = beta(ii,jjb)*(j1+1)/(j+1.0); + } else { + const int jjb = idxb_block(j2,j,j1); + betaj = beta(ii,jjb)*(j1+1)/(j+1.0); + } + + ylist_r[jju] += betaj*ztmp_r; + ylist_i[jju] += betaj*ztmp_i; + //printf("yi %i %g %g\n",jju,ylist_r[jju],ylist_i[jju]); + + }); // end loop over jjz +} + +/* ---------------------------------------------------------------------- + compute dEidRj +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_deidrj(const typename Kokkos::TeamPolicy::member_type& team, double* dedr) +{ + + for(int k = 0; k < 3; k++) + dedr[k] = 0.0; + + // TODO: which loop is faster to parallelize? + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), + [&] (const int& j) { + //for(int j = 0; j <= twojmax; j++) { + int jju = idxu_block[j]; + + for(int mb = 0; 2*mb < j; mb++) + for(int ma = 0; ma <= j; ma++) { + + double jjjmambyarray_r = ylist_r[jju]; + double jjjmambyarray_i = ylist_i[jju]; + + for(int k = 0; k < 3; k++) + dedr[k] += + dulist_r(jju,k) * jjjmambyarray_r + + dulist_i(jju,k) * jjjmambyarray_i; + jju++; + } //end loop over ma mb + + // For j even, handle middle column + + if (j%2 == 0) { + + int mb = j/2; + for(int ma = 0; ma < mb; ma++) { + double jjjmambyarray_r = ylist_r[jju]; + double jjjmambyarray_i = ylist_i[jju]; + + for(int k = 0; k < 3; k++) + dedr[k] += + dulist_r(jju,k) * jjjmambyarray_r + + dulist_i(jju,k) * jjjmambyarray_i; + jju++; + } + + int ma = mb; + double jjjmambyarray_r = ylist_r[jju]; + double jjjmambyarray_i = ylist_i[jju]; + + for(int k = 0; k < 3; k++) + dedr[k] += + (dulist_r(jju,k) * jjjmambyarray_r + + dulist_i(jju,k) * jjjmambyarray_i)*0.5; + } // end if jeven + + }); // end loop over j + + for(int k = 0; k < 3; k++) + dedr[k] *= 2.0; + + //printf("dedr %g %g %g\n",dedr[0],dedr[1],dedr[2]); +} /* ---------------------------------------------------------------------- compute Bi by summing conj(Ui)*Zi @@ -316,31 +550,33 @@ void SNAKokkos::compute_bi(const typename Kokkos::TeamPolicy::compute_bi(const typename Kokkos::TeamPolicy -KOKKOS_INLINE_FUNCTION -void SNAKokkos::copy_bi2bvec(const typename Kokkos::TeamPolicy::member_type& team) -{ - /* int ncount, j1, j2, j; - - ncount = 0; - - for(j1 = 0; j1 <= twojmax; j1++) { - for(j2 = 0; j2 <= j1; j2++) - for(j = abs(j1 - j2); - j <= MIN(twojmax, j1 + j2); j += 2) - if (j >= j1) {*/ - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxj_max), - [&] (const int& JJ) { - //for(int JJ = 0; JJ < idxj_max; JJ++) { - const int j1 = idxj[JJ].j1; - const int j2 = idxj[JJ].j2; - const int j = idxj[JJ].j; - bvec(JJ) = barray(j1,j2,j); - //ncount++; - }); -} - -/* ---------------------------------------------------------------------- - calculate derivative of Ui w.r.t. atom j -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duidrj(const typename Kokkos::TeamPolicy::member_type& team, - double* rij, double wj, double rcut) -{ - double rsq, r, x, y, z, z0, theta0, cs, sn; - double dz0dr; - - x = rij[0]; - y = rij[1]; - z = rij[2]; - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - double rscale0 = rfac0 * MY_PI / (rcut - rmin0); - theta0 = (r - rmin0) * rscale0; - cs = cos(theta0); - sn = sin(theta0); - z0 = r * cs / sn; - dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &starttime); -#endif - - compute_duarray(team, x, y, z, z0, r, dz0dr, wj, rcut); - -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &endtime); - timers[3] += (endtime.tv_sec - starttime.tv_sec + 1.0 * - (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); -#endif - } /* ---------------------------------------------------------------------- @@ -478,21 +649,18 @@ void SNAKokkos::compute_dbidrj(const typename Kokkos::TeamPolicy::compute_dbidrj(const typename Kokkos::TeamPolicy dbdr,sumzdu_r; // Sum terms Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) - // use zarray j1/j2 symmetry (optional) - - int j_,j1_,j2_; - if (j1 >= j2) { - //jjjzarray_r = &zarray_r(j1,j2,j); - //jjjzarray_i = &zarray_i(j1,j2,j); - j1_ = j1; - j2_ = j2; - j_ = j; - } else { - j1_ = j2; - j2_ = j1; - j_ = j; - //jjjzarray_r = &zarray_r(j2,j1,j); - //jjjzarray_i = &zarray_i(j2,j1,j); - } + int jjz = idxz_block(j1,j2,j); + int jju = idxu_block[j]; for(int mb = 0; 2*mb < j; mb++) for(int ma = 0; ma <= j; ma++) { - - dudr_r = &duarray_r(j,mb,ma,0); - dudr_i = &duarray_i(j,mb,ma,0); - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb,ma); - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb,ma); - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); - + const int jju_index = jju+mb*(j+1)+ma; + const int jjz_index = jjz+mb*(j+1)+ma; + sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]); + sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]); + sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]); } //end loop over ma mb // For j even, handle middle column @@ -535,14 +685,19 @@ void SNAKokkos::compute_dbidrj(const typename Kokkos::TeamPolicy::compute_dbidrj(const typename Kokkos::TeamPolicy= j2) { - j1_ = j; - j2_ = j2; - j_ = j1; - - //jjjzarray_r = zarray_r(j,j2,j1); - //jjjzarray_i = zarray_i(j,j2,j1); - } else { - j1_ = j2; - j2_ = j; - j_ = j1; - //jjjzarray_r = zarray_r(j2,j,j1); - //jjjzarray_i = zarray_i(j2,j,j1); - } - - for(int mb1 = 0; 2*mb1 < j1; mb1++) - for(int ma1 = 0; ma1 <= j1; ma1++) { - - dudr_r = &duarray_r(j1,mb1,ma1,0); - dudr_i = &duarray_i(j1,mb1,ma1,0); - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb1,ma1); - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb1,ma1); - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); + for(int mb = 0; 2*mb < j1; mb++) + for(int ma = 0; ma <= j1; ma++) { + const int jju_index = jju+mb*(j1+1)+ma; + const int jjz_index = jjz+mb*(j1+1)+ma; + sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]); + sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]); + sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]); } //end loop over ma1 mb1 // For j1 even, handle middle column if (j1%2 == 0) { - const int mb1 = j1/2; - for(int ma1 = 0; ma1 <= mb1; ma1++) { - dudr_r = &duarray_r(j1,mb1,ma1,0); - dudr_i = &duarray_i(j1,mb1,ma1,0); - const double factor = ma1==mb1?0.5:1.0; - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb1,ma1) * factor; - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb1,ma1) * factor; - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); + const int mb = j1/2; + for(int ma = 0; ma <= mb; ma++) { + const int jju_index = jju+(mb-1)*(j1+1)+(j1+1)+ma; + const int jjz_index = jjz+(mb-1)*(j1+1)+(j1+1)+ma; + sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]); + sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]); + sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]); + } + int ma = mb; + const int jju_index = jju+(mb-1)*(j1+1)+(j1+1)+ma; + const int jjz_index = jjz+(mb-1)*(j1+1)+(j1+1)+ma; + for(int k = 0; k < 3; k++) { + sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz] + dulist_i(jju_index,0) * zlist_i[jjz_index])*0.5; + sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz] + dulist_i(jju_index,1) * zlist_i[jjz_index])*0.5; + sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz] + dulist_i(jju_index,2) * zlist_i[jjz_index])*0.5; } } // end if j1even @@ -605,94 +748,74 @@ void SNAKokkos::compute_dbidrj(const typename Kokkos::TeamPolicy= j) { - j1_ = j1; - j2_ = j; - j_ = j2; - //jjjzarray_r = zarray_r(j1,j,j2); - //jjjzarray_i = zarray_i(j1,j,j2); - } else { - j1_ = j; - j2_ = j1; - j_ = j2; - //jjjzarray_r = zarray_r(j,j1,j2); - //jjjzarray_i = zarray_i(j,j1,j2); - } - - for(int mb2 = 0; 2*mb2 < j2; mb2++) - for(int ma2 = 0; ma2 <= j2; ma2++) { - - dudr_r = &duarray_r(j2,mb2,ma2,0); - dudr_i = &duarray_i(j2,mb2,ma2,0); - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb2,ma2); - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb2,ma2); - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); + for(int mb = 0; 2*mb < j2; mb++) + for(int ma = 0; ma <= j2; ma++) { + const int jju_index = jju+mb*(j2+1)+ma; + const int jjz_index = jjz+mb*(j2+1)+ma; + sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]); + sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]); + sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]); } //end loop over ma2 mb2 // For j2 even, handle middle column if (j2%2 == 0) { - const int mb2 = j2/2; - for(int ma2 = 0; ma2 <= mb2; ma2++) { - dudr_r = &duarray_r(j2,mb2,ma2,0); - dudr_i = &duarray_i(j2,mb2,ma2,0); - const double factor = ma2==mb2?0.5:1.0; - jjjmambzarray_r = zarray_r(j1_,j2_,j_,mb2,ma2) * factor; - jjjmambzarray_i = zarray_i(j1_,j2_,j_,mb2,ma2) * factor; - sumzdu_r.x += (dudr_r[0] * jjjmambzarray_r + dudr_i[0] * jjjmambzarray_i); - sumzdu_r.y += (dudr_r[1] * jjjmambzarray_r + dudr_i[1] * jjjmambzarray_i); - sumzdu_r.z += (dudr_r[2] * jjjmambzarray_r + dudr_i[2] * jjjmambzarray_i); + const int mb = j2/2; + for(int ma = 0; ma <= mb; ma++) { + const int jju_index = jju+(mb-1)*(j2+1)+(j2+1)+ma; + const int jjz_index = jjz+(mb-1)*(j2+1)+(j2+1)+ma; + sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz_index] + dulist_i(jju_index,0) * zlist_i[jjz_index]); + sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz_index] + dulist_i(jju_index,1) * zlist_i[jjz_index]); + sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz_index] + dulist_i(jju_index,2) * zlist_i[jjz_index]); + } + int ma = mb; + const int jju_index = jju+(mb-1)*(j2+1)+(j2+1)+ma; + const int jjz_index = jjz+(mb-1)*(j2+1)+(j2+1)+ma; + for(int k = 0; k < 3; k++) { + sumzdu_r.x += (dulist_r(jju_index,0) * zlist_r[jjz] + dulist_i(jju_index,0) * zlist_i[jjz_index])*0.5; + sumzdu_r.y += (dulist_r(jju_index,1) * zlist_r[jjz] + dulist_i(jju_index,1) * zlist_i[jjz_index])*0.5; + sumzdu_r.z += (dulist_r(jju_index,2) * zlist_r[jjz] + dulist_i(jju_index,2) * zlist_i[jjz_index])*0.5; } } // end if j2even dbdr += 2.0*sumzdu_r*j2fac; - dbarray(j1,j2,j,0) = dbdr.x; - dbarray(j1,j2,j,1) = dbdr.y; - dbarray(j1,j2,j,2) = dbdr.z; + dblist(jjb,0) = dbdr.x; + dblist(jjb,1) = dbdr.y; + dblist(jjb,2) = dbdr.z; + }); //end loop over j1 j2 j - -#ifdef TIMING_INFO - clock_gettime(CLOCK_REALTIME, &endtime); - timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 * - (endtime.tv_nsec - starttime.tv_nsec) / 1000000000); -#endif - } /* ---------------------------------------------------------------------- - copy Bi derivatives into a vector + calculate derivative of Ui w.r.t. atom j ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::copy_dbi2dbvec(const typename Kokkos::TeamPolicy::member_type& team) +void SNAKokkos::compute_duidrj(const typename Kokkos::TeamPolicy::member_type& team, + double* rij, double wj, double rcut) { - /* int ncount, j1, j2, j; + double rsq, r, x, y, z, z0, theta0, cs, sn; + double dz0dr; - ncount = 0; + x = rij[0]; + y = rij[1]; + z = rij[2]; + rsq = x * x + y * y + z * z; + r = sqrt(rsq); + double rscale0 = rfac0 * MY_PI / (rcut - rmin0); + theta0 = (r - rmin0) * rscale0; + cs = cos(theta0); + sn = sin(theta0); + z0 = r * cs / sn; + dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - for(j1 = 0; j1 <= twojmax; j1++) { - for(j2 = 0; j2 <= j1; j2++) - for(j = abs(j1 - j2); - j <= MIN(twojmax, j1 + j2); j += 2) - if (j >= j1) {*/ - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,idxj_max), - [&] (const int& JJ) { - //for(int JJ = 0; JJ < idxj_max; JJ++) { - const int j1 = idxj[JJ].j1; - const int j2 = idxj[JJ].j2; - const int j = idxj[JJ].j; - dbvec(JJ,0) = dbarray(j1,j2,j,0); - dbvec(JJ,1) = dbarray(j1,j2,j,1); - dbvec(JJ,2) = dbarray(j1,j2,j,2); - //ncount++; - }); + compute_duarray(team, x, y, z, z0, r, dz0dr, wj, rcut); } /* ---------------------------------------------------------------------- */ @@ -702,15 +825,15 @@ KOKKOS_INLINE_FUNCTION void SNAKokkos::zero_uarraytot(const typename Kokkos::TeamPolicy::member_type& team) { { - double* const ptr = uarraytot_r.data(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,uarraytot_r.span()), + double* const ptr = ulisttot_r.data(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot_r.span()), [&] (const int& i) { ptr[i] = 0.0; }); } { - double* const ptr = uarraytot_i.data(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,uarraytot_r.span()), + double* const ptr = ulisttot_i.data(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot_i.span()), [&] (const int& i) { ptr[i] = 0.0; }); @@ -723,12 +846,14 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::addself_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, double wself_in) { - //for (int j = 0; j <= twojmax; j++) Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), [&] (const int& j) { + //for (int j = 0; j <= twojmax; j++) + int jju = idxu_block[j]; for (int ma = 0; ma <= j; ma++) { - uarraytot_r(j,ma,ma) = wself_in; - uarraytot_i(j,ma,ma) = 0.0; + ulisttot_r[jju] = wself_in; + ulisttot_i[jju] = 0.0; + jju += j+2; } }); } @@ -743,20 +868,11 @@ void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::compute_uarray(const typename Kokkos::TeamPolicy::compute_duarray(const typename Kokkos::TeamPolicy::compute_duarray(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION void SNAKokkos::create_team_scratch_arrays(const typename Kokkos::TeamPolicy::member_type& team) { - int jdim = twojmax + 1; - uarraytot_r_a = uarraytot_r = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim); - uarraytot_i_a = uarraytot_i = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim); - zarray_r = t_sna_5d(team.team_scratch(1),jdim,jdim,jdim,jdim,jdim); - zarray_i = t_sna_5d(team.team_scratch(1),jdim,jdim,jdim,jdim,jdim); - bvec = Kokkos::View(team.team_scratch(1),ncoeff); - barray = t_sna_3d(team.team_scratch(1),jdim,jdim,jdim); + ulisttot_r_a = ulisttot_r = t_sna_1d(team.team_scratch(1),idxu_max); + ulisttot_i_a = ulisttot_i = t_sna_1d(team.team_scratch(1),idxu_max); + ylist_r = t_sna_1d(team.team_scratch(1),idxu_max); + ylist_i = t_sna_1d(team.team_scratch(1),idxu_max); + zlist_r = t_sna_1d(team.team_scratch(1),idxz_max); + zlist_i = t_sna_1d(team.team_scratch(1),idxz_max); + blist = t_sna_1d(team.team_scratch(1),idxb_max); + dblist = t_sna_2d(team.team_scratch(1),idxb_max,3); rij = t_sna_2d(team.team_scratch(1),nmax,3); rcutij = t_sna_1d(team.team_scratch(1),nmax); @@ -1043,19 +1184,16 @@ void SNAKokkos::create_team_scratch_arrays(const typename Kokkos::Te inside = t_sna_1i(team.team_scratch(1),nmax); } - template inline T_INT SNAKokkos::size_team_scratch_arrays() { T_INT size = 0; - int jdim = twojmax + 1; - size += t_sna_3d::shmem_size(jdim,jdim,jdim); // uarraytot_r_a - size += t_sna_3d::shmem_size(jdim,jdim,jdim); // uarraytot_i_a - size += t_sna_5d::shmem_size(jdim,jdim,jdim,jdim,jdim); // zarray_r - size += t_sna_5d::shmem_size(jdim,jdim,jdim,jdim,jdim); // zarray_i - size += Kokkos::View::shmem_size(ncoeff); // bvec - size += t_sna_3d::shmem_size(jdim,jdim,jdim); // barray + size += t_sna_1d::shmem_size(idxu_max)*2; // ulisttot + size += t_sna_1d::shmem_size(idxu_max)*2; // ylist + size += t_sna_1d::shmem_size(idxz_max)*2; // zlist + size += t_sna_1d::shmem_size(idxb_max); // blist + size += t_sna_2d::shmem_size(idxb_max,3); // dblist size += t_sna_2d::shmem_size(nmax,3); // rij size += t_sna_1d::shmem_size(nmax); // rcutij @@ -1071,53 +1209,225 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::create_thread_scratch_arrays(const typename Kokkos::TeamPolicy::member_type& team) { - int jdim = twojmax + 1; + dblist = t_sna_2d(team.thread_scratch(1),idxb_max,3); - dbvec = Kokkos::View(team.thread_scratch(1),ncoeff); - dbarray = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim); - - uarray_r = t_sna_3d(team.thread_scratch(1),jdim,jdim,jdim); - uarray_i = t_sna_3d(team.thread_scratch(1),jdim,jdim,jdim); - duarray_r = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim); - duarray_i = t_sna_4d(team.thread_scratch(1),jdim,jdim,jdim); + ulist_r = t_sna_1d(team.thread_scratch(1),idxu_max); + ulist_i = t_sna_1d(team.thread_scratch(1),idxu_max); + dulist_r = t_sna_2d(team.thread_scratch(1),idxu_max,3); + dulist_i = t_sna_2d(team.thread_scratch(1),idxu_max,3); } template inline T_INT SNAKokkos::size_thread_scratch_arrays() { T_INT size = 0; - int jdim = twojmax + 1; - size += Kokkos::View::shmem_size(ncoeff); // dbvec - size += t_sna_4d::shmem_size(jdim,jdim,jdim); // dbarray + size += t_sna_2d::shmem_size(idxb_max,3); // dblist - size += t_sna_3d::shmem_size(jdim,jdim,jdim); // uarray_r - size += t_sna_3d::shmem_size(jdim,jdim,jdim); // uarray_i - size += t_sna_4d::shmem_size(jdim,jdim,jdim); // duarray_r - size += t_sna_4d::shmem_size(jdim,jdim,jdim); // duarray_i + size += t_sna_1d::shmem_size(idxu_max)*2; // ulist + size += t_sna_2d::shmem_size(idxu_max,3)*2; // dulist return size; } /* ---------------------------------------------------------------------- - factorial n + factorial n, wrapper for precomputed table ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION +inline double SNAKokkos::factorial(int n) { - double result = 1.0; - for(int i=1; i<=n; i++) - result *= 1.0*i; - return result; + //if (n < 0 || n > nmaxfactorial) { + // char str[128]; + // sprintf(str, "Invalid argument to factorial %d", n); + // error->all(FLERR, str); + //} + + return nfac_table[n]; } +/* ---------------------------------------------------------------------- + factorial n table, size SNA::nmaxfactorial+1 +------------------------------------------------------------------------- */ + +template +const double SNAKokkos::nfac_table[] = { + 1, + 1, + 2, + 6, + 24, + 120, + 720, + 5040, + 40320, + 362880, + 3628800, + 39916800, + 479001600, + 6227020800, + 87178291200, + 1307674368000, + 20922789888000, + 355687428096000, + 6.402373705728e+15, + 1.21645100408832e+17, + 2.43290200817664e+18, + 5.10909421717094e+19, + 1.12400072777761e+21, + 2.5852016738885e+22, + 6.20448401733239e+23, + 1.5511210043331e+25, + 4.03291461126606e+26, + 1.08888694504184e+28, + 3.04888344611714e+29, + 8.8417619937397e+30, + 2.65252859812191e+32, + 8.22283865417792e+33, + 2.63130836933694e+35, + 8.68331761881189e+36, + 2.95232799039604e+38, + 1.03331479663861e+40, + 3.71993326789901e+41, + 1.37637530912263e+43, + 5.23022617466601e+44, + 2.03978820811974e+46, + 8.15915283247898e+47, + 3.34525266131638e+49, + 1.40500611775288e+51, + 6.04152630633738e+52, + 2.65827157478845e+54, + 1.1962222086548e+56, + 5.50262215981209e+57, + 2.58623241511168e+59, + 1.24139155925361e+61, + 6.08281864034268e+62, + 3.04140932017134e+64, + 1.55111875328738e+66, + 8.06581751709439e+67, + 4.27488328406003e+69, + 2.30843697339241e+71, + 1.26964033536583e+73, + 7.10998587804863e+74, + 4.05269195048772e+76, + 2.35056133128288e+78, + 1.3868311854569e+80, + 8.32098711274139e+81, + 5.07580213877225e+83, + 3.14699732603879e+85, + 1.98260831540444e+87, + 1.26886932185884e+89, + 8.24765059208247e+90, + 5.44344939077443e+92, + 3.64711109181887e+94, + 2.48003554243683e+96, + 1.71122452428141e+98, + 1.19785716699699e+100, + 8.50478588567862e+101, + 6.12344583768861e+103, + 4.47011546151268e+105, + 3.30788544151939e+107, + 2.48091408113954e+109, + 1.88549470166605e+111, + 1.45183092028286e+113, + 1.13242811782063e+115, + 8.94618213078297e+116, + 7.15694570462638e+118, + 5.79712602074737e+120, + 4.75364333701284e+122, + 3.94552396972066e+124, + 3.31424013456535e+126, + 2.81710411438055e+128, + 2.42270953836727e+130, + 2.10775729837953e+132, + 1.85482642257398e+134, + 1.65079551609085e+136, + 1.48571596448176e+138, + 1.3520015276784e+140, + 1.24384140546413e+142, + 1.15677250708164e+144, + 1.08736615665674e+146, + 1.03299784882391e+148, + 9.91677934870949e+149, + 9.61927596824821e+151, + 9.42689044888324e+153, + 9.33262154439441e+155, + 9.33262154439441e+157, + 9.42594775983835e+159, + 9.61446671503512e+161, + 9.90290071648618e+163, + 1.02990167451456e+166, + 1.08139675824029e+168, + 1.14628056373471e+170, + 1.22652020319614e+172, + 1.32464181945183e+174, + 1.44385958320249e+176, + 1.58824554152274e+178, + 1.76295255109024e+180, + 1.97450685722107e+182, + 2.23119274865981e+184, + 2.54355973347219e+186, + 2.92509369349301e+188, + 3.3931086844519e+190, + 3.96993716080872e+192, + 4.68452584975429e+194, + 5.5745857612076e+196, + 6.68950291344912e+198, + 8.09429852527344e+200, + 9.8750442008336e+202, + 1.21463043670253e+205, + 1.50614174151114e+207, + 1.88267717688893e+209, + 2.37217324288005e+211, + 3.01266001845766e+213, + 3.8562048236258e+215, + 4.97450422247729e+217, + 6.46685548922047e+219, + 8.47158069087882e+221, + 1.118248651196e+224, + 1.48727070609069e+226, + 1.99294274616152e+228, + 2.69047270731805e+230, + 3.65904288195255e+232, + 5.01288874827499e+234, + 6.91778647261949e+236, + 9.61572319694109e+238, + 1.34620124757175e+241, + 1.89814375907617e+243, + 2.69536413788816e+245, + 3.85437071718007e+247, + 5.5502938327393e+249, + 8.04792605747199e+251, + 1.17499720439091e+254, + 1.72724589045464e+256, + 2.55632391787286e+258, + 3.80892263763057e+260, + 5.71338395644585e+262, + 8.62720977423323e+264, + 1.31133588568345e+267, + 2.00634390509568e+269, + 3.08976961384735e+271, + 4.78914290146339e+273, + 7.47106292628289e+275, + 1.17295687942641e+278, + 1.85327186949373e+280, + 2.94670227249504e+282, + 4.71472363599206e+284, + 7.59070505394721e+286, + 1.22969421873945e+289, + 2.0044015765453e+291, + 3.28721858553429e+293, + 5.42391066613159e+295, + 9.00369170577843e+297, + 1.503616514865e+300, // nmaxfactorial = 167 +}; + /* ---------------------------------------------------------------------- the function delta given by VMK Eq. 8.2(1) ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION +inline double SNAKokkos::deltacg(int j1, int j2, int j) { double sfaccg = factorial((j1 + j2 + j) / 2 + 1); @@ -1135,33 +1445,39 @@ template inline void SNAKokkos::init_clebsch_gordan() { + auto h_cglist = Kokkos::create_mirror_view(cglist); + double sum,dcg,sfaccg; int m, aa2, bb2, cc2; int ifac; - auto h_cgarray = Kokkos::create_mirror_view(cgarray); - for (int j1 = 0; j1 <= twojmax; j1++) - for (int j2 = 0; j2 <= twojmax; j2++) - for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) - for (int m1 = 0; m1 <= j1; m1 += 1) { + int idxcg_count = 0; + for(int j1 = 0; j1 <= twojmax; j1++) + for(int j2 = 0; j2 <= j1; j2++) + for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + for (int m1 = 0; m1 <= j1; m1++) { aa2 = 2 * m1 - j1; - for (int m2 = 0; m2 <= j2; m2 += 1) { + for (int m2 = 0; m2 <= j2; m2++) { // -c <= cc <= c bb2 = 2 * m2 - j2; m = (aa2 + bb2 + j) / 2; - if(m < 0 || m > j) continue; + if(m < 0 || m > j) { + h_cglist[idxcg_count] = 0.0; + idxcg_count++; + continue; + } sum = 0.0; for (int z = MAX(0, MAX(-(j - j2 + aa2) - / 2, -(j - j1 - bb2) / 2)); - z <= MIN((j1 + j2 - j) / 2, - MIN((j1 - aa2) / 2, (j2 + bb2) / 2)); - z++) { + / 2, -(j - j1 - bb2) / 2)); + z <= MIN((j1 + j2 - j) / 2, + MIN((j1 - aa2) / 2, (j2 + bb2) / 2)); + z++) { ifac = z % 2 ? -1 : 1; sum += ifac / (factorial(z) * @@ -1175,18 +1491,19 @@ void SNAKokkos::init_clebsch_gordan() cc2 = 2 * m - j; dcg = deltacg(j1, j2, j); sfaccg = sqrt(factorial((j1 + aa2) / 2) * - factorial((j1 - aa2) / 2) * - factorial((j2 + bb2) / 2) * - factorial((j2 - bb2) / 2) * - factorial((j + cc2) / 2) * - factorial((j - cc2) / 2) * - (j + 1)); - - h_cgarray(j1,j2,j,m1,m2) = sum * dcg * sfaccg; - //printf("SNAP-COMPARE: CG: %i %i %i %i %i %e\n",j1,j2,j,m1,m2,cgarray(j1,j2,j,m1,m2)); + factorial((j1 - aa2) / 2) * + factorial((j2 + bb2) / 2) * + factorial((j2 - bb2) / 2) * + factorial((j + cc2) / 2) * + factorial((j - cc2) / 2) * + (j + 1)); + + h_cglist[idxcg_count] = sum * dcg * sfaccg; + idxcg_count++; } } - Kokkos::deep_copy(cgarray,h_cgarray); + } + Kokkos::deep_copy(cglist,h_cglist); } /* ---------------------------------------------------------------------- @@ -1207,6 +1524,7 @@ void SNAKokkos::init_rootpqarray() /* ---------------------------------------------------------------------- */ + template inline int SNAKokkos::compute_ncoeff() @@ -1217,9 +1535,10 @@ int SNAKokkos::compute_ncoeff() for (int j1 = 0; j1 <= twojmax; j1++) for (int j2 = 0; j2 <= j1; j2++) - for (int j = abs(j1 - j2); - j <= MIN(twojmax, j1 + j2); j += 2) - if (j >= j1) ncount++; + for (int j = j1 - j2; + j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) ncount++; + return ncount; } @@ -1266,15 +1585,39 @@ double SNAKokkos::compute_dsfac(double r, double rcut) template double SNAKokkos::memory_usage() { + int jdimpq = twojmax + 2; int jdim = twojmax + 1; double bytes; - bytes = jdim * jdim * jdim * jdim * jdim * sizeof(double); - bytes += 2 * jdim * jdim * jdim * sizeof(std::complex); - bytes += 2 * jdim * jdim * jdim * sizeof(double); - bytes += jdim * jdim * jdim * 3 * sizeof(std::complex); - bytes += jdim * jdim * jdim * 3 * sizeof(double); - bytes += ncoeff * sizeof(double); - bytes += jdim * jdim * jdim * jdim * jdim * sizeof(std::complex); + + bytes = 0; + + bytes += jdimpq*jdimpq * sizeof(double); // pqarray + bytes += idxcg_max * sizeof(double); // cglist + + bytes += idxu_max * sizeof(double) * 2; // ulist + bytes += idxu_max * sizeof(double) * 2; // ulisttot + bytes += idxu_max * 3 * sizeof(double) * 2; // dulist + + bytes += idxz_max * sizeof(double) * 2; // zlist + bytes += idxb_max * sizeof(double); // blist + bytes += idxb_max * 3 * sizeof(double); // dblist + bytes += idxu_max * sizeof(double) * 2; // ylist + + bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block + bytes += jdim * sizeof(int); // idxu_block + bytes += jdim * jdim * jdim * sizeof(int); // idxz_block + bytes += jdim * jdim * jdim * sizeof(int); // idxb_block + + bytes += idxz_max * sizeof(SNAKK_ZINDICES); // idxz + bytes += idxb_max * sizeof(SNAKK_BINDICES); // idxb + + bytes += jdim * sizeof(double); // bzero + + bytes += nmax * 3 * sizeof(double); // rij + bytes += nmax * sizeof(int); // inside + bytes += nmax * sizeof(double); // wj + bytes += nmax * sizeof(double); // rcutij + return bytes; }