/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Christian Trott (SNL), Stan Moore (SNL) ------------------------------------------------------------------------- */ #include "sna_kokkos.h" #include #include #include #include namespace LAMMPS_NS { static const double MY_PI = 3.14159265358979323846; // pi template inline SNAKokkos::SNAKokkos(double rfac0_in, int twojmax_in, double rmin0_in, int switch_flag_in, int bzero_flag_in) { wself = 1.0; rfac0 = rfac0_in; rmin0 = rmin0_in; switch_flag = switch_flag_in; bzero_flag = bzero_flag_in; twojmax = twojmax_in; ncoeff = compute_ncoeff(); nmax = 0; build_indexlist(); int jdimpq = twojmax + 2; rootpqarray = t_sna_2d("SNAKokkos::rootpqarray",jdimpq,jdimpq); cglist = t_sna_1d("SNAKokkos::cglist",idxcg_max); if (bzero_flag) { bzero = Kokkos::View("sna:bzero",twojmax+1); auto h_bzero = Kokkos::create_mirror_view(bzero); double www = wself*wself*wself; for(int j = 0; j <= twojmax; j++) h_bzero[j] = www*(j+1); Kokkos::deep_copy(bzero,h_bzero); } } /* ---------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION SNAKokkos::~SNAKokkos() { } template inline void SNAKokkos::build_indexlist() { // index list for cglist int jdim = twojmax + 1; idxcg_block = Kokkos::View("SNAKokkos::idxcg_block",jdim,jdim,jdim); auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block); int idxcg_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { h_idxcg_block(j1,j2,j) = idxcg_count; for (int m1 = 0; m1 <= j1; m1++) for (int m2 = 0; m2 <= j2; m2++) idxcg_count++; } idxcg_max = idxcg_count; Kokkos::deep_copy(idxcg_block,h_idxcg_block); // index list for uarray // need to include both halves idxu_block = Kokkos::View("SNAKokkos::idxu_block",jdim); auto h_idxu_block = Kokkos::create_mirror_view(idxu_block); int idxu_count = 0; for(int j = 0; j <= twojmax; j++) { h_idxu_block[j] = idxu_count; for(int mb = 0; mb <= j; mb++) for(int ma = 0; ma <= j; ma++) idxu_count++; } idxu_max = idxu_count; Kokkos::deep_copy(idxu_block,h_idxu_block); // index list for beta and B int idxb_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) if (j >= j1) idxb_count++; idxb_max = idxb_count; idxb = Kokkos::View("SNAKokkos::idxb",idxb_max); auto h_idxb = Kokkos::create_mirror_view(idxb); idxb_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) if (j >= j1) { h_idxb(idxb_count,0) = j1; h_idxb(idxb_count,1) = j2; h_idxb(idxb_count,2) = j; idxb_count++; } Kokkos::deep_copy(idxb,h_idxb); // reverse index list for beta and b idxb_block = Kokkos::View("SNAKokkos::idxb_block",jdim,jdim,jdim); auto h_idxb_block = Kokkos::create_mirror_view(idxb_block); idxb_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { if (j >= j1) { h_idxb_block(j1,j2,j) = idxb_count; idxb_count++; } } Kokkos::deep_copy(idxb_block,h_idxb_block); // index list for zlist int idxz_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) for (int mb = 0; 2*mb <= j; mb++) for (int ma = 0; ma <= j; ma++) idxz_count++; idxz_max = idxz_count; idxz = Kokkos::View("SNAKokkos::idxz",idxz_max); auto h_idxz = Kokkos::create_mirror_view(idxz); idxz_block = Kokkos::View("SNAKokkos::idxz_block", jdim,jdim,jdim); auto h_idxz_block = Kokkos::create_mirror_view(idxz_block); idxz_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { h_idxz_block(j1,j2,j) = idxz_count; // find right beta(ii,jjb) entry // multiply and divide by j+1 factors // account for multiplicity of 1, 2, or 3 for (int mb = 0; 2*mb <= j; mb++) for (int ma = 0; ma <= j; ma++) { h_idxz(idxz_count,0) = j1; h_idxz(idxz_count,1) = j2; h_idxz(idxz_count,2) = j; h_idxz(idxz_count,3) = MAX(0, (2 * ma - j - j2 + j1) / 2); h_idxz(idxz_count,4) = (2 * ma - j - (2 * h_idxz(idxz_count,3) - j1) + j2) / 2; h_idxz(idxz_count,5) = MAX(0, (2 * mb - j - j2 + j1) / 2); h_idxz(idxz_count,6) = (2 * mb - j - (2 * h_idxz(idxz_count,5) - j1) + j2) / 2; h_idxz(idxz_count,7) = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz(idxz_count,3) + 1; h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1; // apply to z(j1,j2,j,ma,mb) to unique element of y(j) const int jju = h_idxu_block[j] + (j+1)*mb + ma; h_idxz(idxz_count,9) = jju; idxz_count++; } } Kokkos::deep_copy(idxz,h_idxz); Kokkos::deep_copy(idxz_block,h_idxz_block); } /* ---------------------------------------------------------------------- */ template inline void SNAKokkos::init() { init_clebsch_gordan(); init_rootpqarray(); } template inline void SNAKokkos::grow_rij(int newnatom, int newnmax) { if(newnatom <= natom && newnmax <= nmax) return; natom = newnatom; nmax = newnmax; rij = t_sna_3d("sna:rij",natom,nmax,3); inside = t_sna_2i("sna:inside",natom,nmax); wj = t_sna_2d("sna:wj",natom,nmax); rcutij = t_sna_2d("sna:rcutij",natom,nmax); dedr = t_sna_3d("sna:dedr",natom,nmax,3); blist = t_sna_2d_ll("sna:blist",idxb_max,natom); //ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max); ulisttot = t_sna_2c_ll("sna:ulisttot",idxu_max,natom); zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom); //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); #ifdef KOKKOS_ENABLE_CUDA if (std::is_same::value) { // dummy allocation ulist = t_sna_3c_ll("sna:ulist",1,1,1); dulist = t_sna_4c_ll("sna:dulist",1,1,1); } else { #endif ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax); dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax); #ifdef KOKKOS_ENABLE_CUDA } #endif //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom); dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax); } /* ---------------------------------------------------------------------- * compute Ui by summing over neighbors j * ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom) { for (int j = 0; j <= twojmax; j++) { const int jju = idxu_block(j); // Only diagonal elements get initialized // for (int m = 0; m < (j+1)*(j+1); m++) Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)), [&] (const int m) { const int jjup = jju + m; // if m is on the "diagonal", initialize it with the self energy. // Otherwise zero it out SNAcomplex init = {0., 0.}; if (m % (j+2) == 0) { init = {wself, 0.0}; } ulisttot(jjup, iatom) = init; }); } } /* ---------------------------------------------------------------------- compute Ui by computing Wigner U-functions for one neighbor and accumulating to the total. GPU only. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int iatom, const int jnbor) { // utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,ma) = 1 for all j,ma // for j in neighbors of i: // compute r0 = (x,y,z,z0) // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb // get shared memory offset const int max_m_tile = (twojmax+1)*(twojmax+1); const int team_rank = team.team_rank(); const int scratch_shift = team_rank * max_m_tile; // double buffer SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; const double x = rij(iatom,jnbor,0); const double y = rij(iatom,jnbor,1); const double z = rij(iatom,jnbor,2); const double wj_local = wj(iatom, jnbor); const double rcut = rcutij(iatom, jnbor); const double rsq = x * x + y * y + z * z; const double r = sqrt(rsq); const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); // theta0 = (r - rmin0) * rscale0; const double cs = cos(theta0); const double sn = sin(theta0); const double z0 = r * cs / sn; // r / tan(theta0) // Compute cutoff function const double sfac = compute_sfac(r, rcut) * wj_local; // compute Cayley-Klein parameters for unit quaternion, // pack into complex number const double r0inv = 1.0 / sqrt(r * r + z0 * z0); const SNAcomplex a = { r0inv * z0, -r0inv * z }; const SNAcomplex b = { r0inv * y, -r0inv * x }; // VMK Section 4.8.2 // All writes go to global memory and shared memory // so we can avoid all global memory reads Kokkos::single(Kokkos::PerThread(team), [=]() { //ulist(0,iatom,jnbor) = { 1.0, 0.0 }; buf1[0] = {1.,0.}; Kokkos::atomic_add(&(ulisttot(0,iatom).re), sfac); }); for (int j = 1; j <= twojmax; j++) { const int jju = idxu_block[j]; const int jjup = idxu_block[j-1]; // fill in left side of matrix layer from previous layer // Flatten loop over ma, mb, need to figure out total // number of iterations // for (int ma = 0; ma <= j; ma++) const int n_ma = j+1; // for (int mb = 0; 2*mb <= j; mb++) const int n_mb = j/2+1; // the last (j / 2) can be avoided due to symmetry const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0); //for (int m = 0; m < total_iters; m++) { Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters), [&] (const int m) { // ma fast, mb slow int ma = m % n_ma; int mb = m / n_ma; // index into global memory array const int jju_index = jju+m; //const int jjup_index = jjup+mb*j+ma; // index into shared memory buffer for this level const int jju_shared_idx = m; // index into shared memory buffer for next level const int jjup_shared_idx = jju_shared_idx - mb; SNAcomplex u_accum = {0., 0.}; // VMK recursion relation: grab contribution which is multiplied by b* const double rootpq2 = -rootpqarray(ma, j - mb); const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.); caconjxpy(b, u_up2, u_accum); // VMK recursion relation: grab contribution which is multiplied by a* const double rootpq1 = rootpqarray(j - ma, j - mb); const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.); //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.); caconjxpy(a, u_up1, u_accum); //ulist(jju_index,iatom,jnbor) = u_accum; // back up into shared memory for next iter buf2[jju_shared_idx] = u_accum; Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re); Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im); // copy left side to right side with inversion symmetry VMK 4.4(2) // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) // if j is even (-> physical j integer), last element maps to self, skip //if (!(m == total_iters - 1 && j % 2 == 0)) { if (m < total_iters - 1 || j % 2 == 1) { const int sign_factor = (((ma+mb)%2==0)?1:-1); const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1); if (sign_factor == 1) { u_accum.im = -u_accum.im; } else { u_accum.re = -u_accum.re; } //ulist(jjup_flip,iatom,jnbor) = u_accum; buf2[jju_shared_flip] = u_accum; Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re); Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im); } }); // In CUDA backend, // ThreadVectorRange has a __syncwarp (appropriately masked for // vector lengths < 32) implict at the end // swap double buffers auto tmp = buf1; buf1 = buf2; buf2 = tmp; } } /* ---------------------------------------------------------------------- compute Ui by summing over bispectrum components. CPU only. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { double rsq, r, x, y, z, z0, theta0; // utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,ma) = 1 for all j,ma // for j in neighbors of i: // compute r0 = (x,y,z,z0) // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb x = rij(iatom,jnbor,0); y = rij(iatom,jnbor,1); z = rij(iatom,jnbor,2); rsq = x * x + y * y + z * z; r = sqrt(rsq); theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); // theta0 = (r - rmin0) * rscale0; z0 = r / tan(theta0); compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor)); } /* ---------------------------------------------------------------------- compute Zi by summing over products of Ui ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_zi(const int& iter) { const int iatom = iter / idxz_max; const int jjz = iter % idxz_max; const int j1 = idxz(jjz,0); const int j2 = idxz(jjz,1); const int j = idxz(jjz,2); const int ma1min = idxz(jjz,3); const int ma2max = idxz(jjz,4); const int mb1min = idxz(jjz,5); const int mb2max = idxz(jjz,6); const int na = idxz(jjz,7); const int nb = idxz(jjz,8); const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); zlist(jjz,iatom).re = 0.0; zlist(jjz,iatom).im = 0.0; int jju1 = idxu_block[j1] + (j1+1)*mb1min; int jju2 = idxu_block[j2] + (j2+1)*mb2max; int icgb = mb1min*(j2+1) + mb2max; for(int ib = 0; ib < nb; ib++) { double suma1_r = 0.0; double suma1_i = 0.0; int ma1 = ma1min; int ma2 = ma2max; int icga = ma1min*(j2+1) + ma2max; for(int ia = 0; ia < na; ia++) { suma1_r += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).re - ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).im); suma1_i += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).im + ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).re); ma1++; ma2--; icga += j2; } // end loop over ia zlist(jjz,iatom).re += cgblock[icgb] * suma1_r; zlist(jjz,iatom).im += cgblock[icgb] * suma1_i; jju1 += j1+1; jju2 -= j2+1; icgb += j2; } // end loop over ib } /* ---------------------------------------------------------------------- compute Yi from Ui without storing Zi, looping over zlist indices ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::zero_yi(const int& idx, const int& iatom) { ylist(idx,iatom) = {0.0, 0.0}; } /* ---------------------------------------------------------------------- compute Yi from Ui without storing Zi, looping over zlist indices ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_yi(int iter, const Kokkos::View &beta) { double betaj; const int iatom = iter / idxz_max; const int jjz = iter % idxz_max; const int j1 = idxz(jjz,0); const int j2 = idxz(jjz,1); const int j = idxz(jjz,2); const int ma1min = idxz(jjz,3); const int ma2max = idxz(jjz,4); const int mb1min = idxz(jjz,5); const int mb2max = idxz(jjz,6); const int na = idxz(jjz,7); const int nb = idxz(jjz,8); const int jju = idxz(jjz,9); const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; double ztmp_r = 0.0; double ztmp_i = 0.0; int jju1 = idxu_block[j1] + (j1+1)*mb1min; int jju2 = idxu_block[j2] + (j2+1)*mb2max; int icgb = mb1min*(j2+1) + mb2max; for(int ib = 0; ib < nb; ib++) { double suma1_r = 0.0; double suma1_i = 0.0; int ma1 = ma1min; int ma2 = ma2max; int icga = ma1min*(j2+1) + ma2max; for(int ia = 0; ia < na; ia++) { suma1_r += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).re - ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).im); suma1_i += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).im + ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).re); ma1++; ma2--; icga += j2; } // end loop over ia ztmp_r += cgblock[icgb] * suma1_r; ztmp_i += cgblock[icgb] * suma1_i; jju1 += j1+1; jju2 -= j2+1; icgb += j2; } // end loop over ib // apply to z(j1,j2,j,ma,mb) to unique element of y(j) // find right y_list[jju] and beta(iatom,jjb) entries // multiply and divide by j+1 factors // account for multiplicity of 1, 2, or 3 // pick out right beta value if (j >= j1) { const int jjb = idxb_block(j1,j2,j); if (j1 == j) { if (j2 == j) betaj = 3*beta(jjb,iatom); else betaj = 2*beta(jjb,iatom); } else betaj = beta(jjb,iatom); } else if (j >= j2) { const int jjb = idxb_block(j,j2,j1); if (j2 == j) betaj = 2*beta(jjb,iatom)*(j1+1)/(j+1.0); else betaj = beta(jjb,iatom)*(j1+1)/(j+1.0); } else { const int jjb = idxb_block(j2,j,j1); betaj = beta(jjb,iatom)*(j1+1)/(j+1.0); } Kokkos::atomic_add(&(ylist(jju,iatom).re), betaj*ztmp_r); Kokkos::atomic_add(&(ylist(jju,iatom).im), betaj*ztmp_i); } /* ---------------------------------------------------------------------- Fused calculation of the derivative of Ui w.r.t. atom j and of dEidRj. GPU only. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int iatom, const int jnbor) { // get shared memory offset const int max_m_tile = (twojmax+1)*(twojmax/2+1); const int team_rank = team.team_rank(); const int scratch_shift = team_rank * max_m_tile; // double buffer for ulist SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; // double buffer for dulist SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; const double x = rij(iatom,jnbor,0); const double y = rij(iatom,jnbor,1); const double z = rij(iatom,jnbor,2); const double rsq = x * x + y * y + z * z; const double r = sqrt(rsq); const double rcut = rcutij(iatom, jnbor); const double rscale0 = rfac0 * MY_PI / (rcut - rmin0); const double theta0 = (r - rmin0) * rscale0; const double cs = cos(theta0); const double sn = sin(theta0); const double z0 = r * cs / sn; const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; const double wj_local = wj(iatom, jnbor); const double sfac = wj_local * compute_sfac(r, rcut); const double dsfac = wj_local * compute_dsfac(r, rcut); const double rinv = 1.0 / r; // extract a single unit vector const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv); // Compute Cayley-Klein parameters for unit quaternion const double r0inv = 1.0 / sqrt(r * r + z0 * z0); const SNAcomplex a = { r0inv * z0, -r0inv * z }; const SNAcomplex b = { r0inv * y, -r0inv * x }; const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); const double dr0inv = dr0invdr * u; const double dz0 = dz0dr * u; const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv, - z * dr0inv + (dir == 2 ? - r0inv : 0.) }; const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.), -x * dr0inv + (dir==0?-r0inv:0.) }; // Accumulate the full contribution to dedr on the fly const double du_prod = dsfac * u; // chain rule const SNAcomplex y_local = ylist(0, iatom); // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0 double dedr_full_sum = 0.5 * du_prod * y_local.re; // single has a warp barrier at the end Kokkos::single(Kokkos::PerThread(team), [=]() { //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here ulist_buf1[0] = {1., 0.}; dulist_buf1[0] = {0., 0.}; }); for (int j = 1; j <= twojmax; j++) { int jju = idxu_block[j]; int jjup = idxu_block[j-1]; // flatten the loop over ma,mb // for (int ma = 0; ma <= j; ma++) const int n_ma = j+1; // for (int mb = 0; 2*mb <= j; mb++) const int n_mb = j/2+1; const int total_iters = n_ma * n_mb; double dedr_sum = 0.; // j-local sum //for (int m = 0; m < total_iters; m++) { Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters), [&] (const int m, double& sum_tmp) { // ma fast, mb slow int ma = m % n_ma; int mb = m / n_ma; const int jju_index = jju+m; // Load y_local, apply the symmetry scaling factor // The "secret" of the shared memory optimization is it eliminates // all global memory reads to duidrj in lieu of caching values in // shared memory and otherwise always writing, making the kernel // ultimately compute bound. We take advantage of that by adding // some reads back in. auto y_local = ylist(jju_index,iatom); if (j % 2 == 0 && 2*mb == j) { if (ma == mb) { y_local = 0.5*y_local; } else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright // else the ma < mb gets "double counted", cancelling the 0.5. } // index into shared memory const int jju_shared_idx = m; const int jjup_shared_idx = jju_shared_idx - mb; // Need to compute and accumulate both u and du (mayhaps, we could probably // balance some read and compute by reading u each time). SNAcomplex u_accum = { 0., 0. }; SNAcomplex du_accum = { 0., 0. }; const double rootpq2 = -rootpqarray(ma, j - mb); const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); caconjxpy(b, u_up2, u_accum); const double rootpq1 = rootpqarray(j - ma, j - mb); const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.); caconjxpy(a, u_up1, u_accum); // Next, spin up du_accum const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.); caconjxpy(da, u_up1, du_accum); caconjxpy(a, du_up1, du_accum); const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.); caconjxpy(db, u_up2, du_accum); caconjxpy(b, du_up2, du_accum); // No need to save u_accum to global memory // Cache u_accum, du_accum to scratch memory. ulist_buf2[jju_shared_idx] = u_accum; dulist_buf2[jju_shared_idx] = du_accum; // Directly accumulate deidrj into sum_tmp //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum); sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im; // copy left side to right side with inversion symmetry VMK 4.4(2) // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) if (j%2==1 && mb+1==n_mb) { int sign_factor = (((ma+mb)%2==0)?1:-1); //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); if (sign_factor == 1) { u_accum.im = -u_accum.im; du_accum.im = -du_accum.im; } else { u_accum.re = -u_accum.re; du_accum.re = -du_accum.re; } // We don't need the second half of the tile for the deidrj accumulation. // That's taken care of by the symmetry factor above. //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); // We do need it for ortho polynomial generation, though ulist_buf2[jju_shared_flip] = u_accum; dulist_buf2[jju_shared_flip] = du_accum; } }, dedr_sum); // swap buffers auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp; tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp; // Accumulate dedr. This "should" be in a single, but // a Kokkos::single call implies a warp sync, and we may // as well avoid that. This does no harm as long as the // final assignment is in a single block. //Kokkos::single(Kokkos::PerThread(team), [=]() { dedr_full_sum += dedr_sum; //}); } // Store the accumulated dedr. Kokkos::single(Kokkos::PerThread(team), [&] () { dedr(iatom,jnbor,dir) = dedr_full_sum*2.0; }); } /* ---------------------------------------------------------------------- compute dEidRj, CPU path only. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { t_scalar3 final_sum; //for(int j = 0; j <= twojmax; j++) { Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), [&] (const int& j, t_scalar3& sum_tmp) { int jju = idxu_block[j]; for(int mb = 0; 2*mb < j; mb++) for(int ma = 0; ma <= j; ma++) { sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im; sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im; sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im; jju++; } //end loop over ma mb // For j even, handle middle column if (j%2 == 0) { int mb = j/2; for(int ma = 0; ma < mb; ma++) { sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im; sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im; sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im; jju++; } //int ma = mb; sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im)*0.5; sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im)*0.5; sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im)*0.5; } // end if jeven },final_sum); // end loop over j Kokkos::single(Kokkos::PerThread(team), [&] () { dedr(iatom,jnbor,0) = final_sum.x*2.0; dedr(iatom,jnbor,1) = final_sum.y*2.0; dedr(iatom,jnbor,2) = final_sum.z*2.0; }); } /* ---------------------------------------------------------------------- compute Bi by summing conj(Ui)*Zi ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_bi(const typename Kokkos::TeamPolicy::member_type& team, int iatom) { // for j1 = 0,...,twojmax // for j2 = 0,twojmax // for j = |j1-j2|,Min(twojmax,j1+j2),2 // b(j1,j2,j) = 0 // for mb = 0,...,jmid // for ma = 0,...,j // b(j1,j2,j) += // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max), [&] (const int& jjb) { //for(int jjb = 0; jjb < idxb_max; jjb++) { const int j1 = idxb(jjb,0); const int j2 = idxb(jjb,1); const int j = idxb(jjb,2); int jjz = idxz_block(j1,j2,j); int jju = idxu_block[j]; double sumzu = 0.0; double sumzu_temp = 0.0; const int bound = (j+2)/2; Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound), [&] (const int mbma, double& sum) { //for(int mb = 0; 2*mb < j; mb++) //for(int ma = 0; ma <= j; ma++) { const int ma = mbma%(j+1); const int mb = mbma/(j+1); const int jju_index = jju+mb*(j+1)+ma; const int jjz_index = jjz+mb*(j+1)+ma; if (2*mb == j) return; sum += ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re + ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im; },sumzu_temp); // end loop over ma, mb sumzu += sumzu_temp; // For j even, special treatment for middle column if (j%2 == 0) { const int mb = j/2; Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb), [&] (const int ma, double& sum) { //for(int ma = 0; ma < mb; ma++) { const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; sum += ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re + ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im; },sumzu_temp); // end loop over ma sumzu += sumzu_temp; const int ma = mb; const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; sumzu += 0.5* (ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re + ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im); } // end if jeven Kokkos::single(Kokkos::PerThread(team), [&] () { sumzu *= 2.0; // apply bzero shift if (bzero_flag) sumzu -= bzero[j]; blist(jjb,iatom) = sumzu; }); }); //} // end loop over j //} // end loop over j1, j2 } /* ---------------------------------------------------------------------- calculate derivative of Ui w.r.t. atom j ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { double rsq, r, x, y, z, z0, theta0, cs, sn; double dz0dr; x = rij(iatom,jnbor,0); y = rij(iatom,jnbor,1); z = rij(iatom,jnbor,2); rsq = x * x + y * y + z * z; r = sqrt(rsq); double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); theta0 = (r - rmin0) * rscale0; cs = cos(theta0); sn = sin(theta0); z0 = r * cs / sn; dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor)); } /* ---------------------------------------------------------------------- add Wigner U-functions for one neighbor to the total ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, double r, double wj, double rcut) { const double sfac = compute_sfac(r, rcut) * wj; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(0)), [&] (const int& i) { Kokkos::atomic_add(&(ulisttot(i,iatom).re), sfac * ulist(i,iatom,jnbor).re); Kokkos::atomic_add(&(ulisttot(i,iatom).im), sfac * ulist(i,iatom,jnbor).im); }); } /* ---------------------------------------------------------------------- compute Wigner U-functions for one neighbor ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, double x, double y, double z, double z0, double r) { double r0inv; double a_r, b_r, a_i, b_i; double rootpq; // compute Cayley-Klein parameters for unit quaternion r0inv = 1.0 / sqrt(r * r + z0 * z0); a_r = r0inv * z0; a_i = -r0inv * z; b_r = r0inv * y; b_i = -r0inv * x; // VMK Section 4.8.2 ulist(0,iatom,jnbor).re = 1.0; ulist(0,iatom,jnbor).im = 0.0; for (int j = 1; j <= twojmax; j++) { int jju = idxu_block[j]; int jjup = idxu_block[j-1]; // fill in left side of matrix layer from previous layer Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), [&] (const int& mb) { //for (int mb = 0; 2*mb <= j; mb++) { const int jju_index = jju+mb+mb*j; ulist(jju_index,iatom,jnbor).re = 0.0; ulist(jju_index,iatom,jnbor).im = 0.0; for (int ma = 0; ma < j; ma++) { const int jju_index = jju+mb+mb*j+ma; const int jjup_index = jjup+mb*j+ma; rootpq = rootpqarray(j - ma,j - mb); ulist(jju_index,iatom,jnbor,jju).re += rootpq * (a_r * ulist(jjup_index,iatom,jnbor).re + a_i * ulist(jjup_index,iatom,jnbor).im); ulist(jju_index,iatom,jnbor).im += rootpq * (a_r * ulist(jjup_index,iatom,jnbor).im - a_i * ulist(jjup_index,iatom,jnbor).re); rootpq = rootpqarray(ma + 1,j - mb); ulist(jju_index+1,iatom,jnbor).re = -rootpq * (b_r * ulist(jjup_index,iatom,jnbor).re + b_i * ulist(jjup_index,iatom,jnbor).im); ulist(jju_index+1,iatom,jnbor).im = -rootpq * (b_r * ulist(jjup_index,iatom,jnbor).im - b_i * ulist(jjup_index,iatom,jnbor).re); } }); // copy left side to right side with inversion symmetry VMK 4.4(2) // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) jju = idxu_block[j]; jjup = jju+(j+1)*(j+1)-1; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), [&] (const int& mb) { // for (int mb = 0; 2*mb <= j; mb++) { int mbpar = (mb)%2==0?1:-1; int mapar = mbpar; for (int ma = 0; ma <= j; ma++) { const int jju_index = jju+mb*(j+1)+ma; const int jjup_index = jjup-mb*(j+1)-ma; if (mapar == 1) { ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re; ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im; } else { ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re; ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im; } mapar = -mapar; } }); } } /* ---------------------------------------------------------------------- compute derivatives of Wigner U-functions for one neighbor see comments in compute_uarray_cpu() ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, double x, double y, double z, double z0, double r, double dz0dr, double wj, double rcut) { double r0inv; double a_r, a_i, b_r, b_i; double da_r[3], da_i[3], db_r[3], db_i[3]; double dz0[3], dr0inv[3], dr0invdr; double rootpq; double rinv = 1.0 / r; double ux = x * rinv; double uy = y * rinv; double uz = z * rinv; r0inv = 1.0 / sqrt(r * r + z0 * z0); a_r = z0 * r0inv; a_i = -z * r0inv; b_r = y * r0inv; b_i = -x * r0inv; dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); dr0inv[0] = dr0invdr * ux; dr0inv[1] = dr0invdr * uy; dr0inv[2] = dr0invdr * uz; dz0[0] = dz0dr * ux; dz0[1] = dz0dr * uy; dz0[2] = dz0dr * uz; for (int k = 0; k < 3; k++) { da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k]; da_i[k] = -z * dr0inv[k]; } da_i[2] += -r0inv; for (int k = 0; k < 3; k++) { db_r[k] = y * dr0inv[k]; db_i[k] = -x * dr0inv[k]; } db_i[0] += -r0inv; db_r[1] += r0inv; dulist(0,iatom,jnbor,0).re = 0.0; dulist(0,iatom,jnbor,1).re = 0.0; dulist(0,iatom,jnbor,2).re = 0.0; dulist(0,iatom,jnbor,0).im = 0.0; dulist(0,iatom,jnbor,1).im = 0.0; dulist(0,iatom,jnbor,2).im = 0.0; for (int j = 1; j <= twojmax; j++) { int jju = idxu_block[j]; int jjup = idxu_block[j-1]; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), [&] (const int& mb) { //for (int mb = 0; 2*mb <= j; mb++) { const int jju_index = jju+mb+mb*j; dulist(jju_index,iatom,jnbor,0).re = 0.0; dulist(jju_index,iatom,jnbor,1).re = 0.0; dulist(jju_index,iatom,jnbor,2).re = 0.0; dulist(jju_index,iatom,jnbor,0).im = 0.0; dulist(jju_index,iatom,jnbor,1).im = 0.0; dulist(jju_index,iatom,jnbor,2).im = 0.0; for (int ma = 0; ma < j; ma++) { const int jju_index = jju+mb+mb*j+ma; const int jjup_index = jjup+mb*j+ma; rootpq = rootpqarray(j - ma,j - mb); for (int k = 0; k < 3; k++) { dulist(jju_index,iatom,jnbor,k).re += rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).re + da_i[k] * ulist(jjup_index,iatom,jnbor).im + a_r * dulist(jjup_index,iatom,jnbor,k).re + a_i * dulist(jjup_index,iatom,jnbor,k).im); dulist(jju_index,iatom,jnbor,k).im += rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).im - da_i[k] * ulist(jjup_index,iatom,jnbor).re + a_r * dulist(jjup_index,iatom,jnbor,k).im - a_i * dulist(jjup_index,iatom,jnbor,k).re); } rootpq = rootpqarray(ma + 1,j - mb); for (int k = 0; k < 3; k++) { dulist(jju_index+1,iatom,jnbor,k).re = -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).re + db_i[k] * ulist(jjup_index,iatom,jnbor).im + b_r * dulist(jjup_index,iatom,jnbor,k).re + b_i * dulist(jjup_index,iatom,jnbor,k).im); dulist(jju_index+1,iatom,jnbor,k).im = -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).im - db_i[k] * ulist(jjup_index,iatom,jnbor).re + b_r * dulist(jjup_index,iatom,jnbor,k).im - b_i * dulist(jjup_index,iatom,jnbor,k).re); } } }); // copy left side to right side with inversion symmetry VMK 4.4(2) // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) jju = idxu_block[j]; jjup = jju+(j+1)*(j+1)-1; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), [&] (const int& mb) { // for (int mb = 0; 2*mb <= j; mb++) { int mbpar = (mb)%2==0?1:-1; int mapar = mbpar; for (int ma = 0; ma <= j; ma++) { const int jju_index = jju+mb*(j+1)+ma; const int jjup_index = jjup-mb*(j+1)-ma; if (mapar == 1) { for (int k = 0; k < 3; k++) { dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re; dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im; } } else { for (int k = 0; k < 3; k++) { dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re; dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im; } } mapar = -mapar; } }); } double sfac = compute_sfac(r, rcut); double dsfac = compute_dsfac(r, rcut); sfac *= wj; dsfac *= wj; for (int j = 0; j <= twojmax; j++) { int jju = idxu_block[j]; for (int mb = 0; 2*mb <= j; mb++) for (int ma = 0; ma <= j; ma++) { dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux + sfac * dulist(jju,iatom,jnbor,0).re; dulist(jju,iatom,jnbor,0).im = dsfac * ulist(jju,iatom,jnbor).im * ux + sfac * dulist(jju,iatom,jnbor,0).im; dulist(jju,iatom,jnbor,1).re = dsfac * ulist(jju,iatom,jnbor).re * uy + sfac * dulist(jju,iatom,jnbor,1).re; dulist(jju,iatom,jnbor,1).im = dsfac * ulist(jju,iatom,jnbor).im * uy + sfac * dulist(jju,iatom,jnbor,1).im; dulist(jju,iatom,jnbor,2).re = dsfac * ulist(jju,iatom,jnbor).re * uz + sfac * dulist(jju,iatom,jnbor,2).re; dulist(jju,iatom,jnbor,2).im = dsfac * ulist(jju,iatom,jnbor).im * uz + sfac * dulist(jju,iatom,jnbor,2).im; jju++; } } } /* ---------------------------------------------------------------------- factorial n, wrapper for precomputed table ------------------------------------------------------------------------- */ template inline double SNAKokkos::factorial(int n) { //if (n < 0 || n > nmaxfactorial) { // char str[128]; // sprintf(str, "Invalid argument to factorial %d", n); // error->all(FLERR, str); //} return nfac_table[n]; } /* ---------------------------------------------------------------------- factorial n table, size SNA::nmaxfactorial+1 ------------------------------------------------------------------------- */ template const double SNAKokkos::nfac_table[] = { 1, 1, 2, 6, 24, 120, 720, 5040, 40320, 362880, 3628800, 39916800, 479001600, 6227020800, 87178291200, 1307674368000, 20922789888000, 355687428096000, 6.402373705728e+15, 1.21645100408832e+17, 2.43290200817664e+18, 5.10909421717094e+19, 1.12400072777761e+21, 2.5852016738885e+22, 6.20448401733239e+23, 1.5511210043331e+25, 4.03291461126606e+26, 1.08888694504184e+28, 3.04888344611714e+29, 8.8417619937397e+30, 2.65252859812191e+32, 8.22283865417792e+33, 2.63130836933694e+35, 8.68331761881189e+36, 2.95232799039604e+38, 1.03331479663861e+40, 3.71993326789901e+41, 1.37637530912263e+43, 5.23022617466601e+44, 2.03978820811974e+46, 8.15915283247898e+47, 3.34525266131638e+49, 1.40500611775288e+51, 6.04152630633738e+52, 2.65827157478845e+54, 1.1962222086548e+56, 5.50262215981209e+57, 2.58623241511168e+59, 1.24139155925361e+61, 6.08281864034268e+62, 3.04140932017134e+64, 1.55111875328738e+66, 8.06581751709439e+67, 4.27488328406003e+69, 2.30843697339241e+71, 1.26964033536583e+73, 7.10998587804863e+74, 4.05269195048772e+76, 2.35056133128288e+78, 1.3868311854569e+80, 8.32098711274139e+81, 5.07580213877225e+83, 3.14699732603879e+85, 1.98260831540444e+87, 1.26886932185884e+89, 8.24765059208247e+90, 5.44344939077443e+92, 3.64711109181887e+94, 2.48003554243683e+96, 1.71122452428141e+98, 1.19785716699699e+100, 8.50478588567862e+101, 6.12344583768861e+103, 4.47011546151268e+105, 3.30788544151939e+107, 2.48091408113954e+109, 1.88549470166605e+111, 1.45183092028286e+113, 1.13242811782063e+115, 8.94618213078297e+116, 7.15694570462638e+118, 5.79712602074737e+120, 4.75364333701284e+122, 3.94552396972066e+124, 3.31424013456535e+126, 2.81710411438055e+128, 2.42270953836727e+130, 2.10775729837953e+132, 1.85482642257398e+134, 1.65079551609085e+136, 1.48571596448176e+138, 1.3520015276784e+140, 1.24384140546413e+142, 1.15677250708164e+144, 1.08736615665674e+146, 1.03299784882391e+148, 9.91677934870949e+149, 9.61927596824821e+151, 9.42689044888324e+153, 9.33262154439441e+155, 9.33262154439441e+157, 9.42594775983835e+159, 9.61446671503512e+161, 9.90290071648618e+163, 1.02990167451456e+166, 1.08139675824029e+168, 1.14628056373471e+170, 1.22652020319614e+172, 1.32464181945183e+174, 1.44385958320249e+176, 1.58824554152274e+178, 1.76295255109024e+180, 1.97450685722107e+182, 2.23119274865981e+184, 2.54355973347219e+186, 2.92509369349301e+188, 3.3931086844519e+190, 3.96993716080872e+192, 4.68452584975429e+194, 5.5745857612076e+196, 6.68950291344912e+198, 8.09429852527344e+200, 9.8750442008336e+202, 1.21463043670253e+205, 1.50614174151114e+207, 1.88267717688893e+209, 2.37217324288005e+211, 3.01266001845766e+213, 3.8562048236258e+215, 4.97450422247729e+217, 6.46685548922047e+219, 8.47158069087882e+221, 1.118248651196e+224, 1.48727070609069e+226, 1.99294274616152e+228, 2.69047270731805e+230, 3.65904288195255e+232, 5.01288874827499e+234, 6.91778647261949e+236, 9.61572319694109e+238, 1.34620124757175e+241, 1.89814375907617e+243, 2.69536413788816e+245, 3.85437071718007e+247, 5.5502938327393e+249, 8.04792605747199e+251, 1.17499720439091e+254, 1.72724589045464e+256, 2.55632391787286e+258, 3.80892263763057e+260, 5.71338395644585e+262, 8.62720977423323e+264, 1.31133588568345e+267, 2.00634390509568e+269, 3.08976961384735e+271, 4.78914290146339e+273, 7.47106292628289e+275, 1.17295687942641e+278, 1.85327186949373e+280, 2.94670227249504e+282, 4.71472363599206e+284, 7.59070505394721e+286, 1.22969421873945e+289, 2.0044015765453e+291, 3.28721858553429e+293, 5.42391066613159e+295, 9.00369170577843e+297, 1.503616514865e+300, // nmaxfactorial = 167 }; /* ---------------------------------------------------------------------- the function delta given by VMK Eq. 8.2(1) ------------------------------------------------------------------------- */ template inline double SNAKokkos::deltacg(int j1, int j2, int j) { double sfaccg = factorial((j1 + j2 + j) / 2 + 1); return sqrt(factorial((j1 + j2 - j) / 2) * factorial((j1 - j2 + j) / 2) * factorial((-j1 + j2 + j) / 2) / sfaccg); } /* ---------------------------------------------------------------------- assign Clebsch-Gordan coefficients using the quasi-binomial formula VMK 8.2.1(3) ------------------------------------------------------------------------- */ template inline void SNAKokkos::init_clebsch_gordan() { auto h_cglist = Kokkos::create_mirror_view(cglist); double sum,dcg,sfaccg; int m, aa2, bb2, cc2; int ifac; int idxcg_count = 0; for(int j1 = 0; j1 <= twojmax; j1++) for(int j2 = 0; j2 <= j1; j2++) for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { for (int m1 = 0; m1 <= j1; m1++) { aa2 = 2 * m1 - j1; for (int m2 = 0; m2 <= j2; m2++) { // -c <= cc <= c bb2 = 2 * m2 - j2; m = (aa2 + bb2 + j) / 2; if(m < 0 || m > j) { h_cglist[idxcg_count] = 0.0; idxcg_count++; continue; } sum = 0.0; for (int z = MAX(0, MAX(-(j - j2 + aa2) / 2, -(j - j1 - bb2) / 2)); z <= MIN((j1 + j2 - j) / 2, MIN((j1 - aa2) / 2, (j2 + bb2) / 2)); z++) { ifac = z % 2 ? -1 : 1; sum += ifac / (factorial(z) * factorial((j1 + j2 - j) / 2 - z) * factorial((j1 - aa2) / 2 - z) * factorial((j2 + bb2) / 2 - z) * factorial((j - j2 + aa2) / 2 + z) * factorial((j - j1 - bb2) / 2 + z)); } cc2 = 2 * m - j; dcg = deltacg(j1, j2, j); sfaccg = sqrt(factorial((j1 + aa2) / 2) * factorial((j1 - aa2) / 2) * factorial((j2 + bb2) / 2) * factorial((j2 - bb2) / 2) * factorial((j + cc2) / 2) * factorial((j - cc2) / 2) * (j + 1)); h_cglist[idxcg_count] = sum * dcg * sfaccg; idxcg_count++; } } } Kokkos::deep_copy(cglist,h_cglist); } /* ---------------------------------------------------------------------- pre-compute table of sqrt[p/m2], p, q = 1,twojmax the p = 0, q = 0 entries are allocated and skipped for convenience. ------------------------------------------------------------------------- */ template inline void SNAKokkos::init_rootpqarray() { auto h_rootpqarray = Kokkos::create_mirror_view(rootpqarray); for (int p = 1; p <= twojmax; p++) for (int q = 1; q <= twojmax; q++) h_rootpqarray(p,q) = sqrt(static_cast(p)/q); Kokkos::deep_copy(rootpqarray,h_rootpqarray); } /* ---------------------------------------------------------------------- */ template inline int SNAKokkos::compute_ncoeff() { int ncount; ncount = 0; for (int j1 = 0; j1 <= twojmax; j1++) for (int j2 = 0; j2 <= j1; j2++) for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) if (j >= j1) ncount++; return ncount; } /* ---------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION double SNAKokkos::compute_sfac(double r, double rcut) { if (switch_flag == 0) return 1.0; if (switch_flag == 1) { if(r <= rmin0) return 1.0; else if(r > rcut) return 0.0; else { double rcutfac = MY_PI / (rcut - rmin0); return 0.5 * (cos((r - rmin0) * rcutfac) + 1.0); } } return 0.0; } /* ---------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION double SNAKokkos::compute_dsfac(double r, double rcut) { if (switch_flag == 0) return 0.0; if (switch_flag == 1) { if(r <= rmin0) return 0.0; else if(r > rcut) return 0.0; else { double rcutfac = MY_PI / (rcut - rmin0); return -0.5 * sin((r - rmin0) * rcutfac) * rcutfac; } } return 0.0; } /* ---------------------------------------------------------------------- */ // efficient complex FMA (i.e., y += a x) template KOKKOS_FORCEINLINE_FUNCTION void SNAKokkos::caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) { y.re += a.re * x.re; y.re -= a.im * x.im; y.im += a.im * x.re; y.im += a.re * x.im; } /* ---------------------------------------------------------------------- */ // efficient complex FMA, conjugate of scalar (i.e.) y += (a.re - i a.im) x) template KOKKOS_FORCEINLINE_FUNCTION void SNAKokkos::caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) { y.re += a.re * x.re; y.re += a.im * x.im; y.im -= a.im * x.re; y.im += a.re * x.im; } /* ---------------------------------------------------------------------- */ // set direction of batched Duidrj template KOKKOS_FORCEINLINE_FUNCTION void SNAKokkos::set_dir(int dir_) { dir = dir_; } /* ---------------------------------------------------------------------- memory usage of arrays ------------------------------------------------------------------------- */ template double SNAKokkos::memory_usage() { int jdimpq = twojmax + 2; int jdim = twojmax + 1; double bytes; bytes = 0; bytes += jdimpq*jdimpq * sizeof(double); // pqarray bytes += idxcg_max * sizeof(double); // cglist #ifdef KOKKOS_ENABLE_CUDA if (!std::is_same::value) { #endif bytes += natom * idxu_max * sizeof(double) * 2; // ulist bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist #ifdef KOKKOS_ENABLE_CUDA } #endif bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot if (!Kokkos::Impl::is_same::value) bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr bytes += natom * idxz_max * sizeof(double) * 2; // zlist bytes += natom * idxb_max * sizeof(double); // blist bytes += natom * idxu_max * sizeof(double) * 2; // ylist bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block bytes += jdim * sizeof(int); // idxu_block bytes += jdim * jdim * jdim * sizeof(int); // idxz_block bytes += jdim * jdim * jdim * sizeof(int); // idxb_block bytes += idxz_max * 10 * sizeof(int); // idxz bytes += idxb_max * 3 * sizeof(int); // idxb bytes += jdim * sizeof(double); // bzero bytes += natom * nmax * 3 * sizeof(double); // rij bytes += natom * nmax * sizeof(int); // inside bytes += natom * nmax * sizeof(double); // wj bytes += natom * nmax * sizeof(double); // rcutij bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist_ij return bytes; } } // namespace LAMMPS_NS