1673 lines
55 KiB
C++
1673 lines
55 KiB
C++
/* ----------------------------------------------------------------------
|
|
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
|
http://lammps.sandia.gov, Sandia National Laboratories
|
|
Steve Plimpton, sjplimp@sandia.gov
|
|
|
|
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
|
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
|
certain rights in this software. This software is distributed under
|
|
the GNU General Public License.
|
|
|
|
See the README file in the top-level LAMMPS directory.
|
|
------------------------------------------------------------------------- */
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Contributing authors: Christian Trott (SNL), Stan Moore (SNL)
|
|
------------------------------------------------------------------------- */
|
|
|
|
#include "sna_kokkos.h"
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <cstdlib>
|
|
#include <type_traits>
|
|
|
|
namespace LAMMPS_NS {
|
|
|
|
static const double MY_PI = 3.14159265358979323846; // pi
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
|
|
int twojmax_in,
|
|
double rmin0_in, int switch_flag_in, int bzero_flag_in)
|
|
{
|
|
wself = 1.0;
|
|
|
|
rfac0 = rfac0_in;
|
|
rmin0 = rmin0_in;
|
|
switch_flag = switch_flag_in;
|
|
bzero_flag = bzero_flag_in;
|
|
|
|
twojmax = twojmax_in;
|
|
|
|
ncoeff = compute_ncoeff();
|
|
|
|
nmax = 0;
|
|
|
|
build_indexlist();
|
|
|
|
int jdimpq = twojmax + 2;
|
|
rootpqarray = t_sna_2d("SNAKokkos::rootpqarray",jdimpq,jdimpq);
|
|
|
|
cglist = t_sna_1d("SNAKokkos::cglist",idxcg_max);
|
|
|
|
if (bzero_flag) {
|
|
bzero = Kokkos::View<double*, Kokkos::LayoutRight, DeviceType>("sna:bzero",twojmax+1);
|
|
auto h_bzero = Kokkos::create_mirror_view(bzero);
|
|
|
|
double www = wself*wself*wself;
|
|
for(int j = 0; j <= twojmax; j++)
|
|
h_bzero[j] = www*(j+1);
|
|
Kokkos::deep_copy(bzero,h_bzero);
|
|
}
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
SNAKokkos<DeviceType>::~SNAKokkos()
|
|
{
|
|
}
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
void SNAKokkos<DeviceType>::build_indexlist()
|
|
{
|
|
// index list for cglist
|
|
|
|
int jdim = twojmax + 1;
|
|
idxcg_block = Kokkos::View<int***, DeviceType>("SNAKokkos::idxcg_block",jdim,jdim,jdim);
|
|
auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block);
|
|
|
|
int idxcg_count = 0;
|
|
for(int j1 = 0; j1 <= twojmax; j1++)
|
|
for(int j2 = 0; j2 <= j1; j2++)
|
|
for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
|
|
h_idxcg_block(j1,j2,j) = idxcg_count;
|
|
for (int m1 = 0; m1 <= j1; m1++)
|
|
for (int m2 = 0; m2 <= j2; m2++)
|
|
idxcg_count++;
|
|
}
|
|
idxcg_max = idxcg_count;
|
|
Kokkos::deep_copy(idxcg_block,h_idxcg_block);
|
|
|
|
// index list for uarray
|
|
// need to include both halves
|
|
|
|
idxu_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_block",jdim);
|
|
auto h_idxu_block = Kokkos::create_mirror_view(idxu_block);
|
|
|
|
int idxu_count = 0;
|
|
|
|
for(int j = 0; j <= twojmax; j++) {
|
|
h_idxu_block[j] = idxu_count;
|
|
for(int mb = 0; mb <= j; mb++)
|
|
for(int ma = 0; ma <= j; ma++)
|
|
idxu_count++;
|
|
}
|
|
idxu_max = idxu_count;
|
|
Kokkos::deep_copy(idxu_block,h_idxu_block);
|
|
|
|
// index list for beta and B
|
|
|
|
int idxb_count = 0;
|
|
for(int j1 = 0; j1 <= twojmax; j1++)
|
|
for(int j2 = 0; j2 <= j1; j2++)
|
|
for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
|
|
if (j >= j1) idxb_count++;
|
|
|
|
idxb_max = idxb_count;
|
|
idxb = Kokkos::View<int*[3], DeviceType>("SNAKokkos::idxb",idxb_max);
|
|
auto h_idxb = Kokkos::create_mirror_view(idxb);
|
|
|
|
idxb_count = 0;
|
|
for(int j1 = 0; j1 <= twojmax; j1++)
|
|
for(int j2 = 0; j2 <= j1; j2++)
|
|
for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
|
|
if (j >= j1) {
|
|
h_idxb(idxb_count,0) = j1;
|
|
h_idxb(idxb_count,1) = j2;
|
|
h_idxb(idxb_count,2) = j;
|
|
idxb_count++;
|
|
}
|
|
Kokkos::deep_copy(idxb,h_idxb);
|
|
|
|
// reverse index list for beta and b
|
|
|
|
idxb_block = Kokkos::View<int***, DeviceType>("SNAKokkos::idxb_block",jdim,jdim,jdim);
|
|
auto h_idxb_block = Kokkos::create_mirror_view(idxb_block);
|
|
|
|
idxb_count = 0;
|
|
for(int j1 = 0; j1 <= twojmax; j1++)
|
|
for(int j2 = 0; j2 <= j1; j2++)
|
|
for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
|
|
if (j >= j1) {
|
|
h_idxb_block(j1,j2,j) = idxb_count;
|
|
idxb_count++;
|
|
}
|
|
}
|
|
Kokkos::deep_copy(idxb_block,h_idxb_block);
|
|
|
|
// index list for zlist
|
|
|
|
int idxz_count = 0;
|
|
|
|
for(int j1 = 0; j1 <= twojmax; j1++)
|
|
for(int j2 = 0; j2 <= j1; j2++)
|
|
for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
|
|
for (int mb = 0; 2*mb <= j; mb++)
|
|
for (int ma = 0; ma <= j; ma++)
|
|
idxz_count++;
|
|
|
|
idxz_max = idxz_count;
|
|
idxz = Kokkos::View<int*[10], DeviceType>("SNAKokkos::idxz",idxz_max);
|
|
auto h_idxz = Kokkos::create_mirror_view(idxz);
|
|
|
|
idxz_block = Kokkos::View<int***, DeviceType>("SNAKokkos::idxz_block", jdim,jdim,jdim);
|
|
auto h_idxz_block = Kokkos::create_mirror_view(idxz_block);
|
|
|
|
idxz_count = 0;
|
|
for(int j1 = 0; j1 <= twojmax; j1++)
|
|
for(int j2 = 0; j2 <= j1; j2++)
|
|
for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
|
|
h_idxz_block(j1,j2,j) = idxz_count;
|
|
|
|
// find right beta(ii,jjb) entry
|
|
// multiply and divide by j+1 factors
|
|
// account for multiplicity of 1, 2, or 3
|
|
|
|
for (int mb = 0; 2*mb <= j; mb++)
|
|
for (int ma = 0; ma <= j; ma++) {
|
|
h_idxz(idxz_count,0) = j1;
|
|
h_idxz(idxz_count,1) = j2;
|
|
h_idxz(idxz_count,2) = j;
|
|
h_idxz(idxz_count,3) = MAX(0, (2 * ma - j - j2 + j1) / 2);
|
|
h_idxz(idxz_count,4) = (2 * ma - j - (2 * h_idxz(idxz_count,3) - j1) + j2) / 2;
|
|
h_idxz(idxz_count,5) = MAX(0, (2 * mb - j - j2 + j1) / 2);
|
|
h_idxz(idxz_count,6) = (2 * mb - j - (2 * h_idxz(idxz_count,5) - j1) + j2) / 2;
|
|
h_idxz(idxz_count,7) = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz(idxz_count,3) + 1;
|
|
h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1;
|
|
|
|
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
|
|
|
|
const int jju = h_idxu_block[j] + (j+1)*mb + ma;
|
|
h_idxz(idxz_count,9) = jju;
|
|
|
|
idxz_count++;
|
|
}
|
|
}
|
|
Kokkos::deep_copy(idxz,h_idxz);
|
|
Kokkos::deep_copy(idxz_block,h_idxz_block);
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
void SNAKokkos<DeviceType>::init()
|
|
{
|
|
init_clebsch_gordan();
|
|
init_rootpqarray();
|
|
}
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
|
|
{
|
|
if(newnatom <= natom && newnmax <= nmax) return;
|
|
natom = newnatom;
|
|
nmax = newnmax;
|
|
|
|
rij = t_sna_3d("sna:rij",natom,nmax,3);
|
|
inside = t_sna_2i("sna:inside",natom,nmax);
|
|
wj = t_sna_2d("sna:wj",natom,nmax);
|
|
rcutij = t_sna_2d("sna:rcutij",natom,nmax);
|
|
dedr = t_sna_3d("sna:dedr",natom,nmax,3);
|
|
|
|
blist = t_sna_2d_ll("sna:blist",idxb_max,natom);
|
|
//ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max);
|
|
ulisttot = t_sna_2c_ll("sna:ulisttot",idxu_max,natom);
|
|
|
|
zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom);
|
|
|
|
//ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
|
|
#ifdef KOKKOS_ENABLE_CUDA
|
|
if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
|
|
// dummy allocation
|
|
ulist = t_sna_3c_ll("sna:ulist",1,1,1);
|
|
dulist = t_sna_4c_ll("sna:dulist",1,1,1);
|
|
} else {
|
|
#endif
|
|
ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
|
|
dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
|
|
#ifdef KOKKOS_ENABLE_CUDA
|
|
}
|
|
#endif
|
|
|
|
//ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
|
|
ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom);
|
|
|
|
dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
* compute Ui by summing over neighbors j
|
|
* ------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom)
|
|
{
|
|
for (int j = 0; j <= twojmax; j++) {
|
|
const int jju = idxu_block(j);
|
|
|
|
// Only diagonal elements get initialized
|
|
// for (int m = 0; m < (j+1)*(j+1); m++)
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
|
|
[&] (const int m) {
|
|
|
|
const int jjup = jju + m;
|
|
|
|
// if m is on the "diagonal", initialize it with the self energy.
|
|
// Otherwise zero it out
|
|
SNAcomplex init = {0., 0.};
|
|
if (m % (j+2) == 0) { init = {wself, 0.0}; }
|
|
|
|
ulisttot(jjup, iatom) = init;
|
|
});
|
|
}
|
|
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute Ui by computing Wigner U-functions for one neighbor and
|
|
accumulating to the total. GPU only.
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
|
|
{
|
|
|
|
// utot(j,ma,mb) = 0 for all j,ma,ma
|
|
// utot(j,ma,ma) = 1 for all j,ma
|
|
// for j in neighbors of i:
|
|
// compute r0 = (x,y,z,z0)
|
|
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
|
|
|
|
// get shared memory offset
|
|
const int max_m_tile = (twojmax+1)*(twojmax+1);
|
|
const int team_rank = team.team_rank();
|
|
const int scratch_shift = team_rank * max_m_tile;
|
|
|
|
// double buffer
|
|
SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
|
SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
|
|
|
const double x = rij(iatom,jnbor,0);
|
|
const double y = rij(iatom,jnbor,1);
|
|
const double z = rij(iatom,jnbor,2);
|
|
|
|
const double wj_local = wj(iatom, jnbor);
|
|
const double rcut = rcutij(iatom, jnbor);
|
|
|
|
const double rsq = x * x + y * y + z * z;
|
|
const double r = sqrt(rsq);
|
|
|
|
const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
|
|
// theta0 = (r - rmin0) * rscale0;
|
|
const double cs = cos(theta0);
|
|
const double sn = sin(theta0);
|
|
const double z0 = r * cs / sn; // r / tan(theta0)
|
|
|
|
// Compute cutoff function
|
|
const double sfac = compute_sfac(r, rcut) * wj_local;
|
|
|
|
// compute Cayley-Klein parameters for unit quaternion,
|
|
// pack into complex number
|
|
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
|
const SNAcomplex a = { r0inv * z0, -r0inv * z };
|
|
const SNAcomplex b = { r0inv * y, -r0inv * x };
|
|
|
|
// VMK Section 4.8.2
|
|
|
|
// All writes go to global memory and shared memory
|
|
// so we can avoid all global memory reads
|
|
Kokkos::single(Kokkos::PerThread(team), [=]() {
|
|
//ulist(0,iatom,jnbor) = { 1.0, 0.0 };
|
|
buf1[0] = {1.,0.};
|
|
Kokkos::atomic_add(&(ulisttot(0,iatom).re), sfac);
|
|
});
|
|
|
|
for (int j = 1; j <= twojmax; j++) {
|
|
const int jju = idxu_block[j];
|
|
const int jjup = idxu_block[j-1];
|
|
|
|
// fill in left side of matrix layer from previous layer
|
|
|
|
// Flatten loop over ma, mb, need to figure out total
|
|
// number of iterations
|
|
|
|
// for (int ma = 0; ma <= j; ma++)
|
|
const int n_ma = j+1;
|
|
// for (int mb = 0; 2*mb <= j; mb++)
|
|
const int n_mb = j/2+1;
|
|
|
|
// the last (j / 2) can be avoided due to symmetry
|
|
const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0);
|
|
|
|
//for (int m = 0; m < total_iters; m++) {
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
|
|
[&] (const int m) {
|
|
|
|
// ma fast, mb slow
|
|
int ma = m % n_ma;
|
|
int mb = m / n_ma;
|
|
|
|
// index into global memory array
|
|
const int jju_index = jju+m;
|
|
//const int jjup_index = jjup+mb*j+ma;
|
|
|
|
// index into shared memory buffer for this level
|
|
const int jju_shared_idx = m;
|
|
|
|
// index into shared memory buffer for next level
|
|
const int jjup_shared_idx = jju_shared_idx - mb;
|
|
|
|
SNAcomplex u_accum = {0., 0.};
|
|
|
|
// VMK recursion relation: grab contribution which is multiplied by b*
|
|
const double rootpq2 = -rootpqarray(ma, j - mb);
|
|
const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
|
|
//const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
|
|
caconjxpy(b, u_up2, u_accum);
|
|
|
|
// VMK recursion relation: grab contribution which is multiplied by a*
|
|
const double rootpq1 = rootpqarray(j - ma, j - mb);
|
|
const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
|
|
//const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.);
|
|
caconjxpy(a, u_up1, u_accum);
|
|
|
|
//ulist(jju_index,iatom,jnbor) = u_accum;
|
|
// back up into shared memory for next iter
|
|
buf2[jju_shared_idx] = u_accum;
|
|
|
|
Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re);
|
|
Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im);
|
|
|
|
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
|
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
|
|
// if j is even (-> physical j integer), last element maps to self, skip
|
|
//if (!(m == total_iters - 1 && j % 2 == 0)) {
|
|
if (m < total_iters - 1 || j % 2 == 1) {
|
|
const int sign_factor = (((ma+mb)%2==0)?1:-1);
|
|
const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
|
|
const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
|
|
|
|
|
|
if (sign_factor == 1) {
|
|
u_accum.im = -u_accum.im;
|
|
} else {
|
|
u_accum.re = -u_accum.re;
|
|
}
|
|
//ulist(jjup_flip,iatom,jnbor) = u_accum;
|
|
buf2[jju_shared_flip] = u_accum;
|
|
|
|
Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re);
|
|
Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im);
|
|
}
|
|
});
|
|
// In CUDA backend,
|
|
// ThreadVectorRange has a __syncwarp (appropriately masked for
|
|
// vector lengths < 32) implict at the end
|
|
|
|
// swap double buffers
|
|
auto tmp = buf1; buf1 = buf2; buf2 = tmp;
|
|
|
|
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute Ui by summing over bispectrum components. CPU only.
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
|
|
{
|
|
double rsq, r, x, y, z, z0, theta0;
|
|
|
|
// utot(j,ma,mb) = 0 for all j,ma,ma
|
|
// utot(j,ma,ma) = 1 for all j,ma
|
|
// for j in neighbors of i:
|
|
// compute r0 = (x,y,z,z0)
|
|
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
|
|
|
|
x = rij(iatom,jnbor,0);
|
|
y = rij(iatom,jnbor,1);
|
|
z = rij(iatom,jnbor,2);
|
|
rsq = x * x + y * y + z * z;
|
|
r = sqrt(rsq);
|
|
|
|
theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
|
|
// theta0 = (r - rmin0) * rscale0;
|
|
z0 = r / tan(theta0);
|
|
|
|
compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
|
|
add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor));
|
|
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute Zi by summing over products of Ui
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_zi(const int& iter)
|
|
{
|
|
const int iatom = iter / idxz_max;
|
|
const int jjz = iter % idxz_max;
|
|
|
|
const int j1 = idxz(jjz,0);
|
|
const int j2 = idxz(jjz,1);
|
|
const int j = idxz(jjz,2);
|
|
const int ma1min = idxz(jjz,3);
|
|
const int ma2max = idxz(jjz,4);
|
|
const int mb1min = idxz(jjz,5);
|
|
const int mb2max = idxz(jjz,6);
|
|
const int na = idxz(jjz,7);
|
|
const int nb = idxz(jjz,8);
|
|
|
|
const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
|
|
|
|
zlist(jjz,iatom).re = 0.0;
|
|
zlist(jjz,iatom).im = 0.0;
|
|
|
|
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
|
|
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
|
|
int icgb = mb1min*(j2+1) + mb2max;
|
|
for(int ib = 0; ib < nb; ib++) {
|
|
|
|
double suma1_r = 0.0;
|
|
double suma1_i = 0.0;
|
|
|
|
int ma1 = ma1min;
|
|
int ma2 = ma2max;
|
|
int icga = ma1min*(j2+1) + ma2max;
|
|
for(int ia = 0; ia < na; ia++) {
|
|
suma1_r += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).re - ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).im);
|
|
suma1_i += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).im + ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).re);
|
|
ma1++;
|
|
ma2--;
|
|
icga += j2;
|
|
} // end loop over ia
|
|
|
|
zlist(jjz,iatom).re += cgblock[icgb] * suma1_r;
|
|
zlist(jjz,iatom).im += cgblock[icgb] * suma1_i;
|
|
|
|
jju1 += j1+1;
|
|
jju2 -= j2+1;
|
|
icgb += j2;
|
|
} // end loop over ib
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute Yi from Ui without storing Zi, looping over zlist indices
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::zero_yi(const int& idx, const int& iatom)
|
|
{
|
|
ylist(idx,iatom) = {0.0, 0.0};
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute Yi from Ui without storing Zi, looping over zlist indices
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_yi(int iter,
|
|
const Kokkos::View<F_FLOAT**, DeviceType> &beta)
|
|
{
|
|
double betaj;
|
|
const int iatom = iter / idxz_max;
|
|
const int jjz = iter % idxz_max;
|
|
|
|
const int j1 = idxz(jjz,0);
|
|
const int j2 = idxz(jjz,1);
|
|
const int j = idxz(jjz,2);
|
|
const int ma1min = idxz(jjz,3);
|
|
const int ma2max = idxz(jjz,4);
|
|
const int mb1min = idxz(jjz,5);
|
|
const int mb2max = idxz(jjz,6);
|
|
const int na = idxz(jjz,7);
|
|
const int nb = idxz(jjz,8);
|
|
const int jju = idxz(jjz,9);
|
|
|
|
const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
|
|
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
|
|
//int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
|
|
|
|
double ztmp_r = 0.0;
|
|
double ztmp_i = 0.0;
|
|
|
|
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
|
|
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
|
|
int icgb = mb1min*(j2+1) + mb2max;
|
|
for(int ib = 0; ib < nb; ib++) {
|
|
|
|
double suma1_r = 0.0;
|
|
double suma1_i = 0.0;
|
|
|
|
int ma1 = ma1min;
|
|
int ma2 = ma2max;
|
|
int icga = ma1min*(j2+1) + ma2max;
|
|
|
|
for(int ia = 0; ia < na; ia++) {
|
|
suma1_r += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).re - ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).im);
|
|
suma1_i += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).im + ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).re);
|
|
ma1++;
|
|
ma2--;
|
|
icga += j2;
|
|
} // end loop over ia
|
|
|
|
ztmp_r += cgblock[icgb] * suma1_r;
|
|
ztmp_i += cgblock[icgb] * suma1_i;
|
|
jju1 += j1+1;
|
|
jju2 -= j2+1;
|
|
icgb += j2;
|
|
} // end loop over ib
|
|
|
|
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
|
|
// find right y_list[jju] and beta(iatom,jjb) entries
|
|
// multiply and divide by j+1 factors
|
|
// account for multiplicity of 1, 2, or 3
|
|
|
|
// pick out right beta value
|
|
|
|
if (j >= j1) {
|
|
const int jjb = idxb_block(j1,j2,j);
|
|
if (j1 == j) {
|
|
if (j2 == j) betaj = 3*beta(jjb,iatom);
|
|
else betaj = 2*beta(jjb,iatom);
|
|
} else betaj = beta(jjb,iatom);
|
|
} else if (j >= j2) {
|
|
const int jjb = idxb_block(j,j2,j1);
|
|
if (j2 == j) betaj = 2*beta(jjb,iatom)*(j1+1)/(j+1.0);
|
|
else betaj = beta(jjb,iatom)*(j1+1)/(j+1.0);
|
|
} else {
|
|
const int jjb = idxb_block(j2,j,j1);
|
|
betaj = beta(jjb,iatom)*(j1+1)/(j+1.0);
|
|
}
|
|
|
|
Kokkos::atomic_add(&(ylist(jju,iatom).re), betaj*ztmp_r);
|
|
Kokkos::atomic_add(&(ylist(jju,iatom).im), betaj*ztmp_i);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
Fused calculation of the derivative of Ui w.r.t. atom j
|
|
and of dEidRj. GPU only.
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
|
|
{
|
|
// get shared memory offset
|
|
const int max_m_tile = (twojmax+1)*(twojmax/2+1);
|
|
const int team_rank = team.team_rank();
|
|
const int scratch_shift = team_rank * max_m_tile;
|
|
|
|
// double buffer for ulist
|
|
SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
|
SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
|
|
|
// double buffer for dulist
|
|
SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
|
SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
|
|
|
|
const double x = rij(iatom,jnbor,0);
|
|
const double y = rij(iatom,jnbor,1);
|
|
const double z = rij(iatom,jnbor,2);
|
|
const double rsq = x * x + y * y + z * z;
|
|
const double r = sqrt(rsq);
|
|
const double rcut = rcutij(iatom, jnbor);
|
|
const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
|
|
const double theta0 = (r - rmin0) * rscale0;
|
|
const double cs = cos(theta0);
|
|
const double sn = sin(theta0);
|
|
const double z0 = r * cs / sn;
|
|
const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
|
|
|
|
const double wj_local = wj(iatom, jnbor);
|
|
const double sfac = wj_local * compute_sfac(r, rcut);
|
|
const double dsfac = wj_local * compute_dsfac(r, rcut);
|
|
|
|
const double rinv = 1.0 / r;
|
|
|
|
// extract a single unit vector
|
|
const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
|
|
|
|
// Compute Cayley-Klein parameters for unit quaternion
|
|
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
|
|
|
const SNAcomplex a = { r0inv * z0, -r0inv * z };
|
|
const SNAcomplex b = { r0inv * y, -r0inv * x };
|
|
|
|
const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
|
|
const double dr0inv = dr0invdr * u;
|
|
const double dz0 = dz0dr * u;
|
|
|
|
const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
|
|
- z * dr0inv + (dir == 2 ? - r0inv : 0.) };
|
|
|
|
const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
|
|
-x * dr0inv + (dir==0?-r0inv:0.) };
|
|
|
|
// Accumulate the full contribution to dedr on the fly
|
|
const double du_prod = dsfac * u; // chain rule
|
|
const SNAcomplex y_local = ylist(0, iatom);
|
|
|
|
// Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
|
|
double dedr_full_sum = 0.5 * du_prod * y_local.re;
|
|
|
|
// single has a warp barrier at the end
|
|
Kokkos::single(Kokkos::PerThread(team), [=]() {
|
|
//dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
|
|
ulist_buf1[0] = {1., 0.};
|
|
dulist_buf1[0] = {0., 0.};
|
|
});
|
|
|
|
for (int j = 1; j <= twojmax; j++) {
|
|
int jju = idxu_block[j];
|
|
int jjup = idxu_block[j-1];
|
|
|
|
// flatten the loop over ma,mb
|
|
|
|
// for (int ma = 0; ma <= j; ma++)
|
|
const int n_ma = j+1;
|
|
// for (int mb = 0; 2*mb <= j; mb++)
|
|
const int n_mb = j/2+1;
|
|
|
|
const int total_iters = n_ma * n_mb;
|
|
|
|
double dedr_sum = 0.; // j-local sum
|
|
|
|
//for (int m = 0; m < total_iters; m++) {
|
|
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters),
|
|
[&] (const int m, double& sum_tmp) {
|
|
|
|
// ma fast, mb slow
|
|
int ma = m % n_ma;
|
|
int mb = m / n_ma;
|
|
|
|
const int jju_index = jju+m;
|
|
|
|
// Load y_local, apply the symmetry scaling factor
|
|
// The "secret" of the shared memory optimization is it eliminates
|
|
// all global memory reads to duidrj in lieu of caching values in
|
|
// shared memory and otherwise always writing, making the kernel
|
|
// ultimately compute bound. We take advantage of that by adding
|
|
// some reads back in.
|
|
auto y_local = ylist(jju_index,iatom);
|
|
if (j % 2 == 0 && 2*mb == j) {
|
|
if (ma == mb) { y_local = 0.5*y_local; }
|
|
else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
|
|
// else the ma < mb gets "double counted", cancelling the 0.5.
|
|
}
|
|
|
|
// index into shared memory
|
|
const int jju_shared_idx = m;
|
|
const int jjup_shared_idx = jju_shared_idx - mb;
|
|
|
|
// Need to compute and accumulate both u and du (mayhaps, we could probably
|
|
// balance some read and compute by reading u each time).
|
|
SNAcomplex u_accum = { 0., 0. };
|
|
SNAcomplex du_accum = { 0., 0. };
|
|
|
|
const double rootpq2 = -rootpqarray(ma, j - mb);
|
|
const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
|
|
caconjxpy(b, u_up2, u_accum);
|
|
|
|
const double rootpq1 = rootpqarray(j - ma, j - mb);
|
|
const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
|
|
caconjxpy(a, u_up1, u_accum);
|
|
|
|
// Next, spin up du_accum
|
|
const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
|
|
caconjxpy(da, u_up1, du_accum);
|
|
caconjxpy(a, du_up1, du_accum);
|
|
|
|
const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
|
|
caconjxpy(db, u_up2, du_accum);
|
|
caconjxpy(b, du_up2, du_accum);
|
|
|
|
// No need to save u_accum to global memory
|
|
// Cache u_accum, du_accum to scratch memory.
|
|
ulist_buf2[jju_shared_idx] = u_accum;
|
|
dulist_buf2[jju_shared_idx] = du_accum;
|
|
|
|
// Directly accumulate deidrj into sum_tmp
|
|
//dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
|
|
const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
|
|
sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
|
|
|
|
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
|
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
|
|
if (j%2==1 && mb+1==n_mb) {
|
|
int sign_factor = (((ma+mb)%2==0)?1:-1);
|
|
//const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
|
|
const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
|
|
|
|
if (sign_factor == 1) {
|
|
u_accum.im = -u_accum.im;
|
|
du_accum.im = -du_accum.im;
|
|
} else {
|
|
u_accum.re = -u_accum.re;
|
|
du_accum.re = -du_accum.re;
|
|
}
|
|
|
|
// We don't need the second half of the tile for the deidrj accumulation.
|
|
// That's taken care of by the symmetry factor above.
|
|
//dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
|
|
|
|
// We do need it for ortho polynomial generation, though
|
|
ulist_buf2[jju_shared_flip] = u_accum;
|
|
dulist_buf2[jju_shared_flip] = du_accum;
|
|
}
|
|
|
|
}, dedr_sum);
|
|
|
|
// swap buffers
|
|
auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
|
|
tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
|
|
|
|
// Accumulate dedr. This "should" be in a single, but
|
|
// a Kokkos::single call implies a warp sync, and we may
|
|
// as well avoid that. This does no harm as long as the
|
|
// final assignment is in a single block.
|
|
//Kokkos::single(Kokkos::PerThread(team), [=]() {
|
|
dedr_full_sum += dedr_sum;
|
|
//});
|
|
}
|
|
|
|
// Store the accumulated dedr.
|
|
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
|
dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
|
|
});
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute dEidRj, CPU path only.
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
|
|
{
|
|
t_scalar3<double> final_sum;
|
|
|
|
//for(int j = 0; j <= twojmax; j++) {
|
|
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
|
|
[&] (const int& j, t_scalar3<double>& sum_tmp) {
|
|
int jju = idxu_block[j];
|
|
|
|
for(int mb = 0; 2*mb < j; mb++)
|
|
for(int ma = 0; ma <= j; ma++) {
|
|
sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im;
|
|
sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im;
|
|
sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im;
|
|
jju++;
|
|
} //end loop over ma mb
|
|
|
|
// For j even, handle middle column
|
|
|
|
if (j%2 == 0) {
|
|
|
|
int mb = j/2;
|
|
for(int ma = 0; ma < mb; ma++) {
|
|
sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im;
|
|
sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im;
|
|
sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im;
|
|
jju++;
|
|
}
|
|
|
|
//int ma = mb;
|
|
sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im)*0.5;
|
|
sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im)*0.5;
|
|
sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im)*0.5;
|
|
} // end if jeven
|
|
|
|
},final_sum); // end loop over j
|
|
|
|
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
|
dedr(iatom,jnbor,0) = final_sum.x*2.0;
|
|
dedr(iatom,jnbor,1) = final_sum.y*2.0;
|
|
dedr(iatom,jnbor,2) = final_sum.z*2.0;
|
|
});
|
|
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute Bi by summing conj(Ui)*Zi
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
|
|
{
|
|
// for j1 = 0,...,twojmax
|
|
// for j2 = 0,twojmax
|
|
// for j = |j1-j2|,Min(twojmax,j1+j2),2
|
|
// b(j1,j2,j) = 0
|
|
// for mb = 0,...,jmid
|
|
// for ma = 0,...,j
|
|
// b(j1,j2,j) +=
|
|
// 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
|
|
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max),
|
|
[&] (const int& jjb) {
|
|
//for(int jjb = 0; jjb < idxb_max; jjb++) {
|
|
const int j1 = idxb(jjb,0);
|
|
const int j2 = idxb(jjb,1);
|
|
const int j = idxb(jjb,2);
|
|
|
|
int jjz = idxz_block(j1,j2,j);
|
|
int jju = idxu_block[j];
|
|
double sumzu = 0.0;
|
|
double sumzu_temp = 0.0;
|
|
const int bound = (j+2)/2;
|
|
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound),
|
|
[&] (const int mbma, double& sum) {
|
|
//for(int mb = 0; 2*mb < j; mb++)
|
|
//for(int ma = 0; ma <= j; ma++) {
|
|
const int ma = mbma%(j+1);
|
|
const int mb = mbma/(j+1);
|
|
const int jju_index = jju+mb*(j+1)+ma;
|
|
const int jjz_index = jjz+mb*(j+1)+ma;
|
|
if (2*mb == j) return;
|
|
sum +=
|
|
ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re +
|
|
ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im;
|
|
},sumzu_temp); // end loop over ma, mb
|
|
sumzu += sumzu_temp;
|
|
|
|
// For j even, special treatment for middle column
|
|
|
|
if (j%2 == 0) {
|
|
const int mb = j/2;
|
|
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb),
|
|
[&] (const int ma, double& sum) {
|
|
//for(int ma = 0; ma < mb; ma++) {
|
|
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
|
|
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
|
|
sum +=
|
|
ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re +
|
|
ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im;
|
|
},sumzu_temp); // end loop over ma
|
|
sumzu += sumzu_temp;
|
|
|
|
const int ma = mb;
|
|
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
|
|
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
|
|
sumzu += 0.5*
|
|
(ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re +
|
|
ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im);
|
|
} // end if jeven
|
|
|
|
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
|
sumzu *= 2.0;
|
|
|
|
// apply bzero shift
|
|
|
|
if (bzero_flag)
|
|
sumzu -= bzero[j];
|
|
|
|
blist(jjb,iatom) = sumzu;
|
|
});
|
|
});
|
|
//} // end loop over j
|
|
//} // end loop over j1, j2
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
calculate derivative of Ui w.r.t. atom j
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
|
|
{
|
|
double rsq, r, x, y, z, z0, theta0, cs, sn;
|
|
double dz0dr;
|
|
|
|
x = rij(iatom,jnbor,0);
|
|
y = rij(iatom,jnbor,1);
|
|
z = rij(iatom,jnbor,2);
|
|
rsq = x * x + y * y + z * z;
|
|
r = sqrt(rsq);
|
|
double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
|
|
theta0 = (r - rmin0) * rscale0;
|
|
cs = cos(theta0);
|
|
sn = sin(theta0);
|
|
z0 = r * cs / sn;
|
|
dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
|
|
|
|
compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
add Wigner U-functions for one neighbor to the total
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
|
double r, double wj, double rcut)
|
|
{
|
|
const double sfac = compute_sfac(r, rcut) * wj;
|
|
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(0)),
|
|
[&] (const int& i) {
|
|
Kokkos::atomic_add(&(ulisttot(i,iatom).re), sfac * ulist(i,iatom,jnbor).re);
|
|
Kokkos::atomic_add(&(ulisttot(i,iatom).im), sfac * ulist(i,iatom,jnbor).im);
|
|
});
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute Wigner U-functions for one neighbor
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
|
double x, double y, double z,
|
|
double z0, double r)
|
|
{
|
|
double r0inv;
|
|
double a_r, b_r, a_i, b_i;
|
|
double rootpq;
|
|
|
|
// compute Cayley-Klein parameters for unit quaternion
|
|
|
|
r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
|
a_r = r0inv * z0;
|
|
a_i = -r0inv * z;
|
|
b_r = r0inv * y;
|
|
b_i = -r0inv * x;
|
|
|
|
// VMK Section 4.8.2
|
|
|
|
ulist(0,iatom,jnbor).re = 1.0;
|
|
ulist(0,iatom,jnbor).im = 0.0;
|
|
|
|
for (int j = 1; j <= twojmax; j++) {
|
|
int jju = idxu_block[j];
|
|
int jjup = idxu_block[j-1];
|
|
|
|
// fill in left side of matrix layer from previous layer
|
|
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
|
[&] (const int& mb) {
|
|
//for (int mb = 0; 2*mb <= j; mb++) {
|
|
const int jju_index = jju+mb+mb*j;
|
|
ulist(jju_index,iatom,jnbor).re = 0.0;
|
|
ulist(jju_index,iatom,jnbor).im = 0.0;
|
|
|
|
for (int ma = 0; ma < j; ma++) {
|
|
const int jju_index = jju+mb+mb*j+ma;
|
|
const int jjup_index = jjup+mb*j+ma;
|
|
rootpq = rootpqarray(j - ma,j - mb);
|
|
ulist(jju_index,iatom,jnbor,jju).re +=
|
|
rootpq *
|
|
(a_r * ulist(jjup_index,iatom,jnbor).re +
|
|
a_i * ulist(jjup_index,iatom,jnbor).im);
|
|
ulist(jju_index,iatom,jnbor).im +=
|
|
rootpq *
|
|
(a_r * ulist(jjup_index,iatom,jnbor).im -
|
|
a_i * ulist(jjup_index,iatom,jnbor).re);
|
|
|
|
rootpq = rootpqarray(ma + 1,j - mb);
|
|
ulist(jju_index+1,iatom,jnbor).re =
|
|
-rootpq *
|
|
(b_r * ulist(jjup_index,iatom,jnbor).re +
|
|
b_i * ulist(jjup_index,iatom,jnbor).im);
|
|
ulist(jju_index+1,iatom,jnbor).im =
|
|
-rootpq *
|
|
(b_r * ulist(jjup_index,iatom,jnbor).im -
|
|
b_i * ulist(jjup_index,iatom,jnbor).re);
|
|
}
|
|
});
|
|
|
|
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
|
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
|
|
|
|
jju = idxu_block[j];
|
|
jjup = jju+(j+1)*(j+1)-1;
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
|
[&] (const int& mb) {
|
|
// for (int mb = 0; 2*mb <= j; mb++) {
|
|
int mbpar = (mb)%2==0?1:-1;
|
|
int mapar = mbpar;
|
|
for (int ma = 0; ma <= j; ma++) {
|
|
const int jju_index = jju+mb*(j+1)+ma;
|
|
const int jjup_index = jjup-mb*(j+1)-ma;
|
|
if (mapar == 1) {
|
|
ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
|
|
ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
|
|
} else {
|
|
ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
|
|
ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
|
|
}
|
|
mapar = -mapar;
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
compute derivatives of Wigner U-functions for one neighbor
|
|
see comments in compute_uarray_cpu()
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
|
double x, double y, double z,
|
|
double z0, double r, double dz0dr,
|
|
double wj, double rcut)
|
|
{
|
|
double r0inv;
|
|
double a_r, a_i, b_r, b_i;
|
|
double da_r[3], da_i[3], db_r[3], db_i[3];
|
|
double dz0[3], dr0inv[3], dr0invdr;
|
|
double rootpq;
|
|
|
|
double rinv = 1.0 / r;
|
|
double ux = x * rinv;
|
|
double uy = y * rinv;
|
|
double uz = z * rinv;
|
|
|
|
r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
|
a_r = z0 * r0inv;
|
|
a_i = -z * r0inv;
|
|
b_r = y * r0inv;
|
|
b_i = -x * r0inv;
|
|
|
|
dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
|
|
|
|
dr0inv[0] = dr0invdr * ux;
|
|
dr0inv[1] = dr0invdr * uy;
|
|
dr0inv[2] = dr0invdr * uz;
|
|
|
|
dz0[0] = dz0dr * ux;
|
|
dz0[1] = dz0dr * uy;
|
|
dz0[2] = dz0dr * uz;
|
|
|
|
for (int k = 0; k < 3; k++) {
|
|
da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k];
|
|
da_i[k] = -z * dr0inv[k];
|
|
}
|
|
|
|
da_i[2] += -r0inv;
|
|
|
|
for (int k = 0; k < 3; k++) {
|
|
db_r[k] = y * dr0inv[k];
|
|
db_i[k] = -x * dr0inv[k];
|
|
}
|
|
|
|
db_i[0] += -r0inv;
|
|
db_r[1] += r0inv;
|
|
|
|
dulist(0,iatom,jnbor,0).re = 0.0;
|
|
dulist(0,iatom,jnbor,1).re = 0.0;
|
|
dulist(0,iatom,jnbor,2).re = 0.0;
|
|
dulist(0,iatom,jnbor,0).im = 0.0;
|
|
dulist(0,iatom,jnbor,1).im = 0.0;
|
|
dulist(0,iatom,jnbor,2).im = 0.0;
|
|
|
|
for (int j = 1; j <= twojmax; j++) {
|
|
int jju = idxu_block[j];
|
|
int jjup = idxu_block[j-1];
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
|
[&] (const int& mb) {
|
|
//for (int mb = 0; 2*mb <= j; mb++) {
|
|
const int jju_index = jju+mb+mb*j;
|
|
dulist(jju_index,iatom,jnbor,0).re = 0.0;
|
|
dulist(jju_index,iatom,jnbor,1).re = 0.0;
|
|
dulist(jju_index,iatom,jnbor,2).re = 0.0;
|
|
dulist(jju_index,iatom,jnbor,0).im = 0.0;
|
|
dulist(jju_index,iatom,jnbor,1).im = 0.0;
|
|
dulist(jju_index,iatom,jnbor,2).im = 0.0;
|
|
|
|
for (int ma = 0; ma < j; ma++) {
|
|
const int jju_index = jju+mb+mb*j+ma;
|
|
const int jjup_index = jjup+mb*j+ma;
|
|
rootpq = rootpqarray(j - ma,j - mb);
|
|
for (int k = 0; k < 3; k++) {
|
|
dulist(jju_index,iatom,jnbor,k).re +=
|
|
rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).re +
|
|
da_i[k] * ulist(jjup_index,iatom,jnbor).im +
|
|
a_r * dulist(jjup_index,iatom,jnbor,k).re +
|
|
a_i * dulist(jjup_index,iatom,jnbor,k).im);
|
|
dulist(jju_index,iatom,jnbor,k).im +=
|
|
rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).im -
|
|
da_i[k] * ulist(jjup_index,iatom,jnbor).re +
|
|
a_r * dulist(jjup_index,iatom,jnbor,k).im -
|
|
a_i * dulist(jjup_index,iatom,jnbor,k).re);
|
|
}
|
|
|
|
rootpq = rootpqarray(ma + 1,j - mb);
|
|
for (int k = 0; k < 3; k++) {
|
|
dulist(jju_index+1,iatom,jnbor,k).re =
|
|
-rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).re +
|
|
db_i[k] * ulist(jjup_index,iatom,jnbor).im +
|
|
b_r * dulist(jjup_index,iatom,jnbor,k).re +
|
|
b_i * dulist(jjup_index,iatom,jnbor,k).im);
|
|
dulist(jju_index+1,iatom,jnbor,k).im =
|
|
-rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).im -
|
|
db_i[k] * ulist(jjup_index,iatom,jnbor).re +
|
|
b_r * dulist(jjup_index,iatom,jnbor,k).im -
|
|
b_i * dulist(jjup_index,iatom,jnbor,k).re);
|
|
}
|
|
}
|
|
});
|
|
|
|
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
|
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
|
|
|
|
jju = idxu_block[j];
|
|
jjup = jju+(j+1)*(j+1)-1;
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2),
|
|
[&] (const int& mb) {
|
|
// for (int mb = 0; 2*mb <= j; mb++) {
|
|
int mbpar = (mb)%2==0?1:-1;
|
|
int mapar = mbpar;
|
|
for (int ma = 0; ma <= j; ma++) {
|
|
const int jju_index = jju+mb*(j+1)+ma;
|
|
const int jjup_index = jjup-mb*(j+1)-ma;
|
|
if (mapar == 1) {
|
|
for (int k = 0; k < 3; k++) {
|
|
dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
|
|
dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
|
|
}
|
|
} else {
|
|
for (int k = 0; k < 3; k++) {
|
|
dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
|
|
dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
|
|
}
|
|
}
|
|
mapar = -mapar;
|
|
}
|
|
});
|
|
}
|
|
|
|
double sfac = compute_sfac(r, rcut);
|
|
double dsfac = compute_dsfac(r, rcut);
|
|
|
|
sfac *= wj;
|
|
dsfac *= wj;
|
|
|
|
for (int j = 0; j <= twojmax; j++) {
|
|
int jju = idxu_block[j];
|
|
for (int mb = 0; 2*mb <= j; mb++)
|
|
for (int ma = 0; ma <= j; ma++) {
|
|
dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux +
|
|
sfac * dulist(jju,iatom,jnbor,0).re;
|
|
dulist(jju,iatom,jnbor,0).im = dsfac * ulist(jju,iatom,jnbor).im * ux +
|
|
sfac * dulist(jju,iatom,jnbor,0).im;
|
|
dulist(jju,iatom,jnbor,1).re = dsfac * ulist(jju,iatom,jnbor).re * uy +
|
|
sfac * dulist(jju,iatom,jnbor,1).re;
|
|
dulist(jju,iatom,jnbor,1).im = dsfac * ulist(jju,iatom,jnbor).im * uy +
|
|
sfac * dulist(jju,iatom,jnbor,1).im;
|
|
dulist(jju,iatom,jnbor,2).re = dsfac * ulist(jju,iatom,jnbor).re * uz +
|
|
sfac * dulist(jju,iatom,jnbor,2).re;
|
|
dulist(jju,iatom,jnbor,2).im = dsfac * ulist(jju,iatom,jnbor).im * uz +
|
|
sfac * dulist(jju,iatom,jnbor,2).im;
|
|
|
|
jju++;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
factorial n, wrapper for precomputed table
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
double SNAKokkos<DeviceType>::factorial(int n)
|
|
{
|
|
//if (n < 0 || n > nmaxfactorial) {
|
|
// char str[128];
|
|
// sprintf(str, "Invalid argument to factorial %d", n);
|
|
// error->all(FLERR, str);
|
|
//}
|
|
|
|
return nfac_table[n];
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
factorial n table, size SNA::nmaxfactorial+1
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
const double SNAKokkos<DeviceType>::nfac_table[] = {
|
|
1,
|
|
1,
|
|
2,
|
|
6,
|
|
24,
|
|
120,
|
|
720,
|
|
5040,
|
|
40320,
|
|
362880,
|
|
3628800,
|
|
39916800,
|
|
479001600,
|
|
6227020800,
|
|
87178291200,
|
|
1307674368000,
|
|
20922789888000,
|
|
355687428096000,
|
|
6.402373705728e+15,
|
|
1.21645100408832e+17,
|
|
2.43290200817664e+18,
|
|
5.10909421717094e+19,
|
|
1.12400072777761e+21,
|
|
2.5852016738885e+22,
|
|
6.20448401733239e+23,
|
|
1.5511210043331e+25,
|
|
4.03291461126606e+26,
|
|
1.08888694504184e+28,
|
|
3.04888344611714e+29,
|
|
8.8417619937397e+30,
|
|
2.65252859812191e+32,
|
|
8.22283865417792e+33,
|
|
2.63130836933694e+35,
|
|
8.68331761881189e+36,
|
|
2.95232799039604e+38,
|
|
1.03331479663861e+40,
|
|
3.71993326789901e+41,
|
|
1.37637530912263e+43,
|
|
5.23022617466601e+44,
|
|
2.03978820811974e+46,
|
|
8.15915283247898e+47,
|
|
3.34525266131638e+49,
|
|
1.40500611775288e+51,
|
|
6.04152630633738e+52,
|
|
2.65827157478845e+54,
|
|
1.1962222086548e+56,
|
|
5.50262215981209e+57,
|
|
2.58623241511168e+59,
|
|
1.24139155925361e+61,
|
|
6.08281864034268e+62,
|
|
3.04140932017134e+64,
|
|
1.55111875328738e+66,
|
|
8.06581751709439e+67,
|
|
4.27488328406003e+69,
|
|
2.30843697339241e+71,
|
|
1.26964033536583e+73,
|
|
7.10998587804863e+74,
|
|
4.05269195048772e+76,
|
|
2.35056133128288e+78,
|
|
1.3868311854569e+80,
|
|
8.32098711274139e+81,
|
|
5.07580213877225e+83,
|
|
3.14699732603879e+85,
|
|
1.98260831540444e+87,
|
|
1.26886932185884e+89,
|
|
8.24765059208247e+90,
|
|
5.44344939077443e+92,
|
|
3.64711109181887e+94,
|
|
2.48003554243683e+96,
|
|
1.71122452428141e+98,
|
|
1.19785716699699e+100,
|
|
8.50478588567862e+101,
|
|
6.12344583768861e+103,
|
|
4.47011546151268e+105,
|
|
3.30788544151939e+107,
|
|
2.48091408113954e+109,
|
|
1.88549470166605e+111,
|
|
1.45183092028286e+113,
|
|
1.13242811782063e+115,
|
|
8.94618213078297e+116,
|
|
7.15694570462638e+118,
|
|
5.79712602074737e+120,
|
|
4.75364333701284e+122,
|
|
3.94552396972066e+124,
|
|
3.31424013456535e+126,
|
|
2.81710411438055e+128,
|
|
2.42270953836727e+130,
|
|
2.10775729837953e+132,
|
|
1.85482642257398e+134,
|
|
1.65079551609085e+136,
|
|
1.48571596448176e+138,
|
|
1.3520015276784e+140,
|
|
1.24384140546413e+142,
|
|
1.15677250708164e+144,
|
|
1.08736615665674e+146,
|
|
1.03299784882391e+148,
|
|
9.91677934870949e+149,
|
|
9.61927596824821e+151,
|
|
9.42689044888324e+153,
|
|
9.33262154439441e+155,
|
|
9.33262154439441e+157,
|
|
9.42594775983835e+159,
|
|
9.61446671503512e+161,
|
|
9.90290071648618e+163,
|
|
1.02990167451456e+166,
|
|
1.08139675824029e+168,
|
|
1.14628056373471e+170,
|
|
1.22652020319614e+172,
|
|
1.32464181945183e+174,
|
|
1.44385958320249e+176,
|
|
1.58824554152274e+178,
|
|
1.76295255109024e+180,
|
|
1.97450685722107e+182,
|
|
2.23119274865981e+184,
|
|
2.54355973347219e+186,
|
|
2.92509369349301e+188,
|
|
3.3931086844519e+190,
|
|
3.96993716080872e+192,
|
|
4.68452584975429e+194,
|
|
5.5745857612076e+196,
|
|
6.68950291344912e+198,
|
|
8.09429852527344e+200,
|
|
9.8750442008336e+202,
|
|
1.21463043670253e+205,
|
|
1.50614174151114e+207,
|
|
1.88267717688893e+209,
|
|
2.37217324288005e+211,
|
|
3.01266001845766e+213,
|
|
3.8562048236258e+215,
|
|
4.97450422247729e+217,
|
|
6.46685548922047e+219,
|
|
8.47158069087882e+221,
|
|
1.118248651196e+224,
|
|
1.48727070609069e+226,
|
|
1.99294274616152e+228,
|
|
2.69047270731805e+230,
|
|
3.65904288195255e+232,
|
|
5.01288874827499e+234,
|
|
6.91778647261949e+236,
|
|
9.61572319694109e+238,
|
|
1.34620124757175e+241,
|
|
1.89814375907617e+243,
|
|
2.69536413788816e+245,
|
|
3.85437071718007e+247,
|
|
5.5502938327393e+249,
|
|
8.04792605747199e+251,
|
|
1.17499720439091e+254,
|
|
1.72724589045464e+256,
|
|
2.55632391787286e+258,
|
|
3.80892263763057e+260,
|
|
5.71338395644585e+262,
|
|
8.62720977423323e+264,
|
|
1.31133588568345e+267,
|
|
2.00634390509568e+269,
|
|
3.08976961384735e+271,
|
|
4.78914290146339e+273,
|
|
7.47106292628289e+275,
|
|
1.17295687942641e+278,
|
|
1.85327186949373e+280,
|
|
2.94670227249504e+282,
|
|
4.71472363599206e+284,
|
|
7.59070505394721e+286,
|
|
1.22969421873945e+289,
|
|
2.0044015765453e+291,
|
|
3.28721858553429e+293,
|
|
5.42391066613159e+295,
|
|
9.00369170577843e+297,
|
|
1.503616514865e+300, // nmaxfactorial = 167
|
|
};
|
|
|
|
/* ----------------------------------------------------------------------
|
|
the function delta given by VMK Eq. 8.2(1)
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
double SNAKokkos<DeviceType>::deltacg(int j1, int j2, int j)
|
|
{
|
|
double sfaccg = factorial((j1 + j2 + j) / 2 + 1);
|
|
return sqrt(factorial((j1 + j2 - j) / 2) *
|
|
factorial((j1 - j2 + j) / 2) *
|
|
factorial((-j1 + j2 + j) / 2) / sfaccg);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
assign Clebsch-Gordan coefficients using
|
|
the quasi-binomial formula VMK 8.2.1(3)
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
void SNAKokkos<DeviceType>::init_clebsch_gordan()
|
|
{
|
|
auto h_cglist = Kokkos::create_mirror_view(cglist);
|
|
|
|
double sum,dcg,sfaccg;
|
|
int m, aa2, bb2, cc2;
|
|
int ifac;
|
|
|
|
int idxcg_count = 0;
|
|
for(int j1 = 0; j1 <= twojmax; j1++)
|
|
for(int j2 = 0; j2 <= j1; j2++)
|
|
for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
|
|
for (int m1 = 0; m1 <= j1; m1++) {
|
|
aa2 = 2 * m1 - j1;
|
|
|
|
for (int m2 = 0; m2 <= j2; m2++) {
|
|
|
|
// -c <= cc <= c
|
|
|
|
bb2 = 2 * m2 - j2;
|
|
m = (aa2 + bb2 + j) / 2;
|
|
|
|
if(m < 0 || m > j) {
|
|
h_cglist[idxcg_count] = 0.0;
|
|
idxcg_count++;
|
|
continue;
|
|
}
|
|
|
|
sum = 0.0;
|
|
|
|
for (int z = MAX(0, MAX(-(j - j2 + aa2)
|
|
/ 2, -(j - j1 - bb2) / 2));
|
|
z <= MIN((j1 + j2 - j) / 2,
|
|
MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
|
|
z++) {
|
|
ifac = z % 2 ? -1 : 1;
|
|
sum += ifac /
|
|
(factorial(z) *
|
|
factorial((j1 + j2 - j) / 2 - z) *
|
|
factorial((j1 - aa2) / 2 - z) *
|
|
factorial((j2 + bb2) / 2 - z) *
|
|
factorial((j - j2 + aa2) / 2 + z) *
|
|
factorial((j - j1 - bb2) / 2 + z));
|
|
}
|
|
|
|
cc2 = 2 * m - j;
|
|
dcg = deltacg(j1, j2, j);
|
|
sfaccg = sqrt(factorial((j1 + aa2) / 2) *
|
|
factorial((j1 - aa2) / 2) *
|
|
factorial((j2 + bb2) / 2) *
|
|
factorial((j2 - bb2) / 2) *
|
|
factorial((j + cc2) / 2) *
|
|
factorial((j - cc2) / 2) *
|
|
(j + 1));
|
|
|
|
h_cglist[idxcg_count] = sum * dcg * sfaccg;
|
|
idxcg_count++;
|
|
}
|
|
}
|
|
}
|
|
Kokkos::deep_copy(cglist,h_cglist);
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
pre-compute table of sqrt[p/m2], p, q = 1,twojmax
|
|
the p = 0, q = 0 entries are allocated and skipped for convenience.
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
void SNAKokkos<DeviceType>::init_rootpqarray()
|
|
{
|
|
auto h_rootpqarray = Kokkos::create_mirror_view(rootpqarray);
|
|
for (int p = 1; p <= twojmax; p++)
|
|
for (int q = 1; q <= twojmax; q++)
|
|
h_rootpqarray(p,q) = sqrt(static_cast<double>(p)/q);
|
|
Kokkos::deep_copy(rootpqarray,h_rootpqarray);
|
|
}
|
|
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
inline
|
|
int SNAKokkos<DeviceType>::compute_ncoeff()
|
|
{
|
|
int ncount;
|
|
|
|
ncount = 0;
|
|
|
|
for (int j1 = 0; j1 <= twojmax; j1++)
|
|
for (int j2 = 0; j2 <= j1; j2++)
|
|
for (int j = j1 - j2;
|
|
j <= MIN(twojmax, j1 + j2); j += 2)
|
|
if (j >= j1) ncount++;
|
|
|
|
return ncount;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
double SNAKokkos<DeviceType>::compute_sfac(double r, double rcut)
|
|
{
|
|
if (switch_flag == 0) return 1.0;
|
|
if (switch_flag == 1) {
|
|
if(r <= rmin0) return 1.0;
|
|
else if(r > rcut) return 0.0;
|
|
else {
|
|
double rcutfac = MY_PI / (rcut - rmin0);
|
|
return 0.5 * (cos((r - rmin0) * rcutfac) + 1.0);
|
|
}
|
|
}
|
|
return 0.0;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
KOKKOS_INLINE_FUNCTION
|
|
double SNAKokkos<DeviceType>::compute_dsfac(double r, double rcut)
|
|
{
|
|
if (switch_flag == 0) return 0.0;
|
|
if (switch_flag == 1) {
|
|
if(r <= rmin0) return 0.0;
|
|
else if(r > rcut) return 0.0;
|
|
else {
|
|
double rcutfac = MY_PI / (rcut - rmin0);
|
|
return -0.5 * sin((r - rmin0) * rcutfac) * rcutfac;
|
|
}
|
|
}
|
|
return 0.0;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
// efficient complex FMA (i.e., y += a x)
|
|
template<class DeviceType>
|
|
KOKKOS_FORCEINLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) {
|
|
y.re += a.re * x.re;
|
|
y.re -= a.im * x.im;
|
|
y.im += a.im * x.re;
|
|
y.im += a.re * x.im;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
// efficient complex FMA, conjugate of scalar (i.e.) y += (a.re - i a.im) x)
|
|
template<class DeviceType>
|
|
KOKKOS_FORCEINLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) {
|
|
y.re += a.re * x.re;
|
|
y.re += a.im * x.im;
|
|
y.im -= a.im * x.re;
|
|
y.im += a.re * x.im;
|
|
}
|
|
|
|
/* ---------------------------------------------------------------------- */
|
|
|
|
// set direction of batched Duidrj
|
|
template<class DeviceType>
|
|
KOKKOS_FORCEINLINE_FUNCTION
|
|
void SNAKokkos<DeviceType>::set_dir(int dir_) {
|
|
dir = dir_;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------------
|
|
memory usage of arrays
|
|
------------------------------------------------------------------------- */
|
|
|
|
template<class DeviceType>
|
|
double SNAKokkos<DeviceType>::memory_usage()
|
|
{
|
|
int jdimpq = twojmax + 2;
|
|
int jdim = twojmax + 1;
|
|
double bytes;
|
|
|
|
bytes = 0;
|
|
|
|
bytes += jdimpq*jdimpq * sizeof(double); // pqarray
|
|
bytes += idxcg_max * sizeof(double); // cglist
|
|
|
|
#ifdef KOKKOS_ENABLE_CUDA
|
|
if (!std::is_same<DeviceType,Kokkos::Cuda>::value) {
|
|
#endif
|
|
bytes += natom * idxu_max * sizeof(double) * 2; // ulist
|
|
bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist
|
|
#ifdef KOKKOS_ENABLE_CUDA
|
|
}
|
|
#endif
|
|
bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot
|
|
if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
|
|
bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr
|
|
|
|
bytes += natom * idxz_max * sizeof(double) * 2; // zlist
|
|
bytes += natom * idxb_max * sizeof(double); // blist
|
|
bytes += natom * idxu_max * sizeof(double) * 2; // ylist
|
|
|
|
bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block
|
|
bytes += jdim * sizeof(int); // idxu_block
|
|
bytes += jdim * jdim * jdim * sizeof(int); // idxz_block
|
|
bytes += jdim * jdim * jdim * sizeof(int); // idxb_block
|
|
|
|
bytes += idxz_max * 10 * sizeof(int); // idxz
|
|
bytes += idxb_max * 3 * sizeof(int); // idxb
|
|
|
|
bytes += jdim * sizeof(double); // bzero
|
|
|
|
bytes += natom * nmax * 3 * sizeof(double); // rij
|
|
bytes += natom * nmax * sizeof(int); // inside
|
|
bytes += natom * nmax * sizeof(double); // wj
|
|
bytes += natom * nmax * sizeof(double); // rcutij
|
|
bytes += natom * nmax * idxu_max * sizeof(double) * 2; // ulist_ij
|
|
|
|
return bytes;
|
|
}
|
|
|
|
} // namespace LAMMPS_NS
|