Merge branch 'master' into lammps_gjf

This commit is contained in:
charlie sievers
2019-10-08 12:21:29 -07:00
4 changed files with 248 additions and 242 deletions

View File

@ -39,6 +39,7 @@ struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPZeroYi{};
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeDuidrj{};
struct TagPairSNAPComputeDeidrj{};
@ -73,19 +74,22 @@ public:
void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
void operator() (TagPairSNAPPreUi,const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi>::member_type& team) const;
void operator() (TagPairSNAPComputeZi,const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi>::member_type& team) const;
void operator() (TagPairSNAPZeroYi,const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeYi,const int& ii) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;

View File

@ -184,22 +184,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Experimental::Max<int>(max_neighs));
int vector_length = 1;
int ui_vector_length = 1;
int team_size = 1;
int yi_team_size = 1;
int team_size_max = Kokkos::TeamPolicy<DeviceType>::team_size_max(*this);
#ifdef KOKKOS_ENABLE_CUDA
team_size = 32;//max_neighs;
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
yi_team_size = 256;
if (yi_team_size*vector_length > team_size_max)
yi_team_size = team_size_max/vector_length;
ui_vector_length = 8;
if (team_size*ui_vector_length > team_size_max)
team_size = team_size_max/ui_vector_length;
#endif
if (beta_max < inum) {
@ -227,17 +217,21 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
//PreUi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi> policy_preui(chunk_size,team_size,vector_length);
typename Kokkos::RangePolicy<DeviceType, TagPairSNAPPreUi> policy_preui(0,chunk_size);
Kokkos::parallel_for("PreUi",policy_preui,*this);
//ComputeUi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((inum+team_size-1)/team_size)*max_neighs,team_size,ui_vector_length);
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((inum+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
//Ulisttot transpose
snaKK.transpose_ulisttot();
//Compute bispectrum
if (quadraticflag || eflag) {
//ComputeZi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi> policy_zi(chunk_size,team_size,vector_length);
int idxz_max = snaKK.idxz_max;
typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeZi> policy_zi(0,chunk_size*idxz_max);
Kokkos::parallel_for("ComputeZi",policy_zi,*this);
//ComputeBi
@ -250,7 +244,12 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
//ComputeYi
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(chunk_size,yi_team_size,vector_length);
typename Kokkos::RangePolicy<DeviceType, TagPairSNAPZeroYi> policy_zero_yi(0,chunk_size);
Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
//ComputeYi
int idxz_max = snaKK.idxz_max;
typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
Kokkos::parallel_for("ComputeYi",policy_yi,*this);
//ComputeDuidrj
@ -504,10 +503,9 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const {
int ii = team.league_rank();
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const int& ii) const {
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.pre_ui(team,ii);
my_sna.pre_ui(ii);
}
template<class DeviceType>
@ -529,18 +527,23 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeYi>::member_type& team) const {
int ii = team.league_rank();
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const int& ii) const {
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_yi(team,ii,d_beta);
my_sna.zero_yi(ii);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeZi>::member_type& team) const {
int ii = team.league_rank();
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int& ii) const {
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_zi(team,ii);
my_sna.compute_yi(ii,d_beta);
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int& ii) const {
SNAKokkos<DeviceType> my_sna = snaKK;
my_sna.compute_zi(ii);
}
template<class DeviceType>

View File

@ -26,15 +26,28 @@
namespace LAMMPS_NS {
typedef double SNAreal;
typedef struct { SNAreal re, im; } SNAcomplex;
struct SNAKK_ZINDICES {
int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
//typedef struct { SNAreal re, im; } SNAcomplex;
struct alignas(2*sizeof(SNAreal)) SNAcomplex{
SNAreal re, im;
KOKKOS_INLINE_FUNCTION
SNAcomplex() : re(0),im(0)
{}
KOKKOS_INLINE_FUNCTION
SNAcomplex(SNAreal real_in, SNAreal imag_in)
:re(real_in),im(imag_in)
{}
};
struct SNAKK_BINDICES {
int j1, j2, j;
};
//struct SNAKK_ZINDICES {
// int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
//};
//
//struct SNAKK_BINDICES {
// int j1, j2, j;
//};
template<class DeviceType>
class SNAKokkos {
@ -53,12 +66,32 @@ public:
typedef Kokkos::View<SNAcomplex*, DeviceType> t_sna_1c;
typedef Kokkos::View<SNAcomplex*, DeviceType, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1c_atomic;
typedef Kokkos::View<SNAcomplex**, DeviceType> t_sna_2c;
typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_cpu;
typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_lr;
typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
// Helper class to get ulisttot_r
template<typename DeviceLayout, typename T1, typename T2>
class UlisttotHelper {
public:
inline
static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) {
Kokkos::deep_copy(ulisttot_lr,ulisttot);
}
};
template<typename T1, typename T2>
class UlisttotHelper<Kokkos::LayoutRight,T1,T2> {
public:
inline
static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) {
ulisttot_lr = ulisttot;
}
};
inline
SNAKokkos() {};
KOKKOS_INLINE_FUNCTION
@ -80,17 +113,22 @@ inline
int ncoeff;
inline
void transpose_ulisttot();
// functions for bispectrum coefficients
KOKKOS_INLINE_FUNCTION
void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
void pre_ui(const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_zi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
void compute_zi(const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_yi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int,
void zero_yi(const int&);
KOKKOS_INLINE_FUNCTION
void compute_yi(int,
const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_bi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // ForceSNAP
@ -129,23 +167,25 @@ inline
int twojmax, diagonalstyle;
t_sna_2d blist;
t_sna_2c_cpu ulisttot;
t_sna_2c ulisttot;
t_sna_2c_lr ulisttot_lr;
t_sna_2c zlist;
t_sna_3c ulist;
t_sna_2c ylist;
t_sna_2c_lr ylist;
// derivatives of data
t_sna_4c dulist;
int idxcg_max, idxu_max, idxz_max, idxb_max;
private:
double rmin0, rfac0;
//use indexlist instead of loops, constructor generates these
// Same across all SNAKokkos
Kokkos::View<SNAKK_ZINDICES*, DeviceType> idxz;
Kokkos::View<SNAKK_BINDICES*, DeviceType> idxb;
int idxcg_max, idxu_max, idxz_max, idxb_max;
Kokkos::View<int*[10], DeviceType> idxz;
Kokkos::View<int*[3], DeviceType> idxb;
Kokkos::View<int***, DeviceType> idxcg_block;
Kokkos::View<int*, DeviceType> idxu_block;
Kokkos::View<int***, DeviceType> idxz_block;
@ -173,9 +213,9 @@ inline
inline
void init_rootpqarray(); // init()
KOKKOS_INLINE_FUNCTION
void zero_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int); // compute_ui
void zero_uarraytot(const int&); // compute_ui
KOKKOS_INLINE_FUNCTION
void addself_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, double); // compute_ui
void addself_uarraytot(const int&, const double&); // compute_ui
KOKKOS_INLINE_FUNCTION
void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui

View File

@ -117,7 +117,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
if (j >= j1) idxb_count++;
idxb_max = idxb_count;
idxb = Kokkos::View<SNAKK_BINDICES*, DeviceType>("SNAKokkos::idxb",idxb_max);
idxb = Kokkos::View<int*[3], DeviceType>("SNAKokkos::idxb",idxb_max);
auto h_idxb = Kokkos::create_mirror_view(idxb);
idxb_count = 0;
@ -125,9 +125,9 @@ void SNAKokkos<DeviceType>::build_indexlist()
for(int j2 = 0; j2 <= j1; j2++)
for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
if (j >= j1) {
h_idxb[idxb_count].j1 = j1;
h_idxb[idxb_count].j2 = j2;
h_idxb[idxb_count].j = j;
h_idxb(idxb_count,0) = j1;
h_idxb(idxb_count,1) = j2;
h_idxb(idxb_count,2) = j;
idxb_count++;
}
Kokkos::deep_copy(idxb,h_idxb);
@ -160,7 +160,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
idxz_count++;
idxz_max = idxz_count;
idxz = Kokkos::View<SNAKK_ZINDICES*, DeviceType>("SNAKokkos::idxz",idxz_max);
idxz = Kokkos::View<int*[10], DeviceType>("SNAKokkos::idxz",idxz_max);
auto h_idxz = Kokkos::create_mirror_view(idxz);
idxz_block = Kokkos::View<int***, DeviceType>("SNAKokkos::idxz_block", jdim,jdim,jdim);
@ -178,20 +178,20 @@ void SNAKokkos<DeviceType>::build_indexlist()
for (int mb = 0; 2*mb <= j; mb++)
for (int ma = 0; ma <= j; ma++) {
h_idxz[idxz_count].j1 = j1;
h_idxz[idxz_count].j2 = j2;
h_idxz[idxz_count].j = j;
h_idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2);
h_idxz[idxz_count].ma2max = (2 * ma - j - (2 * h_idxz[idxz_count].ma1min - j1) + j2) / 2;
h_idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz[idxz_count].ma1min + 1;
h_idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2);
h_idxz[idxz_count].mb2max = (2 * mb - j - (2 * h_idxz[idxz_count].mb1min - j1) + j2) / 2;
h_idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz[idxz_count].mb1min + 1;
h_idxz(idxz_count,0) = j1;
h_idxz(idxz_count,1) = j2;
h_idxz(idxz_count,2) = j;
h_idxz(idxz_count,3) = MAX(0, (2 * ma - j - j2 + j1) / 2);
h_idxz(idxz_count,4) = (2 * ma - j - (2 * h_idxz(idxz_count,3) - j1) + j2) / 2;
h_idxz(idxz_count,5) = MAX(0, (2 * mb - j - j2 + j1) / 2);
h_idxz(idxz_count,6) = (2 * mb - j - (2 * h_idxz(idxz_count,5) - j1) + j2) / 2;
h_idxz(idxz_count,7) = MIN(j1, (2 * ma - j + j2 + j1) / 2) - h_idxz(idxz_count,3) + 1;
h_idxz(idxz_count,8) = MIN(j1, (2 * mb - j + j2 + j1) / 2) - h_idxz(idxz_count,5) + 1;
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
const int jju = h_idxu_block[j] + (j+1)*mb + ma;
h_idxz[idxz_count].jju = jju;
h_idxz(idxz_count,9) = jju;
idxz_count++;
}
@ -225,11 +225,13 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
dedr = t_sna_3d("sna:dedr",natom,nmax,3);
blist = t_sna_2d("sna:blist",natom,idxb_max);
ulisttot = t_sna_2c_cpu("sna:ulisttot",natom,idxu_max);
ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max);
if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
ulisttot_lr = t_sna_2c_lr("sna:ulisttot_lr",natom,idxu_max);
zlist = t_sna_2c("sna:zlist",natom,idxz_max);
ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
ylist = t_sna_2c("sna:ylist",natom,idxu_max);
ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
}
@ -240,15 +242,15 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
void SNAKokkos<DeviceType>::pre_ui(const int& iatom)
{
if(team.team_rank() == 0) {
zero_uarraytot(team,iatom);
//if(team.team_rank() == 0) {
zero_uarraytot(iatom);
//Kokkos::single(Kokkos::PerThread(team), [&] (){
addself_uarraytot(team,iatom,wself);
addself_uarraytot(iatom,wself);
//});
}
team.team_barrier();
//}
//team.team_barrier();
}
/* ----------------------------------------------------------------------
@ -278,50 +280,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
z0 = r / tan(theta0);
compute_uarray(team, iatom, jnbor, x, y, z, z0, r);
//Kokkos::single(Kokkos::PerThread(team), [&] (){
add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor));
//});
}
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnum)
{
double rsq, r, x, y, z, z0, theta0;
// utot(j,ma,mb) = 0 for all j,ma,ma
// utot(j,ma,ma) = 1 for all j,ma
// for j in neighbors of i:
// compute r0 = (x,y,z,z0)
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
if(team.team_rank() == 0) {
zero_uarraytot(team,iatom);
//Kokkos::single(Kokkos::PerThread(team), [&] (){
addself_uarraytot(team,iatom,wself);
//});
}
team.team_barrier();
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,jnum),
[&] (const int& j) {
//for(int j = 0; j < jnum; j++) {
x = rij(iatom,j,0);
y = rij(iatom,j,1);
z = rij(iatom,j,2);
rsq = x * x + y * y + z * z;
r = sqrt(rsq);
theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,j) - rmin0);
// theta0 = (r - rmin0) * rscale0;
z0 = r / tan(theta0);
compute_uarray(team, iatom, j, x, y, z, z0, r);
//Kokkos::single(Kokkos::PerThread(team), [&] (){
add_uarraytot(team, iatom, j, r, wj(iatom,j), rcutij(iatom,j));
//});
});
}
/* ----------------------------------------------------------------------
@ -330,54 +289,52 @@ void SNAKokkos<DeviceType>::compute_ui_orig(const typename Kokkos::TeamPolicy<De
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
void SNAKokkos<DeviceType>::compute_zi(const int& iter)
{
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max),
[&] (const int& jjz) {
//for(int jjz = 0; jjz < idxz_max; jjz++) {
const int j1 = idxz[jjz].j1;
const int j2 = idxz[jjz].j2;
const int j = idxz[jjz].j;
const int ma1min = idxz[jjz].ma1min;
const int ma2max = idxz[jjz].ma2max;
const int na = idxz[jjz].na;
const int mb1min = idxz[jjz].mb1min;
const int mb2max = idxz[jjz].mb2max;
const int nb = idxz[jjz].nb;
const int iatom = iter / idxz_max;
const int jjz = iter % idxz_max;
const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
const int j1 = idxz(jjz,0);
const int j2 = idxz(jjz,1);
const int j = idxz(jjz,2);
const int ma1min = idxz(jjz,3);
const int ma2max = idxz(jjz,4);
const int mb1min = idxz(jjz,5);
const int mb2max = idxz(jjz,6);
const int na = idxz(jjz,7);
const int nb = idxz(jjz,8);
zlist(iatom,jjz).re = 0.0;
zlist(iatom,jjz).im = 0.0;
const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
int icgb = mb1min*(j2+1) + mb2max;
for(int ib = 0; ib < nb; ib++) {
zlist(iatom,jjz).re = 0.0;
zlist(iatom,jjz).im = 0.0;
double suma1_r = 0.0;
double suma1_i = 0.0;
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
int icgb = mb1min*(j2+1) + mb2max;
for(int ib = 0; ib < nb; ib++) {
int ma1 = ma1min;
int ma2 = ma2max;
int icga = ma1min*(j2+1) + ma2max;
for(int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im);
suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re);
ma1++;
ma2--;
icga += j2;
} // end loop over ia
double suma1_r = 0.0;
double suma1_i = 0.0;
zlist(iatom,jjz).re += cgblock[icgb] * suma1_r;
zlist(iatom,jjz).im += cgblock[icgb] * suma1_i;
int ma1 = ma1min;
int ma2 = ma2max;
int icga = ma1min*(j2+1) + ma2max;
for(int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im);
suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re);
ma1++;
ma2--;
icga += j2;
} // end loop over ia
jju1 += j1+1;
jju2 -= j2+1;
icgb += j2;
} // end loop over ib
zlist(iatom,jjz).re += cgblock[icgb] * suma1_r;
zlist(iatom,jjz).im += cgblock[icgb] * suma1_i;
}); // end loop over jjz
jju1 += j1+1;
jju2 -= j2+1;
icgb += j2;
} // end loop over ib
}
/* ----------------------------------------------------------------------
@ -386,102 +343,94 @@ void SNAKokkos<DeviceType>::compute_zi(const typename Kokkos::TeamPolicy<DeviceT
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_yi(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom,
void SNAKokkos<DeviceType>::zero_yi(const int& iatom)
{
for (int j = 0; j < idxu_max; j++)
ylist(iatom,j) = {0.0,0.0};
}
/* ----------------------------------------------------------------------
compute Yi from Ui without storing Zi, looping over zlist indices
------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_yi(int iter,
const Kokkos::View<F_FLOAT**, DeviceType> &beta)
{
double betaj;
const int ii = iatom;
const int iatom = iter / idxz_max;
const int jjz = iter % idxz_max;
{
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,ylist.extent(1)),
[&] (const int& i) {
ylist(iatom,i).re = 0.0;
ylist(iatom,i).im = 0.0;
});
}
const int j1 = idxz(jjz,0);
const int j2 = idxz(jjz,1);
const int j = idxz(jjz,2);
const int ma1min = idxz(jjz,3);
const int ma2max = idxz(jjz,4);
const int mb1min = idxz(jjz,5);
const int mb2max = idxz(jjz,6);
const int na = idxz(jjz,7);
const int nb = idxz(jjz,8);
const int jju = idxz(jjz,9);
//int flopsum = 0;
const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
//int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxz_max),
[&] (const int& jjz) {
//for(int jjz = 0; jjz < idxz_max; jjz++) {
const int j1 = idxz[jjz].j1;
const int j2 = idxz[jjz].j2;
const int j = idxz[jjz].j;
const int ma1min = idxz[jjz].ma1min;
const int ma2max = idxz[jjz].ma2max;
const int na = idxz[jjz].na;
const int mb1min = idxz[jjz].mb1min;
const int mb2max = idxz[jjz].mb2max;
const int nb = idxz[jjz].nb;
double ztmp_r = 0.0;
double ztmp_i = 0.0;
const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
//int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
//int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
int icgb = mb1min*(j2+1) + mb2max;
for(int ib = 0; ib < nb; ib++) {
double ztmp_r = 0.0;
double ztmp_i = 0.0;
double suma1_r = 0.0;
double suma1_i = 0.0;
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
int icgb = mb1min*(j2+1) + mb2max;
for(int ib = 0; ib < nb; ib++) {
int ma1 = ma1min;
int ma2 = ma2max;
int icga = ma1min*(j2+1) + ma2max;
double suma1_r = 0.0;
double suma1_i = 0.0;
for(int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).re - ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).im);
suma1_i += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).im + ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).re);
ma1++;
ma2--;
icga += j2;
} // end loop over ia
int ma1 = ma1min;
int ma2 = ma2max;
int icga = ma1min*(j2+1) + ma2max;
ztmp_r += cgblock[icgb] * suma1_r;
ztmp_i += cgblock[icgb] * suma1_i;
jju1 += j1+1;
jju2 -= j2+1;
icgb += j2;
} // end loop over ib
for(int ia = 0; ia < na; ia++) {
suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im);
suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re);
//flopsum += 10;
ma1++;
ma2--;
icga += j2;
} // end loop over ia
ztmp_r += cgblock[icgb] * suma1_r;
ztmp_i += cgblock[icgb] * suma1_i;
jju1 += j1+1;
jju2 -= j2+1;
icgb += j2;
} // end loop over ib
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
// find right y_list[jju] and beta(ii,jjb) entries
// multiply and divide by j+1 factors
// account for multiplicity of 1, 2, or 3
const int jju = idxz[jjz].jju;
// apply to z(j1,j2,j,ma,mb) to unique element of y(j)
// find right y_list[jju] and beta(iatom,jjb) entries
// multiply and divide by j+1 factors
// account for multiplicity of 1, 2, or 3
// pick out right beta value
if (j >= j1) {
const int jjb = idxb_block(j1,j2,j);
if (j1 == j) {
if (j2 == j) betaj = 3*beta(ii,jjb);
else betaj = 2*beta(ii,jjb);
} else betaj = beta(ii,jjb);
} else if (j >= j2) {
const int jjb = idxb_block(j,j2,j1);
if (j2 == j) betaj = 2*beta(ii,jjb)*(j1+1)/(j+1.0);
else betaj = beta(ii,jjb)*(j1+1)/(j+1.0);
} else {
const int jjb = idxb_block(j2,j,j1);
betaj = beta(ii,jjb)*(j1+1)/(j+1.0);
}
if (j >= j1) {
const int jjb = idxb_block(j1,j2,j);
if (j1 == j) {
if (j2 == j) betaj = 3*beta(iatom,jjb);
else betaj = 2*beta(iatom,jjb);
} else betaj = beta(iatom,jjb);
} else if (j >= j2) {
const int jjb = idxb_block(j,j2,j1);
if (j2 == j) betaj = 2*beta(iatom,jjb)*(j1+1)/(j+1.0);
else betaj = beta(iatom,jjb)*(j1+1)/(j+1.0);
} else {
const int jjb = idxb_block(j2,j,j1);
betaj = beta(iatom,jjb)*(j1+1)/(j+1.0);
}
Kokkos::single(Kokkos::PerThread(team), [&] () {
Kokkos::atomic_add(&(ylist(iatom,jju).re), betaj*ztmp_r);
Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i);
});
}); // end loop over jjz
//printf("sum %i\n",flopsum);
Kokkos::atomic_add(&(ylist(iatom,jju).re), betaj*ztmp_r);
Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i);
}
/* ----------------------------------------------------------------------
@ -556,9 +505,9 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max),
[&] (const int& jjb) {
//for(int jjb = 0; jjb < idxb_max; jjb++) {
const int j1 = idxb[jjb].j1;
const int j2 = idxb[jjb].j2;
const int j = idxb[jjb].j;
const int j1 = idxb(jjb,0);
const int j2 = idxb(jjb,1);
const int j = idxb(jjb,2);
int jjz = idxz_block(j1,j2,j);
int jju = idxu_block[j];
@ -648,14 +597,16 @@ void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<Dev
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::zero_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
void SNAKokkos<DeviceType>::zero_uarraytot(const int& iatom)
{
{
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)),
[&] (const int& i) {
//Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)),
// [&] (const int& i) {
for (int i = 0; i < ulisttot.extent(1); i++) {
ulisttot(iatom,i).re = 0.0;
ulisttot(iatom,i).im = 0.0;
});
}
//});
}
}
@ -663,18 +614,18 @@ void SNAKokkos<DeviceType>::zero_uarraytot(const typename Kokkos::TeamPolicy<Dev
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::addself_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, double wself_in)
void SNAKokkos<DeviceType>::addself_uarraytot(const int& iatom, const double& wself_in)
{
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
[&] (const int& j) {
//for (int j = 0; j <= twojmax; j++)
//Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
// [&] (const int& j) {
for (int j = 0; j <= twojmax; j++) {
int jju = idxu_block[j];
for (int ma = 0; ma <= j; ma++) {
ulisttot(iatom,jju).re = wself_in;
ulisttot(iatom,jju).im = 0.0;
jju += j+2;
}
});
}//});
}
/* ----------------------------------------------------------------------
@ -786,6 +737,12 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
}
}
template<class DeviceType>
void SNAKokkos<DeviceType>::transpose_ulisttot()
{
UlisttotHelper<typename DeviceType::array_layout,decltype(ulisttot_lr),decltype(ulisttot)>::transpose(ulisttot_lr,ulisttot);
}
/* ----------------------------------------------------------------------
compute derivatives of Wigner U-functions for one neighbor
see comments in compute_uarray()
@ -1318,6 +1275,8 @@ double SNAKokkos<DeviceType>::memory_usage()
bytes += natom * idxu_max * sizeof(double) * 2; // ulist
bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot
if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr
bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist
bytes += natom * idxz_max * sizeof(double) * 2; // zlist
@ -1329,8 +1288,8 @@ double SNAKokkos<DeviceType>::memory_usage()
bytes += jdim * jdim * jdim * sizeof(int); // idxz_block
bytes += jdim * jdim * jdim * sizeof(int); // idxb_block
bytes += idxz_max * sizeof(SNAKK_ZINDICES); // idxz
bytes += idxb_max * sizeof(SNAKK_BINDICES); // idxb
bytes += idxz_max * 10 * sizeof(int); // idxz
bytes += idxb_max * 3 * sizeof(int); // idxb
bytes += jdim * sizeof(double); // bzero