Merge remote-tracking branch 'github/master' into collected_small_fixes
This commit is contained in:
@ -182,4 +182,4 @@ Jonsson, Mills, Jacobsen.
|
||||
.. _Guenole:
|
||||
|
||||
**(Guenole)** Guenole, Noehring, Vaid, Houlle, Xie, Prakash, Bitzek,
|
||||
Comput Mater Sci, (2020), in press (arXiv:190802038).
|
||||
Comput Mater Sci, 175, 109584 (2020).
|
||||
|
||||
@ -37,12 +37,16 @@ struct TagPairSNAPBeta{};
|
||||
struct TagPairSNAPComputeNeigh{};
|
||||
struct TagPairSNAPPreUi{};
|
||||
struct TagPairSNAPComputeUi{};
|
||||
struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately
|
||||
struct TagPairSNAPComputeUiCPU{};
|
||||
struct TagPairSNAPComputeZi{};
|
||||
struct TagPairSNAPComputeBi{};
|
||||
struct TagPairSNAPZeroYi{};
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPComputeDuidrj{};
|
||||
struct TagPairSNAPComputeDuidrjCPU{};
|
||||
struct TagPairSNAPComputeDeidrj{};
|
||||
struct TagPairSNAPComputeDeidrjCPU{};
|
||||
|
||||
template<class DeviceType>
|
||||
class PairSNAPKokkos : public PairSNAP {
|
||||
@ -74,11 +78,17 @@ public:
|
||||
void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPPreUi,const int& ii) const;
|
||||
void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeZi,const int& ii) const;
|
||||
|
||||
@ -86,7 +96,7 @@ public:
|
||||
void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPZeroYi,const int& ii) const;
|
||||
void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeYi,const int& ii) const;
|
||||
@ -94,9 +104,15 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta>::member_type& team) const;
|
||||
|
||||
|
||||
@ -12,7 +12,8 @@
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Christian Trott (SNL), Stan Moore (SNL)
|
||||
Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
|
||||
Evan Weinberg (NVIDIA)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <cmath>
|
||||
@ -29,6 +30,7 @@
|
||||
#include "kokkos.h"
|
||||
#include "sna.h"
|
||||
|
||||
|
||||
#define MAXLINE 1024
|
||||
#define MAXWORD 3
|
||||
|
||||
@ -194,7 +196,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
|
||||
if (beta_max < inum) {
|
||||
beta_max = inum;
|
||||
d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",inum,ncoeff);
|
||||
d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
|
||||
d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
|
||||
}
|
||||
|
||||
@ -205,6 +207,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
|
||||
EV_FLOAT ev;
|
||||
|
||||
int idxu_max = snaKK.idxu_max;
|
||||
|
||||
while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
|
||||
|
||||
EV_FLOAT ev_tmp;
|
||||
@ -217,15 +221,62 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
|
||||
|
||||
//PreUi
|
||||
typename Kokkos::RangePolicy<DeviceType, TagPairSNAPPreUi> policy_preui(0,chunk_size);
|
||||
Kokkos::parallel_for("PreUi",policy_preui,*this);
|
||||
{
|
||||
int vector_length = 1;
|
||||
int team_size = 1;
|
||||
if (lmp->kokkos->ngpus != 0) {
|
||||
vector_length = 32;
|
||||
team_size = 32;//max_neighs;
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::team_size_max(*this);
|
||||
if (team_size*vector_length > team_size_max)
|
||||
team_size = team_size_max/vector_length;
|
||||
}
|
||||
typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("PreUi",policy_preui,*this);
|
||||
}
|
||||
|
||||
//ComputeUi
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
|
||||
// ComputeUI
|
||||
if (lmp->kokkos->ngpus == 0) { // CPU
|
||||
// Run a fused calculation of ulist and accumulation into ulisttot using atomics
|
||||
int vector_length = 1;
|
||||
int team_size = 1;
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
|
||||
Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this);
|
||||
} else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics
|
||||
|
||||
// ComputeUi
|
||||
int vector_length = 32;
|
||||
int team_size = 4; // need to cap b/c of shared memory reqs
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::team_size_max(*this);
|
||||
if (team_size*vector_length > team_size_max)
|
||||
team_size = team_size_max/vector_length;
|
||||
|
||||
// scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
|
||||
// 2 is for double buffer
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
|
||||
typedef Kokkos::View< SNAcomplex*,
|
||||
Kokkos::DefaultExecutionSpace::scratch_memory_space,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged> >
|
||||
ScratchViewType;
|
||||
int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1));
|
||||
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
|
||||
|
||||
Kokkos::parallel_for("ComputeUi",policy_ui,*this);
|
||||
|
||||
// ComputeUitot
|
||||
vector_length = 1;
|
||||
team_size = 128;
|
||||
team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::team_size_max(*this);
|
||||
if (team_size*vector_length > team_size_max)
|
||||
team_size = team_size_max/vector_length;
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot> policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this);
|
||||
}
|
||||
|
||||
//Ulisttot transpose
|
||||
snaKK.transpose_ulisttot();
|
||||
|
||||
//Compute bispectrum
|
||||
if (quadraticflag || eflag) {
|
||||
@ -237,15 +288,28 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
//ComputeBi
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeBi> policy_bi(chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ComputeBi",policy_bi,*this);
|
||||
|
||||
}
|
||||
|
||||
//Compute beta = dE_i/dB_i for all i in list
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPBeta> policy_beta(chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ComputeBeta",policy_beta,*this);
|
||||
|
||||
//ComputeYi
|
||||
typename Kokkos::RangePolicy<DeviceType, TagPairSNAPZeroYi> policy_zero_yi(0,chunk_size);
|
||||
Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
|
||||
//ZeroYi
|
||||
{
|
||||
int vector_length = 1;
|
||||
int team_size = 1;
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::team_size_max(*this);
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
team_size = 128;
|
||||
if (team_size*vector_length > team_size_max)
|
||||
team_size = team_size_max/vector_length;
|
||||
#endif
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi> policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
|
||||
Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this);
|
||||
}
|
||||
|
||||
//ComputeYi
|
||||
int idxz_max = snaKK.idxz_max;
|
||||
@ -253,12 +317,59 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
Kokkos::parallel_for("ComputeYi",policy_yi,*this);
|
||||
|
||||
//ComputeDuidrj
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
|
||||
if (lmp->kokkos->ngpus == 0) { // CPU
|
||||
int vector_length = 1;
|
||||
int team_size = 1;
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
snaKK.set_dir(-1); // technically doesn't do anything
|
||||
Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
|
||||
} else { // GPU, utilize scratch memory and splitting over dimensions
|
||||
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::team_size_max(*this);
|
||||
int vector_length = 32;
|
||||
int team_size = 2; // need to cap b/c of shared memory reqs
|
||||
if (team_size*vector_length > team_size_max)
|
||||
team_size = team_size_max/vector_length;
|
||||
|
||||
// scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
|
||||
// 2 is for double buffer
|
||||
typedef Kokkos::View< SNAcomplex*,
|
||||
Kokkos::DefaultExecutionSpace::scratch_memory_space,
|
||||
Kokkos::MemoryTraits<Kokkos::Unmanaged> >
|
||||
ScratchViewType;
|
||||
|
||||
int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1));
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
|
||||
// Need to call three times, once for each direction
|
||||
for (int k = 0; k < 3; k++) {
|
||||
snaKK.set_dir(k);
|
||||
Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
|
||||
}
|
||||
}
|
||||
|
||||
//ComputeDeidrj
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
|
||||
if (lmp->kokkos->ngpus == 0) { // CPU
|
||||
int vector_length = 1;
|
||||
int team_size = 1;
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
|
||||
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
|
||||
|
||||
} else { // GPU, different loop strategy internally
|
||||
|
||||
int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::team_size_max(*this);
|
||||
int vector_length = 32; // coalescing disaster right now, will fix later
|
||||
int team_size = 8;
|
||||
if (team_size*vector_length > team_size_max)
|
||||
team_size = team_size_max/vector_length;
|
||||
|
||||
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
|
||||
|
||||
Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
|
||||
}
|
||||
|
||||
//ComputeForce
|
||||
if (eflag) {
|
||||
@ -284,6 +395,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
|
||||
}
|
||||
ev += ev_tmp;
|
||||
chunk_offset += chunk_size;
|
||||
|
||||
} // end while
|
||||
|
||||
if (need_dup)
|
||||
@ -341,18 +453,18 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const typename Kokk
|
||||
d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++)
|
||||
d_beta(ii,icoeff) = d_coeffi[icoeff+1];
|
||||
d_beta(icoeff,ii) = d_coeffi[icoeff+1];
|
||||
|
||||
if (quadraticflag) {
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
double bveci = my_sna.blist(ii,icoeff);
|
||||
d_beta(ii,icoeff) += d_coeffi[k]*bveci;
|
||||
double bveci = my_sna.blist(icoeff,ii);
|
||||
d_beta(icoeff,ii) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
double bvecj = my_sna.blist(ii,jcoeff);
|
||||
d_beta(ii,icoeff) += d_coeffi[k]*bvecj;
|
||||
d_beta(ii,jcoeff) += d_coeffi[k]*bveci;
|
||||
double bvecj = my_sna.blist(jcoeff,ii);
|
||||
d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
|
||||
d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
@ -503,9 +615,14 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const int& ii) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
my_sna.pre_ui(ii);
|
||||
|
||||
// Extract the atom number
|
||||
const int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
my_sna.pre_ui(team,ii);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
@ -514,8 +631,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() %
|
||||
((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
@ -528,9 +644,54 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const int& ii) const {
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
my_sna.zero_yi(ii);
|
||||
|
||||
// Extract the quantum number
|
||||
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
|
||||
if (idx >= my_sna.idxu_max) return;
|
||||
|
||||
// Extract the atomic index
|
||||
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the number of neighbors neighbor number
|
||||
const int ninside = d_ninside(ii);
|
||||
|
||||
my_sna.compute_uitot(team,idx,ii,ninside);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.compute_ui_cpu(team,ii,jj);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPZeroYi>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the quantum number
|
||||
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
|
||||
if (idx >= my_sna.idxu_max) return;
|
||||
|
||||
// Extract the atomic index
|
||||
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
my_sna.zero_yi(idx,ii);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
@ -561,8 +722,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() %
|
||||
((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
@ -573,14 +733,31 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
|
||||
my_sna.compute_duidrj(team,ii,jj);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.compute_duidrj_cpu(team,ii,jj);
|
||||
}
|
||||
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() %
|
||||
((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
@ -591,6 +768,23 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const type
|
||||
my_sna.compute_deidrj(team,ii,jj);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
|
||||
SNAKokkos<DeviceType> my_sna = snaKK;
|
||||
|
||||
// Extract the atom number
|
||||
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// Extract the neighbor number
|
||||
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.compute_deidrj_cpu(team,ii,jj);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
template<int NEIGHFLAG, int EVFLAG>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -658,17 +852,17 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
|
||||
// linear contributions
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++)
|
||||
evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,icoeff);
|
||||
evdwl += d_coeffi[icoeff+1]*my_sna.blist(icoeff,ii);
|
||||
|
||||
// quadratic contributions
|
||||
|
||||
if (quadraticflag) {
|
||||
int k = ncoeff+1;
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
double bveci = my_sna.blist(ii,icoeff);
|
||||
double bveci = my_sna.blist(icoeff,ii);
|
||||
evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
|
||||
for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
|
||||
double bvecj = my_sna.blist(ii,jcoeff);
|
||||
double bvecj = my_sna.blist(jcoeff,ii);
|
||||
evdwl += d_coeffi[k++]*bveci*bvecj;
|
||||
}
|
||||
}
|
||||
|
||||
@ -28,19 +28,53 @@ namespace LAMMPS_NS {
|
||||
typedef double SNAreal;
|
||||
|
||||
//typedef struct { SNAreal re, im; } SNAcomplex;
|
||||
struct alignas(2*sizeof(SNAreal)) SNAcomplex{
|
||||
SNAreal re, im;
|
||||
template <typename real>
|
||||
struct alignas(2*sizeof(real)) SNAComplex
|
||||
{
|
||||
real re,im;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex() = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SNAcomplex() : re(0),im(0)
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re)
|
||||
: re(re), im(static_cast<real>(0.)) { ; }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re, real im)
|
||||
: re(re), im(im) { ; }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex(const SNAComplex& other)
|
||||
: re(other.re), im(other.im) { ; }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator=(const SNAComplex& other) {
|
||||
re = other.re; im = other.im;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex(SNAComplex&& other)
|
||||
: re(other.re), im(other.im) { ; }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator=(SNAComplex&& other) {
|
||||
re = other.re; im = other.im;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator+(SNAComplex const& other) {
|
||||
return SNAComplex(re + other.re, im + other.im);
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator+=(SNAComplex const& other) {
|
||||
re += other.re; im += other.im;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SNAcomplex(SNAreal real_in, SNAreal imag_in)
|
||||
:re(real_in),im(imag_in)
|
||||
{}
|
||||
};
|
||||
|
||||
template <typename real>
|
||||
KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real> operator*(const real& r, const SNAComplex<real>& self) {
|
||||
return SNAComplex<real>(r*self.re, r*self.im);
|
||||
}
|
||||
|
||||
typedef SNAComplex<SNAreal> SNAcomplex;
|
||||
|
||||
//struct SNAKK_ZINDICES {
|
||||
// int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
|
||||
//};
|
||||
@ -58,6 +92,7 @@ public:
|
||||
typedef Kokkos::View<double*, DeviceType, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1d_atomic;
|
||||
typedef Kokkos::View<int**, DeviceType> t_sna_2i;
|
||||
typedef Kokkos::View<double**, DeviceType> t_sna_2d;
|
||||
typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
|
||||
typedef Kokkos::View<double***, DeviceType> t_sna_3d;
|
||||
typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
|
||||
typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
|
||||
@ -66,32 +101,15 @@ public:
|
||||
typedef Kokkos::View<SNAcomplex*, DeviceType> t_sna_1c;
|
||||
typedef Kokkos::View<SNAcomplex*, DeviceType, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1c_atomic;
|
||||
typedef Kokkos::View<SNAcomplex**, DeviceType> t_sna_2c;
|
||||
typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutLeft, DeviceType> t_sna_2c_ll;
|
||||
typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_lr;
|
||||
typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
|
||||
typedef Kokkos::View<SNAcomplex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
|
||||
typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
|
||||
typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
|
||||
typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
|
||||
typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
|
||||
|
||||
// Helper class to get ulisttot_r
|
||||
|
||||
template<typename DeviceLayout, typename T1, typename T2>
|
||||
class UlisttotHelper {
|
||||
public:
|
||||
inline
|
||||
static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) {
|
||||
Kokkos::deep_copy(ulisttot_lr,ulisttot);
|
||||
}
|
||||
};
|
||||
|
||||
template<typename T1, typename T2>
|
||||
class UlisttotHelper<Kokkos::LayoutRight,T1,T2> {
|
||||
public:
|
||||
inline
|
||||
static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) {
|
||||
ulisttot_lr = ulisttot;
|
||||
}
|
||||
};
|
||||
|
||||
inline
|
||||
SNAKokkos() {};
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -113,20 +131,21 @@ inline
|
||||
|
||||
int ncoeff;
|
||||
|
||||
inline
|
||||
void transpose_ulisttot();
|
||||
|
||||
// functions for bispectrum coefficients
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void pre_ui(const int&); // ForceSNAP
|
||||
void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, int); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_zi(const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void zero_yi(const int&);
|
||||
void zero_yi(const int&,const int&); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_yi(int,
|
||||
const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
|
||||
@ -138,12 +157,29 @@ inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double compute_sfac(double, double); // add_uarraytot, compute_duarray
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double compute_dsfac(double, double); // compute_duarray
|
||||
|
||||
// efficient complex FMA
|
||||
// efficient caxpy (i.e., y += a x)
|
||||
static KOKKOS_FORCEINLINE_FUNCTION
|
||||
void caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y);
|
||||
|
||||
// efficient complex FMA, conjugate of scalar
|
||||
static KOKKOS_FORCEINLINE_FUNCTION
|
||||
void caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y);
|
||||
|
||||
// Set the direction for split ComputeDuidrj
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void set_dir(int);
|
||||
|
||||
#ifdef TIMING_INFO
|
||||
double* timers;
|
||||
timespec starttime, endtime;
|
||||
@ -165,17 +201,16 @@ inline
|
||||
void grow_rij(int, int);
|
||||
|
||||
int twojmax, diagonalstyle;
|
||||
|
||||
t_sna_2d_ll blist;
|
||||
t_sna_2c_ll ulisttot;
|
||||
t_sna_2c_ll zlist;
|
||||
|
||||
t_sna_2d blist;
|
||||
t_sna_2c ulisttot;
|
||||
t_sna_2c_lr ulisttot_lr;
|
||||
t_sna_2c zlist;
|
||||
|
||||
t_sna_3c ulist;
|
||||
t_sna_2c_lr ylist;
|
||||
t_sna_3c_ll ulist;
|
||||
t_sna_2c_ll ylist;
|
||||
|
||||
// derivatives of data
|
||||
t_sna_4c dulist;
|
||||
t_sna_4c_ll dulist;
|
||||
|
||||
int idxcg_max, idxu_max, idxz_max, idxb_max;
|
||||
|
||||
@ -212,10 +247,7 @@ inline
|
||||
|
||||
inline
|
||||
void init_rootpqarray(); // init()
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void zero_uarraytot(const int&); // compute_ui
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void addself_uarraytot(const int&, const double&); // compute_ui
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui
|
||||
|
||||
@ -223,6 +255,12 @@ inline
|
||||
void compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
|
||||
double, double, double,
|
||||
double, double); // compute_ui
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
|
||||
double, double, double,
|
||||
double, double); // compute_ui_cpu
|
||||
|
||||
|
||||
inline
|
||||
double deltacg(int, int, int); // init_clebsch_gordan
|
||||
|
||||
@ -232,6 +270,10 @@ inline
|
||||
void compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
|
||||
double, double, double, // compute_duidrj
|
||||
double, double, double, double, double);
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
|
||||
double, double, double, // compute_duidrj
|
||||
double, double, double, double, double);
|
||||
|
||||
// Sets the style for the switching function
|
||||
// 0 = none
|
||||
@ -243,6 +285,9 @@ inline
|
||||
|
||||
int bzero_flag; // 1 if bzero subtracted from barray
|
||||
Kokkos::View<double*, DeviceType> bzero; // array of B values for isolated atoms
|
||||
|
||||
// for per-direction dulist calculation, specify the direction.
|
||||
int dir;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -224,16 +224,19 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
|
||||
rcutij = t_sna_2d("sna:rcutij",natom,nmax);
|
||||
dedr = t_sna_3d("sna:dedr",natom,nmax,3);
|
||||
|
||||
blist = t_sna_2d("sna:blist",natom,idxb_max);
|
||||
ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max);
|
||||
if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
|
||||
ulisttot_lr = t_sna_2c_lr("sna:ulisttot_lr",natom,idxu_max);
|
||||
zlist = t_sna_2c("sna:zlist",natom,idxz_max);
|
||||
blist = t_sna_2d_ll("sna:blist",idxb_max,natom);
|
||||
//ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max);
|
||||
ulisttot = t_sna_2c_ll("sna:ulisttot",idxu_max,natom);
|
||||
|
||||
ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
|
||||
ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
|
||||
zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom);
|
||||
|
||||
dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
|
||||
//ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
|
||||
ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
|
||||
//ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
|
||||
ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom);
|
||||
|
||||
//dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
|
||||
dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -242,19 +245,31 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::pre_ui(const int& iatom)
|
||||
void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom)
|
||||
{
|
||||
//if(team.team_rank() == 0) {
|
||||
zero_uarraytot(iatom);
|
||||
//Kokkos::single(Kokkos::PerThread(team), [&] (){
|
||||
addself_uarraytot(iatom,wself);
|
||||
//});
|
||||
//}
|
||||
//team.team_barrier();
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
const int jju = idxu_block(j);
|
||||
|
||||
// Only diagonal elements get initialized
|
||||
// for (int m = 0; m < (j+1)*(j+1); m++)
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)),
|
||||
[&] (const int m) {
|
||||
|
||||
const int jjup = jju + m;
|
||||
|
||||
// if m is on the "diagonal", initialize it with the self energy.
|
||||
// Otherwise zero it out
|
||||
SNAcomplex init = {0., 0.};
|
||||
if (m % (j+2) == 0) { init = {wself, 0.0}; }
|
||||
|
||||
ulisttot(jjup, iatom) = init;
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Ui by summing over neighbors j
|
||||
compute Ui by summing over bispectrum components
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
@ -280,11 +295,72 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
|
||||
z0 = r / tan(theta0);
|
||||
|
||||
compute_uarray(team, iatom, jnbor, x, y, z, z0, r);
|
||||
|
||||
// if we're on the GPU, accumulating into uarraytot is done in a separate kernel.
|
||||
// if we're not, it's more efficient to include it in compute_uarray.
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
|
||||
{
|
||||
double rsq, r, x, y, z, z0, theta0;
|
||||
|
||||
// utot(j,ma,mb) = 0 for all j,ma,ma
|
||||
// utot(j,ma,ma) = 1 for all j,ma
|
||||
// for j in neighbors of i:
|
||||
// compute r0 = (x,y,z,z0)
|
||||
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
|
||||
|
||||
x = rij(iatom,jnbor,0);
|
||||
y = rij(iatom,jnbor,1);
|
||||
z = rij(iatom,jnbor,2);
|
||||
rsq = x * x + y * y + z * z;
|
||||
r = sqrt(rsq);
|
||||
|
||||
theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
|
||||
// theta0 = (r - rmin0) * rscale0;
|
||||
z0 = r / tan(theta0);
|
||||
|
||||
compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r);
|
||||
add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor));
|
||||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute UiTot by summing over neighbors
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int idx, int iatom, int ninside)
|
||||
{
|
||||
// fuse initialize in, avoid this load?
|
||||
SNAcomplex utot = ulisttot(idx, iatom);
|
||||
for (int jnbor = 0; jnbor < ninside; jnbor++) {
|
||||
|
||||
const auto x = rij(iatom,jnbor,0);
|
||||
const auto y = rij(iatom,jnbor,1);
|
||||
const auto z = rij(iatom,jnbor,2);
|
||||
const auto rsq = x * x + y * y + z * z;
|
||||
const auto r = sqrt(rsq);
|
||||
|
||||
const double wj_local = wj(iatom, jnbor);
|
||||
const double rcut = rcutij(iatom, jnbor);
|
||||
const double sfac = compute_sfac(r, rcut) * wj_local;
|
||||
|
||||
auto ulist_local = ulist(idx, iatom, jnbor);
|
||||
utot.re += sfac * ulist_local.re;
|
||||
utot.im += sfac * ulist_local.im;
|
||||
}
|
||||
|
||||
ulisttot(idx, iatom) = utot;
|
||||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute Zi by summing over products of Ui
|
||||
not updated yet
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
@ -306,8 +382,8 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iter)
|
||||
|
||||
const double* cgblock = cglist.data() + idxcg_block(j1,j2,j);
|
||||
|
||||
zlist(iatom,jjz).re = 0.0;
|
||||
zlist(iatom,jjz).im = 0.0;
|
||||
zlist(jjz,iatom).re = 0.0;
|
||||
zlist(jjz,iatom).im = 0.0;
|
||||
|
||||
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
|
||||
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
|
||||
@ -321,15 +397,15 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iter)
|
||||
int ma2 = ma2max;
|
||||
int icga = ma1min*(j2+1) + ma2max;
|
||||
for(int ia = 0; ia < na; ia++) {
|
||||
suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re);
|
||||
suma1_r += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).re - ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).im + ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).re);
|
||||
ma1++;
|
||||
ma2--;
|
||||
icga += j2;
|
||||
} // end loop over ia
|
||||
|
||||
zlist(iatom,jjz).re += cgblock[icgb] * suma1_r;
|
||||
zlist(iatom,jjz).im += cgblock[icgb] * suma1_i;
|
||||
zlist(jjz,iatom).re += cgblock[icgb] * suma1_r;
|
||||
zlist(jjz,iatom).im += cgblock[icgb] * suma1_i;
|
||||
|
||||
jju1 += j1+1;
|
||||
jju2 -= j2+1;
|
||||
@ -343,10 +419,9 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iter)
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::zero_yi(const int& iatom)
|
||||
void SNAKokkos<DeviceType>::zero_yi(const int& idx, const int& iatom)
|
||||
{
|
||||
for (int j = 0; j < idxu_max; j++)
|
||||
ylist(iatom,j) = {0.0,0.0};
|
||||
ylist(idx,iatom) = {0.0, 0.0};
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -393,8 +468,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
|
||||
int icga = ma1min*(j2+1) + ma2max;
|
||||
|
||||
for(int ia = 0; ia < na; ia++) {
|
||||
suma1_r += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).re - ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).im + ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).re);
|
||||
suma1_r += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).re - ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).im);
|
||||
suma1_i += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).im + ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).re);
|
||||
ma1++;
|
||||
ma2--;
|
||||
icga += j2;
|
||||
@ -417,20 +492,20 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
|
||||
if (j >= j1) {
|
||||
const int jjb = idxb_block(j1,j2,j);
|
||||
if (j1 == j) {
|
||||
if (j2 == j) betaj = 3*beta(iatom,jjb);
|
||||
else betaj = 2*beta(iatom,jjb);
|
||||
} else betaj = beta(iatom,jjb);
|
||||
if (j2 == j) betaj = 3*beta(jjb,iatom);
|
||||
else betaj = 2*beta(jjb,iatom);
|
||||
} else betaj = beta(jjb,iatom);
|
||||
} else if (j >= j2) {
|
||||
const int jjb = idxb_block(j,j2,j1);
|
||||
if (j2 == j) betaj = 2*beta(iatom,jjb)*(j1+1)/(j+1.0);
|
||||
else betaj = beta(iatom,jjb)*(j1+1)/(j+1.0);
|
||||
if (j2 == j) betaj = 2*beta(jjb,iatom)*(j1+1)/(j+1.0);
|
||||
else betaj = beta(jjb,iatom)*(j1+1)/(j+1.0);
|
||||
} else {
|
||||
const int jjb = idxb_block(j2,j,j1);
|
||||
betaj = beta(iatom,jjb)*(j1+1)/(j+1.0);
|
||||
betaj = beta(jjb,iatom)*(j1+1)/(j+1.0);
|
||||
}
|
||||
|
||||
Kokkos::atomic_add(&(ylist(iatom,jju).re), betaj*ztmp_r);
|
||||
Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i);
|
||||
Kokkos::atomic_add(&(ylist(jju,iatom).re), betaj*ztmp_r);
|
||||
Kokkos::atomic_add(&(ylist(jju,iatom).im), betaj*ztmp_i);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -441,18 +516,81 @@ template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
|
||||
{
|
||||
t_scalar3<double> sum;
|
||||
t_scalar3<double> final_sum;
|
||||
|
||||
// Like in ComputeUi/ComputeDuidrj, regular loop over j.
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
int jju = idxu_block(j);
|
||||
|
||||
// Flatten loop over ma, mb, reduce w/in
|
||||
|
||||
const int n_ma = j+1;
|
||||
// for (int mb = 0; 2*mb <= j; mb++)
|
||||
const int n_mb = j/2+1;
|
||||
|
||||
const int total_iters = n_ma * n_mb;
|
||||
|
||||
t_scalar3<double> sum;
|
||||
|
||||
//for (int m = 0; m < total_iters; m++) {
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters),
|
||||
[&] (const int m, t_scalar3<double>& sum_tmp) {
|
||||
|
||||
// ma fast, mb slow
|
||||
int ma = m % n_ma;
|
||||
int mb = m / n_ma;
|
||||
|
||||
// get index
|
||||
const int jju_index = jju+mb+mb*j+ma;
|
||||
|
||||
// get ylist, rescale last element by 0.5
|
||||
SNAcomplex y_local = ylist(jju_index,iatom);
|
||||
|
||||
const SNAcomplex du_x = dulist(jju_index,iatom,jnbor,0);
|
||||
const SNAcomplex du_y = dulist(jju_index,iatom,jnbor,1);
|
||||
const SNAcomplex du_z = dulist(jju_index,iatom,jnbor,2);
|
||||
|
||||
if (j % 2 == 0 && 2*mb == j) {
|
||||
if (ma == mb) { y_local = 0.5*y_local; }
|
||||
else if (ma > mb) { y_local = { 0., 0. }; }
|
||||
// else the ma < mb gets "double counted", cancelling the 0.5.
|
||||
}
|
||||
|
||||
sum_tmp.x += du_x.re * y_local.re + du_x.im * y_local.im;
|
||||
sum_tmp.y += du_y.re * y_local.re + du_y.im * y_local.im;
|
||||
sum_tmp.z += du_z.re * y_local.re + du_z.im * y_local.im;
|
||||
|
||||
}, sum); // end loop over flattened ma,mb
|
||||
|
||||
final_sum.x += sum.x;
|
||||
final_sum.y += sum.y;
|
||||
final_sum.z += sum.z;
|
||||
}
|
||||
|
||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||
dedr(iatom,jnbor,0) = final_sum.x*2.0;
|
||||
dedr(iatom,jnbor,1) = final_sum.y*2.0;
|
||||
dedr(iatom,jnbor,2) = final_sum.z*2.0;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
|
||||
{
|
||||
t_scalar3<double> final_sum;
|
||||
|
||||
//for(int j = 0; j <= twojmax; j++) {
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
|
||||
[&] (const int& j, t_scalar3<double>& sum_tmp) {
|
||||
//for(int j = 0; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
|
||||
for(int mb = 0; 2*mb < j; mb++)
|
||||
for(int ma = 0; ma <= j; ma++) {
|
||||
sum_tmp.x += dulist(iatom,jnbor,jju,0).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,0).im * ylist(iatom,jju).im;
|
||||
sum_tmp.y += dulist(iatom,jnbor,jju,1).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,1).im * ylist(iatom,jju).im;
|
||||
sum_tmp.z += dulist(iatom,jnbor,jju,2).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,2).im * ylist(iatom,jju).im;
|
||||
sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im;
|
||||
sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im;
|
||||
sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im;
|
||||
jju++;
|
||||
} //end loop over ma mb
|
||||
|
||||
@ -462,25 +600,24 @@ void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<Dev
|
||||
|
||||
int mb = j/2;
|
||||
for(int ma = 0; ma < mb; ma++) {
|
||||
sum_tmp.x += dulist(iatom,jnbor,jju,0).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,0).im * ylist(iatom,jju).im;
|
||||
sum_tmp.y += dulist(iatom,jnbor,jju,1).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,1).im * ylist(iatom,jju).im;
|
||||
sum_tmp.z += dulist(iatom,jnbor,jju,2).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,2).im * ylist(iatom,jju).im;
|
||||
sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im;
|
||||
sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im;
|
||||
sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im;
|
||||
jju++;
|
||||
}
|
||||
|
||||
//int ma = mb;
|
||||
sum_tmp.x += (dulist(iatom,jnbor,jju,0).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,0).im * ylist(iatom,jju).im)*0.5;
|
||||
sum_tmp.y += (dulist(iatom,jnbor,jju,1).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,1).im * ylist(iatom,jju).im)*0.5;
|
||||
sum_tmp.z += (dulist(iatom,jnbor,jju,2).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,2).im * ylist(iatom,jju).im)*0.5;
|
||||
sum_tmp.x += (dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im)*0.5;
|
||||
sum_tmp.y += (dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im)*0.5;
|
||||
sum_tmp.z += (dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im)*0.5;
|
||||
} // end if jeven
|
||||
|
||||
},sum); // end loop over j
|
||||
//}
|
||||
},final_sum); // end loop over j
|
||||
|
||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||
dedr(iatom,jnbor,0) = sum.x*2.0;
|
||||
dedr(iatom,jnbor,1) = sum.y*2.0;
|
||||
dedr(iatom,jnbor,2) = sum.z*2.0;
|
||||
dedr(iatom,jnbor,0) = final_sum.x*2.0;
|
||||
dedr(iatom,jnbor,1) = final_sum.y*2.0;
|
||||
dedr(iatom,jnbor,2) = final_sum.z*2.0;
|
||||
});
|
||||
|
||||
}
|
||||
@ -524,8 +661,8 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
|
||||
const int jjz_index = jjz+mb*(j+1)+ma;
|
||||
if (2*mb == j) return;
|
||||
sum +=
|
||||
ulisttot(iatom,jju_index).re * zlist(iatom,jjz_index).re +
|
||||
ulisttot(iatom,jju_index).im * zlist(iatom,jjz_index).im;
|
||||
ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re +
|
||||
ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im;
|
||||
},sumzu_temp); // end loop over ma, mb
|
||||
sumzu += sumzu_temp;
|
||||
|
||||
@ -539,8 +676,8 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
|
||||
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
|
||||
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
|
||||
sum +=
|
||||
ulisttot(iatom,jju_index).re * zlist(iatom,jjz_index).re +
|
||||
ulisttot(iatom,jju_index).im * zlist(iatom,jjz_index).im;
|
||||
ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re +
|
||||
ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im;
|
||||
},sumzu_temp); // end loop over ma
|
||||
sumzu += sumzu_temp;
|
||||
|
||||
@ -548,8 +685,8 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
|
||||
const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
|
||||
const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
|
||||
sumzu += 0.5*
|
||||
(ulisttot(iatom,jju_index).re * zlist(iatom,jjz_index).re +
|
||||
ulisttot(iatom,jju_index).im * zlist(iatom,jjz_index).im);
|
||||
(ulisttot(jju_index,iatom).re * zlist(jjz_index,iatom).re +
|
||||
ulisttot(jju_index,iatom).im * zlist(jjz_index,iatom).im);
|
||||
} // end if jeven
|
||||
|
||||
Kokkos::single(Kokkos::PerThread(team), [&] () {
|
||||
@ -560,7 +697,7 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
|
||||
if (bzero_flag)
|
||||
sumzu -= bzero[j];
|
||||
|
||||
blist(iatom,jjb) = sumzu;
|
||||
blist(jjb,iatom) = sumzu;
|
||||
});
|
||||
});
|
||||
//} // end loop over j
|
||||
@ -593,39 +730,26 @@ void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<Dev
|
||||
compute_duarray(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::zero_uarraytot(const int& iatom)
|
||||
void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
|
||||
{
|
||||
{
|
||||
//Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)),
|
||||
// [&] (const int& i) {
|
||||
for (int i = 0; i < ulisttot.extent(1); i++) {
|
||||
ulisttot(iatom,i).re = 0.0;
|
||||
ulisttot(iatom,i).im = 0.0;
|
||||
}
|
||||
//});
|
||||
}
|
||||
}
|
||||
double rsq, r, x, y, z, z0, theta0, cs, sn;
|
||||
double dz0dr;
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
x = rij(iatom,jnbor,0);
|
||||
y = rij(iatom,jnbor,1);
|
||||
z = rij(iatom,jnbor,2);
|
||||
rsq = x * x + y * y + z * z;
|
||||
r = sqrt(rsq);
|
||||
double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
|
||||
theta0 = (r - rmin0) * rscale0;
|
||||
cs = cos(theta0);
|
||||
sn = sin(theta0);
|
||||
z0 = r * cs / sn;
|
||||
dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::addself_uarraytot(const int& iatom, const double& wself_in)
|
||||
{
|
||||
//Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
|
||||
// [&] (const int& j) {
|
||||
for (int j = 0; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
ulisttot(iatom,jju).re = wself_in;
|
||||
ulisttot(iatom,jju).im = 0.0;
|
||||
jju += j+2;
|
||||
}
|
||||
}//});
|
||||
compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -639,10 +763,10 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
|
||||
{
|
||||
const double sfac = compute_sfac(r, rcut) * wj;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)),
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(0)),
|
||||
[&] (const int& i) {
|
||||
Kokkos::atomic_add(&(ulisttot(iatom,i).re), sfac * ulist(iatom,jnbor,i).re);
|
||||
Kokkos::atomic_add(&(ulisttot(iatom,i).im), sfac * ulist(iatom,jnbor,i).im);
|
||||
Kokkos::atomic_add(&(ulisttot(i,iatom).re), sfac * ulist(i,iatom,jnbor).re);
|
||||
Kokkos::atomic_add(&(ulisttot(i,iatom).im), sfac * ulist(i,iatom,jnbor).im);
|
||||
});
|
||||
}
|
||||
|
||||
@ -655,6 +779,119 @@ KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
||||
double x, double y, double z,
|
||||
double z0, double r)
|
||||
{
|
||||
// define size of scratch memory buffer
|
||||
const int max_m_tile = (twojmax+1)*(twojmax+1);
|
||||
const int team_rank = team.team_rank();
|
||||
|
||||
// get scratch memory double buffer
|
||||
SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
|
||||
SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
|
||||
|
||||
// compute Cayley-Klein parameters for unit quaternion,
|
||||
// pack into complex number
|
||||
double r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
||||
SNAcomplex a = { r0inv * z0, -r0inv * z };
|
||||
SNAcomplex b = { r0inv * y, -r0inv * x };
|
||||
|
||||
// VMK Section 4.8.2
|
||||
|
||||
// All writes go to global memory and shared memory
|
||||
// so we can avoid all global memory reads
|
||||
Kokkos::single(Kokkos::PerThread(team), [=]() {
|
||||
ulist(0,iatom,jnbor) = { 1.0, 0.0 };
|
||||
buf1[max_m_tile*team_rank] = {1.,0.};
|
||||
});
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
const int jju = idxu_block[j];
|
||||
int jjup = idxu_block[j-1];
|
||||
|
||||
// fill in left side of matrix layer from previous layer
|
||||
|
||||
// Flatten loop over ma, mb, need to figure out total
|
||||
// number of iterations
|
||||
|
||||
// for (int ma = 0; ma <= j; ma++)
|
||||
const int n_ma = j+1;
|
||||
// for (int mb = 0; 2*mb <= j; mb++)
|
||||
const int n_mb = j/2+1;
|
||||
|
||||
const int total_iters = n_ma * n_mb;
|
||||
|
||||
//for (int m = 0; m < total_iters; m++) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
|
||||
[&] (const int m) {
|
||||
|
||||
// ma fast, mb slow
|
||||
int ma = m % n_ma;
|
||||
int mb = m / n_ma;
|
||||
|
||||
// index into global memory array
|
||||
const int jju_index = jju+mb+mb*j+ma;
|
||||
|
||||
// index into shared memory buffer for previous level
|
||||
const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
|
||||
|
||||
// index into shared memory buffer for next level
|
||||
const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
|
||||
|
||||
SNAcomplex u_accum = {0., 0.};
|
||||
|
||||
// VMK recursion relation: grab contribution which is multiplied by a*
|
||||
const double rootpq1 = rootpqarray(j - ma, j - mb);
|
||||
const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
|
||||
caconjxpy(a, u_up1, u_accum);
|
||||
|
||||
// VMK recursion relation: grab contribution which is multiplied by b*
|
||||
const double rootpq2 = -rootpqarray(ma, j - mb);
|
||||
const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
|
||||
caconjxpy(b, u_up2, u_accum);
|
||||
|
||||
ulist(jju_index,iatom,jnbor) = u_accum;
|
||||
|
||||
// We no longer accumulate into ulisttot in this kernel.
|
||||
// Instead, we have a separate kernel which avoids atomics.
|
||||
// Running two separate kernels is net faster.
|
||||
|
||||
// back up into shared memory for next iter
|
||||
if (j != twojmax) buf2[jju_shared_idx] = u_accum;
|
||||
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
|
||||
// We can avoid this if we're on the last row for an integer j
|
||||
if (!(n_ma % 2 == 1 && (mb+1) == n_mb)) {
|
||||
|
||||
int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
|
||||
const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
|
||||
const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
|
||||
|
||||
if (sign_factor == 1) {
|
||||
u_accum.im = -u_accum.im;
|
||||
} else {
|
||||
u_accum.re = -u_accum.re;
|
||||
}
|
||||
ulist(jjup_flip,iatom,jnbor) = u_accum;
|
||||
if (j != twojmax) buf2[jju_shared_flip] = u_accum;
|
||||
}
|
||||
});
|
||||
// In CUDA backend,
|
||||
// ThreadVectorRange has a __syncwarp (appropriately masked for
|
||||
// vector lengths < 32) implict at the end
|
||||
|
||||
// swap double buffers
|
||||
auto tmp = buf1; buf1 = buf2; buf2 = tmp;
|
||||
//std::swap(buf1, buf2); // throws warnings
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// CPU version
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
||||
double x, double y, double z,
|
||||
double z0, double r)
|
||||
{
|
||||
double r0inv;
|
||||
double a_r, b_r, a_i, b_i;
|
||||
@ -670,8 +907,8 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
|
||||
|
||||
// VMK Section 4.8.2
|
||||
|
||||
ulist(iatom,jnbor,0).re = 1.0;
|
||||
ulist(iatom,jnbor,0).im = 0.0;
|
||||
ulist(0,iatom,jnbor).re = 1.0;
|
||||
ulist(0,iatom,jnbor).im = 0.0;
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
@ -683,31 +920,31 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
|
||||
[&] (const int& mb) {
|
||||
//for (int mb = 0; 2*mb <= j; mb++) {
|
||||
const int jju_index = jju+mb+mb*j;
|
||||
ulist(iatom,jnbor,jju_index).re = 0.0;
|
||||
ulist(iatom,jnbor,jju_index).im = 0.0;
|
||||
ulist(jju_index,iatom,jnbor).re = 0.0;
|
||||
ulist(jju_index,iatom,jnbor).im = 0.0;
|
||||
|
||||
for (int ma = 0; ma < j; ma++) {
|
||||
const int jju_index = jju+mb+mb*j+ma;
|
||||
const int jjup_index = jjup+mb*j+ma;
|
||||
rootpq = rootpqarray(j - ma,j - mb);
|
||||
ulist(iatom,jnbor,jju_index).re +=
|
||||
ulist(jju_index,iatom,jnbor,jju).re +=
|
||||
rootpq *
|
||||
(a_r * ulist(iatom,jnbor,jjup_index).re +
|
||||
a_i * ulist(iatom,jnbor,jjup_index).im);
|
||||
ulist(iatom,jnbor,jju_index).im +=
|
||||
(a_r * ulist(jjup_index,iatom,jnbor).re +
|
||||
a_i * ulist(jjup_index,iatom,jnbor).im);
|
||||
ulist(jju_index,iatom,jnbor).im +=
|
||||
rootpq *
|
||||
(a_r * ulist(iatom,jnbor,jjup_index).im -
|
||||
a_i * ulist(iatom,jnbor,jjup_index).re);
|
||||
(a_r * ulist(jjup_index,iatom,jnbor).im -
|
||||
a_i * ulist(jjup_index,iatom,jnbor).re);
|
||||
|
||||
rootpq = rootpqarray(ma + 1,j - mb);
|
||||
ulist(iatom,jnbor,jju_index+1).re =
|
||||
ulist(jju_index+1,iatom,jnbor).re =
|
||||
-rootpq *
|
||||
(b_r * ulist(iatom,jnbor,jjup_index).re +
|
||||
b_i * ulist(iatom,jnbor,jjup_index).im);
|
||||
ulist(iatom,jnbor,jju_index+1).im =
|
||||
(b_r * ulist(jjup_index,iatom,jnbor).re +
|
||||
b_i * ulist(jjup_index,iatom,jnbor).im);
|
||||
ulist(jju_index+1,iatom,jnbor).im =
|
||||
-rootpq *
|
||||
(b_r * ulist(iatom,jnbor,jjup_index).im -
|
||||
b_i * ulist(iatom,jnbor,jjup_index).re);
|
||||
(b_r * ulist(jjup_index,iatom,jnbor).im -
|
||||
b_i * ulist(jjup_index,iatom,jnbor).re);
|
||||
}
|
||||
});
|
||||
|
||||
@ -725,11 +962,11 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
|
||||
const int jju_index = jju+mb*(j+1)+ma;
|
||||
const int jjup_index = jjup-mb*(j+1)-ma;
|
||||
if (mapar == 1) {
|
||||
ulist(iatom,jnbor,jjup_index).re = ulist(iatom,jnbor,jju_index).re;
|
||||
ulist(iatom,jnbor,jjup_index).im = -ulist(iatom,jnbor,jju_index).im;
|
||||
ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re;
|
||||
ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im;
|
||||
} else {
|
||||
ulist(iatom,jnbor,jjup_index).re = -ulist(iatom,jnbor,jju_index).re;
|
||||
ulist(iatom,jnbor,jjup_index).im = ulist(iatom,jnbor,jju_index).im;
|
||||
ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re;
|
||||
ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im;
|
||||
}
|
||||
mapar = -mapar;
|
||||
}
|
||||
@ -737,12 +974,6 @@ void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<Dev
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
void SNAKokkos<DeviceType>::transpose_ulisttot()
|
||||
{
|
||||
UlisttotHelper<typename DeviceType::array_layout,decltype(ulisttot_lr),decltype(ulisttot)>::transpose(ulisttot_lr,ulisttot);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
compute derivatives of Wigner U-functions for one neighbor
|
||||
see comments in compute_uarray()
|
||||
@ -755,7 +986,150 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
|
||||
double z0, double r, double dz0dr,
|
||||
double wj, double rcut)
|
||||
{
|
||||
double r0inv;
|
||||
|
||||
// get shared memory offset
|
||||
const int max_m_tile = (twojmax+1)*(twojmax+1);
|
||||
const int team_rank = team.team_rank();
|
||||
|
||||
// double buffer for ulist
|
||||
SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
|
||||
SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
|
||||
|
||||
// double buffer for dulist
|
||||
SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
|
||||
SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
|
||||
|
||||
const double sfac = wj * compute_sfac(r, rcut);
|
||||
const double dsfac = wj * compute_dsfac(r, rcut);
|
||||
|
||||
const double rinv = 1.0 / r;
|
||||
|
||||
// extract a single unit vector
|
||||
const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
|
||||
|
||||
// Compute Cayley-Klein parameters for unit quaternion
|
||||
|
||||
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
|
||||
|
||||
const SNAcomplex a = { r0inv * z0, -r0inv * z };
|
||||
const SNAcomplex b = { r0inv * y, -r0inv * x };
|
||||
|
||||
const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
|
||||
const double dr0inv = dr0invdr * u;
|
||||
const double dz0 = dz0dr * u;
|
||||
|
||||
const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
|
||||
- z * dr0inv + (dir == 2 ? - r0inv : 0.) };
|
||||
|
||||
const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
|
||||
-x * dr0inv + (dir==0?-r0inv:0.) };
|
||||
|
||||
// single has a warp barrier at the end
|
||||
Kokkos::single(Kokkos::PerThread(team), [=]() {
|
||||
dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
|
||||
ulist_buf1[max_m_tile*team_rank] = {1., 0.};
|
||||
dulist_buf1[max_m_tile*team_rank] = {0., 0.};
|
||||
});
|
||||
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
int jjup = idxu_block[j-1];
|
||||
|
||||
// flatten the loop over ma,mb
|
||||
|
||||
// for (int ma = 0; ma <= j; ma++)
|
||||
const int n_ma = j+1;
|
||||
// for (int mb = 0; 2*mb <= j; mb++)
|
||||
const int n_mb = j/2+1;
|
||||
|
||||
const int total_iters = n_ma * n_mb;
|
||||
|
||||
//for (int m = 0; m < total_iters; m++) {
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
|
||||
[&] (const int m) {
|
||||
|
||||
// ma fast, mb slow
|
||||
int ma = m % n_ma;
|
||||
int mb = m / n_ma;
|
||||
|
||||
const int jju_index = jju+mb+mb*j+ma;
|
||||
|
||||
// index into shared memory
|
||||
const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
|
||||
const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
|
||||
|
||||
// Need to compute and accumulate both u and du (mayhaps, we could probably
|
||||
// balance some read and compute by reading u each time).
|
||||
SNAcomplex u_accum = { 0., 0. };
|
||||
SNAcomplex du_accum = { 0., 0. };
|
||||
|
||||
const double rootpq1 = rootpqarray(j - ma, j - mb);
|
||||
const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
|
||||
caconjxpy(a, u_up1, u_accum);
|
||||
|
||||
const double rootpq2 = -rootpqarray(ma, j - mb);
|
||||
const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
|
||||
caconjxpy(b, u_up2, u_accum);
|
||||
|
||||
// No need to save u_accum to global memory
|
||||
if (j != twojmax) ulist_buf2[jju_shared_idx] = u_accum;
|
||||
|
||||
// Next, spin up du_accum
|
||||
const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
|
||||
caconjxpy(da, u_up1, du_accum);
|
||||
caconjxpy(a, du_up1, du_accum);
|
||||
|
||||
const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
|
||||
caconjxpy(db, u_up2, du_accum);
|
||||
caconjxpy(b, du_up2, du_accum);
|
||||
|
||||
dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
|
||||
|
||||
if (j != twojmax) dulist_buf2[jju_shared_idx] = du_accum;
|
||||
|
||||
// copy left side to right side with inversion symmetry VMK 4.4(2)
|
||||
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
|
||||
|
||||
int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
|
||||
const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
|
||||
const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
|
||||
|
||||
if (sign_factor == 1) {
|
||||
//ulist_alt(iatom,jnbor,jjup_flip).re = u_accum.re;
|
||||
//ulist_alt(iatom,jnbor,jjup_flip).im = -u_accum.im;
|
||||
u_accum.im = -u_accum.im;
|
||||
du_accum.im = -du_accum.im;
|
||||
} else {
|
||||
//ulist_alt(iatom,jnbor,jjup_flip).re = -u_accum.re;
|
||||
//ulist_alt(iatom,jnbor,jjup_flip).im = u_accum.im;
|
||||
u_accum.re = -u_accum.re;
|
||||
du_accum.re = -du_accum.re;
|
||||
}
|
||||
|
||||
dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
|
||||
|
||||
if (j != twojmax) {
|
||||
ulist_buf2[jju_shared_flip] = u_accum;
|
||||
dulist_buf2[jju_shared_flip] = du_accum;
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
// swap buffers
|
||||
auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
|
||||
tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
|
||||
double x, double y, double z,
|
||||
double z0, double r, double dz0dr,
|
||||
double wj, double rcut)
|
||||
{
|
||||
double r0inv;
|
||||
double a_r, a_i, b_r, b_i;
|
||||
double da_r[3], da_i[3], db_r[3], db_i[3];
|
||||
double dz0[3], dr0inv[3], dr0invdr;
|
||||
@ -772,7 +1146,7 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
|
||||
b_r = y * r0inv;
|
||||
b_i = -x * r0inv;
|
||||
|
||||
dr0invdr = -pow(r0inv, 3.0) * (r + z0 * dz0dr);
|
||||
dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
|
||||
|
||||
dr0inv[0] = dr0invdr * ux;
|
||||
dr0inv[1] = dr0invdr * uy;
|
||||
@ -797,12 +1171,12 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
|
||||
db_i[0] += -r0inv;
|
||||
db_r[1] += r0inv;
|
||||
|
||||
dulist(iatom,jnbor,0,0).re = 0.0;
|
||||
dulist(iatom,jnbor,0,1).re = 0.0;
|
||||
dulist(iatom,jnbor,0,2).re = 0.0;
|
||||
dulist(iatom,jnbor,0,0).im = 0.0;
|
||||
dulist(iatom,jnbor,0,1).im = 0.0;
|
||||
dulist(iatom,jnbor,0,2).im = 0.0;
|
||||
dulist(0,iatom,jnbor,0).re = 0.0;
|
||||
dulist(0,iatom,jnbor,1).re = 0.0;
|
||||
dulist(0,iatom,jnbor,2).re = 0.0;
|
||||
dulist(0,iatom,jnbor,0).im = 0.0;
|
||||
dulist(0,iatom,jnbor,1).im = 0.0;
|
||||
dulist(0,iatom,jnbor,2).im = 0.0;
|
||||
|
||||
for (int j = 1; j <= twojmax; j++) {
|
||||
int jju = idxu_block[j];
|
||||
@ -811,42 +1185,42 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
|
||||
[&] (const int& mb) {
|
||||
//for (int mb = 0; 2*mb <= j; mb++) {
|
||||
const int jju_index = jju+mb+mb*j;
|
||||
dulist(iatom,jnbor,jju_index,0).re = 0.0;
|
||||
dulist(iatom,jnbor,jju_index,1).re = 0.0;
|
||||
dulist(iatom,jnbor,jju_index,2).re = 0.0;
|
||||
dulist(iatom,jnbor,jju_index,0).im = 0.0;
|
||||
dulist(iatom,jnbor,jju_index,1).im = 0.0;
|
||||
dulist(iatom,jnbor,jju_index,2).im = 0.0;
|
||||
dulist(jju_index,iatom,jnbor,0).re = 0.0;
|
||||
dulist(jju_index,iatom,jnbor,1).re = 0.0;
|
||||
dulist(jju_index,iatom,jnbor,2).re = 0.0;
|
||||
dulist(jju_index,iatom,jnbor,0).im = 0.0;
|
||||
dulist(jju_index,iatom,jnbor,1).im = 0.0;
|
||||
dulist(jju_index,iatom,jnbor,2).im = 0.0;
|
||||
|
||||
for (int ma = 0; ma < j; ma++) {
|
||||
const int jju_index = jju+mb+mb*j+ma;
|
||||
const int jjup_index = jjup+mb*j+ma;
|
||||
rootpq = rootpqarray(j - ma,j - mb);
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist(iatom,jnbor,jju_index,k).re +=
|
||||
rootpq * (da_r[k] * ulist(iatom,jnbor,jjup_index).re +
|
||||
da_i[k] * ulist(iatom,jnbor,jjup_index).im +
|
||||
a_r * dulist(iatom,jnbor,jjup_index,k).re +
|
||||
a_i * dulist(iatom,jnbor,jjup_index,k).im);
|
||||
dulist(iatom,jnbor,jju_index,k).im +=
|
||||
rootpq * (da_r[k] * ulist(iatom,jnbor,jjup_index).im -
|
||||
da_i[k] * ulist(iatom,jnbor,jjup_index).re +
|
||||
a_r * dulist(iatom,jnbor,jjup_index,k).im -
|
||||
a_i * dulist(iatom,jnbor,jjup_index,k).re);
|
||||
dulist(jju_index,iatom,jnbor,k).re +=
|
||||
rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).re +
|
||||
da_i[k] * ulist(jjup_index,iatom,jnbor).im +
|
||||
a_r * dulist(jjup_index,iatom,jnbor,k).re +
|
||||
a_i * dulist(jjup_index,iatom,jnbor,k).im);
|
||||
dulist(jju_index,iatom,jnbor,k).im +=
|
||||
rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).im -
|
||||
da_i[k] * ulist(jjup_index,iatom,jnbor).re +
|
||||
a_r * dulist(jjup_index,iatom,jnbor,k).im -
|
||||
a_i * dulist(jjup_index,iatom,jnbor,k).re);
|
||||
}
|
||||
|
||||
rootpq = rootpqarray(ma + 1,j - mb);
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist(iatom,jnbor,jju_index+1,k).re =
|
||||
-rootpq * (db_r[k] * ulist(iatom,jnbor,jjup_index).re +
|
||||
db_i[k] * ulist(iatom,jnbor,jjup_index).im +
|
||||
b_r * dulist(iatom,jnbor,jjup_index,k).re +
|
||||
b_i * dulist(iatom,jnbor,jjup_index,k).im);
|
||||
dulist(iatom,jnbor,jju_index+1,k).im =
|
||||
-rootpq * (db_r[k] * ulist(iatom,jnbor,jjup_index).im -
|
||||
db_i[k] * ulist(iatom,jnbor,jjup_index).re +
|
||||
b_r * dulist(iatom,jnbor,jjup_index,k).im -
|
||||
b_i * dulist(iatom,jnbor,jjup_index,k).re);
|
||||
dulist(jju_index+1,iatom,jnbor,k).re =
|
||||
-rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).re +
|
||||
db_i[k] * ulist(jjup_index,iatom,jnbor).im +
|
||||
b_r * dulist(jjup_index,iatom,jnbor,k).re +
|
||||
b_i * dulist(jjup_index,iatom,jnbor,k).im);
|
||||
dulist(jju_index+1,iatom,jnbor,k).im =
|
||||
-rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).im -
|
||||
db_i[k] * ulist(jjup_index,iatom,jnbor).re +
|
||||
b_r * dulist(jjup_index,iatom,jnbor,k).im -
|
||||
b_i * dulist(jjup_index,iatom,jnbor,k).re);
|
||||
}
|
||||
}
|
||||
});
|
||||
@ -866,13 +1240,13 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
|
||||
const int jjup_index = jjup-mb*(j+1)-ma;
|
||||
if (mapar == 1) {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist(iatom,jnbor,jjup_index,k).re = dulist(iatom,jnbor,jju_index,k).re;
|
||||
dulist(iatom,jnbor,jjup_index,k).im = -dulist(iatom,jnbor,jju_index,k).im;
|
||||
dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re;
|
||||
dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im;
|
||||
}
|
||||
} else {
|
||||
for (int k = 0; k < 3; k++) {
|
||||
dulist(iatom,jnbor,jjup_index,k).re = -dulist(iatom,jnbor,jju_index,k).re;
|
||||
dulist(iatom,jnbor,jjup_index,k).im = dulist(iatom,jnbor,jju_index,k).im;
|
||||
dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re;
|
||||
dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im;
|
||||
}
|
||||
}
|
||||
mapar = -mapar;
|
||||
@ -890,18 +1264,18 @@ void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<De
|
||||
int jju = idxu_block[j];
|
||||
for (int mb = 0; 2*mb <= j; mb++)
|
||||
for (int ma = 0; ma <= j; ma++) {
|
||||
dulist(iatom,jnbor,jju,0).re = dsfac * ulist(iatom,jnbor,jju).re * ux +
|
||||
sfac * dulist(iatom,jnbor,jju,0).re;
|
||||
dulist(iatom,jnbor,jju,0).im = dsfac * ulist(iatom,jnbor,jju).im * ux +
|
||||
sfac * dulist(iatom,jnbor,jju,0).im;
|
||||
dulist(iatom,jnbor,jju,1).re = dsfac * ulist(iatom,jnbor,jju).re * uy +
|
||||
sfac * dulist(iatom,jnbor,jju,1).re;
|
||||
dulist(iatom,jnbor,jju,1).im = dsfac * ulist(iatom,jnbor,jju).im * uy +
|
||||
sfac * dulist(iatom,jnbor,jju,1).im;
|
||||
dulist(iatom,jnbor,jju,2).re = dsfac * ulist(iatom,jnbor,jju).re * uz +
|
||||
sfac * dulist(iatom,jnbor,jju,2).re;
|
||||
dulist(iatom,jnbor,jju,2).im = dsfac * ulist(iatom,jnbor,jju).im * uz +
|
||||
sfac * dulist(iatom,jnbor,jju,2).im;
|
||||
dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux +
|
||||
sfac * dulist(jju,iatom,jnbor,0).re;
|
||||
dulist(jju,iatom,jnbor,0).im = dsfac * ulist(jju,iatom,jnbor).im * ux +
|
||||
sfac * dulist(jju,iatom,jnbor,0).im;
|
||||
dulist(jju,iatom,jnbor,1).re = dsfac * ulist(jju,iatom,jnbor).re * uy +
|
||||
sfac * dulist(jju,iatom,jnbor,1).re;
|
||||
dulist(jju,iatom,jnbor,1).im = dsfac * ulist(jju,iatom,jnbor).im * uy +
|
||||
sfac * dulist(jju,iatom,jnbor,1).im;
|
||||
dulist(jju,iatom,jnbor,2).re = dsfac * ulist(jju,iatom,jnbor).re * uz +
|
||||
sfac * dulist(jju,iatom,jnbor,2).re;
|
||||
dulist(jju,iatom,jnbor,2).im = dsfac * ulist(jju,iatom,jnbor).im * uz +
|
||||
sfac * dulist(jju,iatom,jnbor,2).im;
|
||||
|
||||
jju++;
|
||||
}
|
||||
@ -1257,6 +1631,39 @@ double SNAKokkos<DeviceType>::compute_dsfac(double r, double rcut)
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
// efficient complex FMA (i.e., y += a x)
|
||||
template<class DeviceType>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) {
|
||||
y.re += a.re * x.re;
|
||||
y.re -= a.im * x.im;
|
||||
y.im += a.im * x.re;
|
||||
y.im += a.re * x.im;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
// efficient complex FMA, conjugate of scalar (i.e.) y += (a.re - i a.im) x)
|
||||
template<class DeviceType>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) {
|
||||
y.re += a.re * x.re;
|
||||
y.re += a.im * x.im;
|
||||
y.im -= a.im * x.re;
|
||||
y.im += a.re * x.im;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
// set direction of batched Duidrj
|
||||
template<class DeviceType>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void SNAKokkos<DeviceType>::set_dir(int dir_) {
|
||||
dir = dir_;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
memory usage of arrays
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
@ -87,6 +87,7 @@ void MinFire::setup_style()
|
||||
|
||||
for (int i = 0; i < nlocal; i++)
|
||||
v[i][0] = v[i][1] = v[i][2] = 0.0;
|
||||
flagv0 = 1;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -259,14 +260,40 @@ int MinFire::iterate(int maxiter)
|
||||
|
||||
if (halfstepback_flag) {
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
x[i][0] -= 0.5 * dtv * v[i][0];
|
||||
x[i][1] -= 0.5 * dtv * v[i][1];
|
||||
x[i][2] -= 0.5 * dtv * v[i][2];
|
||||
x[i][0] -= 0.5 * dt * v[i][0];
|
||||
x[i][1] -= 0.5 * dt * v[i][1];
|
||||
x[i][2] -= 0.5 * dt * v[i][2];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < nlocal; i++)
|
||||
v[i][0] = v[i][1] = v[i][2] = 0.0;
|
||||
flagv0 = 1;
|
||||
}
|
||||
|
||||
// evaluates velocties to estimate wether dtv has to be limited
|
||||
// required when v have been reset
|
||||
|
||||
if (flagv0) {
|
||||
dtf = dt * force->ftm2v;
|
||||
energy_force(0);
|
||||
neval++;
|
||||
|
||||
if (rmass) {
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
dtfm = dtf / rmass[i];
|
||||
v[i][0] = dtfm * f[i][0];
|
||||
v[i][1] = dtfm * f[i][1];
|
||||
v[i][2] = dtfm * f[i][2];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < nlocal; i++) {
|
||||
dtfm = dtf / mass[type[i]];
|
||||
v[i][0] = dtfm * f[i][0];
|
||||
v[i][1] = dtfm * f[i][1];
|
||||
v[i][2] = dtfm * f[i][2];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// limit timestep so no particle moves further than dmax
|
||||
@ -281,6 +308,13 @@ int MinFire::iterate(int maxiter)
|
||||
|
||||
MPI_Allreduce(&dtvone,&dtv,1,MPI_DOUBLE,MPI_MIN,world);
|
||||
|
||||
// reset velocities when necessary
|
||||
|
||||
if (flagv0) {
|
||||
for (int i = 0; i < nlocal; i++)
|
||||
v[i][0] = v[i][1] = v[i][2] = 0.0;
|
||||
}
|
||||
|
||||
// min dtv over replicas, if necessary
|
||||
// this communicator would be invalid for multiprocess replicas
|
||||
|
||||
@ -438,6 +472,10 @@ int MinFire::iterate(int maxiter)
|
||||
neval++;
|
||||
}
|
||||
|
||||
// velocities have been evaluated
|
||||
|
||||
flagv0 = 0;
|
||||
|
||||
// energy tolerance criterion
|
||||
// only check after delaystep elapsed since velocties reset to 0
|
||||
// sync across replicas if running multi-replica minimization
|
||||
|
||||
@ -37,7 +37,7 @@ class MinFire : public Min {
|
||||
double dt,dtmax,dtmin;
|
||||
double alpha;
|
||||
bigint last_negative,ntimestep_start;
|
||||
int vdotf_negatif;
|
||||
int vdotf_negatif,flagv0;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user