Files
lammps/src/KOKKOS/pair_pace_kokkos.cpp
2022-05-12 15:25:07 -06:00

1722 lines
58 KiB
C++

// clang-format off
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
aE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing author: Stan Moore (SNL)
------------------------------------------------------------------------- */
#include "pair_pace_kokkos.h"
#include "atom_kokkos.h"
#include "atom_masks.h"
#include "comm.h"
#include "error.h"
#include "force.h"
#include "kokkos.h"
#include "math_const.h"
#include "memory_kokkos.h"
#include "neighbor_kokkos.h"
#include "neigh_request.h"
#include "ace_c_basis.h"
#include "ace_evaluator.h"
#include "ace_recursive.h"
#include "ace_version.h"
#include "ace_radial.h"
#include <cstring>
namespace LAMMPS_NS {
struct ACEImpl {
ACEImpl() : basis_set(nullptr), ace(nullptr) {}
~ACEImpl()
{
delete basis_set;
delete ace;
}
ACECTildeBasisSet *basis_set;
ACERecursiveEvaluator *ace;
};
} // namespace LAMMPS_NS
using namespace LAMMPS_NS;
using namespace MathConst;
enum{FS,FS_SHIFTEDSCALED};
/* ---------------------------------------------------------------------- */
template<class DeviceType>
PairPACEKokkos<DeviceType>::PairPACEKokkos(LAMMPS *lmp) : PairPACE(lmp)
{
respa_enable = 0;
kokkosable = 1;
atomKK = (AtomKokkos *) atom;
execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
datamask_read = EMPTY_MASK;
datamask_modify = EMPTY_MASK;
host_flag = (execution_space == Host);
}
/* ----------------------------------------------------------------------
check if allocated, since class can be destructed when incomplete
------------------------------------------------------------------------- */
template<class DeviceType>
PairPACEKokkos<DeviceType>::~PairPACEKokkos()
{
if (copymode) return;
memoryKK->destroy_kokkos(k_eatom,eatom);
memoryKK->destroy_kokkos(k_vatom,vatom);
// deallocate views of views in serial to prevent issues in Kokkos tools
for (int i = 0; i < nelements; i++) {
for (int j = 0; j < nelements; j++) {
k_splines_gk.h_view(i, j).deallocate();
k_splines_rnl.h_view(i, j).deallocate();
k_splines_hc.h_view(i, j).deallocate();
}
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::grow(int natom, int maxneigh)
{
auto basis_set = aceimpl->basis_set;
if (A.extent(0) < natom) {
memoryKK->realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
memoryKK->realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase);
memoryKK->realloc_kokkos(A_list, "pace:A_list", natom, idx_rho_max, basis_set->rankmax);
//size is +1 of max to avoid out-of-boundary array access in double-triangular scheme
memoryKK->realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_rho_max, basis_set->rankmax + 1);
memoryKK->realloc_kokkos(e_atom, "pace:e_atom", natom);
memoryKK->realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
memoryKK->realloc_kokkos(dF_drho, "pace:dF_drho", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
memoryKK->realloc_kokkos(weights, "pace:weights", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
memoryKK->realloc_kokkos(weights_rank1, "pace:weights_rank1", natom, nelements, nradbase);
// hard-core repulsion
memoryKK->realloc_kokkos(rho_core, "pace:rho_core", natom);
memoryKK->realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom);
memoryKK->realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_rho_max, basis_set->rankmax);
}
if (ylm.extent(0) < natom || ylm.extent(1) < maxneigh) {
// radial functions
memoryKK->realloc_kokkos(fr, "pace:fr", natom, maxneigh, nradmax, lmax + 1);
memoryKK->realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, nradmax, lmax + 1);
memoryKK->realloc_kokkos(gr, "pace:gr", natom, maxneigh, nradbase);
memoryKK->realloc_kokkos(dgr, "pace:dgr", natom, maxneigh, nradbase);
const int max_num_functions = MAX(nradbase, nradmax*(lmax + 1));
memoryKK->realloc_kokkos(d_values, "pace:d_values", natom, maxneigh, max_num_functions);
memoryKK->realloc_kokkos(d_derivatives, "pace:d_derivatives", natom, maxneigh, max_num_functions);
// hard-core repulsion
memoryKK->realloc_kokkos(cr, "pace:cr", natom, maxneigh);
memoryKK->realloc_kokkos(dcr, "pace:dcr", natom, maxneigh);
// spherical harmonics
memoryKK->realloc_kokkos(plm, "pace:plm", natom, maxneigh, (lmax + 1) * (lmax + 1));
memoryKK->realloc_kokkos(dplm, "pace:dplm", natom, maxneigh, (lmax + 1) * (lmax + 1));
memoryKK->realloc_kokkos(ylm, "pace:ylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
memoryKK->realloc_kokkos(dylm, "pace:dylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
// short neigh list
memoryKK->realloc_kokkos(d_ncount, "pace:ncount", natom);
memoryKK->realloc_kokkos(d_mu, "pace:mu", natom, maxneigh);
memoryKK->realloc_kokkos(d_rhats, "pace:rhats", natom, maxneigh);
memoryKK->realloc_kokkos(d_rnorms, "pace:rnorms", natom, maxneigh);
memoryKK->realloc_kokkos(d_nearest, "pace:nearest", natom, maxneigh);
memoryKK->realloc_kokkos(f_ij, "pace:f_ij", natom, maxneigh);
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::copy_pertype()
{
auto basis_set = aceimpl->basis_set;
memoryKK->realloc_kokkos(d_rho_core_cutoff, "pace:rho_core_cutoff", nelements);
memoryKK->realloc_kokkos(d_drho_core_cutoff, "pace:drho_core_cutoff", nelements);
memoryKK->realloc_kokkos(d_E0vals, "pace:E0vals", nelements);
memoryKK->realloc_kokkos(d_ndensity, "pace:ndensity", nelements);
memoryKK->realloc_kokkos(d_npoti, "pace:npoti", nelements);
auto h_rho_core_cutoff = Kokkos::create_mirror_view(d_rho_core_cutoff);
auto h_drho_core_cutoff = Kokkos::create_mirror_view(d_drho_core_cutoff);
auto h_E0vals = Kokkos::create_mirror_view(d_E0vals);
auto h_ndensity = Kokkos::create_mirror_view(d_ndensity);
auto h_npoti = Kokkos::create_mirror_view(d_npoti);
for (int n = 0; n < nelements; n++) {
h_rho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).rho_core_cutoff;
h_drho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).drho_core_cutoff;
h_E0vals(n)= basis_set->E0vals(n);
const int ndensity = h_ndensity(n) = basis_set->map_embedding_specifications.at(n).ndensity;
string npoti = basis_set->map_embedding_specifications.at(n).npoti;
if (npoti == "FinnisSinclair")
h_npoti(n) = FS;
else if (npoti == "FinnisSinclairShiftedScaled")
h_npoti(n) = FS_SHIFTEDSCALED;
}
Kokkos::deep_copy(d_rho_core_cutoff, h_rho_core_cutoff);
Kokkos::deep_copy(d_drho_core_cutoff, h_drho_core_cutoff);
Kokkos::deep_copy(d_E0vals, h_E0vals);
Kokkos::deep_copy(d_ndensity, h_ndensity);
Kokkos::deep_copy(d_npoti, h_npoti);
memoryKK->realloc_kokkos(d_wpre, "pace:wpre", nelements, basis_set->ndensitymax);
memoryKK->realloc_kokkos(d_mexp, "pace:mexp", nelements, basis_set->ndensitymax);
auto h_wpre = Kokkos::create_mirror_view(d_wpre);
auto h_mexp = Kokkos::create_mirror_view(d_mexp);
for (int n = 0; n < nelements; n++) {
const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity;
for (int p = 0; p < ndensity; p++) {
h_wpre(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 0);
h_mexp(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 1);
}
}
Kokkos::deep_copy(d_wpre, h_wpre);
Kokkos::deep_copy(d_mexp, h_mexp);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::copy_splines()
{
auto basis_set = aceimpl->basis_set;
if (k_splines_gk.d_view.data()) {
for (int i = 0; i < nelements; i++) {
for (int j = 0; j < nelements; j++) {
k_splines_gk.h_view(i, j).deallocate();
k_splines_rnl.h_view(i, j).deallocate();
k_splines_hc.h_view(i, j).deallocate();
}
}
}
k_splines_gk = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_gk", nelements, nelements);
k_splines_rnl = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_rnl", nelements, nelements);
k_splines_hc = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_hc", nelements, nelements);
ACERadialFunctions* radial_functions = dynamic_cast<ACERadialFunctions*>(basis_set->radial_functions);
for (int i = 0; i < nelements; i++) {
for (int j = 0; j < nelements; j++) {
k_splines_gk.h_view(i, j) = radial_functions->splines_gk(i, j);
k_splines_rnl.h_view(i, j) = radial_functions->splines_rnl(i, j);
k_splines_hc.h_view(i, j) = radial_functions->splines_hc(i, j);
}
}
k_splines_gk.modify_host();
k_splines_rnl.modify_host();
k_splines_hc.modify_host();
k_splines_gk.sync_device();
k_splines_rnl.sync_device();
k_splines_hc.sync_device();
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::copy_tilde()
{
auto basis_set = aceimpl->basis_set;
// flatten loops, get per-element count and max
idx_rho_max = 0;
int total_basis_size_max = 0;
memoryKK->realloc_kokkos(d_idx_rho_count, "pace:idx_rho_count", nelements);
auto h_idx_rho_count = Kokkos::create_mirror_view(d_idx_rho_count);
for (int n = 0; n < nelements; n++) {
int idx_rho = 0;
const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n];
const int total_basis_size = basis_set->total_basis_size[n];
ACECTildeBasisFunction *basis = basis_set->basis[n];
// rank=1
for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind)
idx_rho++;
// rank > 1
for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
ACECTildeBasisFunction *func = &basis[func_ind];
// loop over {ms} combinations in sum
for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind)
idx_rho++;
}
h_idx_rho_count(n) = idx_rho;
idx_rho_max = MAX(idx_rho_max, idx_rho);
total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size);
}
Kokkos::deep_copy(d_idx_rho_count, h_idx_rho_count);
memoryKK->realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max);
memoryKK->realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max);
memoryKK->realloc_kokkos(d_offsets, "pace:offsets", nelements, idx_rho_max);
memoryKK->realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax);
memoryKK->realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax);
memoryKK->realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax);
memoryKK->realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_rho_max, basis_set->rankmax);
memoryKK->realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_rho_max, basis_set->ndensitymax);
auto h_rank = Kokkos::create_mirror_view(d_rank);
auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs);
auto h_offsets = Kokkos::create_mirror_view(d_offsets);
auto h_mus = Kokkos::create_mirror_view(d_mus);
auto h_ns = Kokkos::create_mirror_view(d_ns);
auto h_ls = Kokkos::create_mirror_view(d_ls);
auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs);
auto h_ctildes = Kokkos::create_mirror_view(d_ctildes);
// copy values on host
for (int n = 0; n < nelements; n++) {
const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n];
const int total_basis_size = basis_set->total_basis_size[n];
ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[n];
ACECTildeBasisFunction *basis = basis_set->basis[n];
const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity;
int idx_rho = 0;
// rank=1
for (int offset = 0; offset < total_basis_size_rank1; ++offset) {
ACECTildeBasisFunction *func = &basis_rank1[offset];
h_rank(n, offset) = 1;
h_mus(n, offset, 0) = func->mus[0];
h_ns(n, offset, 0) = func->ns[0];
for (int p = 0; p < ndensity; p++)
h_ctildes(n, idx_rho, p) = func->ctildes[p];
h_offsets(n, idx_rho) = offset;
idx_rho++;
}
// rank > 1
for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
ACECTildeBasisFunction *func = &basis[func_ind];
// TODO: check if func->ctildes are zero, then skip
const int offset = total_basis_size_rank1 + func_ind;
const int rank = h_rank(n, offset) = func->rank;
h_num_ms_combs(n, offset) = func->num_ms_combs;
for (int t = 0; t < rank; t++) {
h_mus(n, offset, t) = func->mus[t];
h_ns(n, offset, t) = func->ns[t];
h_ls(n, offset, t) = func->ls[t];
}
// loop over {ms} combinations in sum
for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) {
auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank)
for (int t = 0; t < rank; t++)
h_ms_combs(n, idx_rho, t) = ms[t];
for (int p = 0; p < ndensity; ++p) {
// real-part only multiplication
h_ctildes(n, idx_rho, p) = func->ctildes[ms_ind * ndensity + p];
}
h_offsets(n, idx_rho) = offset;
idx_rho++;
}
}
}
Kokkos::deep_copy(d_rank, h_rank);
Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs);
Kokkos::deep_copy(d_offsets, h_offsets);
Kokkos::deep_copy(d_mus, h_mus);
Kokkos::deep_copy(d_ns, h_ns);
Kokkos::deep_copy(d_ls, h_ls);
Kokkos::deep_copy(d_ms_combs, h_ms_combs);
Kokkos::deep_copy(d_ctildes, h_ctildes);
}
/* ----------------------------------------------------------------------
init specific to this pair style
------------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::init_style()
{
if (host_flag) {
if (lmp->kokkos->nthreads > 1)
error->all(FLERR,"Pair style pace/kk can currently only run on a single "
"CPU thread");
PairPACE::init_style();
return;
}
if (atom->tag_enable == 0) error->all(FLERR, "Pair style PACE requires atom IDs");
if (force->newton_pair == 0) error->all(FLERR, "Pair style PACE requires newton pair on");
// neighbor list request for KOKKOS
neighflag = lmp->kokkos->neighflag;
auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
!std::is_same<DeviceType,LMPDeviceType>::value);
request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
if (neighflag == FULL)
error->all(FLERR,"Must use half neighbor list style with pair pace/kk");
auto basis_set = aceimpl->basis_set;
nelements = basis_set->nelements;
lmax = basis_set->lmax;
nradmax = basis_set->nradmax;
nradbase = basis_set->nradbase;
// spherical harmonics
memoryKK->realloc_kokkos(alm, "pace:alm", (lmax + 1) * (lmax + 1));
memoryKK->realloc_kokkos(blm, "pace:blm", (lmax + 1) * (lmax + 1));
memoryKK->realloc_kokkos(cl, "pace:cl", lmax + 1);
memoryKK->realloc_kokkos(dl, "pace:dl", lmax + 1);
pre_compute_harmonics(lmax);
copy_pertype();
copy_splines();
copy_tilde();
}
/* ----------------------------------------------------------------------
init for one type pair i,j and corresponding j,i
------------------------------------------------------------------------- */
template<class DeviceType>
double PairPACEKokkos<DeviceType>::init_one(int i, int j)
{
double cutone = PairPACE::init_one(i,j);
k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j];
k_scale.template modify<LMPHostType>();
k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
k_cutsq.template modify<LMPHostType>();
return cutone;
}
/* ----------------------------------------------------------------------
set coeffs for one or more type pairs
------------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::coeff(int narg, char **arg)
{
PairPACE::coeff(narg,arg);
// Set up element lists
auto h_map = Kokkos::create_mirror_view(d_map);
for (int i = 1; i <= atom->ntypes; i++)
h_map(i) = map[i];
Kokkos::deep_copy(d_map,h_map);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::allocate()
{
PairPACE::allocate();
int n = atom->ntypes + 1;
memoryKK->realloc_kokkos(d_map, "pace:map", n);
memoryKK->realloc_kokkos(k_cutsq, "pace:cutsq", n, n);
d_cutsq = k_cutsq.template view<DeviceType>();
memoryKK->realloc_kokkos(k_scale, "pace:scale", n, n);
d_scale = k_scale.template view<DeviceType>();
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
struct FindMaxNumNeighs {
typedef DeviceType device_type;
NeighListKokkos<DeviceType> k_list;
FindMaxNumNeighs(NeighListKokkos<DeviceType>* nl): k_list(*nl) {}
~FindMaxNumNeighs() {k_list.copymode = 1;}
KOKKOS_INLINE_FUNCTION
void operator() (const int& ii, int& maxneigh) const {
const int i = k_list.d_ilist[ii];
const int num_neighs = k_list.d_numneigh[i];
if (maxneigh < num_neighs) maxneigh = num_neighs;
}
};
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
{
if (host_flag) {
atomKK->sync(Host,X_MASK|TYPE_MASK);
PairPACE::compute(eflag_in,vflag_in);
atomKK->modified(Host,F_MASK);
return;
}
eflag = eflag_in;
vflag = vflag_in;
if (neighflag == FULL) no_virial_fdotr_compute = 1;
ev_init(eflag,vflag,0);
// reallocate per-atom arrays if necessary
if (eflag_atom) {
memoryKK->destroy_kokkos(k_eatom,eatom);
memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
d_eatom = k_eatom.view<DeviceType>();
}
if (vflag_atom) {
memoryKK->destroy_kokkos(k_vatom,vatom);
memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
d_vatom = k_vatom.view<DeviceType>();
}
copymode = 1;
int newton_pair = force->newton_pair;
if (newton_pair == false)
error->all(FLERR,"PairPACEKokkos requires 'newton on'");
if (recursive)
error->all(FLERR,"Must use 'product' algorithm with pair pace/kk");
atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
x = atomKK->k_x.view<DeviceType>();
f = atomKK->k_f.view<DeviceType>();
type = atomKK->k_type.view<DeviceType>();
k_scale.template sync<DeviceType>();
k_cutsq.template sync<DeviceType>();
NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
d_numneigh = k_list->d_numneigh;
d_neighbors = k_list->d_neighbors;
d_ilist = k_list->d_ilist;
inum = list->inum;
need_dup = lmp->kokkos->need_dup<DeviceType>();
if (need_dup) {
dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f);
dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom);
} else {
ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f);
ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom);
}
maxneigh = 0;
Kokkos::parallel_reduce("pace::find_maxneigh", inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Max<int>(maxneigh));
int vector_length_default = 1;
int team_size_default = 1;
if (!host_flag)
team_size_default = 32;
chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
chunk_offset = 0;
grow(chunk_size, maxneigh);
EV_FLOAT ev;
while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
Kokkos::deep_copy(weights, 0.0);
Kokkos::deep_copy(weights_rank1, 0.0);
Kokkos::deep_copy(A, 0.0);
Kokkos::deep_copy(A_rank1, 0.0);
Kokkos::deep_copy(rhos, 0.0);
EV_FLOAT ev_tmp;
if (chunk_size > inum - chunk_offset)
chunk_size = inum - chunk_offset;
//Neigh
{
int vector_length = vector_length_default;
int team_size = team_size_default;
check_team_size_for<TagPairPACEComputeNeigh>(chunk_size,team_size,vector_length);
int scratch_size = scratch_size_helper<int>(team_size * maxneigh);
typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeNeigh> policy_neigh(chunk_size,team_size,vector_length);
policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
}
//ComputeRadial
{
int vector_length = vector_length_default;
int team_size = team_size_default;
check_team_size_for<TagPairPACEComputeRadial>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial> policy_radial(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
Kokkos::parallel_for("ComputeRadial",policy_radial,*this);
}
//ComputeYlm
{
int vector_length = vector_length_default;
int team_size = 16;
check_team_size_for<TagPairPACEComputeYlm>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm> policy_ylm(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
Kokkos::parallel_for("ComputeYlm",policy_ylm,*this);
}
//ComputeAi
{
int vector_length = vector_length_default;
int team_size = team_size_default;
check_team_size_for<TagPairPACEComputeAi>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi> policy_ai(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
Kokkos::parallel_for("ComputeAi",policy_ai,*this);
}
//ConjugateAi
{
typename Kokkos::RangePolicy<DeviceType,TagPairPACEConjugateAi> policy_conj_ai(0,chunk_size);
Kokkos::parallel_for("ConjugateAi",policy_conj_ai,*this);
}
//ComputeRho
{
typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeRho> policy_rho(0,chunk_size*idx_rho_max);
Kokkos::parallel_for("ComputeRho",policy_rho,*this);
}
//ComputeFS
{
typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeFS> policy_fs(0,chunk_size);
Kokkos::parallel_for("ComputeFS",policy_fs,*this);
}
//ComputeWeights
{
typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0,chunk_size*idx_rho_max);
Kokkos::parallel_for("ComputeWeights",policy_weights,*this);
}
//ComputeDerivative
{
int vector_length = vector_length_default;
int team_size = team_size_default;
check_team_size_for<TagPairPACEComputeDerivative>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative> policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
Kokkos::parallel_for("ComputeDerivative",policy_derivative,*this);
}
//ComputeForce
{
if (evflag) {
if (neighflag == HALF) {
typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALF,1> > policy_force(0,chunk_size);
Kokkos::parallel_reduce(policy_force, *this, ev_tmp);
} else if (neighflag == HALFTHREAD) {
typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALFTHREAD,1> > policy_force(0,chunk_size);
Kokkos::parallel_reduce("ComputeForce",policy_force, *this, ev_tmp);
}
} else {
if (neighflag == HALF) {
typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALF,0> > policy_force(0,chunk_size);
Kokkos::parallel_for(policy_force, *this);
} else if (neighflag == HALFTHREAD) {
typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALFTHREAD,0> > policy_force(0,chunk_size);
Kokkos::parallel_for("ComputeForce",policy_force, *this);
}
}
}
ev += ev_tmp;
chunk_offset += chunk_size;
} // end while
if (need_dup)
Kokkos::Experimental::contribute(f, dup_f);
if (eflag_global) eng_vdwl += ev.evdwl;
if (vflag_global) {
virial[0] += ev.v[0];
virial[1] += ev.v[1];
virial[2] += ev.v[2];
virial[3] += ev.v[3];
virial[4] += ev.v[4];
virial[5] += ev.v[5];
}
if (vflag_fdotr) pair_virial_fdotr_compute(this);
if (eflag_atom) {
k_eatom.template modify<DeviceType>();
k_eatom.template sync<LMPHostType>();
}
if (vflag_atom) {
if (need_dup)
Kokkos::Experimental::contribute(d_vatom, dup_vatom);
k_vatom.template modify<DeviceType>();
k_vatom.template sync<LMPHostType>();
}
atomKK->modified(execution_space,F_MASK);
copymode = 0;
// free duplicated memory
if (need_dup) {
dup_f = decltype(dup_f)();
dup_vatom = decltype(dup_vatom)();
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeNeigh>::member_type& team) const
{
const int ii = team.league_rank();
const int i = d_ilist[ii + chunk_offset];
const int itype = type[i];
const X_FLOAT xtmp = x(i,0);
const X_FLOAT ytmp = x(i,1);
const X_FLOAT ztmp = x(i,2);
const int jnum = d_numneigh[i];
// get a pointer to scratch memory
// This is used to cache whether or not an atom is within the cutoff
// If it is, inside is assigned to 1, otherwise -1
const int team_rank = team.team_rank();
const int scratch_shift = team_rank * maxneigh; // offset into pointer for entire team
int* inside = (int*)team.team_shmem().get_shmem(team.team_size() * maxneigh * sizeof(int), 0) + scratch_shift;
// loop over list of all neighbors within force cutoff
// distsq[] = distance sq to each
// rlist[] = distance vector to each
// nearest[] = atom indices of neighbors
int ncount = 0;
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,jnum),
[&] (const int jj, int& count) {
int j = d_neighbors(i,jj);
j &= NEIGHMASK;
const int jtype = type(j);
const F_FLOAT delx = xtmp - x(j,0);
const F_FLOAT dely = ytmp - x(j,1);
const F_FLOAT delz = ztmp - x(j,2);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
inside[jj] = -1;
if (rsq < d_cutsq(itype,jtype)) {
inside[jj] = 1;
count++;
}
},ncount);
d_ncount(ii) = ncount;
Kokkos::parallel_scan(Kokkos::TeamThreadRange(team,jnum),
[&] (const int jj, int& offset, bool final) {
if (inside[jj] < 0) return;
if (final) {
int j = d_neighbors(i,jj);
j &= NEIGHMASK;
const F_FLOAT delx = xtmp - x(j,0);
const F_FLOAT dely = ytmp - x(j,1);
const F_FLOAT delz = ztmp - x(j,2);
const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
const F_FLOAT r = sqrt(rsq);
const F_FLOAT rinv = 1.0/r;
const int mu_j = d_map(type(j));
d_mu(ii,offset) = mu_j;
d_rnorms(ii,offset) = r;
d_rhats(ii,offset,0) = -delx*rinv;
d_rhats(ii,offset,1) = -dely*rinv;
d_rhats(ii,offset,2) = -delz*rinv;
d_nearest(ii,offset) = j;
}
offset++;
});
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeRadial, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial>::member_type& team) const
{
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() %
((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
const int i = d_ilist[ii + chunk_offset];
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ncount = d_ncount(ii);
if (jj >= ncount) return;
const double r_norm = d_rnorms(ii, jj);
const int mu_i = d_map(type(i));
const int mu_j = d_mu(ii, jj);
evaluate_splines(ii, jj, r_norm, nradbase, nradmax, mu_i, mu_j);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeYlm, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const
{
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() %
((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ncount = d_ncount(ii);
if (jj >= ncount) return;
const double xn = d_rhats(ii, jj, 0);
const double yn = d_rhats(ii, jj, 1);
const double zn = d_rhats(ii, jj, 2);
compute_ylm(ii,jj,xn,yn,zn,lmax);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeAi, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const
{
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() %
((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
const int i = d_ilist[ii + chunk_offset];
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ncount = d_ncount(ii);
if (jj >= ncount) return;
const int mu_j = d_mu(ii, jj);
// rank = 1
for (int n = 0; n < nradbase; n++)
Kokkos::atomic_add(&A_rank1(ii, mu_j, n), gr(ii, jj, n) * Y00);
// rank > 1
for (int n = 0; n < nradmax; n++) {
for (int l = 0; l <= lmax; l++) {
for (int m = 0; m <= l; m++) {
const int idx = l * (l + 1) + m; // (l, m)
Kokkos::atomic_add(&A(ii, mu_j, n, idx).re, fr(ii, jj, n, l) * ylm(ii, jj, idx).re);
Kokkos::atomic_add(&A(ii, mu_j, n, idx).im, fr(ii, jj, n, l) * ylm(ii, jj, idx).im);
}
}
}
// hard-core repulsion
Kokkos::atomic_add(&rho_core(ii), cr(ii, jj));
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEConjugateAi, const int& ii) const
{
const int i = d_ilist[ii + chunk_offset];
//complex conjugate A's (for NEGATIVE (-m) terms)
// for rank > 1
for (int mu_j = 0; mu_j < nelements; mu_j++) {
for (int n = 0; n < nradmax; n++) {
for (int l = 0; l <= lmax; l++) {
//fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm
for (int m = 1; m <= l; m++) {
const int idx = l * (l + 1) + m; // (l, m)
const int idxm = l * (l + 1) - m; // (l, -m)
const int factor = m % 2 == 0 ? 1 : -1;
A(ii, mu_j, n, idxm) = A(ii, mu_j, n, idx).conj() * (double)factor;
}
}
}
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeRho, const int& iter) const
{
const int idx_rho = iter / chunk_size;
const int ii = iter % chunk_size;
const int i = d_ilist[ii + chunk_offset];
const int mu_i = d_map(type(i));
if (idx_rho >= d_idx_rho_count(mu_i)) return;
const int ndensity = d_ndensity(mu_i);
const int offset = d_offsets(mu_i, idx_rho);
const int rank = d_rank(mu_i, offset);
const int r = rank - 1;
// Basis functions B with iterative product and density rho(p) calculation
if (rank == 1) {
const int mu = d_mus(mu_i, offset, 0);
const int n = d_ns(mu_i, offset, 0);
double A_cur = A_rank1(ii, mu, n - 1);
for (int p = 0; p < ndensity; ++p) {
//for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
Kokkos::atomic_add(&rhos(ii, p), d_ctildes(mu_i, idx_rho, p) * A_cur);
}
} else { // rank > 1
// loop over {ms} combinations in sum
// loop over m, collect B = product of A with given ms
A_forward_prod(ii, idx_rho, 0) = complex::one();
// fill forward A-product triangle
for (int t = 0; t < rank; t++) {
//TODO: optimize ns[t]-1 -> ns[t] during functions construction
const int mu = d_mus(mu_i, offset, t);
const int n = d_ns(mu_i, offset, t);
const int l = d_ls(mu_i, offset, t);
const int m = d_ms_combs(mu_i, idx_rho, t); // current ms-combination (of length = rank)
const int idx = l * (l + 1) + m; // (l, m)
A_list(ii, idx_rho, t) = A(ii, mu, n - 1, idx);
A_forward_prod(ii, idx_rho, t + 1) = A_forward_prod(ii, idx_rho, t) * A_list(ii, idx_rho, t);
}
complex A_backward_prod = complex::one();
// fill backward A-product triangle
for (int t = r; t >= 1; t--) {
const complex dB = A_forward_prod(ii, idx_rho, t) * A_backward_prod; // dB - product of all A's except t-th
dB_flatten(ii, idx_rho, t) = dB;
A_backward_prod = A_backward_prod * A_list(ii, idx_rho, t);
}
dB_flatten(ii, idx_rho, 0) = A_forward_prod(ii, idx_rho, 0) * A_backward_prod;
const complex B = A_forward_prod(ii, idx_rho, rank);
for (int p = 0; p < ndensity; ++p) {
// real-part only multiplication
Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_ctildes(mu_i, idx_rho, p)));
}
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeFS, const int& ii) const
{
const int i = d_ilist[ii + chunk_offset];
const int mu_i = d_map(type(i));
const double rho_cut = d_rho_core_cutoff(mu_i);
const double drho_cut = d_drho_core_cutoff(mu_i);
const int ndensity = d_ndensity(mu_i);
double evdwl, fcut, dfcut;
evdwl = fcut = dfcut = 0.0;
inner_cutoff(rho_core(ii), rho_cut, drho_cut, fcut, dfcut);
FS_values_and_derivatives(ii, evdwl, mu_i);
dF_drho_core(ii) = evdwl * dfcut + 1;
for (int p = 0; p < ndensity; ++p)
dF_drho(ii, p) *= fcut;
// tally energy contribution
if (eflag) {
double evdwl_cut = evdwl * fcut + rho_core(ii);
// E0 shift
evdwl_cut += d_E0vals(mu_i);
e_atom(ii) = evdwl_cut;
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeWeights, const int& iter) const
{
const int idx_rho = iter / chunk_size;
const int ii = iter % chunk_size;
const int i = d_ilist[ii + chunk_offset];
const int mu_i = d_map(type(i));
if (idx_rho >= d_idx_rho_count(mu_i)) return;
const int ndensity = d_ndensity(mu_i);
const int offset = d_offsets(mu_i, idx_rho);
const int rank = d_rank(mu_i, offset);
// Weights and theta calculation
if (rank == 1) {
const int mu = d_mus(mu_i, offset, 0);
const int n = d_ns(mu_i, offset, 0);
double theta = 0.0;
for (int p = 0; p < ndensity; ++p) {
// for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p);
}
Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta);
} else { // rank > 1
double theta = 0.0;
for (int p = 0; p < ndensity; ++p)
theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p);
theta *= 0.5; // 0.5 factor due to possible double counting ???
for (int t = 0; t < rank; ++t) {
const int m_t = d_ms_combs(mu_i, idx_rho, t);
const int factor = (m_t % 2 == 0 ? 1 : -1);
const complex dB = dB_flatten(ii, idx_rho, t);
const int mu_t = d_mus(mu_i, offset, t);
const int n_t = d_ns(mu_i, offset, t);
const int l_t = d_ls(mu_i, offset, t);
const int idx = l_t * (l_t + 1) + m_t; // (l, m)
const complex value = theta * dB;
Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re);
Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).im), value.im);
// update -m_t (that could also be positive), because the basis is half_basis
const int idxm = l_t * (l_t + 1) - m_t; // (l, -m)
const complex valuem = theta * dB.conj() * (double)factor;
Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).re), valuem.re);
Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).im), valuem.im);
}
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeDerivative, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative>::member_type& team) const
{
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() %
((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
const int i = d_ilist[ii + chunk_offset];
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ncount = d_ncount(ii);
if (jj >= ncount) return;
const int itype = type(i);
const double scale = d_scale(itype,itype);
int j = d_nearest(ii,jj);
const int mu_j = d_mu(ii, jj);
double r_hat[3];
r_hat[0] = d_rhats(ii, jj, 0);
r_hat[1] = d_rhats(ii, jj, 1);
r_hat[2] = d_rhats(ii, jj, 2);
const double r = d_rnorms(ii, jj);
const double rinv = 1.0/r;
double f_ji[3];
f_ji[0] = f_ji[1] = f_ji[2] = 0;
// for rank = 1
for (int n = 0; n < nradbase; ++n) {
if (weights_rank1(ii, mu_j, n) == 0) continue;
double &DG = dgr(ii, jj, n);
double DGR = DG * Y00;
DGR *= weights_rank1(ii, mu_j, n);
f_ji[0] += DGR * r_hat[0];
f_ji[1] += DGR * r_hat[1];
f_ji[2] += DGR * r_hat[2];
}
// for rank > 1
for (int n = 0; n < nradmax; n++) {
for (int l = 0; l <= lmax; l++) {
const double R_over_r = fr(ii, jj, n, l) * rinv;
const double DR = dfr(ii, jj, n, l);
// for m >= 0
for (int m = 0; m <= l; m++) {
const int idx = l * (l + 1) + m; // (l, m)
complex w = weights(ii, mu_j, n, idx);
if (w.re == 0.0 && w.im == 0.0) continue;
// counting for -m cases if m > 0
if (m > 0) {
w.re *= 2.0;
w.im *= 2.0;
}
complex DY[3];
DY[0] = dylm(ii, jj, idx, 0);
DY[1] = dylm(ii, jj, idx, 1);
DY[2] = dylm(ii, jj, idx, 2);
const complex Y_DR = ylm(ii, jj, idx) * DR;
complex grad_phi_nlm[3];
grad_phi_nlm[0] = Y_DR * r_hat[0] + DY[0] * R_over_r;
grad_phi_nlm[1] = Y_DR * r_hat[1] + DY[1] * R_over_r;
grad_phi_nlm[2] = Y_DR * r_hat[2] + DY[2] * R_over_r;
// real-part multiplication only
f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
}
}
}
// hard-core repulsion
const double fpair = dF_drho_core(ii) * dcr(ii,jj);
f_ij(ii, jj, 0) = scale * f_ji[0] + fpair * r_hat[0];
f_ij(ii, jj, 1) = scale * f_ji[1] + fpair * r_hat[1];
f_ij(ii, jj, 2) = scale * f_ji[2] + fpair * r_hat[2];
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>, const int& ii, EV_FLOAT& ev) const
{
// The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
const auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
const auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();
const int i = d_ilist[ii + chunk_offset];
const int itype = type(i);
const double scale = d_scale(itype,itype);
const int ncount = d_ncount(ii);
F_FLOAT fitmp[3] = {0.0,0.0,0.0};
for (int jj = 0; jj < ncount; jj++) {
int j = d_nearest(ii,jj);
double r_hat[3];
r_hat[0] = d_rhats(ii, jj, 0);
r_hat[1] = d_rhats(ii, jj, 1);
r_hat[2] = d_rhats(ii, jj, 2);
const double r = d_rnorms(ii, jj);
const double delx = -r_hat[0]*r;
const double dely = -r_hat[1]*r;
const double delz = -r_hat[2]*r;
const double fpairx = f_ij(ii, jj, 0);
const double fpairy = f_ij(ii, jj, 1);
const double fpairz = f_ij(ii, jj, 2);
fitmp[0] += fpairx;
fitmp[1] += fpairy;
fitmp[2] += fpairz;
a_f(j,0) -= fpairx;
a_f(j,1) -= fpairy;
a_f(j,2) -= fpairz;
// tally per-atom virial contribution
if (EVFLAG && vflag_either)
v_tally_xyz<NEIGHFLAG>(ev, i, j, fpairx, fpairy, fpairz, delx, dely, delz);
}
a_f(i,0) += fitmp[0];
a_f(i,1) += fitmp[1];
a_f(i,2) += fitmp[2];
// tally energy contribution
if (EVFLAG && eflag_either) {
const double evdwl = scale*e_atom(ii);
//ev_tally_full(i, 2.0 * evdwl, 0.0, 0.0, 0.0, 0.0, 0.0);
if (eflag_global) ev.evdwl += evdwl;
if (eflag_atom) d_eatom[i] += evdwl;
}
}
template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const {
EV_FLOAT ev;
this->template operator()<NEIGHFLAG,EVFLAG>(TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>(), ii, ev);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz,
const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const
{
// The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
auto v_vatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom);
auto a_vatom = v_vatom.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
const E_FLOAT v0 = delx*fx;
const E_FLOAT v1 = dely*fy;
const E_FLOAT v2 = delz*fz;
const E_FLOAT v3 = delx*fy;
const E_FLOAT v4 = delx*fz;
const E_FLOAT v5 = dely*fz;
if (vflag_global) {
ev.v[0] += v0;
ev.v[1] += v1;
ev.v[2] += v2;
ev.v[3] += v3;
ev.v[4] += v4;
ev.v[5] += v5;
}
if (vflag_atom) {
a_vatom(i,0) += 0.5*v0;
a_vatom(i,1) += 0.5*v1;
a_vatom(i,2) += 0.5*v2;
a_vatom(i,3) += 0.5*v3;
a_vatom(i,4) += 0.5*v4;
a_vatom(i,5) += 0.5*v5;
a_vatom(j,0) += 0.5*v0;
a_vatom(j,1) += 0.5*v1;
a_vatom(j,2) += 0.5*v2;
a_vatom(j,3) += 0.5*v3;
a_vatom(j,4) += 0.5*v4;
a_vatom(j,5) += 0.5*v5;
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void PairPACEKokkos<DeviceType>::pre_compute_harmonics(int lmax)
{
auto h_alm = Kokkos::create_mirror_view(alm);
auto h_blm = Kokkos::create_mirror_view(blm);
auto h_cl = Kokkos::create_mirror_view(cl);
auto h_dl = Kokkos::create_mirror_view(dl);
for (int l = 1; l <= lmax; l++) {
const double lsq = l * l;
const double ld = 2 * l;
const double l1 = (4 * lsq - 1);
const double l2 = lsq - ld + 1;
for (int m = 0; m < l - 1; m++) {
const double msq = m * m;
const double a = sqrt((double(l1)) / (double(lsq - msq)));
const double b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1)));
const int idx = l * (l + 1) + m; // (l, m)
h_alm(idx) = a;
h_blm(idx) = b;
}
}
for (int l = 1; l <= lmax; l++) {
h_cl(l) = -sqrt(1.0 + 0.5 / (double(l)));
h_dl(l) = sqrt(double(2 * (l - 1) + 3));
}
Kokkos::deep_copy(alm, h_alm);
Kokkos::deep_copy(blm, h_blm);
Kokkos::deep_copy(cl, h_cl);
Kokkos::deep_copy(dl, h_dl);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::compute_barplm(int ii, int jj, double rz, int lmax) const
{
// requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
// prefactors include 1/sqrt(2) factor compared to reference
// l=0, m=0
// plm(ii, jj, 0, 0) = Y00/sq1o4pi; //= sq1o4pi;
plm(ii, jj, 0) = Y00; //= 1;
dplm(ii, jj, 0) = 0.0;
if (lmax > 0) {
// l=1, m=0
plm(ii, jj, 2) = Y00 * sq3 * rz;
dplm(ii, jj, 2) = Y00 * sq3;
// l=1, m=1
plm(ii, jj, 3) = -sq3o2 * Y00;
dplm(ii, jj, 3) = 0.0;
// loop l = 2, lmax
for (int l = 2; l <= lmax; l++) {
for (int m = 0; m < l - 1; m++) {
const int idx = l * (l + 1) + m; // (l, m)
const int idx1 = (l - 1) * l + m; // (l - 1, m)
const int idx2 = (l - 2) * (l - 1) + m; // (l - 2, m)
plm(ii, jj, idx) = alm(idx) * (rz * plm(ii, jj, idx1) + blm(idx) * plm(ii, jj, idx2));
dplm(ii, jj, idx) = alm(idx) * (plm(ii, jj, idx1) + rz * dplm(ii, jj, idx1) + blm(idx) * dplm(ii, jj, idx2));
}
const int idx = l * (l + 1) + l; // (l, l)
const int idx1 = l * (l + 1) + l - 1; // (l, l - 1)
const int idx2 = (l - 1) * l + l - 1; // (l - 1, l - 1)
const double t = dl(l) * plm(ii, jj, idx2);
plm(ii, jj, idx1) = t * rz;
dplm(ii, jj, idx1) = t;
plm(ii, jj, idx) = cl(l) * plm(ii, jj, idx2);
dplm(ii, jj, idx) = 0.0;
}
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::compute_ylm(int ii, int jj, double rx, double ry, double rz, int lmax) const
{
// requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
complex phase;
complex phasem, mphasem1;
complex dyx, dyy, dyz;
complex rdy;
phase.re = rx;
phase.im = ry;
// compute barplm
compute_barplm(ii, jj, rz, lmax);
// m = 0
for (int l = 0; l <= lmax; l++) {
const int idx = l * (l + 1);
ylm(ii, jj, idx).re = plm(ii, jj, idx);
ylm(ii, jj, idx).im = 0.0;
dyz.re = dplm(ii, jj, idx);
rdy.re = dyz.re * rz;
dylm(ii, jj, idx, 0).re = -rdy.re * rx;
dylm(ii, jj, idx, 0).im = 0.0;
dylm(ii, jj, idx, 1).re = -rdy.re * ry;
dylm(ii, jj, idx, 1).im = 0.0;
dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
dylm(ii, jj, idx, 2).im = 0;
}
// m = 1
for (int l = 1; l <= lmax; l++) {
const int idx = l * (l + 1) + 1;
ylm(ii, jj, idx) = phase * plm(ii, jj, idx);
dyx.re = plm(ii, jj, idx);
dyx.im = 0.0;
dyy.re = 0.0;
dyy.im = plm(ii, jj, idx);
dyz.re = phase.re * dplm(ii, jj, idx);
dyz.im = phase.im * dplm(ii, jj, idx);
rdy.re = rx * dyx.re + +rz * dyz.re;
rdy.im = ry * dyy.im + rz * dyz.im;
dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
dylm(ii, jj, idx, 0).im = -rdy.im * rx;
dylm(ii, jj, idx, 1).re = -rdy.re * ry;
dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
}
// m > 1
phasem = phase;
for (int m = 2; m <= lmax; m++) {
mphasem1.re = phasem.re * double(m);
mphasem1.im = phasem.im * double(m);
phasem = phasem * phase;
for (int l = m; l <= lmax; l++) {
const int idx = l * (l + 1) + m;
ylm(ii, jj, idx).re = phasem.re * plm(ii, jj, idx);
ylm(ii, jj, idx).im = phasem.im * plm(ii, jj, idx);
dyx = mphasem1 * plm(ii, jj, idx);
dyy.re = -dyx.im;
dyy.im = dyx.re;
dyz = phasem * dplm(ii, jj, idx);
rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re;
rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im;
dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
dylm(ii, jj, idx, 0).im = dyx.im - rdy.im * rx;
dylm(ii, jj, idx, 1).re = dyy.re - rdy.re * ry;
dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
}
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::cutoff_func_poly(const double r, const double r_in, const double delta_in, double &fc, double &dfc) const
{
if (r <= r_in-delta_in) {
fc = 1;
dfc = 0;
} else if (r >= r_in ) {
fc = 0;
dfc = 0;
} else {
double x = 1 - 2 * (1 + (r - r_in) / delta_in);
fc = 0.5 + 7.5 / 2. * (x / 4. - pow(x, 3) / 6. + pow(x, 5) / 20.);
dfc = -7.5 / delta_in * (0.25 - x * x / 2.0 + pow(x, 4) / 4.);
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::Fexp(const double x, const double m, double &F, double &DF) const
{
const double w = 1.e6;
const double eps = 1e-10;
const double lambda = pow(1.0 / w, m - 1.0);
if (abs(x) > eps) {
double g;
const double a = abs(x);
const double am = pow(a, m);
const double w3x3 = pow(w * a, 3); //// use cube
const double sign_factor = (signbit(x) ? -1 : 1);
if (w3x3 > 30.0)
g = 0.0;
else
g = exp(-w3x3);
const double omg = 1.0 - g;
F = sign_factor * (omg * am + lambda * g * a);
const double dg = -3.0 * w * w * w * a * a * g;
DF = m * pow(a, m - 1.0) * omg - am * dg + lambda * dg * a + lambda * g;
} else {
F = lambda * x;
DF = lambda;
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::FexpShiftedScaled(const double rho, const double mexp, double &F, double &DF) const
{
const double eps = 1e-10;
if (abs(mexp - 1.0) < eps) {
F = rho;
DF = 1;
} else {
const double a = abs(rho);
const double exprho = exp(-a);
const double nx = 1. / mexp;
const double xoff = pow(nx, (nx / (1.0 - nx))) * exprho;
const double yoff = pow(nx, (1 / (1.0 - nx))) * exprho;
const double sign_factor = (signbit(rho) ? -1 : 1);
F = sign_factor * (pow(xoff + a, mexp) - yoff);
DF = yoff + mexp * (-xoff + 1.0) * pow(xoff + a, mexp - 1.);
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::inner_cutoff(const double rho_core, const double rho_cut, const double drho_cut,
double &fcut, double &dfcut) const
{
double rho_low = rho_cut - drho_cut;
if (rho_core >= rho_cut) {
fcut = 0;
dfcut = 0;
} else if (rho_core <= rho_low) {
fcut = 1;
dfcut = 0;
} else {
cutoff_func_poly(rho_core, rho_cut, drho_cut, fcut, dfcut);
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::FS_values_and_derivatives(const int ii, double &evdwl, const int mu_i) const
{
double F, DF = 0;
int npoti = d_npoti(mu_i);
int ndensity = d_ndensity(mu_i);
for (int p = 0; p < ndensity; p++) {
const double wpre = d_wpre(mu_i, p);
const double mexp = d_mexp(mu_i, p);
if (npoti == FS)
Fexp(rhos(ii, p), mexp, F, DF);
else if (npoti == FS_SHIFTEDSCALED)
FexpShiftedScaled(rhos(ii, p), mexp, F, DF);
evdwl += F * wpre; // * weight (wpre)
dF_drho(ii, p) = DF * wpre; // * weight (wpre)
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::evaluate_splines(const int ii, const int jj, double r, int nradbase_c, int nradial_c, int mu_i,
int mu_j) const
{
auto &spline_gk = k_splines_gk.template view<DeviceType>()(mu_i, mu_j);
auto &spline_rnl = k_splines_rnl.template view<DeviceType>()(mu_i, mu_j);
auto &spline_hc = k_splines_hc.template view<DeviceType>()(mu_i, mu_j);
spline_gk.calcSplines(ii, jj, r, gr, dgr);
spline_rnl.calcSplines(ii, jj, r, d_values, d_derivatives);
for (int kk = 0; kk < fr.extent(2); kk++) {
for (int ll = 0; ll < fr.extent(3); ll++) {
const int flatten = kk*fr.extent(3) + ll;
fr(ii, jj, kk, ll) = d_values(ii, jj, flatten);
dfr(ii, jj, kk, ll) = d_derivatives(ii, jj, flatten);
}
}
spline_hc.calcSplines(ii, jj, r, d_values, d_derivatives);
cr(ii, jj) = d_values(ii, jj, 0);
dcr(ii, jj) = d_derivatives(ii, jj, 0);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairPACEKokkos<DeviceType>::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const
{
double wl, wl2, wl3, w2l1, w3l2;
double c[4];
double x = r * rscalelookup;
int nl = static_cast<int>(floor(x));
if (nl <= 0)
Kokkos::abort("Encountered very small distance. Stopping.");
if (nl < nlut) {
wl = x - double(nl);
wl2 = wl * wl;
wl3 = wl2 * wl;
w2l1 = 2.0 * wl;
w3l2 = 3.0 * wl2;
for (int func_id = 0; func_id < num_of_functions; func_id++) {
for (int idx = 0; idx < 4; idx++)
c[idx] = lookupTable(nl, func_id, idx);
d_values(ii, jj, func_id) = c[0] + c[1] * wl + c[2] * wl2 + c[3] * wl3;
d_derivatives(ii, jj, func_id) = (c[1] + c[2] * w2l1 + c[3] * w3l2) * rscalelookup;
}
} else { // fill with zeroes
for (int func_id = 0; func_id < num_of_functions; func_id++) {
d_values(ii, jj, func_id) = 0.0;
d_derivatives(ii, jj, func_id) = 0.0;
}
}
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
template<class TagStyle>
void PairPACEKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
int team_size_max;
team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
template<class TagStyle>
void PairPACEKokkos<DeviceType>::check_team_size_reduce(int inum, int &team_size, int vector_length) {
int team_size_max;
team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
}
template<class DeviceType>
template<typename scratch_type>
int PairPACEKokkos<DeviceType>::scratch_size_helper(int values_per_team) {
typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
return ScratchViewType::shmem_size(values_per_team);
}
/* ----------------------------------------------------------------------
memory usage of arrays
------------------------------------------------------------------------- */
template<class DeviceType>
double PairPACEKokkos<DeviceType>::memory_usage()
{
double bytes = 0;
bytes += memoryKK->memory_usage(A);
bytes += memoryKK->memory_usage(A_rank1);
bytes += memoryKK->memory_usage(A_list);
bytes += memoryKK->memory_usage(A_forward_prod);
bytes += memoryKK->memory_usage(e_atom);
bytes += memoryKK->memory_usage(rhos);
bytes += memoryKK->memory_usage(dF_drho);
bytes += memoryKK->memory_usage(weights);
bytes += memoryKK->memory_usage(weights_rank1);
bytes += memoryKK->memory_usage(rho_core);
bytes += memoryKK->memory_usage(dF_drho_core);
bytes += memoryKK->memory_usage(dB_flatten);
bytes += memoryKK->memory_usage(fr);
bytes += memoryKK->memory_usage(dfr);
bytes += memoryKK->memory_usage(gr);
bytes += memoryKK->memory_usage(dgr);
bytes += memoryKK->memory_usage(d_values);
bytes += memoryKK->memory_usage(d_derivatives);
bytes += memoryKK->memory_usage(cr);
bytes += memoryKK->memory_usage(dcr);
bytes += memoryKK->memory_usage(plm);
bytes += memoryKK->memory_usage(dplm);
bytes += memoryKK->memory_usage(ylm);
bytes += memoryKK->memory_usage(dylm);
bytes += memoryKK->memory_usage(d_ncount);
bytes += memoryKK->memory_usage(d_mu);
bytes += memoryKK->memory_usage(d_rhats);
bytes += memoryKK->memory_usage(d_rnorms);
bytes += memoryKK->memory_usage(d_nearest);
bytes += memoryKK->memory_usage(f_ij);
bytes += memoryKK->memory_usage(d_rho_core_cutoff);
bytes += memoryKK->memory_usage(d_drho_core_cutoff);
bytes += memoryKK->memory_usage(d_E0vals);
bytes += memoryKK->memory_usage(d_ndensity);
bytes += memoryKK->memory_usage(d_npoti);
bytes += memoryKK->memory_usage(d_wpre);
bytes += memoryKK->memory_usage(d_mexp);
bytes += memoryKK->memory_usage(d_idx_rho_count);
bytes += memoryKK->memory_usage(d_rank);
bytes += memoryKK->memory_usage(d_num_ms_combs);
bytes += memoryKK->memory_usage(d_offsets);
bytes += memoryKK->memory_usage(d_mus);
bytes += memoryKK->memory_usage(d_ns);
bytes += memoryKK->memory_usage(d_ls);
bytes += memoryKK->memory_usage(d_ms_combs);
bytes += memoryKK->memory_usage(d_ctildes);
bytes += memoryKK->memory_usage(alm);
bytes += memoryKK->memory_usage(blm);
bytes += memoryKK->memory_usage(cl);
bytes += memoryKK->memory_usage(dl);
for (int i = 0; i < nelements; i++)
for (int j = 0; j < nelements; j++)
bytes += k_splines_gk.h_view(i, j).memory_usage();
return bytes;
}
/* ---------------------------------------------------------------------- */
namespace LAMMPS_NS {
template class PairPACEKokkos<LMPDeviceType>;
#ifdef LMP_KOKKOS_GPU
template class PairPACEKokkos<LMPHostType>;
#endif
}