Merge pull request #1 from rohskopf/sna-grid-kokkos

compute sna/grid/kokkos
This commit is contained in:
Lenz Fiedler
2023-06-07 13:57:50 +02:00
committed by GitHub
11 changed files with 1474 additions and 12 deletions

View File

@ -0,0 +1,81 @@
// clang-format off
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "compute_sna_grid_kokkos.h"
#include "compute_sna_grid_kokkos_impl.h"
namespace LAMMPS_NS {
template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
#ifdef LMP_KOKKOS_GPU
template class ComputeSNAGridKokkosHost<LMPHostType>;
#endif
}
// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
/*
#include "compute_sna_grid_kokkos.h"
#include "atom_kokkos.h"
#include "atom_masks.h"
#include "comm.h"
#include "error.h"
#include "memory_kokkos.h"
#include "modify.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "neighbor_kokkos.h"
#include "sna_kokkos.h"
#include "update.h"
using namespace LAMMPS_NS;
// ----------------------------------------------------------------------
template<class DeviceType>
ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
ComputeSNAGrid(lmp, narg, arg)
{
printf("^^^ inside ComputeSNAGridKokkos constructor\n");
kokkosable = 1;
atomKK = (AtomKokkos *) atom;
execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
datamask_read = EMPTY_MASK;
datamask_modify = EMPTY_MASK;
}
// ----------------------------------------------------------------------
template<class DeviceType>
ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
{
if (copymode) return;
}
namespace LAMMPS_NS {
template class ComputeSNAGridKokkos<LMPDeviceType>;
#ifdef LMP_KOKKOS_GPU
template class ComputeSNAGridKokkos<LMPHostType>;
#endif
}
*/

View File

@ -0,0 +1,366 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifdef COMPUTE_CLASS
// clang-format off
ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
#ifdef LMP_KOKKOS_GPU
ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
#else
ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
#endif
// clang-format on
#else
// clang-format off
#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
#include "compute_sna_grid.h"
#include "kokkos_type.h"
//#include "pair_snap.h"
//#include "kokkos_type.h"
//#include "neigh_list_kokkos.h"
#include "sna_kokkos.h"
//#include "pair_kokkos.h"
namespace LAMMPS_NS {
// Routines for both the CPU and GPU backend
//template<int NEIGHFLAG, int EVFLAG>
//struct TagPairSNAPComputeForce{};
// GPU backend only
/*
struct TagPairSNAPComputeNeigh{};
struct TagPairSNAPComputeCayleyKlein{};
struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
struct TagPairSNAPComputeZi{};
struct TagPairSNAPBeta{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeYiWithZlist{};
template<int dir>
struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
template<int dir>
struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
*/
//struct TagPairSNAPPreUi{};
struct TagCSNAGridComputeNeigh{};
struct TagCSNAGridComputeCayleyKlein{};
struct TagCSNAGridPreUi{};
struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
struct TagCSNAGridComputeZi{};
struct TagCSNAGridComputeBi{};
struct TagCSNAGridTransformBi{}; // re-order blist from AoSoA to AoS
struct TagCSNAGridLocalFill{}; // fill the gridlocal array
//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
struct TagComputeSNAGridLoop{};
struct TagComputeSNAGrid3D{};
//struct TagCSNAGridTeam{};
// CPU backend only
/*
struct TagPairSNAPComputeNeighCPU{};
struct TagPairSNAPPreUiCPU{};
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPTransformUiCPU{};
struct TagPairSNAPComputeZiCPU{};
struct TagPairSNAPBetaCPU{};
struct TagPairSNAPComputeBiCPU{};
struct TagPairSNAPZeroYiCPU{};
struct TagPairSNAPComputeYiCPU{};
struct TagPairSNAPComputeDuidrjCPU{};
struct TagPairSNAPComputeDeidrjCPU{};
*/
struct TagComputeSNAGridLoopCPU{};
//template<class DeviceType>
template<class DeviceType, typename real_type_, int vector_length_>
class ComputeSNAGridKokkos : public ComputeSNAGrid {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
static constexpr int vector_length = vector_length_;
using real_type = real_type_;
using complex = SNAComplex<real_type>;
// Static team/tile sizes for device offload
#ifdef KOKKOS_ENABLE_HIP
static constexpr int team_size_compute_neigh = 2;
static constexpr int tile_size_compute_ck = 2;
static constexpr int tile_size_pre_ui = 2;
static constexpr int team_size_compute_ui = 2;
static constexpr int tile_size_transform_ui = 2;
static constexpr int tile_size_compute_zi = 2;
static constexpr int tile_size_compute_bi = 2;
static constexpr int tile_size_transform_bi = 2;
static constexpr int tile_size_compute_yi = 2;
static constexpr int team_size_compute_fused_deidrj = 2;
#else
static constexpr int team_size_compute_neigh = 4;
static constexpr int tile_size_compute_ck = 4;
static constexpr int tile_size_pre_ui = 4;
static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
static constexpr int tile_size_transform_ui = 4;
static constexpr int tile_size_compute_zi = 8;
static constexpr int tile_size_compute_bi = 4;
static constexpr int tile_size_transform_bi = 4;
static constexpr int tile_size_compute_yi = 8;
static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
#endif
// Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
// This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
// and reduces the verbosity of the LaunchBound by hiding the explicit
// multiplication by vector_length
template <class Device, int num_tiles, class TagComputeSNAP>
using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
// MDRangePolicy for the 3D grid loop:
template <class Device, class TagComputeSNAP>
using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
// Testing out team policies
template <class Device, int num_teams, class TagComputeSNAP>
using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
//using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNAP>;
//using team_member = typename team_policy::member_type;
// Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
// This hides the LaunchBounds abstraction by hiding the explicit
// multiplication by vector length
template <class Device, int num_teams, class TagComputeSNAP>
using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
ComputeSNAGridKokkos(class LAMMPS *, int, char **);
~ComputeSNAGridKokkos() override;
void init() override;
void setup() override;
void compute_array() override;
// Utility functions for teams
template<class TagStyle>
void check_team_size_for(int, int&);
template<class TagStyle>
void check_team_size_reduce(int, int&);
// operator function for example team policy
//KOKKOS_INLINE_FUNCTION
//void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeam>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagComputeSNAGridLoop, const int& ) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagComputeSNAGridLoopCPU, const int&) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
// PrintNeigh
//void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPrintNeigh>::member_type& team) const;
// 3D case - used by parallel_for
KOKKOS_INLINE_FUNCTION
void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridLocalFill,const int& ii) const;
protected:
SNAKokkos<DeviceType, real_type, vector_length> snaKK;
int max_neighs, chunk_size, chunk_offset;
int host_flag;
int ntotal;
int total_range; // total number of loop iterations in grid
int zlen; //= nzhi-nzlo+1;
int ylen; //= nyhi-nylo+1;
int xlen; //= nxhi-nxlo+1;
double cutsq_tmp; // temporary cutsq until we get a view
Kokkos::View<real_type*, DeviceType> d_radelem; // element radii
Kokkos::View<real_type*, DeviceType> d_wjelem; // elements weights
//Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem; // element bispectrum coefficients
Kokkos::View<real_type*, DeviceType> d_sinnerelem; // element inner cutoff midpoint
Kokkos::View<real_type*, DeviceType> d_dinnerelem; // element inner cutoff half-width
Kokkos::View<T_INT*, DeviceType> d_ninside; // ninside for all atoms in list
Kokkos::View<T_INT*, DeviceType> d_map; // mapping from atom types to elements
Kokkos::View<real_type*, DeviceType> d_test; // test view
typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
tdual_fparams k_cutsq;
typedef Kokkos::View<const F_FLOAT**, DeviceType,
Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
t_fparams_rnd rnd_cutsq;
typename AT::t_x_array_randomread x;
typename AT::t_int_1d_randomread type;
DAT::tdual_float_2d k_grid;
DAT::tdual_float_2d k_gridall;
typename AT::t_float_2d d_grid;
typename AT::t_float_2d d_gridall;
//DAT::tdual_float_4d k_gridlocal;
//typedef Kokkos::DualView<real_type****, Kokkos::LayoutLeft, DeviceType> t_gridlocal_4d;
//typedef Kokkos::View<real_type****, DeviceType> t_4d;
// should we use LMPDeviceType below?
//typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
//typedef tdual_float_4d::t_dev tdev_float_4d;
//tdual_float_4d k_gridlocal;
//tdev_float_4d d_gridlocal;
DAT::tdual_float_4d k_gridlocal;
typename AT::t_float_4d d_gridlocal;
// Utility routine which wraps computing per-team scratch size requirements for
// ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
template <typename scratch_type>
int scratch_size_helper(int values_per_team);
class DomainKokkos *domainKK;
// triclinic vars
/*
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
*/
double h0, h1, h2, h3, h4, h5;
double lo0, lo1, lo2;
};
// These wrapper classes exist to make the compute style factory happy/avoid having
// to extend the compute style factory to support Compute classes w/an arbitrary number
// of extra template parameters
template <class DeviceType>
class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
private:
using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
public:
ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
void init() override;
void compute_array() override;
};
#ifdef LMP_KOKKOS_GPU
template <class DeviceType>
class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
private:
using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
public:
ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
void init() override;
void compute_array() override;
};
#endif
}
#endif
#endif
// The following will compile with the chunk in cpp file but we're gonna try wrapper like pair snap.
/*
#ifdef COMPUTE_CLASS
// clang-format off
ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
// clang-format on
#else
// clang-format off
#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
#include "compute_sna_grid.h"
#include "kokkos_type.h"
namespace LAMMPS_NS {
//template<int CSTYLE, int NCOL>
//struct TagComputeCoordAtom{};
template<class DeviceType>
class ComputeSNAGridKokkos : public ComputeSNAGrid {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
ComputeSNAGridKokkos(class LAMMPS *, int, char **);
~ComputeSNAGridKokkos() override;
private:
};
}
#endif
#endif
*/

View File

@ -0,0 +1,932 @@
// clang-format off
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
Evan Weinberg (NVIDIA)
------------------------------------------------------------------------- */
#include "compute_sna_grid_kokkos.h"
#include "pair_snap_kokkos.h"
#include "atom_kokkos.h"
#include "atom_masks.h"
#include "comm.h"
#include "error.h"
#include "memory_kokkos.h"
#include "modify.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "neighbor_kokkos.h"
//#include "sna_kokkos.h"
#include "domain.h"
#include "domain_kokkos.h"
#include "sna.h"
#include "update.h"
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <iostream>
#define MAXLINE 1024
#define MAXWORD 3
namespace LAMMPS_NS {
// Constructor
template<class DeviceType, typename real_type, int vector_length>
ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
{
kokkosable = 1;
atomKK = (AtomKokkos *) atom;
domainKK = (DomainKokkos *) domain;
execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
datamask_read = EMPTY_MASK;
datamask_modify = EMPTY_MASK;
k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
auto d_cutsq = k_cutsq.template view<DeviceType>();
rnd_cutsq = d_cutsq;
host_flag = (execution_space == Host);
// TODO: Extract cutsq in double loop below, no need for cutsq_tmp
cutsq_tmp = cutsq[1][1];
for (int i = 1; i <= atom->ntypes; i++) {
for (int j = 1; j <= atom->ntypes; j++){
k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
k_cutsq.template modify<LMPHostType>();
}
}
// Set up element lists
MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
// test
MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
int n = atom->ntypes;
MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
auto h_radelem = Kokkos::create_mirror_view(d_radelem);
auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
auto h_map = Kokkos::create_mirror_view(d_map);
// test
auto h_test = Kokkos::create_mirror_view(d_test);
h_test(0) = 2.0;
// start from index 1 because of how compute sna/grid is
for (int i = 1; i <= atom->ntypes; i++) {
h_radelem(i-1) = radelem[i];
h_wjelem(i-1) = wjelem[i];
if (switchinnerflag){
h_sinnerelem(i) = sinnerelem[i];
h_dinnerelem(i) = dinnerelem[i];
}
}
// In pair snap some things like `map` get allocated regardless of chem flag.
if (chemflag){
for (int i = 1; i <= atom->ntypes; i++) {
h_map(i) = map[i];
}
}
Kokkos::deep_copy(d_radelem,h_radelem);
Kokkos::deep_copy(d_wjelem,h_wjelem);
if (switchinnerflag){
Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
}
if (chemflag){
Kokkos::deep_copy(d_map,h_map);
}
Kokkos::deep_copy(d_test,h_test);
double bytes = MemKK::memory_usage(d_wjelem);
snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
snaKK.grow_rij(0,0);
snaKK.init();
}
// Destructor
template<class DeviceType, typename real_type, int vector_length>
ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
{
if (copymode) return;
memoryKK->destroy_kokkos(k_cutsq,cutsq);
//memoryKK->destroy_kokkos(k_grid,grid);
memoryKK->destroy_kokkos(k_gridall, gridall);
//memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
}
// Init
template<class DeviceType, typename real_type, int vector_length>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
{
if (host_flag) {
return;
}
ComputeSNAGrid::init();
}
// Setup
template<class DeviceType, typename real_type, int vector_length>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
{
// Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
// Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
ComputeGrid::set_grid_global();
ComputeGrid::set_grid_local();
// allocate arrays
//printf(">>> Allocating gridall.\n");
//printf(">>> %d %d\n", size_array_rows, size_array_cols);
//memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
//printf(">>> Allocated gridall.\n");
// do not use or allocate gridlocal for now
gridlocal_allocated = 0;
/*
if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
gridlocal_allocated = 1;
memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo,
nyhi, nxlo, nxhi, "grid:gridlocal");
}
*/
array = gridall;
d_gridlocal = k_gridlocal.template view<DeviceType>();
//d_grid = k_grid.template view<DeviceType>();
d_gridall = k_gridall.template view<DeviceType>();
}
// Compute
template<class DeviceType, typename real_type, int vector_length>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
{
if (host_flag) {
return;
}
copymode = 1;
zlen = nzhi-nzlo+1;
ylen = nyhi-nylo+1;
xlen = nxhi-nxlo+1;
total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
x = atomKK->k_x.view<DeviceType>();
type = atomKK->k_type.view<DeviceType>();
k_cutsq.template sync<DeviceType>();
// max_neighs is defined here - think of more elaborate methods.
max_neighs = 100;
// Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
// number of atoms.
ntotal = atomKK->nlocal + atomKK->nghost;
// Allocate view for number of neighbors per grid point
MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
// "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
// `total_range` is the number of grid points which may be larger than chunk size.
//printf(">>> total_range: %d\n", total_range);
chunk_size = MIN(chunksize, total_range);
chunk_offset = 0;
//snaKK.grow_rij(chunk_size, ntotal);
snaKK.grow_rij(chunk_size, max_neighs);
//chunk_size = total_range;
// Pre-compute ceil(chunk_size / vector_length) for code cleanliness
const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
if (triclinic){
/*
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
*/
h0 = domain->h[0];
h1 = domain->h[1];
h2 = domain->h[2];
h3 = domain->h[3];
h4 = domain->h[4];
h5 = domain->h[5];
lo0 = domain->boxlo[0];
lo1 = domain->boxlo[1];
lo2 = domain->boxlo[2];
}
while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
if (chunk_size > total_range - chunk_offset)
chunk_size = total_range - chunk_offset;
//printf(">>> chunk_offset: %d\n", chunk_offset);
//ComputeNeigh
{
int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
}
//ComputeCayleyKlein
{
// tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
}
//PreUi
{
// tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
Kokkos::parallel_for("PreUi",policy_preui,*this);
}
// ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
{
// team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
// scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
const int tile_size = vector_length * (twojmax + 1);
const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
if (chunk_size < parallel_thresh)
{
// Version with parallelism over j_bend
// total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
} else {
// Version w/out parallelism over j_bend
// total number of teams needed: (natoms / 32) * (ntotal)
const int n_teams = chunk_size_div * max_neighs;
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
}
}
//TransformUi: un-"fold" ulisttot, zero ylist
{
// team_size_transform_ui is defined in `pair_snap_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
}
//Compute bispectrum in AoSoA data layout, transform Bi
//ComputeZi
const int idxz_max = snaKK.idxz_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
//ComputeBi
const int idxb_max = snaKK.idxb_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
//Transform data layout of blist out of AoSoA
//We need this because `blist` gets used in ComputeForce which doesn't
//take advantage of AoSoA, which at best would only be beneficial on the margins
//NOTE: Do we need this in compute sna/grid/kk?
Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
// Fill the grid array with bispectrum values
{
typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
Kokkos::parallel_for(policy_fill, *this);
}
// Proceed to the next chunk.
chunk_offset += chunk_size;
} // end while
k_gridlocal.template modify<DeviceType>();
k_gridlocal.template sync<LMPHostType>();
//k_grid.template modify<DeviceType>();
//k_grid.template sync<LMPHostType>();
k_gridall.template modify<DeviceType>();
k_gridall.template sync<LMPHostType>();
}
/* ----------------------------------------------------------------------
Begin routines that are unique to the GPU codepath. These take advantage
of AoSoA data layouts and scratch memory for recursive polynomials
------------------------------------------------------------------------- */
/*
Simple team policy functor seeing how many layers deep we can go with the parallelism.
*/
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
// This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
// Main difference is that we don't use the neighbor class or neighbor variables here.
// This is because the grid points are not atoms and therefore do not get assigned
// neighbors in LAMMPS.
// TODO: If we did make a neighborlist for each grid point, we could use current
// routines and avoid having to loop over all atoms (which limits us to
// natoms = max team size).
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// basic quantities associated with this team:
// team_rank : rank of thread in this team
// league_rank : rank of team in this league
// team_size : number of threads in this team
// extract loop index
int ii = team.team_rank() + team.league_rank() * team.team_size();
if (ii >= chunk_size) return;
// extract grid index
int igrid = ii + chunk_offset;
// get a pointer to scratch memory
// This is used to cache whether or not an atom is within the cutoff.
// If it is, type_cache is assigned to the atom type.
// If it's not, it's assigned to -1.
const int tile_size = ntotal; //max_neighs; // number of elements per thread
const int team_rank = team.team_rank();
const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
// convert to grid indices
int iz = igrid/(xlen*ylen);
int i2 = igrid - (iz*xlen*ylen);
int iy = i2/xlen;
int ix = i2 % xlen;
iz += nzlo;
iy += nylo;
ix += nxlo;
double xgrid[3];
// index ii already captures the proper grid point
//int igrid = iz * (nx * ny) + iy * nx + ix;
//printf("%d %d\n", ii, igrid);
// grid2x converts igrid to ix,iy,iz like we've done before
// multiply grid integers by grid spacing delx, dely, delz
//grid2x(igrid, xgrid);
xgrid[0] = ix * delx;
xgrid[1] = iy * dely;
xgrid[2] = iz * delz;
if (triclinic) {
// Do a conversion on `xgrid` here like we do in the CPU version.
// Can't do this:
// domainKK->lamda2x(xgrid, xgrid);
// Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
// Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
/*
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
*/
xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
xgrid[2] = h2*xgrid[2] + lo2;
}
const F_FLOAT xtmp = xgrid[0];
const F_FLOAT ytmp = xgrid[1];
const F_FLOAT ztmp = xgrid[2];
// currently, all grid points are type 1
// not clear what a better choice would be
const int itype = 1;
int ielem = 0;
if (chemflag) ielem = d_map[itype];
const double radi = d_radelem[ielem];
// We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
// The purpose here is to transform for triclinic boxes.
/*
if (triclinic){
printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
}
*/
// Compute the number of neighbors, store rsq
int ninside = 0;
// Looping over ntotal for now.
for (int j = 0; j < ntotal; j++){
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
int jtype = type(j);
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
// don't include atoms that share location with grid point
if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
jtype = -1; // use -1 to signal it's outside the radius
}
if (jtype >= 0)
ninside++;
}
/*
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
[&] (const int j, int& count) {
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
int jtype = type(j);
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
// don't include atoms that share location with grid point
if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
jtype = -1; // use -1 to signal it's outside the radius
}
type_cache[j] = jtype;
if (jtype >= 0)
count++;
}, ninside);
*/
d_ninside(ii) = ninside;
// TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
int offset = 0;
for (int j = 0; j < ntotal; j++){
//const int jtype = type_cache[j];
//if (jtype >= 0) {
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
int jtype = type(j);
if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
int jelem = 0;
if (chemflag) jelem = d_map[jtype];
my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
// pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
// actually since the views here have values starting at 0, let's use jelem
my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
my_sna.inside(ii,offset) = j;
if (switchinnerflag) {
my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
}
if (chemflag)
my_sna.element(ii,offset) = jelem;
else
my_sna.element(ii,offset) = 0;
offset++;
}
}
/*
int offset = 0;
for (int j = 0; j < ntotal; j++){
const int jtype = type_cache[j];
if (jtype >= 0) {
printf(">>> offset: %d\n", offset);
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
int jtype = type(j);
int jelem = 0;
if (chemflag) jelem = d_map[jtype];
my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
// pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
// actually since the views here have values starting at 0, let's use jelem
my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
my_sna.inside(ii,offset) = j;
if (switchinnerflag) {
my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
}
if (chemflag)
my_sna.element(ii,offset) = jelem;
else
my_sna.element(ii,offset) = 0;
offset++;
}
}
*/
/*
Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
[&] (const int j, int& offset, bool final) {
const int jtype = type_cache[j];
if (jtype >= 0) {
if (final) {
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
int jtype = type(j);
int jelem = 0;
if (chemflag) jelem = d_map[jtype];
my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
// pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
// actually since the views here have values starting at 0, let's use jelem
my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
my_sna.inside(ii,offset) = j;
if (switchinnerflag) {
my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
}
if (chemflag)
my_sna.element(ii,offset) = jelem;
else
my_sna.element(ii,offset) = 0;
}
offset++;
}
});
*/
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int ii = iatom_mod + iatom_div * vector_length;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jnbor >= ninside) return;
my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int ii = iatom_mod + iatom_div * vector_length;
if (ii >= chunk_size) return;
int itype = type(ii);
// force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
int ielem = 0;
my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract flattened atom_div / neighbor number / bend_location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
const int jbend = jj_jbend / max_neighs;
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
int jj = flattened_idx - iatom_div * max_neighs;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (idxu > my_sna.idxu_max) return;
int elem_count = chemflag ? nelements : 1;
for (int ielem = 0; ielem < elem_count; ielem++){
const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
if (mapper.flip_sign == 1){
utot_im = -utot_im;
} else if (mapper.flip_sign == -1){
utot_re = -utot_re;
}
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
if (mapper.flip_sign == 0) {
my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
}
}
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= my_sna.idxz_max) return;
my_sna.compute_zi(iatom_mod,jjz,iatom_div);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjb >= my_sna.idxb_max) return;
my_sna.compute_bi(iatom_mod,jjb,iatom_div);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (idxb >= my_sna.idxb_max) return;
const int ntriples = my_sna.ntriples;
for (int itriple = 0; itriple < ntriples; itriple++) {
const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
my_sna.blist(iatom, itriple, idxb) = blocal;
}
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract grid index
int igrid = ii + chunk_offset;
// convert to grid indices
int iz = igrid/(xlen*ylen);
int i2 = igrid - (iz*xlen*ylen);
int iy = i2/xlen;
int ix = i2 % xlen;
iz += nzlo;
iy += nylo;
ix += nxlo;
double xgrid[3];
// index ii already captures the proper grid point
// int igrid = iz * (nx * ny) + iy * nx + ix;
// printf("ii igrid: %d %d\n", ii, igrid);
// grid2x converts igrid to ix,iy,iz like we've done before
//grid2x(igrid, xgrid);
xgrid[0] = ix * delx;
xgrid[1] = iy * dely;
xgrid[2] = iz * delz;
if (triclinic) {
// Do a conversion on `xgrid` here like we do in the CPU version.
// Can't do this:
// domainKK->lamda2x(xgrid, xgrid);
// Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
// Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
/*
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
*/
xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
xgrid[2] = h2*xgrid[2] + lo2;
}
const F_FLOAT xtmp = xgrid[0];
const F_FLOAT ytmp = xgrid[1];
const F_FLOAT ztmp = xgrid[2];
d_gridall(igrid,0) = xtmp;
d_gridall(igrid,1) = ytmp;
d_gridall(igrid,2) = ztmp;
const auto idxb_max = snaKK.idxb_max;
// linear contributions
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
const auto idxb = icoeff % idxb_max;
const auto idx_chem = icoeff / idxb_max;
d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
}
}
/* ----------------------------------------------------------------------
utility functions
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
template<class TagStyle>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
int team_size_max;
team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
}
template<class DeviceType, typename real_type, int vector_length>
template<class TagStyle>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
int team_size_max;
team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
}
template<class DeviceType, typename real_type, int vector_length>
template<typename scratch_type>
int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
return ScratchViewType::shmem_size(values_per_team);
}
/* ---------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
routines used by template reference classes
------------------------------------------------------------------------- */
template<class DeviceType>
ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
: ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
template<class DeviceType>
void ComputeSNAGridKokkosDevice<DeviceType>::init()
{
Base::init();
}
template<class DeviceType>
void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
{
Base::compute_array();
}
#ifdef LMP_KOKKOS_GPU
template<class DeviceType>
ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
: ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
template<class DeviceType>
void ComputeSNAGridKokkosHost<DeviceType>::init()
{
Base::init();
}
template<class DeviceType>
void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
{
Base::compute_array();
}
#endif
}

View File

@ -769,6 +769,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um;
typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um;
typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread;
//4d float array n
typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
typedef tdual_float_4d::t_dev t_float_4d;
typedef tdual_float_4d::t_dev_const t_float_4d_const;
typedef tdual_float_4d::t_dev_um t_float_4d_um;
typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um;
typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread;
#ifdef LMP_KOKKOS_NO_LEGACY
typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
#else
@ -1075,6 +1083,14 @@ typedef tdual_float_2d::t_host_um t_float_2d_um;
typedef tdual_float_2d::t_host_const_um t_float_2d_const_um;
typedef tdual_float_2d::t_host_const_randomread t_float_2d_randomread;
//4d float array n
typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
typedef tdual_float_4d::t_host t_float_4d;
typedef tdual_float_4d::t_host_const t_float_4d_const;
typedef tdual_float_4d::t_host_um t_float_4d_um;
typedef tdual_float_4d::t_host_const_um t_float_4d_const_um;
typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread;
#ifdef LMP_KOKKOS_NO_LEGACY
typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
#else

View File

@ -163,6 +163,7 @@ template <typename TYPE, typename HTYPE>
{
data = TYPE(std::string(name),n1,n2);
h_data = Kokkos::create_mirror_view(data);
//printf(">>> name: %s\n", name);
return data;
}
@ -173,6 +174,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
data = TYPE(std::string(name),n1,n2);
bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
array = (typename TYPE::value_type **) smalloc(nbytes,name);
//printf(">>> name %s nbytes %d\n", name, nbytes);
for (int i = 0; i < n1; i++) {
if (n2 == 0)
@ -183,6 +185,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
return data;
}
/* ----------------------------------------------------------------------
create a 4d array with indices 2,3,4 offset, but not first
2nd index from n2lo to n2hi inclusive
3rd index from n3lo to n3hi inclusive
4th index from n4lo to n4hi inclusive
cannot grow it
------------------------------------------------------------------------- */
template <typename TYPE>
TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
const char *name)
{
//if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array = nullptr;
printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
int n2 = n2hi - n2lo + 1;
int n3 = n3hi - n3lo + 1;
int n4 = n4hi - n4lo + 1;
data = TYPE(std::string(name),n1,n2,n3,n4);
bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
array = (typename TYPE::value_type ****) smalloc(nbytes,name);
for (int i = 0; i < n1; i++) {
if (n2 == 0) {
array[i] = nullptr;
} else {
nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
for (int j = 0; j < n2; j++){
if (n3 == 0){
array[i][j] = nullptr;
} else {
nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
for (int k = 0; k < n3; k++){
if (n4 == 0)
array[i][j][k] = nullptr;
else
array[i][j][k] = &data.h_view(i,j,k,0);
}
}
}
}
}
return data;
}
template <typename TYPE, typename HTYPE>
TYPE create_kokkos(TYPE &data, HTYPE &h_data,
typename TYPE::value_type **&array, int n1, int n2,

View File

@ -232,6 +232,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
// map[i] = which element the Ith atom type is, -1 if not mapped
// map[0] is not used
//printf(">>> ntypes: %d\n", atom->ntypes);
for (int i = 1; i <= atom->ntypes; i++) {
char* elemname = elemtypes[i-1];
int jelem;
@ -239,6 +240,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
if (strcmp(elemname,descriptor->elements[jelem]) == 0)
break;
//printf(">>> nelements: %d\n", descriptor->nelements);
if (jelem < descriptor->nelements)
map[i] = jelem;
else if (strcmp(elemname,"NULL") == 0) map[i] = -1;

View File

@ -664,6 +664,8 @@ template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh>::member_type& team) const {
printf("d_wjelem: %f %f %f %f\n", d_wjelem[0], d_wjelem[1], d_wjelem(0), d_wjelem(1));
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract atom number

View File

@ -643,7 +643,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
}
ulist_wrapper.set(ma, ulist_accum);
mb++;
}
@ -870,7 +869,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
int icgb = mb1min*(j2+1) + mb2max;
#ifdef LMP_KK_DEVICE_COMPILE
#pragma unroll
#endif
@ -2298,7 +2296,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_s_dsfac(const real
constexpr real_type zero = static_cast<real_type>(0.0);
constexpr real_type onehalf = static_cast<real_type>(0.5);
if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; }
if (switch_flag == 0) { sfac_outer = one; dsfac_outer = zero; }
else if (switch_flag == 1) {
if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; }
else if (r > rcut) { sfac = zero; dsfac = zero; return; }

View File

@ -57,6 +57,7 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :
ComputeGrid::~ComputeGrid()
{
if (copymode) return;
deallocate();
}
@ -87,6 +88,7 @@ void ComputeGrid::grid2x(int igrid, double *x)
x[2] = iz * delz;
if (triclinic) domain->lamda2x(x, x);
//printf(">>>>> ComputeGrid::grid2x\n");
}
/* ----------------------------------------------------------------------
@ -102,6 +104,7 @@ void ComputeGrid::assign_coords_all()
gridall[igrid][1] = x[1];
gridall[igrid][2] = x[2];
}
//printf(">>>>> ComputeGrid::assign_coords_all\n");
}
/* ----------------------------------------------------------------------
@ -110,8 +113,8 @@ void ComputeGrid::assign_coords_all()
void ComputeGrid::allocate()
{
//printf(">>> ComputeGrid::allocate\n");
// allocate arrays
memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {

View File

@ -31,14 +31,13 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
// skip over arguments used by base class
// so that argument positions are identical to
// regular per-atom compute
arg += nargbase;
narg -= nargbase;
// begin code common to all SNAP computes
double rfac0, rmin0;
int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
//double rfac0, rmin0;
//int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
int ntypes = atom->ntypes;
int nargmin = 6 + 2 * ntypes;
@ -56,6 +55,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
wselfallflag = 0;
switchinnerflag = 0;
nelements = 1;
chunksize = 32768;
parallel_thresh = 8192;
// process required arguments
@ -67,8 +68,9 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
for (int i = 0; i < ntypes; i++)
for (int i = 0; i < ntypes; i++) {
wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
}
// construct cutsq
@ -181,11 +183,12 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
ComputeSNAGrid::~ComputeSNAGrid()
{
if (copymode) return;
memory->destroy(radelem);
memory->destroy(wjelem);
memory->destroy(cutsq);
delete snaptr;
if (chemflag) memory->destroy(map);
}
@ -202,6 +205,7 @@ void ComputeSNAGrid::init()
void ComputeSNAGrid::compute_array()
{
invoked_array = update->ntimestep;
// compute sna for each gridpoint

View File

@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
void init() override;
void compute_array() override;
double memory_usage() override;
int ncoeff,nelements; // public for kokkos, but could go in the protected block now
private:
int ncoeff;
protected:
//int ncoeff;
double **cutsq;
double rcutfac;
double *radelem;
double *wjelem;
int *map; // map types to [0,nelements)
int nelements, chemflag;
int chemflag;
int switchinnerflag;
double *sinnerelem;
double *dinnerelem;
int parallel_thresh;
class SNA *snaptr;
double cutmax;
int quadraticflag;
double rfac0, rmin0;
int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
int chunksize;
};
} // namespace LAMMPS_NS