Add all SNAP computations and 4D view Kokkos memory allocator

This commit is contained in:
rohskopf
2023-04-08 11:19:07 -06:00
parent de4dbec661
commit 212b864052
7 changed files with 1118 additions and 57 deletions

View File

@ -23,3 +23,59 @@ template class ComputeSNAGridKokkosHost<LMPHostType>;
#endif
}
// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
/*
#include "compute_sna_grid_kokkos.h"
#include "atom_kokkos.h"
#include "atom_masks.h"
#include "comm.h"
#include "error.h"
#include "memory_kokkos.h"
#include "modify.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "neighbor_kokkos.h"
#include "sna_kokkos.h"
#include "update.h"
using namespace LAMMPS_NS;
// ----------------------------------------------------------------------
template<class DeviceType>
ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
ComputeSNAGrid(lmp, narg, arg)
{
printf("^^^ inside ComputeSNAGridKokkos constructor\n");
kokkosable = 1;
atomKK = (AtomKokkos *) atom;
execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
datamask_read = EMPTY_MASK;
datamask_modify = EMPTY_MASK;
}
// ----------------------------------------------------------------------
template<class DeviceType>
ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
{
if (copymode) return;
}
namespace LAMMPS_NS {
template class ComputeSNAGridKokkos<LMPDeviceType>;
#ifdef LMP_KOKKOS_GPU
template class ComputeSNAGridKokkos<LMPHostType>;
#endif
}
*/

View File

@ -29,42 +29,233 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
#include "compute_sna_grid.h"
#include "kokkos_type.h"
//#include "pair_snap.h"
//#include "kokkos_type.h"
//#include "neigh_list_kokkos.h"
#include "sna_kokkos.h"
//#include "pair_kokkos.h"
namespace LAMMPS_NS {
// Routines for both the CPU and GPU backend
//template<int NEIGHFLAG, int EVFLAG>
//struct TagPairSNAPComputeForce{};
// GPU backend only
/*
struct TagPairSNAPComputeNeigh{};
struct TagPairSNAPComputeCayleyKlein{};
struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
struct TagPairSNAPComputeZi{};
struct TagPairSNAPBeta{};
struct TagPairSNAPComputeBi{};
struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeYiWithZlist{};
template<int dir>
struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
template<int dir>
struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
*/
//struct TagPairSNAPPreUi{};
struct TagCSNAGridComputeNeigh{};
struct TagCSNAGridComputeCayleyKlein{};
struct TagCSNAGridPreUi{};
struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
struct TagCSNAGridComputeZi{};
struct TagCSNAGridComputeBi{};
struct TagComputeSNAGridLoop{};
struct TagComputeSNAGrid3D{};
//struct TagCSNAGridTeam{};
// CPU backend only
/*
struct TagPairSNAPComputeNeighCPU{};
struct TagPairSNAPPreUiCPU{};
struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPTransformUiCPU{};
struct TagPairSNAPComputeZiCPU{};
struct TagPairSNAPBetaCPU{};
struct TagPairSNAPComputeBiCPU{};
struct TagPairSNAPZeroYiCPU{};
struct TagPairSNAPComputeYiCPU{};
struct TagPairSNAPComputeDuidrjCPU{};
struct TagPairSNAPComputeDeidrjCPU{};
*/
struct TagComputeSNAGridLoopCPU{};
//template<class DeviceType>
template<class DeviceType, typename real_type_, int vector_length_>
class ComputeSNAGridKokkos : public ComputeSNAGrid {
public:
//enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD};
//enum {COUL_FLAG=0};
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef EV_FLOAT value_type;
static constexpr int vector_length = vector_length_;
using real_type = real_type_;
//using complex = SNAComplex<real_type>;
using complex = SNAComplex<real_type>;
// Static team/tile sizes for device offload
#ifdef KOKKOS_ENABLE_HIP
static constexpr int team_size_compute_neigh = 2;
static constexpr int tile_size_compute_ck = 2;
static constexpr int tile_size_pre_ui = 2;
static constexpr int team_size_compute_ui = 2;
static constexpr int tile_size_transform_ui = 2;
static constexpr int tile_size_compute_zi = 2;
static constexpr int tile_size_compute_bi = 2;
static constexpr int tile_size_transform_bi = 2;
static constexpr int tile_size_compute_yi = 2;
static constexpr int team_size_compute_fused_deidrj = 2;
#else
static constexpr int team_size_compute_neigh = 4;
static constexpr int tile_size_compute_ck = 4;
static constexpr int tile_size_pre_ui = 4;
static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
static constexpr int tile_size_transform_ui = 4;
static constexpr int tile_size_compute_zi = 8;
static constexpr int tile_size_compute_bi = 4;
static constexpr int tile_size_transform_bi = 4;
static constexpr int tile_size_compute_yi = 8;
static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
#endif
// Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
// This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
// and reduces the verbosity of the LaunchBound by hiding the explicit
// multiplication by vector_length
template <class Device, int num_tiles, class TagComputeSNAP>
using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
// MDRangePolicy for the 3D grid loop:
template <class Device, class TagComputeSNAP>
using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
// Testing out team policies
template <class Device, int num_teams, class TagComputeSNAP>
using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
//using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNAP>;
//using team_member = typename team_policy::member_type;
// Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
// This hides the LaunchBounds abstraction by hiding the explicit
// multiplication by vector length
template <class Device, int num_teams, class TagComputeSNAP>
using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
ComputeSNAGridKokkos(class LAMMPS *, int, char **);
~ComputeSNAGridKokkos() override;
void init() override;
//void compute_array(int, int) override;
//double memory_usage() override;
void setup() override;
void compute_array() override;
// Utility functions for teams
template<class TagStyle>
void check_team_size_for(int, int&);
template<class TagStyle>
void check_team_size_reduce(int, int&);
// operator function for example team policy
//KOKKOS_INLINE_FUNCTION
//void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeam>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagComputeSNAGridLoop, const int& ) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagComputeSNAGridLoopCPU, const int&) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
// 3D case - used by parallel_for
KOKKOS_INLINE_FUNCTION
void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
protected:
SNAKokkos<DeviceType, real_type, vector_length> snaKK;
int chunk_size, chunk_offset;
int host_flag;
int ntotal;
int total_range; // total number of loop iterations in grid
int zlen; //= nzhi-nzlo+1;
int ylen; //= nyhi-nylo+1;
int xlen; //= nxhi-nxlo+1;
using KKDeviceType = typename KKDevice<DeviceType>::value;
double cutsq_tmp; // temporary cutsq until we get a view
Kokkos::View<real_type*, DeviceType> d_radelem; // element radii
Kokkos::View<real_type*, DeviceType> d_wjelem; // elements weights
//Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem; // element bispectrum coefficients
Kokkos::View<real_type*, DeviceType> d_sinnerelem; // element inner cutoff midpoint
Kokkos::View<real_type*, DeviceType> d_dinnerelem; // element inner cutoff half-width
Kokkos::View<T_INT*, DeviceType> d_ninside; // ninside for all atoms in list
Kokkos::View<T_INT*, DeviceType> d_map; // mapping from atom types to elements
typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
tdual_fparams k_cutsq;
typedef Kokkos::View<const F_FLOAT**, DeviceType,
Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
t_fparams_rnd rnd_cutsq;
typename AT::t_x_array_randomread x;
typename AT::t_int_1d_randomread type;
DAT::tdual_float_2d k_grid;
DAT::tdual_float_2d k_gridall;
typename AT::t_float_2d d_grid;
typename AT::t_float_2d d_gridall;
//DAT::tdual_float_4d k_gridlocal;
//typedef Kokkos::DualView<real_type****, Kokkos::LayoutLeft, DeviceType> t_gridlocal_4d;
//typedef Kokkos::View<real_type****, DeviceType> t_4d;
typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
tdual_float_4d k_gridlocal;
tdual_float_4d d_gridlocal;
// Utility routine which wraps computing per-team scratch size requirements for
// ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
template <typename scratch_type>
int scratch_size_helper(int values_per_team);
};
// These wrapper classes exist to make the compute style factory happy/avoid having
// to extend the compute style factory to support Compute classes w/an arbitrary number
// to extend the compute style factory to support Compute classes w/an arbitrary number
// of extra template parameters
template <class DeviceType>
@ -76,10 +267,9 @@ class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_
public:
ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
//ComputeSNAGridKokkosDevice(class LAMMPS *);
void init() override;
//double memory_usage() override;
void compute_array() override;
};
@ -93,10 +283,9 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KO
public:
ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
//ComputeSNAGridKokkosHost(class LAMMPS *);
void init();
//double memory_usage();
void init() override;
void compute_array() override;
};
#endif
@ -105,3 +294,45 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KO
#endif
#endif
// The following will compile with the chunk in cpp file but we're gonna try wrapper like pair snap.
/*
#ifdef COMPUTE_CLASS
// clang-format off
ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
// clang-format on
#else
// clang-format off
#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
#include "compute_sna_grid.h"
#include "kokkos_type.h"
namespace LAMMPS_NS {
//template<int CSTYLE, int NCOL>
//struct TagComputeCoordAtom{};
template<class DeviceType>
class ComputeSNAGridKokkos : public ComputeSNAGrid {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
ComputeSNAGridKokkos(class LAMMPS *, int, char **);
~ComputeSNAGridKokkos() override;
private:
};
}
#endif
#endif
*/

View File

@ -3,12 +3,10 @@
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
LAMMPS development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
@ -18,17 +16,20 @@
------------------------------------------------------------------------- */
#include "compute_sna_grid_kokkos.h"
#include "pair_snap_kokkos.h"
#include "atom_kokkos.h"
#include "atom_masks.h"
#include "comm.h"
#include "error.h"
#include "force.h"
#include "kokkos.h"
#include "memory_kokkos.h"
#include "neighbor_kokkos.h"
#include "modify.h"
#include "neigh_list.h"
#include "neigh_request.h"
#include "neighbor_kokkos.h"
//#include "sna_kokkos.h"
#include "sna.h"
#include "update.h"
#include <cmath>
#include <cstdlib>
@ -39,69 +40,757 @@
namespace LAMMPS_NS {
/* ---------------------------------------------------------------------- */
// Constructor
template<class DeviceType, typename real_type, int vector_length>
ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid (lmp, narg, arg)
ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
{
//respa_enable = 0;
kokkosable = 1;
atomKK = (AtomKokkos *) atom;
execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
//datamask_read = EMPTY_MASK;
//datamask_modify = EMPTY_MASK;
datamask_read = EMPTY_MASK;
datamask_modify = EMPTY_MASK;
//host_flag = (execution_space == Host);
k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
auto d_cutsq = k_cutsq.template view<DeviceType>();
rnd_cutsq = d_cutsq;
host_flag = (execution_space == Host);
// ComputeSNAGrid constructor allocates `map` so let's do same here.
// actually, let's move this down to init
//int n = atom->ntypes;
//printf("^^^ realloc d_map\n");
//MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
printf("^^^^^ cutsq: %f\n", cutsq[1][1]);
cutsq_tmp = cutsq[1][1];
//memoryKK->create_kokkos(k_gridlocal,
//printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]);
}
/* ---------------------------------------------------------------------- */
// Destructor
template<class DeviceType, typename real_type, int vector_length>
ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
{
if (copymode) return;
//memoryKK->destroy_kokkos(k_eatom,eatom);
//memoryKK->destroy_kokkos(k_vatom,vatom);
printf("^^^ Finish ComputeSNAGridKokkos destructor\n");
}
/* ---------------------------------------------------------------------- */
// Init
template<class DeviceType, typename real_type, int vector_length>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
{
printf("^^^ Begin ComputeSNAGridKokkos init()\n");
// The part of pair_snap_kokkos_impl.h that allocates snap params is coeff(), and it
// calls the original coeff function. So let's do that here:
ComputeSNAGrid::init();
// Set up element lists
printf("^^^ Begin kokkos reallocs\n");
MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
MemKK::realloc_kokkos(d_wjelem,"pair:wjelem",nelements);
// pair snap kokkos uses `ncoeffall` in the following, inherits from original.
//MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff);
MemKK::realloc_kokkos(d_sinnerelem,"pair:sinnerelem",nelements);
MemKK::realloc_kokkos(d_dinnerelem,"pair:dinnerelem",nelements);
int n = atom->ntypes;
//printf("^^^ realloc d_map\n");
printf("^^^ n: %d\n", n);
MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
printf("^^^ begin mirrow view creation\n");
auto h_radelem = Kokkos::create_mirror_view(d_radelem);
auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
//auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem);
auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
auto h_map = Kokkos::create_mirror_view(d_map);
printf("^^^ begin loop over elements, nelements = %d\n", nelements);
for (int ielem = 0; ielem < nelements; ielem++) {
printf("^^^^^ ielem %d\n", ielem);
h_radelem(ielem) = radelem[ielem];
printf("^^^^^ 1\n");
h_wjelem(ielem) = wjelem[ielem];
printf("^^^^^ 2\n");
if (switchinnerflag){
h_sinnerelem(ielem) = sinnerelem[ielem];
h_dinnerelem(ielem) = dinnerelem[ielem];
}
// pair snap kokkos uses `ncoeffall` in the following.
//for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
// h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff];
//}
}
printf("^^^ begin loop over map\n");
// NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where
// some things like `map` get allocated regardless of chem flag.
if (chemflag){
for (int i = 1; i <= atom->ntypes; i++) {
h_map(i) = map[i];
printf("%d\n", map[i]);
}
}
Kokkos::deep_copy(d_radelem,h_radelem);
Kokkos::deep_copy(d_wjelem,h_wjelem);
if (switchinnerflag){
Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
}
if (chemflag){
Kokkos::deep_copy(d_map,h_map);
}
snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
snaKK.grow_rij(0,0);
snaKK.init();
printf("^^^ inside ComputeSNAGridKokkos init\n");
// from pair_snap_kokkos_impl.h :
/*
if (host_flag) {
if (lmp->kokkos->nthreads > 1)
error->all(FLERR,"Pair style snap/kk can currently only run on a single "
"CPU thread");
PairSNAP::init_style();
// The following lmp->kokkos will compile error with pointer to incomplete class type not allowed.
//if (lmp->kokkos->nthreads > 1)
// error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single "
// "CPU thread");
ComputeSNAGrid::init();
return;
}
if (force->newton_pair == 0)
error->all(FLERR,"Pair style SNAP requires newton pair on");
printf("^^^ Finished ComputeSNAGridKokkos init\n");
// neighbor list request for KOKKOS
neighflag = lmp->kokkos->neighflag;
auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
!std::is_same<DeviceType,LMPDeviceType>::value);
request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
if (neighflag == FULL)
error->all(FLERR,"Must use half neighbor list style with pair snap/kk");
*/
}
// Setup
template<class DeviceType, typename real_type, int vector_length>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
{
// Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
// Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
//ComputeGrid::setup();
printf("^^^^^ SETUP!\n");
//printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]);
ComputeGrid::set_grid_global();
ComputeGrid::set_grid_local();
// allocate arrays
memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
gridlocal_allocated = 1;
memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo,
nyhi, nxlo, nxhi, "grid:gridlocal");
}
array = gridall;
}
// Compute
template<class DeviceType, typename real_type, int vector_length>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
{
printf("^^^ Begin ComputeSNAGridKokkos compute_array()\n");
if (DeviceType::in_parallel()) {
printf("^^^ compute_array() is a host function\n");
} else {
printf("^^^ compute_array() is not a host function\n");
}
if (host_flag) {
/*
atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK);
PairSNAP::compute(eflag_in,vflag_in);
atomKK->modified(Host,F_MASK);
*/
return;
}
copymode = 1;
zlen = nzhi-nzlo+1;
ylen = nyhi-nylo+1;
xlen = nxhi-nxlo+1;
printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi);
total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
x = atomKK->k_x.view<DeviceType>();
// This will error because trying to access host view on the device:
//printf("x(0,0): %f\n", x(0,0));
type = atomKK->k_type.view<DeviceType>();
k_cutsq.template sync<DeviceType>();
MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",total_range);
//printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi);
// Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
// number of atoms.
//const int ntotal = atomKK->nlocal + atomKK->nghost;
ntotal = atomKK->nlocal + atomKK->nghost;
//printf("^^^ ntotal: %d\n", ntotal);
// ensure rij, inside, and typej are of size jnum
// snaKK.grow_rij(int, int) requires 2 args where one is a chunksize.
chunk_size = MIN(chunksize, total_range); // "chunksize" variable is set by user
//printf("^^^ chunk_size: %d\n", chunk_size);
snaKK.grow_rij(chunk_size, ntotal);
// Launch 3 teams of the maximum number of threads per team
//const int team_size_max = team_policy(3, 1).team_size_max(
// TagCSNAGridTeamPolicy, Kokkos::ParallelForTag());
//typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeamPolicy> team_policy_test(3,1);
// Using custom policy:
/*
CSNAGridTeamPolicy<DeviceType, team_size_compute_neigh ,TagCSNAGridTeam> team_policy(chunk_size,team_size_compute_neigh,vector_length);
//team_policy = team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("TeamPolicy",team_policy,*this);
*/
chunk_size = total_range;
printf("%d %d %d\n", chunk_size, team_size_compute_neigh, vector_length);
// team_size_compute_neigh is defined in `pair_snap_kokkos.h`
// Pre-compute ceil(chunk_size / vector_length) for code cleanliness
const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
//ComputeNeigh
{
int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * ntotal);
SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
}
//ComputeCayleyKlein
{
// tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
policy_compute_ck({0,0,0}, {vector_length, ntotal, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
}
//PreUi
{
// tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
Kokkos::parallel_for("PreUi",policy_preui,*this);
}
// ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
{
// team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
// scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
const int tile_size = vector_length * (twojmax + 1);
const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
if (chunk_size < parallel_thresh)
{
// Version with parallelism over j_bend
// total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
const int n_teams = chunk_size_div * ntotal * (twojmax + 1);
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
} else {
// Version w/out parallelism over j_bend
// total number of teams needed: (natoms / 32) * (ntotal)
const int n_teams = chunk_size_div * ntotal;
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
}
}
//TransformUi: un-"fold" ulisttot, zero ylist
{
// team_size_transform_ui is defined in `pair_snap_kokkos.h`
Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
}
//Compute bispectrum in AoSoA data layout, transform Bi
//if (quadraticflag || eflag) {
//ComputeZi
const int idxz_max = snaKK.idxz_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
//ComputeBi
const int idxb_max = snaKK.idxb_max;
Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
//Looks like best way to grab blist is in a parallel_for
//Transform data layout of blist out of AoSoA
//We need this because `blist` gets used in ComputeForce which doesn't
//take advantage of AoSoA, which at best would only be beneficial on the margins
//NOTE: Do we need this in compute sna/grid/kk?
/*
Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagPairSNAPTransformBi>
policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
*/
// let's try a simple parallel for loop
// NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this
// function, because this is a host-device function.
/*
typename Kokkos::RangePolicy<DeviceType,TagComputeSNAGridLoop> policy_loop(0,4);
Kokkos::parallel_for("Loop",policy_loop,*this);
*/
// Simple working loop:
/*
Kokkos::parallel_for("Loop1", 4, KOKKOS_LAMBDA (const int& i) {
printf("Greeting from iteration %i\n",i);
});
*/
/*
// NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this
// function, because this is a host-device function.
const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
policy_compute_ck({0,0,0},{vector_length,ntotal,chunk_size_div},{vector_length,tile_size_compute_ck,1});
Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this);
*/
// Simple example of 3D MD range policy.
// Begin loop over grid points.
/*
// NOTE: We don't get the compiler error calling host function DeviceType::in_parallel() in this
// function, but we get it in the above function.
int n = 3; // bounds for mdrange policy
typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagComputeSNAGrid3D> policy_3d({0,0,0},{n,n,n});
Kokkos::parallel_for("3D",policy_3d,*this);
*/
printf("^^^ End ComputeSNAGridKokkos compute_array()\n");
}
/* ----------------------------------------------------------------------
Begin routines that are unique to the GPU codepath. These take advantage
of AoSoA data layouts and scratch memory for recursive polynomials
------------------------------------------------------------------------- */
/*
Simple team policy functor seeing how many layers deep we can go with the parallelism.
*/
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
// this function is following the same procedure in ComputeNeigh of PairSNAPKokkos
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// basic quantities associated with this team:
// team_rank : rank of thread in this team
// league_rank : rank of team in this league
// team_size : number of threads in this team
//printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size());
// extract loop index
int ii = team.team_rank() + team.league_rank() * team.team_size();
if (ii >= chunk_size) return;
// get a pointer to scratch memory
// This is used to cache whether or not an atom is within the cutoff.
// If it is, type_cache is assigned to the atom type.
// If it's not, it's assigned to -1.
const int tile_size = ntotal; // number of elements per thread
const int team_rank = team.team_rank();
const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
//printf("ntotal scratch_shift: %d %d\n", ntotal, scratch_shift);
int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
//printf("ii: %d\n", ii);
// convert to grid indices
int iz = ii/(xlen*ylen);
int i2 = ii - (iz*xlen*ylen);
int iy = i2/xlen;
int ix = i2 % xlen;
iz += nzlo;
iy += nylo;
ix += nxlo;
double xgrid[3];
//int igrid = iz * (nx * ny) + iy * nx + ix;
// these end up being the same...?
//printf("ii igrid: %d %d\n", ii, igrid);
// grid2x converts igrid to ix,iy,iz like we've done before
//grid2x(igrid, xgrid);
xgrid[0] = ix * delx;
xgrid[1] = iy * dely;
xgrid[2] = iz * delz;
const double xtmp = xgrid[0];
const double ytmp = xgrid[1];
const double ztmp = xgrid[2];
// currently, all grid points are type 1
// not clear what a better choice would be
const int itype = 1;
const int ielem = d_map[itype];
const double radi = d_radelem[ielem];
// We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
if (triclinic){
printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
} else {
//printf("We are not triclinic\n");
}
// can check xgrid positions with original
//printf("%f %f %f\n", xgrid[0], xgrid[1], xgrid[2]);
// Compute the number of neighbors, store rsq
int ninside = 0;
// want to loop over ntotal... keep getting seg fault when accessing type_cache[j]?
//printf("ntotal: %d\n", ntotal);
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
[&] (const int j, int& count) {
// From pair snap/kk :
/*
T_INT j = d_neighbors(i,jj);
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
*/
// From compute sna/grid/kk :
/*
const double delx = xtmp - x[j][0];
const double dely = ytmp - x[j][1];
const double delz = ztmp - x[j][2];
*/
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
//printf("dx: %f\n", dx);
//const double rsq = delx * delx + dely * dely + delz * delz;
int jtype = type(j);
//printf("jtype: %d\n", jtype);
//int jelem = 0;
//if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) {
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
//if (rsq >= rnd_cutsq(itype,jtype)) {
if (rsq >= cutsq_tmp){
jtype = -1; // use -1 to signal it's outside the radius
}
//printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype));
if (j > 340){
printf("j: %d\n", j);
}
//printf("j: %d\n", j);
type_cache[j] = jtype;
if (jtype >= 0)
count++;
}, ninside);
//printf("ninside: %d\n", ninside);
d_ninside(ii) = ninside;
// TODO: Make sure itype is appropriate instead of ielem
Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
[&] (const int j, int& offset, bool final) {
const int jtype = type_cache[j];
if (jtype >= 0) {
if (final) {
const F_FLOAT dx = x(j,0) - xtmp;
const F_FLOAT dy = x(j,1) - ytmp;
const F_FLOAT dz = x(j,2) - ztmp;
const int jelem = d_map[jtype];
my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[jelem])*rcutfac);
my_sna.inside(ii,offset) = j;
if (switchinnerflag) {
my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[itype] + d_sinnerelem[jelem]);
my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[itype] + d_dinnerelem[jelem]);
}
if (chemflag)
my_sna.element(ii,offset) = jelem;
else
my_sna.element(ii,offset) = 0;
}
offset++;
}
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
//printf("^^^ ComputeCayleyKlein\n");
/*
if (DeviceType::in_parallel()) {
printf("operator() of TagCSNAGridComputeCayleyKlein is a host function\n");
} else {
printf("operator() of TagCSNAGridComputeCayleyKlein is not a host function\n");
}
*/
const int ii = iatom_mod + iatom_div * vector_length;
if (ii >= chunk_size) return;
const int ninside = ntotal; //d_ninside(ii);
if (jnbor >= ninside) return;
my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int ii = iatom_mod + iatom_div * vector_length;
if (ii >= chunk_size) return;
int itype = type(ii);
int ielem = d_map[itype];
my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract flattened atom_div / neighbor number / bend_location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / (ntotal * (twojmax + 1)); // removed "const" to work around GCC 7 bug
const int jj_jbend = flattened_idx - iatom_div * (ntotal * (twojmax + 1));
const int jbend = jj_jbend / ntotal;
int jj = jj_jbend - jbend * ntotal; // removed "const" to work around GCC 7 bug
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
// extract flattened atom_div / neighbor number / bend location
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
// extract neighbor index, iatom_div
int iatom_div = flattened_idx / ntotal; // removed "const" to work around GCC 7 bug
int jj = flattened_idx - iatom_div * ntotal;
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
[&] (const int iatom_mod) {
const int ii = iatom_mod + vector_length * iatom_div;
if (ii >= chunk_size) return;
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
});
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (idxu > my_sna.idxu_max) return;
int elem_count = chemflag ? nelements : 1;
for (int ielem = 0; ielem < elem_count; ielem++){
const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
if (mapper.flip_sign == 1){
utot_im = -utot_im;
} else if (mapper.flip_sign == -1){
utot_re = -utot_re;
}
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
if (mapper.flip_sign == 0) {
my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
}
}
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjz >= my_sna.idxz_max) return;
my_sna.compute_zi(iatom_mod,jjz,iatom_div);
}
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
const int iatom = iatom_mod + iatom_div * vector_length;
if (iatom >= chunk_size) return;
if (jjb >= my_sna.idxb_max) return;
my_sna.compute_bi(iatom_mod,jjb,iatom_div);
}
/*
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
}
*/
/* ----------------------------------------------------------------------
Begin routines that are unique to the CPU codepath. These do not take
advantage of AoSoA data layouts, but that could be a good point of
future optimization and unification with the above kernels. It's unlikely
that scratch memory optimizations will ever be useful for the CPU due to
different arithmetic intensity requirements for the CPU vs GPU.
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
KOKKOS_INLINE_FUNCTION
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagComputeSNAGridLoopCPU,const int& ii) const {
}
/* ----------------------------------------------------------------------
utility functions
------------------------------------------------------------------------- */
template<class DeviceType, typename real_type, int vector_length>
template<class TagStyle>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
int team_size_max;
team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
}
template<class DeviceType, typename real_type, int vector_length>
template<class TagStyle>
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
int team_size_max;
team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
}
template<class DeviceType, typename real_type, int vector_length>
template<typename scratch_type>
int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
return ScratchViewType::shmem_size(values_per_team);
}
/* ---------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
routines used by template reference classes
------------------------------------------------------------------------- */
template<class DeviceType>
ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
: ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
@ -112,6 +801,12 @@ void ComputeSNAGridKokkosDevice<DeviceType>::init()
Base::init();
}
template<class DeviceType>
void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
{
Base::compute_array();
}
#ifdef LMP_KOKKOS_GPU
template<class DeviceType>
ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
@ -122,6 +817,12 @@ void ComputeSNAGridKokkosHost<DeviceType>::init()
{
Base::init();
}
template<class DeviceType>
void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
{
Base::compute_array();
}
#endif
}

View File

@ -183,6 +183,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
return data;
}
/* ----------------------------------------------------------------------
create a 4d array with indices 2,3,4 offset, but not first
2nd index from n2lo to n2hi inclusive
3rd index from n3lo to n3hi inclusive
4th index from n4lo to n4hi inclusive
cannot grow it
------------------------------------------------------------------------- */
template <typename TYPE>
TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
const char *name)
{
//if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array = nullptr;
printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
int n2 = n2hi - n2lo + 1;
int n3 = n3hi - n3lo + 1;
int n4 = n4hi - n4lo + 1;
data = TYPE(std::string(name),n1,n2,n3,n4);
bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
array = (typename TYPE::value_type ****) smalloc(nbytes,name);
for (int i = 0; i < n1; i++) {
if (n2 == 0) {
array[i] = nullptr;
} else {
nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
for (int j = 0; j < n2; j++){
if (n3 == 0){
array[i][j] = nullptr;
} else {
nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
for (int k = 0; k < n3; k++){
if (n4 == 0)
array[i][j][k] = nullptr;
else
array[i][j][k] = &data.h_view(i,j,k,0);
}
}
}
}
}
return data;
}
template <typename TYPE, typename HTYPE>
TYPE create_kokkos(TYPE &data, HTYPE &h_data,
typename TYPE::value_type **&array, int n1, int n2,

View File

@ -57,6 +57,8 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :
ComputeGrid::~ComputeGrid()
{
printf("^^^ begin ComputeGrid destructor\n");
if (copymode) return;
deallocate();
}
@ -111,7 +113,7 @@ void ComputeGrid::assign_coords_all()
void ComputeGrid::allocate()
{
// allocate arrays
printf("^^^^^^^^^^^^^^^ ComputeGrid::allocate()\n");
memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {

View File

@ -31,14 +31,14 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
// skip over arguments used by base class
// so that argument positions are identical to
// regular per-atom compute
printf("^^^ inside compute sna grid constructor\n");
arg += nargbase;
narg -= nargbase;
// begin code common to all SNAP computes
double rfac0, rmin0;
int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
//double rfac0, rmin0;
//int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
int ntypes = atom->ntypes;
int nargmin = 6 + 2 * ntypes;
@ -56,6 +56,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
wselfallflag = 0;
switchinnerflag = 0;
nelements = 1;
chunksize = 32768;
parallel_thresh = 8192;
// process required arguments
@ -112,6 +114,7 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
iarg += 2;
} else if (strcmp(arg[iarg], "chem") == 0) {
printf("^^^ chem flag, creating map\n");
if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
chemflag = 1;
memory->create(map, ntypes + 1, "compute_sna_grid:map");
@ -181,11 +184,17 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
ComputeSNAGrid::~ComputeSNAGrid()
{
memory->destroy(radelem);
memory->destroy(wjelem);
memory->destroy(cutsq);
delete snaptr;
if (copymode) return;
printf("^^^ begin ComputeSNAGrid destructor\n");
memory->destroy(radelem);
printf("^^^^ CSG 1\n");
memory->destroy(wjelem);
printf("^^^^ CSG 2\n");
memory->destroy(cutsq);
printf("^^^^ CSG 3\n");
delete snaptr;
printf("^^^^ CSG 4\n");
if (chemflag) memory->destroy(map);
}
@ -196,12 +205,16 @@ void ComputeSNAGrid::init()
if ((modify->get_compute_by_style("^sna/grid$").size() > 1) && (comm->me == 0))
error->warning(FLERR, "More than one instance of compute sna/grid");
snaptr->init();
printf("^^^ finished ComputeSNAGrid init()\n");
}
/* ---------------------------------------------------------------------- */
void ComputeSNAGrid::compute_array()
{
printf("^^^ inside ComputeSNAGrid compute_array()\n");
invoked_array = update->ntimestep;
// compute sna for each gridpoint
@ -211,6 +224,8 @@ void ComputeSNAGrid::compute_array()
int *const type = atom->type;
const int ntotal = atom->nlocal + atom->nghost;
printf("^^^ ntotal: %d\n", ntotal);
// ensure rij, inside, and typej are of size jnum
snaptr->grow_rij(ntotal);

View File

@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
void init() override;
void compute_array() override;
double memory_usage() override;
int ncoeff,nelements; // public for kokkos, but could go in the protected block now
private:
int ncoeff;
protected:
//int ncoeff;
double **cutsq;
double rcutfac;
double *radelem;
double *wjelem;
int *map; // map types to [0,nelements)
int nelements, chemflag;
int chemflag;
int switchinnerflag;
double *sinnerelem;
double *dinnerelem;
int parallel_thresh;
class SNA *snaptr;
double cutmax;
int quadraticflag;
double rfac0, rmin0;
int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
int chunksize;
};
} // namespace LAMMPS_NS