Merge pull request #1 from rohskopf/sna-grid-kokkos
compute sna/grid/kokkos
This commit is contained in:
81
src/KOKKOS/compute_sna_grid_kokkos.cpp
Normal file
81
src/KOKKOS/compute_sna_grid_kokkos.cpp
Normal file
@ -0,0 +1,81 @@
|
||||
// clang-format off
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
LAMMPS development team: developers@lammps.org
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "compute_sna_grid_kokkos.h"
|
||||
#include "compute_sna_grid_kokkos_impl.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
template class ComputeSNAGridKokkosDevice<LMPDeviceType>;
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template class ComputeSNAGridKokkosHost<LMPHostType>;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// The following chunk will compile but we're gonna try a wrapper approach like pair snap.
|
||||
/*
|
||||
#include "compute_sna_grid_kokkos.h"
|
||||
|
||||
#include "atom_kokkos.h"
|
||||
#include "atom_masks.h"
|
||||
#include "comm.h"
|
||||
#include "error.h"
|
||||
#include "memory_kokkos.h"
|
||||
#include "modify.h"
|
||||
#include "neigh_list.h"
|
||||
#include "neigh_request.h"
|
||||
#include "neighbor_kokkos.h"
|
||||
#include "sna_kokkos.h"
|
||||
#include "update.h"
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
template<class DeviceType>
|
||||
ComputeSNAGridKokkos<DeviceType>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) :
|
||||
ComputeSNAGrid(lmp, narg, arg)
|
||||
{
|
||||
|
||||
printf("^^^ inside ComputeSNAGridKokkos constructor\n");
|
||||
kokkosable = 1;
|
||||
atomKK = (AtomKokkos *) atom;
|
||||
execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
|
||||
datamask_read = EMPTY_MASK;
|
||||
datamask_modify = EMPTY_MASK;
|
||||
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
||||
template<class DeviceType>
|
||||
ComputeSNAGridKokkos<DeviceType>::~ComputeSNAGridKokkos()
|
||||
{
|
||||
if (copymode) return;
|
||||
|
||||
|
||||
}
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
template class ComputeSNAGridKokkos<LMPDeviceType>;
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template class ComputeSNAGridKokkos<LMPHostType>;
|
||||
#endif
|
||||
}
|
||||
*/
|
||||
|
||||
366
src/KOKKOS/compute_sna_grid_kokkos.h
Normal file
366
src/KOKKOS/compute_sna_grid_kokkos.h
Normal file
@ -0,0 +1,366 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
LAMMPS development team: developers@lammps.org
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef COMPUTE_CLASS
|
||||
// clang-format off
|
||||
ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice<LMPDeviceType>);
|
||||
ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice<LMPDeviceType>);
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost<LMPHostType>);
|
||||
#else
|
||||
ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice<LMPHostType>);
|
||||
#endif
|
||||
// clang-format on
|
||||
#else
|
||||
|
||||
// clang-format off
|
||||
#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
|
||||
#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
|
||||
|
||||
#include "compute_sna_grid.h"
|
||||
#include "kokkos_type.h"
|
||||
//#include "pair_snap.h"
|
||||
//#include "kokkos_type.h"
|
||||
//#include "neigh_list_kokkos.h"
|
||||
#include "sna_kokkos.h"
|
||||
//#include "pair_kokkos.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
// Routines for both the CPU and GPU backend
|
||||
//template<int NEIGHFLAG, int EVFLAG>
|
||||
//struct TagPairSNAPComputeForce{};
|
||||
|
||||
|
||||
// GPU backend only
|
||||
/*
|
||||
struct TagPairSNAPComputeNeigh{};
|
||||
struct TagPairSNAPComputeCayleyKlein{};
|
||||
struct TagPairSNAPPreUi{};
|
||||
struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence
|
||||
struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence
|
||||
struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
|
||||
struct TagPairSNAPComputeZi{};
|
||||
struct TagPairSNAPBeta{};
|
||||
struct TagPairSNAPComputeBi{};
|
||||
struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
|
||||
struct TagPairSNAPComputeYi{};
|
||||
struct TagPairSNAPComputeYiWithZlist{};
|
||||
template<int dir>
|
||||
struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence
|
||||
template<int dir>
|
||||
struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence
|
||||
*/
|
||||
//struct TagPairSNAPPreUi{};
|
||||
struct TagCSNAGridComputeNeigh{};
|
||||
struct TagCSNAGridComputeCayleyKlein{};
|
||||
struct TagCSNAGridPreUi{};
|
||||
struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence
|
||||
struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence
|
||||
struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
|
||||
struct TagCSNAGridComputeZi{};
|
||||
struct TagCSNAGridComputeBi{};
|
||||
struct TagCSNAGridTransformBi{}; // re-order blist from AoSoA to AoS
|
||||
struct TagCSNAGridLocalFill{}; // fill the gridlocal array
|
||||
//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce
|
||||
|
||||
struct TagComputeSNAGridLoop{};
|
||||
struct TagComputeSNAGrid3D{};
|
||||
//struct TagCSNAGridTeam{};
|
||||
|
||||
// CPU backend only
|
||||
/*
|
||||
struct TagPairSNAPComputeNeighCPU{};
|
||||
struct TagPairSNAPPreUiCPU{};
|
||||
struct TagPairSNAPComputeUiCPU{};
|
||||
struct TagPairSNAPTransformUiCPU{};
|
||||
struct TagPairSNAPComputeZiCPU{};
|
||||
struct TagPairSNAPBetaCPU{};
|
||||
struct TagPairSNAPComputeBiCPU{};
|
||||
struct TagPairSNAPZeroYiCPU{};
|
||||
struct TagPairSNAPComputeYiCPU{};
|
||||
struct TagPairSNAPComputeDuidrjCPU{};
|
||||
struct TagPairSNAPComputeDeidrjCPU{};
|
||||
*/
|
||||
struct TagComputeSNAGridLoopCPU{};
|
||||
|
||||
//template<class DeviceType>
|
||||
template<class DeviceType, typename real_type_, int vector_length_>
|
||||
class ComputeSNAGridKokkos : public ComputeSNAGrid {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
|
||||
static constexpr int vector_length = vector_length_;
|
||||
using real_type = real_type_;
|
||||
using complex = SNAComplex<real_type>;
|
||||
|
||||
// Static team/tile sizes for device offload
|
||||
|
||||
#ifdef KOKKOS_ENABLE_HIP
|
||||
static constexpr int team_size_compute_neigh = 2;
|
||||
static constexpr int tile_size_compute_ck = 2;
|
||||
static constexpr int tile_size_pre_ui = 2;
|
||||
static constexpr int team_size_compute_ui = 2;
|
||||
static constexpr int tile_size_transform_ui = 2;
|
||||
static constexpr int tile_size_compute_zi = 2;
|
||||
static constexpr int tile_size_compute_bi = 2;
|
||||
static constexpr int tile_size_transform_bi = 2;
|
||||
static constexpr int tile_size_compute_yi = 2;
|
||||
static constexpr int team_size_compute_fused_deidrj = 2;
|
||||
#else
|
||||
static constexpr int team_size_compute_neigh = 4;
|
||||
static constexpr int tile_size_compute_ck = 4;
|
||||
static constexpr int tile_size_pre_ui = 4;
|
||||
static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
|
||||
static constexpr int tile_size_transform_ui = 4;
|
||||
static constexpr int tile_size_compute_zi = 8;
|
||||
static constexpr int tile_size_compute_bi = 4;
|
||||
static constexpr int tile_size_transform_bi = 4;
|
||||
static constexpr int tile_size_compute_yi = 8;
|
||||
static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
|
||||
#endif
|
||||
|
||||
// Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches
|
||||
// This hides the Kokkos::IndexType<int> and Kokkos::Rank<3...>
|
||||
// and reduces the verbosity of the LaunchBound by hiding the explicit
|
||||
// multiplication by vector_length
|
||||
template <class Device, int num_tiles, class TagComputeSNAP>
|
||||
using Snap3DRangePolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds<vector_length * num_tiles>, TagComputeSNAP>;
|
||||
|
||||
// MDRangePolicy for the 3D grid loop:
|
||||
template <class Device, class TagComputeSNAP>
|
||||
using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy<Device, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>;
|
||||
|
||||
// Testing out team policies
|
||||
template <class Device, int num_teams, class TagComputeSNAP>
|
||||
using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
|
||||
//using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::IndexType<int>, Kokkos::IndexType<int>, Kokkos::IndexType<int>, TagComputeSNAP>;
|
||||
//using team_member = typename team_policy::member_type;
|
||||
|
||||
// Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches
|
||||
// This hides the LaunchBounds abstraction by hiding the explicit
|
||||
// multiplication by vector length
|
||||
template <class Device, int num_teams, class TagComputeSNAP>
|
||||
using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy<Device, Kokkos::LaunchBounds<vector_length * num_teams>, TagComputeSNAP>;
|
||||
|
||||
ComputeSNAGridKokkos(class LAMMPS *, int, char **);
|
||||
~ComputeSNAGridKokkos() override;
|
||||
|
||||
void init() override;
|
||||
void setup() override;
|
||||
void compute_array() override;
|
||||
|
||||
// Utility functions for teams
|
||||
|
||||
template<class TagStyle>
|
||||
void check_team_size_for(int, int&);
|
||||
|
||||
template<class TagStyle>
|
||||
void check_team_size_reduce(int, int&);
|
||||
|
||||
// operator function for example team policy
|
||||
//KOKKOS_INLINE_FUNCTION
|
||||
//void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridTeam>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagComputeSNAGridLoop, const int& ) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagComputeSNAGridLoopCPU, const int&) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeNeigh>::member_type& team) const;
|
||||
|
||||
// PrintNeigh
|
||||
//void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPrintNeigh>::member_type& team) const;
|
||||
|
||||
// 3D case - used by parallel_for
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiSmall>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType, TagCSNAGridComputeUiLarge>::member_type& team) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (TagCSNAGridLocalFill,const int& ii) const;
|
||||
|
||||
protected:
|
||||
|
||||
SNAKokkos<DeviceType, real_type, vector_length> snaKK;
|
||||
|
||||
int max_neighs, chunk_size, chunk_offset;
|
||||
int host_flag;
|
||||
int ntotal;
|
||||
int total_range; // total number of loop iterations in grid
|
||||
int zlen; //= nzhi-nzlo+1;
|
||||
int ylen; //= nyhi-nylo+1;
|
||||
int xlen; //= nxhi-nxlo+1;
|
||||
|
||||
double cutsq_tmp; // temporary cutsq until we get a view
|
||||
|
||||
Kokkos::View<real_type*, DeviceType> d_radelem; // element radii
|
||||
Kokkos::View<real_type*, DeviceType> d_wjelem; // elements weights
|
||||
//Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem; // element bispectrum coefficients
|
||||
Kokkos::View<real_type*, DeviceType> d_sinnerelem; // element inner cutoff midpoint
|
||||
Kokkos::View<real_type*, DeviceType> d_dinnerelem; // element inner cutoff half-width
|
||||
Kokkos::View<T_INT*, DeviceType> d_ninside; // ninside for all atoms in list
|
||||
Kokkos::View<T_INT*, DeviceType> d_map; // mapping from atom types to elements
|
||||
Kokkos::View<real_type*, DeviceType> d_test; // test view
|
||||
|
||||
typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
|
||||
tdual_fparams k_cutsq;
|
||||
typedef Kokkos::View<const F_FLOAT**, DeviceType,
|
||||
Kokkos::MemoryTraits<Kokkos::RandomAccess> > t_fparams_rnd;
|
||||
t_fparams_rnd rnd_cutsq;
|
||||
|
||||
typename AT::t_x_array_randomread x;
|
||||
typename AT::t_int_1d_randomread type;
|
||||
DAT::tdual_float_2d k_grid;
|
||||
DAT::tdual_float_2d k_gridall;
|
||||
typename AT::t_float_2d d_grid;
|
||||
typename AT::t_float_2d d_gridall;
|
||||
|
||||
//DAT::tdual_float_4d k_gridlocal;
|
||||
//typedef Kokkos::DualView<real_type****, Kokkos::LayoutLeft, DeviceType> t_gridlocal_4d;
|
||||
//typedef Kokkos::View<real_type****, DeviceType> t_4d;
|
||||
// should we use LMPDeviceType below?
|
||||
//typedef Kokkos::DualView<LMP_FLOAT****, LMPDeviceType> tdual_float_4d;
|
||||
//typedef tdual_float_4d::t_dev tdev_float_4d;
|
||||
//tdual_float_4d k_gridlocal;
|
||||
//tdev_float_4d d_gridlocal;
|
||||
DAT::tdual_float_4d k_gridlocal;
|
||||
typename AT::t_float_4d d_gridlocal;
|
||||
|
||||
|
||||
// Utility routine which wraps computing per-team scratch size requirements for
|
||||
// ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
|
||||
template <typename scratch_type>
|
||||
int scratch_size_helper(int values_per_team);
|
||||
|
||||
class DomainKokkos *domainKK;
|
||||
|
||||
// triclinic vars
|
||||
/*
|
||||
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
|
||||
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
|
||||
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
|
||||
*/
|
||||
double h0, h1, h2, h3, h4, h5;
|
||||
double lo0, lo1, lo2;
|
||||
|
||||
};
|
||||
|
||||
// These wrapper classes exist to make the compute style factory happy/avoid having
|
||||
// to extend the compute style factory to support Compute classes w/an arbitrary number
|
||||
// of extra template parameters
|
||||
|
||||
template <class DeviceType>
|
||||
class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
|
||||
|
||||
private:
|
||||
using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
|
||||
|
||||
public:
|
||||
|
||||
ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **);
|
||||
|
||||
void init() override;
|
||||
void compute_array() override;
|
||||
|
||||
};
|
||||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template <class DeviceType>
|
||||
class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
|
||||
|
||||
private:
|
||||
using Base = ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
|
||||
|
||||
public:
|
||||
|
||||
ComputeSNAGridKokkosHost(class LAMMPS *, int, char **);
|
||||
|
||||
void init() override;
|
||||
void compute_array() override;
|
||||
|
||||
};
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// The following will compile with the chunk in cpp file but we're gonna try wrapper like pair snap.
|
||||
/*
|
||||
#ifdef COMPUTE_CLASS
|
||||
// clang-format off
|
||||
ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos<LMPDeviceType>);
|
||||
ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos<LMPDeviceType>);
|
||||
ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos<LMPHostType>);
|
||||
// clang-format on
|
||||
#else
|
||||
|
||||
// clang-format off
|
||||
#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H
|
||||
#define LMP_COMPUTE_SNA_GRID_KOKKOS_H
|
||||
|
||||
#include "compute_sna_grid.h"
|
||||
#include "kokkos_type.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
//template<int CSTYLE, int NCOL>
|
||||
//struct TagComputeCoordAtom{};
|
||||
|
||||
template<class DeviceType>
|
||||
class ComputeSNAGridKokkos : public ComputeSNAGrid {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
|
||||
ComputeSNAGridKokkos(class LAMMPS *, int, char **);
|
||||
~ComputeSNAGridKokkos() override;
|
||||
|
||||
private:
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
932
src/KOKKOS/compute_sna_grid_kokkos_impl.h
Normal file
932
src/KOKKOS/compute_sna_grid_kokkos_impl.h
Normal file
@ -0,0 +1,932 @@
|
||||
// clang-format off
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
https://www.lammps.org/, Sandia National Laboratories
|
||||
LAMMPS development team: developers@lammps.org
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Christian Trott (SNL), Stan Moore (SNL),
|
||||
Evan Weinberg (NVIDIA)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "compute_sna_grid_kokkos.h"
|
||||
#include "pair_snap_kokkos.h"
|
||||
|
||||
#include "atom_kokkos.h"
|
||||
#include "atom_masks.h"
|
||||
#include "comm.h"
|
||||
#include "error.h"
|
||||
#include "memory_kokkos.h"
|
||||
#include "modify.h"
|
||||
#include "neigh_list.h"
|
||||
#include "neigh_request.h"
|
||||
#include "neighbor_kokkos.h"
|
||||
//#include "sna_kokkos.h"
|
||||
#include "domain.h"
|
||||
#include "domain_kokkos.h"
|
||||
#include "sna.h"
|
||||
#include "update.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#define MAXLINE 1024
|
||||
#define MAXWORD 3
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
// Constructor
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg)
|
||||
{
|
||||
kokkosable = 1;
|
||||
atomKK = (AtomKokkos *) atom;
|
||||
domainKK = (DomainKokkos *) domain;
|
||||
execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
|
||||
datamask_read = EMPTY_MASK;
|
||||
datamask_modify = EMPTY_MASK;
|
||||
|
||||
k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1);
|
||||
auto d_cutsq = k_cutsq.template view<DeviceType>();
|
||||
rnd_cutsq = d_cutsq;
|
||||
|
||||
host_flag = (execution_space == Host);
|
||||
|
||||
// TODO: Extract cutsq in double loop below, no need for cutsq_tmp
|
||||
|
||||
cutsq_tmp = cutsq[1][1];
|
||||
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
for (int j = 1; j <= atom->ntypes; j++){
|
||||
k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp;
|
||||
k_cutsq.template modify<LMPHostType>();
|
||||
}
|
||||
}
|
||||
|
||||
// Set up element lists
|
||||
MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements);
|
||||
MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements);
|
||||
MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements);
|
||||
MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements);
|
||||
// test
|
||||
MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements);
|
||||
|
||||
int n = atom->ntypes;
|
||||
MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1);
|
||||
|
||||
auto h_radelem = Kokkos::create_mirror_view(d_radelem);
|
||||
auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
|
||||
auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem);
|
||||
auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem);
|
||||
auto h_map = Kokkos::create_mirror_view(d_map);
|
||||
// test
|
||||
auto h_test = Kokkos::create_mirror_view(d_test);
|
||||
h_test(0) = 2.0;
|
||||
|
||||
// start from index 1 because of how compute sna/grid is
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
h_radelem(i-1) = radelem[i];
|
||||
h_wjelem(i-1) = wjelem[i];
|
||||
if (switchinnerflag){
|
||||
h_sinnerelem(i) = sinnerelem[i];
|
||||
h_dinnerelem(i) = dinnerelem[i];
|
||||
}
|
||||
}
|
||||
|
||||
// In pair snap some things like `map` get allocated regardless of chem flag.
|
||||
if (chemflag){
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
h_map(i) = map[i];
|
||||
}
|
||||
}
|
||||
|
||||
Kokkos::deep_copy(d_radelem,h_radelem);
|
||||
Kokkos::deep_copy(d_wjelem,h_wjelem);
|
||||
if (switchinnerflag){
|
||||
Kokkos::deep_copy(d_sinnerelem,h_sinnerelem);
|
||||
Kokkos::deep_copy(d_dinnerelem,h_dinnerelem);
|
||||
}
|
||||
if (chemflag){
|
||||
Kokkos::deep_copy(d_map,h_map);
|
||||
}
|
||||
Kokkos::deep_copy(d_test,h_test);
|
||||
|
||||
double bytes = MemKK::memory_usage(d_wjelem);
|
||||
|
||||
snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
|
||||
rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag);
|
||||
snaKK.grow_rij(0,0);
|
||||
snaKK.init();
|
||||
|
||||
}
|
||||
|
||||
// Destructor
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::~ComputeSNAGridKokkos()
|
||||
{
|
||||
if (copymode) return;
|
||||
|
||||
memoryKK->destroy_kokkos(k_cutsq,cutsq);
|
||||
//memoryKK->destroy_kokkos(k_grid,grid);
|
||||
memoryKK->destroy_kokkos(k_gridall, gridall);
|
||||
//memoryKK->destroy_kokkos(k_gridlocal, gridlocal);
|
||||
}
|
||||
|
||||
// Init
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::init()
|
||||
{
|
||||
if (host_flag) {
|
||||
return;
|
||||
}
|
||||
ComputeSNAGrid::init();
|
||||
|
||||
}
|
||||
|
||||
// Setup
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::setup()
|
||||
{
|
||||
|
||||
// Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there.
|
||||
// Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices.
|
||||
|
||||
ComputeGrid::set_grid_global();
|
||||
ComputeGrid::set_grid_local();
|
||||
|
||||
// allocate arrays
|
||||
//printf(">>> Allocating gridall.\n");
|
||||
//printf(">>> %d %d\n", size_array_rows, size_array_cols);
|
||||
//memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid");
|
||||
memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall");
|
||||
//printf(">>> Allocated gridall.\n");
|
||||
|
||||
// do not use or allocate gridlocal for now
|
||||
|
||||
gridlocal_allocated = 0;
|
||||
/*
|
||||
if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
|
||||
gridlocal_allocated = 1;
|
||||
memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo,
|
||||
nyhi, nxlo, nxhi, "grid:gridlocal");
|
||||
}
|
||||
*/
|
||||
array = gridall;
|
||||
|
||||
d_gridlocal = k_gridlocal.template view<DeviceType>();
|
||||
//d_grid = k_grid.template view<DeviceType>();
|
||||
d_gridall = k_gridall.template view<DeviceType>();
|
||||
}
|
||||
|
||||
// Compute
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::compute_array()
|
||||
{
|
||||
if (host_flag) {
|
||||
return;
|
||||
}
|
||||
|
||||
copymode = 1;
|
||||
|
||||
zlen = nzhi-nzlo+1;
|
||||
ylen = nyhi-nylo+1;
|
||||
xlen = nxhi-nxlo+1;
|
||||
total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1);
|
||||
|
||||
atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
|
||||
x = atomKK->k_x.view<DeviceType>();
|
||||
type = atomKK->k_type.view<DeviceType>();
|
||||
k_cutsq.template sync<DeviceType>();
|
||||
|
||||
// max_neighs is defined here - think of more elaborate methods.
|
||||
max_neighs = 100;
|
||||
|
||||
// Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total
|
||||
// number of atoms.
|
||||
|
||||
ntotal = atomKK->nlocal + atomKK->nghost;
|
||||
// Allocate view for number of neighbors per grid point
|
||||
MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range);
|
||||
|
||||
// "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user
|
||||
// `total_range` is the number of grid points which may be larger than chunk size.
|
||||
//printf(">>> total_range: %d\n", total_range);
|
||||
chunk_size = MIN(chunksize, total_range);
|
||||
chunk_offset = 0;
|
||||
//snaKK.grow_rij(chunk_size, ntotal);
|
||||
snaKK.grow_rij(chunk_size, max_neighs);
|
||||
|
||||
//chunk_size = total_range;
|
||||
|
||||
// Pre-compute ceil(chunk_size / vector_length) for code cleanliness
|
||||
const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
|
||||
|
||||
if (triclinic){
|
||||
/*
|
||||
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
|
||||
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
|
||||
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
|
||||
*/
|
||||
h0 = domain->h[0];
|
||||
h1 = domain->h[1];
|
||||
h2 = domain->h[2];
|
||||
h3 = domain->h[3];
|
||||
h4 = domain->h[4];
|
||||
h5 = domain->h[5];
|
||||
lo0 = domain->boxlo[0];
|
||||
lo1 = domain->boxlo[1];
|
||||
lo2 = domain->boxlo[2];
|
||||
}
|
||||
|
||||
while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory
|
||||
|
||||
if (chunk_size > total_range - chunk_offset)
|
||||
chunk_size = total_range - chunk_offset;
|
||||
|
||||
//printf(">>> chunk_offset: %d\n", chunk_offset);
|
||||
|
||||
//ComputeNeigh
|
||||
{
|
||||
int scratch_size = scratch_size_helper<int>(team_size_compute_neigh * max_neighs); //ntotal);
|
||||
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_neigh, TagCSNAGridComputeNeigh>
|
||||
policy_neigh(chunk_size, team_size_compute_neigh, vector_length);
|
||||
policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
|
||||
}
|
||||
|
||||
//ComputeCayleyKlein
|
||||
{
|
||||
// tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h`
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_ck, TagCSNAGridComputeCayleyKlein>
|
||||
policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1});
|
||||
Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this);
|
||||
}
|
||||
|
||||
//PreUi
|
||||
{
|
||||
// tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h`
|
||||
Snap3DRangePolicy<DeviceType, tile_size_pre_ui, TagCSNAGridPreUi>
|
||||
policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1});
|
||||
Kokkos::parallel_for("PreUi",policy_preui,*this);
|
||||
}
|
||||
|
||||
// ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot
|
||||
{
|
||||
// team_size_compute_ui is defined in `compute_sna_grid_kokkos.h`
|
||||
// scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
|
||||
const int tile_size = vector_length * (twojmax + 1);
|
||||
const int scratch_size = scratch_size_helper<complex>(team_size_compute_ui * tile_size);
|
||||
|
||||
if (chunk_size < parallel_thresh)
|
||||
{
|
||||
// Version with parallelism over j_bend
|
||||
|
||||
// total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations)
|
||||
const int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
|
||||
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
|
||||
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiSmall>
|
||||
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
|
||||
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this);
|
||||
} else {
|
||||
// Version w/out parallelism over j_bend
|
||||
|
||||
// total number of teams needed: (natoms / 32) * (ntotal)
|
||||
const int n_teams = chunk_size_div * max_neighs;
|
||||
const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui;
|
||||
|
||||
SnapAoSoATeamPolicy<DeviceType, team_size_compute_ui, TagCSNAGridComputeUiLarge>
|
||||
policy_ui(n_teams_div, team_size_compute_ui, vector_length);
|
||||
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
||||
Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this);
|
||||
}
|
||||
}
|
||||
|
||||
//TransformUi: un-"fold" ulisttot, zero ylist
|
||||
{
|
||||
// team_size_transform_ui is defined in `pair_snap_kokkos.h`
|
||||
Snap3DRangePolicy<DeviceType, tile_size_transform_ui, TagCSNAGridTransformUi>
|
||||
policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1});
|
||||
Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
|
||||
}
|
||||
|
||||
//Compute bispectrum in AoSoA data layout, transform Bi
|
||||
|
||||
//ComputeZi
|
||||
const int idxz_max = snaKK.idxz_max;
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_zi, TagCSNAGridComputeZi>
|
||||
policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1});
|
||||
Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
|
||||
|
||||
//ComputeBi
|
||||
const int idxb_max = snaKK.idxb_max;
|
||||
Snap3DRangePolicy<DeviceType, tile_size_compute_bi, TagCSNAGridComputeBi>
|
||||
policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1});
|
||||
Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
|
||||
|
||||
//Transform data layout of blist out of AoSoA
|
||||
//We need this because `blist` gets used in ComputeForce which doesn't
|
||||
//take advantage of AoSoA, which at best would only be beneficial on the margins
|
||||
//NOTE: Do we need this in compute sna/grid/kk?
|
||||
Snap3DRangePolicy<DeviceType, tile_size_transform_bi, TagCSNAGridTransformBi>
|
||||
policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1});
|
||||
Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
|
||||
|
||||
// Fill the grid array with bispectrum values
|
||||
{
|
||||
typename Kokkos::RangePolicy<DeviceType,TagCSNAGridLocalFill> policy_fill(0,chunk_size);
|
||||
Kokkos::parallel_for(policy_fill, *this);
|
||||
}
|
||||
|
||||
// Proceed to the next chunk.
|
||||
chunk_offset += chunk_size;
|
||||
|
||||
} // end while
|
||||
|
||||
k_gridlocal.template modify<DeviceType>();
|
||||
k_gridlocal.template sync<LMPHostType>();
|
||||
|
||||
//k_grid.template modify<DeviceType>();
|
||||
//k_grid.template sync<LMPHostType>();
|
||||
|
||||
k_gridall.template modify<DeviceType>();
|
||||
k_gridall.template sync<LMPHostType>();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Begin routines that are unique to the GPU codepath. These take advantage
|
||||
of AoSoA data layouts and scratch memory for recursive polynomials
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
Simple team policy functor seeing how many layers deep we can go with the parallelism.
|
||||
*/
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeNeigh>::member_type& team) const {
|
||||
|
||||
// This function follows similar procedure as ComputeNeigh of PairSNAPKokkos.
|
||||
// Main difference is that we don't use the neighbor class or neighbor variables here.
|
||||
// This is because the grid points are not atoms and therefore do not get assigned
|
||||
// neighbors in LAMMPS.
|
||||
// TODO: If we did make a neighborlist for each grid point, we could use current
|
||||
// routines and avoid having to loop over all atoms (which limits us to
|
||||
// natoms = max team size).
|
||||
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
// basic quantities associated with this team:
|
||||
// team_rank : rank of thread in this team
|
||||
// league_rank : rank of team in this league
|
||||
// team_size : number of threads in this team
|
||||
|
||||
// extract loop index
|
||||
int ii = team.team_rank() + team.league_rank() * team.team_size();
|
||||
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
// extract grid index
|
||||
int igrid = ii + chunk_offset;
|
||||
|
||||
// get a pointer to scratch memory
|
||||
// This is used to cache whether or not an atom is within the cutoff.
|
||||
// If it is, type_cache is assigned to the atom type.
|
||||
// If it's not, it's assigned to -1.
|
||||
const int tile_size = ntotal; //max_neighs; // number of elements per thread
|
||||
const int team_rank = team.team_rank();
|
||||
const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
|
||||
int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
|
||||
|
||||
// convert to grid indices
|
||||
|
||||
int iz = igrid/(xlen*ylen);
|
||||
int i2 = igrid - (iz*xlen*ylen);
|
||||
int iy = i2/xlen;
|
||||
int ix = i2 % xlen;
|
||||
iz += nzlo;
|
||||
iy += nylo;
|
||||
ix += nxlo;
|
||||
|
||||
double xgrid[3];
|
||||
|
||||
// index ii already captures the proper grid point
|
||||
//int igrid = iz * (nx * ny) + iy * nx + ix;
|
||||
//printf("%d %d\n", ii, igrid);
|
||||
|
||||
// grid2x converts igrid to ix,iy,iz like we've done before
|
||||
// multiply grid integers by grid spacing delx, dely, delz
|
||||
//grid2x(igrid, xgrid);
|
||||
xgrid[0] = ix * delx;
|
||||
xgrid[1] = iy * dely;
|
||||
xgrid[2] = iz * delz;
|
||||
|
||||
if (triclinic) {
|
||||
|
||||
// Do a conversion on `xgrid` here like we do in the CPU version.
|
||||
|
||||
// Can't do this:
|
||||
// domainKK->lamda2x(xgrid, xgrid);
|
||||
// Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
|
||||
|
||||
// Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
|
||||
/*
|
||||
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
|
||||
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
|
||||
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
|
||||
*/
|
||||
xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
|
||||
xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
|
||||
xgrid[2] = h2*xgrid[2] + lo2;
|
||||
}
|
||||
|
||||
const F_FLOAT xtmp = xgrid[0];
|
||||
const F_FLOAT ytmp = xgrid[1];
|
||||
const F_FLOAT ztmp = xgrid[2];
|
||||
|
||||
// currently, all grid points are type 1
|
||||
// not clear what a better choice would be
|
||||
|
||||
const int itype = 1;
|
||||
int ielem = 0;
|
||||
if (chemflag) ielem = d_map[itype];
|
||||
const double radi = d_radelem[ielem];
|
||||
|
||||
// We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now.
|
||||
// The purpose here is to transform for triclinic boxes.
|
||||
/*
|
||||
if (triclinic){
|
||||
printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp);
|
||||
}
|
||||
*/
|
||||
|
||||
// Compute the number of neighbors, store rsq
|
||||
int ninside = 0;
|
||||
|
||||
// Looping over ntotal for now.
|
||||
for (int j = 0; j < ntotal; j++){
|
||||
const F_FLOAT dx = x(j,0) - xtmp;
|
||||
const F_FLOAT dy = x(j,1) - ytmp;
|
||||
const F_FLOAT dz = x(j,2) - ztmp;
|
||||
int jtype = type(j);
|
||||
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
|
||||
|
||||
// don't include atoms that share location with grid point
|
||||
if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
|
||||
jtype = -1; // use -1 to signal it's outside the radius
|
||||
}
|
||||
|
||||
if (jtype >= 0)
|
||||
ninside++;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal),
|
||||
[&] (const int j, int& count) {
|
||||
const F_FLOAT dx = x(j,0) - xtmp;
|
||||
const F_FLOAT dy = x(j,1) - ytmp;
|
||||
const F_FLOAT dz = x(j,2) - ztmp;
|
||||
|
||||
int jtype = type(j);
|
||||
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
|
||||
|
||||
// don't include atoms that share location with grid point
|
||||
if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) {
|
||||
jtype = -1; // use -1 to signal it's outside the radius
|
||||
}
|
||||
|
||||
type_cache[j] = jtype;
|
||||
|
||||
if (jtype >= 0)
|
||||
count++;
|
||||
|
||||
}, ninside);
|
||||
*/
|
||||
|
||||
d_ninside(ii) = ninside;
|
||||
|
||||
// TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type.
|
||||
int offset = 0;
|
||||
for (int j = 0; j < ntotal; j++){
|
||||
//const int jtype = type_cache[j];
|
||||
//if (jtype >= 0) {
|
||||
const F_FLOAT dx = x(j,0) - xtmp;
|
||||
const F_FLOAT dy = x(j,1) - ytmp;
|
||||
const F_FLOAT dz = x(j,2) - ztmp;
|
||||
const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
|
||||
int jtype = type(j);
|
||||
if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) {
|
||||
int jelem = 0;
|
||||
if (chemflag) jelem = d_map[jtype];
|
||||
my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
|
||||
my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
|
||||
my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
|
||||
// pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
|
||||
// actually since the views here have values starting at 0, let's use jelem
|
||||
my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
|
||||
my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
|
||||
my_sna.inside(ii,offset) = j;
|
||||
if (switchinnerflag) {
|
||||
my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
|
||||
my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
|
||||
}
|
||||
if (chemflag)
|
||||
my_sna.element(ii,offset) = jelem;
|
||||
else
|
||||
my_sna.element(ii,offset) = 0;
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
int offset = 0;
|
||||
for (int j = 0; j < ntotal; j++){
|
||||
const int jtype = type_cache[j];
|
||||
if (jtype >= 0) {
|
||||
printf(">>> offset: %d\n", offset);
|
||||
const F_FLOAT dx = x(j,0) - xtmp;
|
||||
const F_FLOAT dy = x(j,1) - ytmp;
|
||||
const F_FLOAT dz = x(j,2) - ztmp;
|
||||
int jtype = type(j);
|
||||
int jelem = 0;
|
||||
if (chemflag) jelem = d_map[jtype];
|
||||
my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
|
||||
my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
|
||||
my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
|
||||
// pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
|
||||
// actually since the views here have values starting at 0, let's use jelem
|
||||
my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
|
||||
my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
|
||||
my_sna.inside(ii,offset) = j;
|
||||
if (switchinnerflag) {
|
||||
my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
|
||||
my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
|
||||
}
|
||||
if (chemflag)
|
||||
my_sna.element(ii,offset) = jelem;
|
||||
else
|
||||
my_sna.element(ii,offset) = 0;
|
||||
offset++;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal),
|
||||
[&] (const int j, int& offset, bool final) {
|
||||
|
||||
const int jtype = type_cache[j];
|
||||
|
||||
if (jtype >= 0) {
|
||||
if (final) {
|
||||
const F_FLOAT dx = x(j,0) - xtmp;
|
||||
const F_FLOAT dy = x(j,1) - ytmp;
|
||||
const F_FLOAT dz = x(j,2) - ztmp;
|
||||
int jtype = type(j);
|
||||
int jelem = 0;
|
||||
if (chemflag) jelem = d_map[jtype];
|
||||
my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
|
||||
my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
|
||||
my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
|
||||
// pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp
|
||||
// actually since the views here have values starting at 0, let's use jelem
|
||||
my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[jelem]);
|
||||
my_sna.rcutij(ii,offset) = static_cast<real_type>((2.0 * d_radelem[jelem])*rcutfac);
|
||||
my_sna.inside(ii,offset) = j;
|
||||
if (switchinnerflag) {
|
||||
my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]);
|
||||
my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]);
|
||||
}
|
||||
if (chemflag)
|
||||
my_sna.element(ii,offset) = jelem;
|
||||
else
|
||||
my_sna.element(ii,offset) = 0;
|
||||
}
|
||||
offset++;
|
||||
}
|
||||
});
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int ii = iatom_mod + iatom_div * vector_length;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jnbor >= ninside) return;
|
||||
|
||||
my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int ii = iatom_mod + iatom_div * vector_length;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
int itype = type(ii);
|
||||
// force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp`
|
||||
int ielem = 0;
|
||||
|
||||
my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiSmall>::member_type& team) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend_location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug
|
||||
const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
|
||||
const int jbend = jj_jbend / max_neighs;
|
||||
int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy<DeviceType,TagCSNAGridComputeUiLarge>::member_type& team) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
// extract flattened atom_div / neighbor number / bend location
|
||||
int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
|
||||
|
||||
// extract neighbor index, iatom_div
|
||||
int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug
|
||||
int jj = flattened_idx - iatom_div * max_neighs;
|
||||
|
||||
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
|
||||
[&] (const int iatom_mod) {
|
||||
const int ii = iatom_mod + vector_length * iatom_div;
|
||||
if (ii >= chunk_size) return;
|
||||
|
||||
const int ninside = d_ninside(ii);
|
||||
if (jj >= ninside) return;
|
||||
|
||||
my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div);
|
||||
});
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (idxu > my_sna.idxu_max) return;
|
||||
|
||||
int elem_count = chemflag ? nelements : 1;
|
||||
|
||||
for (int ielem = 0; ielem < elem_count; ielem++){
|
||||
|
||||
const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
|
||||
|
||||
auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
|
||||
auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
|
||||
|
||||
if (mapper.flip_sign == 1){
|
||||
utot_im = -utot_im;
|
||||
} else if (mapper.flip_sign == -1){
|
||||
utot_re = -utot_re;
|
||||
}
|
||||
|
||||
my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
|
||||
|
||||
if (mapper.flip_sign == 0) {
|
||||
my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
|
||||
my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjz >= my_sna.idxz_max) return;
|
||||
|
||||
my_sna.compute_zi(iatom_mod,jjz,iatom_div);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (jjb >= my_sna.idxb_max) return;
|
||||
|
||||
my_sna.compute_bi(iatom_mod,jjb,iatom_div);
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
const int iatom = iatom_mod + iatom_div * vector_length;
|
||||
if (iatom >= chunk_size) return;
|
||||
|
||||
if (idxb >= my_sna.idxb_max) return;
|
||||
|
||||
const int ntriples = my_sna.ntriples;
|
||||
|
||||
for (int itriple = 0; itriple < ntriples; itriple++) {
|
||||
|
||||
const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
|
||||
|
||||
my_sna.blist(iatom, itriple, idxb) = blocal;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::operator() (TagCSNAGridLocalFill, const int& ii) const {
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
|
||||
// extract grid index
|
||||
int igrid = ii + chunk_offset;
|
||||
|
||||
// convert to grid indices
|
||||
|
||||
int iz = igrid/(xlen*ylen);
|
||||
int i2 = igrid - (iz*xlen*ylen);
|
||||
int iy = i2/xlen;
|
||||
int ix = i2 % xlen;
|
||||
iz += nzlo;
|
||||
iy += nylo;
|
||||
ix += nxlo;
|
||||
|
||||
double xgrid[3];
|
||||
|
||||
// index ii already captures the proper grid point
|
||||
// int igrid = iz * (nx * ny) + iy * nx + ix;
|
||||
// printf("ii igrid: %d %d\n", ii, igrid);
|
||||
|
||||
// grid2x converts igrid to ix,iy,iz like we've done before
|
||||
//grid2x(igrid, xgrid);
|
||||
xgrid[0] = ix * delx;
|
||||
xgrid[1] = iy * dely;
|
||||
xgrid[2] = iz * delz;
|
||||
if (triclinic) {
|
||||
|
||||
// Do a conversion on `xgrid` here like we do in the CPU version.
|
||||
|
||||
// Can't do this:
|
||||
// domainKK->lamda2x(xgrid, xgrid);
|
||||
// Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed
|
||||
|
||||
// Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats.
|
||||
/*
|
||||
xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0];
|
||||
xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1];
|
||||
xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2];
|
||||
*/
|
||||
xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0;
|
||||
xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1;
|
||||
xgrid[2] = h2*xgrid[2] + lo2;
|
||||
}
|
||||
|
||||
const F_FLOAT xtmp = xgrid[0];
|
||||
const F_FLOAT ytmp = xgrid[1];
|
||||
const F_FLOAT ztmp = xgrid[2];
|
||||
d_gridall(igrid,0) = xtmp;
|
||||
d_gridall(igrid,1) = ytmp;
|
||||
d_gridall(igrid,2) = ztmp;
|
||||
|
||||
const auto idxb_max = snaKK.idxb_max;
|
||||
|
||||
// linear contributions
|
||||
|
||||
for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
|
||||
const auto idxb = icoeff % idxb_max;
|
||||
const auto idx_chem = icoeff / idxb_max;
|
||||
d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
utility functions
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<class TagStyle>
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
|
||||
int team_size_max;
|
||||
|
||||
team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
|
||||
|
||||
if (team_size*vector_length > team_size_max)
|
||||
team_size = team_size_max/vector_length;
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<class TagStyle>
|
||||
void ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
|
||||
int team_size_max;
|
||||
|
||||
team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
|
||||
|
||||
if (team_size*vector_length > team_size_max)
|
||||
team_size = team_size_max/vector_length;
|
||||
}
|
||||
|
||||
template<class DeviceType, typename real_type, int vector_length>
|
||||
template<typename scratch_type>
|
||||
int ComputeSNAGridKokkos<DeviceType, real_type, vector_length>::scratch_size_helper(int values_per_team) {
|
||||
typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
|
||||
|
||||
return ScratchViewType::shmem_size(values_per_team);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
routines used by template reference classes
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
template<class DeviceType>
|
||||
ComputeSNAGridKokkosDevice<DeviceType>::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg)
|
||||
: ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp, narg, arg) { ; }
|
||||
|
||||
template<class DeviceType>
|
||||
void ComputeSNAGridKokkosDevice<DeviceType>::init()
|
||||
{
|
||||
Base::init();
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
void ComputeSNAGridKokkosDevice<DeviceType>::compute_array()
|
||||
{
|
||||
Base::compute_array();
|
||||
}
|
||||
|
||||
#ifdef LMP_KOKKOS_GPU
|
||||
template<class DeviceType>
|
||||
ComputeSNAGridKokkosHost<DeviceType>::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg)
|
||||
: ComputeSNAGridKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp, narg, arg) { ; }
|
||||
|
||||
template<class DeviceType>
|
||||
void ComputeSNAGridKokkosHost<DeviceType>::init()
|
||||
{
|
||||
Base::init();
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
void ComputeSNAGridKokkosHost<DeviceType>::compute_array()
|
||||
{
|
||||
Base::compute_array();
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
@ -769,6 +769,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um;
|
||||
typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um;
|
||||
typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread;
|
||||
|
||||
//4d float array n
|
||||
typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
|
||||
typedef tdual_float_4d::t_dev t_float_4d;
|
||||
typedef tdual_float_4d::t_dev_const t_float_4d_const;
|
||||
typedef tdual_float_4d::t_dev_um t_float_4d_um;
|
||||
typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um;
|
||||
typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread;
|
||||
|
||||
#ifdef LMP_KOKKOS_NO_LEGACY
|
||||
typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
|
||||
#else
|
||||
@ -1075,6 +1083,14 @@ typedef tdual_float_2d::t_host_um t_float_2d_um;
|
||||
typedef tdual_float_2d::t_host_const_um t_float_2d_const_um;
|
||||
typedef tdual_float_2d::t_host_const_randomread t_float_2d_randomread;
|
||||
|
||||
//4d float array n
|
||||
typedef Kokkos::DualView<LMP_FLOAT****, Kokkos::LayoutRight, LMPDeviceType> tdual_float_4d;
|
||||
typedef tdual_float_4d::t_host t_float_4d;
|
||||
typedef tdual_float_4d::t_host_const t_float_4d_const;
|
||||
typedef tdual_float_4d::t_host_um t_float_4d_um;
|
||||
typedef tdual_float_4d::t_host_const_um t_float_4d_const_um;
|
||||
typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread;
|
||||
|
||||
#ifdef LMP_KOKKOS_NO_LEGACY
|
||||
typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_float_1d_4;
|
||||
#else
|
||||
|
||||
@ -163,6 +163,7 @@ template <typename TYPE, typename HTYPE>
|
||||
{
|
||||
data = TYPE(std::string(name),n1,n2);
|
||||
h_data = Kokkos::create_mirror_view(data);
|
||||
//printf(">>> name: %s\n", name);
|
||||
return data;
|
||||
}
|
||||
|
||||
@ -173,6 +174,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
|
||||
data = TYPE(std::string(name),n1,n2);
|
||||
bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1;
|
||||
array = (typename TYPE::value_type **) smalloc(nbytes,name);
|
||||
//printf(">>> name %s nbytes %d\n", name, nbytes);
|
||||
|
||||
for (int i = 0; i < n1; i++) {
|
||||
if (n2 == 0)
|
||||
@ -183,6 +185,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array,
|
||||
return data;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
create a 4d array with indices 2,3,4 offset, but not first
|
||||
2nd index from n2lo to n2hi inclusive
|
||||
3rd index from n3lo to n3hi inclusive
|
||||
4th index from n4lo to n4hi inclusive
|
||||
cannot grow it
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template <typename TYPE>
|
||||
TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array,
|
||||
int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi,
|
||||
const char *name)
|
||||
{
|
||||
//if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array = nullptr;
|
||||
|
||||
printf("^^^^^ memoryKK->create_4d_offset_kokkos\n");
|
||||
|
||||
int n2 = n2hi - n2lo + 1;
|
||||
int n3 = n3hi - n3lo + 1;
|
||||
int n4 = n4hi - n4lo + 1;
|
||||
data = TYPE(std::string(name),n1,n2,n3,n4);
|
||||
bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1;
|
||||
array = (typename TYPE::value_type ****) smalloc(nbytes,name);
|
||||
|
||||
for (int i = 0; i < n1; i++) {
|
||||
if (n2 == 0) {
|
||||
array[i] = nullptr;
|
||||
} else {
|
||||
nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2;
|
||||
array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name);
|
||||
for (int j = 0; j < n2; j++){
|
||||
if (n3 == 0){
|
||||
array[i][j] = nullptr;
|
||||
} else {
|
||||
nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3;
|
||||
array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name);
|
||||
for (int k = 0; k < n3; k++){
|
||||
if (n4 == 0)
|
||||
array[i][j][k] = nullptr;
|
||||
else
|
||||
array[i][j][k] = &data.h_view(i,j,k,0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
template <typename TYPE, typename HTYPE>
|
||||
TYPE create_kokkos(TYPE &data, HTYPE &h_data,
|
||||
typename TYPE::value_type **&array, int n1, int n2,
|
||||
|
||||
@ -232,6 +232,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
|
||||
// map[i] = which element the Ith atom type is, -1 if not mapped
|
||||
// map[0] is not used
|
||||
|
||||
//printf(">>> ntypes: %d\n", atom->ntypes);
|
||||
for (int i = 1; i <= atom->ntypes; i++) {
|
||||
char* elemname = elemtypes[i-1];
|
||||
int jelem;
|
||||
@ -239,6 +240,7 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
|
||||
if (strcmp(elemname,descriptor->elements[jelem]) == 0)
|
||||
break;
|
||||
|
||||
//printf(">>> nelements: %d\n", descriptor->nelements);
|
||||
if (jelem < descriptor->nelements)
|
||||
map[i] = jelem;
|
||||
else if (strcmp(elemname,"NULL") == 0) map[i] = -1;
|
||||
|
||||
@ -664,6 +664,8 @@ template<class DeviceType, typename real_type, int vector_length>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh>::member_type& team) const {
|
||||
|
||||
|
||||
printf("d_wjelem: %f %f %f %f\n", d_wjelem[0], d_wjelem[1], d_wjelem(0), d_wjelem(1));
|
||||
SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
|
||||
|
||||
// extract atom number
|
||||
|
||||
@ -643,7 +643,6 @@ void SNAKokkos<DeviceType, real_type, vector_length>::evaluate_ui_jbend(const Wi
|
||||
}
|
||||
|
||||
ulist_wrapper.set(ma, ulist_accum);
|
||||
|
||||
mb++;
|
||||
}
|
||||
|
||||
@ -870,7 +869,6 @@ typename SNAKokkos<DeviceType, real_type, vector_length>::complex SNAKokkos<Devi
|
||||
int jju1 = idxu_block[j1] + (j1+1)*mb1min;
|
||||
int jju2 = idxu_block[j2] + (j2+1)*mb2max;
|
||||
int icgb = mb1min*(j2+1) + mb2max;
|
||||
|
||||
#ifdef LMP_KK_DEVICE_COMPILE
|
||||
#pragma unroll
|
||||
#endif
|
||||
@ -2298,7 +2296,7 @@ void SNAKokkos<DeviceType, real_type, vector_length>::compute_s_dsfac(const real
|
||||
constexpr real_type zero = static_cast<real_type>(0.0);
|
||||
constexpr real_type onehalf = static_cast<real_type>(0.5);
|
||||
|
||||
if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; }
|
||||
if (switch_flag == 0) { sfac_outer = one; dsfac_outer = zero; }
|
||||
else if (switch_flag == 1) {
|
||||
if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; }
|
||||
else if (r > rcut) { sfac = zero; dsfac = zero; return; }
|
||||
|
||||
@ -57,6 +57,7 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :
|
||||
|
||||
ComputeGrid::~ComputeGrid()
|
||||
{
|
||||
if (copymode) return;
|
||||
deallocate();
|
||||
}
|
||||
|
||||
@ -87,6 +88,7 @@ void ComputeGrid::grid2x(int igrid, double *x)
|
||||
x[2] = iz * delz;
|
||||
|
||||
if (triclinic) domain->lamda2x(x, x);
|
||||
//printf(">>>>> ComputeGrid::grid2x\n");
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -102,6 +104,7 @@ void ComputeGrid::assign_coords_all()
|
||||
gridall[igrid][1] = x[1];
|
||||
gridall[igrid][2] = x[2];
|
||||
}
|
||||
//printf(">>>>> ComputeGrid::assign_coords_all\n");
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
@ -110,8 +113,8 @@ void ComputeGrid::assign_coords_all()
|
||||
|
||||
void ComputeGrid::allocate()
|
||||
{
|
||||
//printf(">>> ComputeGrid::allocate\n");
|
||||
// allocate arrays
|
||||
|
||||
memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
|
||||
memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
|
||||
if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
|
||||
|
||||
@ -31,14 +31,13 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
|
||||
// skip over arguments used by base class
|
||||
// so that argument positions are identical to
|
||||
// regular per-atom compute
|
||||
|
||||
arg += nargbase;
|
||||
narg -= nargbase;
|
||||
|
||||
// begin code common to all SNAP computes
|
||||
|
||||
double rfac0, rmin0;
|
||||
int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
|
||||
//double rfac0, rmin0;
|
||||
//int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
|
||||
|
||||
int ntypes = atom->ntypes;
|
||||
int nargmin = 6 + 2 * ntypes;
|
||||
@ -56,6 +55,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
|
||||
wselfallflag = 0;
|
||||
switchinnerflag = 0;
|
||||
nelements = 1;
|
||||
chunksize = 32768;
|
||||
parallel_thresh = 8192;
|
||||
|
||||
// process required arguments
|
||||
|
||||
@ -67,8 +68,9 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
|
||||
twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
|
||||
|
||||
for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
|
||||
for (int i = 0; i < ntypes; i++)
|
||||
for (int i = 0; i < ntypes; i++) {
|
||||
wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
|
||||
}
|
||||
|
||||
// construct cutsq
|
||||
|
||||
@ -181,11 +183,12 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
|
||||
|
||||
ComputeSNAGrid::~ComputeSNAGrid()
|
||||
{
|
||||
if (copymode) return;
|
||||
|
||||
memory->destroy(radelem);
|
||||
memory->destroy(wjelem);
|
||||
memory->destroy(cutsq);
|
||||
delete snaptr;
|
||||
|
||||
if (chemflag) memory->destroy(map);
|
||||
}
|
||||
|
||||
@ -202,6 +205,7 @@ void ComputeSNAGrid::init()
|
||||
|
||||
void ComputeSNAGrid::compute_array()
|
||||
{
|
||||
|
||||
invoked_array = update->ntimestep;
|
||||
|
||||
// compute sna for each gridpoint
|
||||
|
||||
@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid {
|
||||
void init() override;
|
||||
void compute_array() override;
|
||||
double memory_usage() override;
|
||||
int ncoeff,nelements; // public for kokkos, but could go in the protected block now
|
||||
|
||||
private:
|
||||
int ncoeff;
|
||||
protected:
|
||||
//int ncoeff;
|
||||
double **cutsq;
|
||||
double rcutfac;
|
||||
double *radelem;
|
||||
double *wjelem;
|
||||
int *map; // map types to [0,nelements)
|
||||
int nelements, chemflag;
|
||||
int chemflag;
|
||||
int switchinnerflag;
|
||||
double *sinnerelem;
|
||||
double *dinnerelem;
|
||||
int parallel_thresh;
|
||||
class SNA *snaptr;
|
||||
double cutmax;
|
||||
int quadraticflag;
|
||||
double rfac0, rmin0;
|
||||
int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
|
||||
int chunksize;
|
||||
|
||||
};
|
||||
|
||||
} // namespace LAMMPS_NS
|
||||
|
||||
Reference in New Issue
Block a user