diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp index 197234cf1d..8a05ba7901 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.cpp +++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp @@ -23,3 +23,59 @@ template class ComputeSNAGridKokkosHost; #endif } + + + + +// The following chunk will compile but we're gonna try a wrapper approach like pair snap. +/* +#include "compute_sna_grid_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +#include "sna_kokkos.h" +#include "update.h" + +using namespace LAMMPS_NS; + +// ---------------------------------------------------------------------- + +template +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : + ComputeSNAGrid(lmp, narg, arg) +{ + + printf("^^^ inside ComputeSNAGridKokkos constructor\n"); + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + +} + +// ---------------------------------------------------------------------- + +template +ComputeSNAGridKokkos::~ComputeSNAGridKokkos() +{ + if (copymode) return; + + +} + +namespace LAMMPS_NS { +template class ComputeSNAGridKokkos; +#ifdef LMP_KOKKOS_GPU +template class ComputeSNAGridKokkos; +#endif +} +*/ + diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index 9ab23f5bd2..b461f755b8 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -29,42 +29,233 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice); #include "compute_sna_grid.h" #include "kokkos_type.h" +//#include "pair_snap.h" +//#include "kokkos_type.h" //#include "neigh_list_kokkos.h" #include "sna_kokkos.h" //#include "pair_kokkos.h" namespace LAMMPS_NS { +// Routines for both the CPU and GPU backend +//template +//struct TagPairSNAPComputeForce{}; + + +// GPU backend only +/* +struct TagPairSNAPComputeNeigh{}; +struct TagPairSNAPComputeCayleyKlein{}; +struct TagPairSNAPPreUi{}; +struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence +struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence +struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +struct TagPairSNAPComputeZi{}; +struct TagPairSNAPBeta{}; +struct TagPairSNAPComputeBi{}; +struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS +struct TagPairSNAPComputeYi{}; +struct TagPairSNAPComputeYiWithZlist{}; +template +struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence +template +struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence +*/ +//struct TagPairSNAPPreUi{}; +struct TagCSNAGridComputeNeigh{}; +struct TagCSNAGridComputeCayleyKlein{}; +struct TagCSNAGridPreUi{}; +struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence +struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence +struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +struct TagCSNAGridComputeZi{}; +struct TagCSNAGridComputeBi{}; + +struct TagComputeSNAGridLoop{}; +struct TagComputeSNAGrid3D{}; +//struct TagCSNAGridTeam{}; + +// CPU backend only +/* +struct TagPairSNAPComputeNeighCPU{}; +struct TagPairSNAPPreUiCPU{}; +struct TagPairSNAPComputeUiCPU{}; +struct TagPairSNAPTransformUiCPU{}; +struct TagPairSNAPComputeZiCPU{}; +struct TagPairSNAPBetaCPU{}; +struct TagPairSNAPComputeBiCPU{}; +struct TagPairSNAPZeroYiCPU{}; +struct TagPairSNAPComputeYiCPU{}; +struct TagPairSNAPComputeDuidrjCPU{}; +struct TagPairSNAPComputeDeidrjCPU{}; +*/ +struct TagComputeSNAGridLoopCPU{}; + +//template template class ComputeSNAGridKokkos : public ComputeSNAGrid { public: - //enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD}; - //enum {COUL_FLAG=0}; typedef DeviceType device_type; typedef ArrayTypes AT; - typedef EV_FLOAT value_type; static constexpr int vector_length = vector_length_; using real_type = real_type_; - //using complex = SNAComplex; + using complex = SNAComplex; + + // Static team/tile sizes for device offload + +#ifdef KOKKOS_ENABLE_HIP + static constexpr int team_size_compute_neigh = 2; + static constexpr int tile_size_compute_ck = 2; + static constexpr int tile_size_pre_ui = 2; + static constexpr int team_size_compute_ui = 2; + static constexpr int tile_size_transform_ui = 2; + static constexpr int tile_size_compute_zi = 2; + static constexpr int tile_size_compute_bi = 2; + static constexpr int tile_size_transform_bi = 2; + static constexpr int tile_size_compute_yi = 2; + static constexpr int team_size_compute_fused_deidrj = 2; +#else + static constexpr int team_size_compute_neigh = 4; + static constexpr int tile_size_compute_ck = 4; + static constexpr int tile_size_pre_ui = 4; + static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4; + static constexpr int tile_size_transform_ui = 4; + static constexpr int tile_size_compute_zi = 8; + static constexpr int tile_size_compute_bi = 4; + static constexpr int tile_size_transform_bi = 4; + static constexpr int tile_size_compute_yi = 8; + static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; +#endif + + // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches + // This hides the Kokkos::IndexType and Kokkos::Rank<3...> + // and reduces the verbosity of the LaunchBound by hiding the explicit + // multiplication by vector_length + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNAP>; + + // MDRangePolicy for the 3D grid loop: + template + using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>; + + // Testing out team policies + template + using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; + //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, Kokkos::IndexType, Kokkos::IndexType, TagComputeSNAP>; + //using team_member = typename team_policy::member_type; + + // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches + // This hides the LaunchBounds abstraction by hiding the explicit + // multiplication by vector length + template + using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; ComputeSNAGridKokkos(class LAMMPS *, int, char **); ~ComputeSNAGridKokkos() override; void init() override; - //void compute_array(int, int) override; - //double memory_usage() override; - + void setup() override; + void compute_array() override; + + // Utility functions for teams + + template + void check_team_size_for(int, int&); + + template + void check_team_size_reduce(int, int&); + + // operator function for example team policy + //KOKKOS_INLINE_FUNCTION + //void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLoop, const int& ) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLoopCPU, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + // 3D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + protected: + SNAKokkos snaKK; + int chunk_size, chunk_offset; + int host_flag; + int ntotal; + int total_range; // total number of loop iterations in grid + int zlen; //= nzhi-nzlo+1; + int ylen; //= nyhi-nylo+1; + int xlen; //= nxhi-nxlo+1; - using KKDeviceType = typename KKDevice::value; + double cutsq_tmp; // temporary cutsq until we get a view + + Kokkos::View d_radelem; // element radii + Kokkos::View d_wjelem; // elements weights + //Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_sinnerelem; // element inner cutoff midpoint + Kokkos::View d_dinnerelem; // element inner cutoff half-width + Kokkos::View d_ninside; // ninside for all atoms in list + Kokkos::View d_map; // mapping from atom types to elements + + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq; + typedef Kokkos::View > t_fparams_rnd; + t_fparams_rnd rnd_cutsq; + + typename AT::t_x_array_randomread x; + typename AT::t_int_1d_randomread type; + DAT::tdual_float_2d k_grid; + DAT::tdual_float_2d k_gridall; + typename AT::t_float_2d d_grid; + typename AT::t_float_2d d_gridall; + + //DAT::tdual_float_4d k_gridlocal; + //typedef Kokkos::DualView t_gridlocal_4d; + //typedef Kokkos::View t_4d; + typedef Kokkos::DualView tdual_float_4d; + tdual_float_4d k_gridlocal; + tdual_float_4d d_gridlocal; + + + // Utility routine which wraps computing per-team scratch size requirements for + // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj + template + int scratch_size_helper(int values_per_team); }; // These wrapper classes exist to make the compute style factory happy/avoid having -// to extend the compute style factory to support Compute classes w/an arbitrary number +// to extend the compute style factory to support Compute classes w/an arbitrary number // of extra template parameters template @@ -76,10 +267,9 @@ class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos); +ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos); +ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos); +// clang-format on +#else + +// clang-format off +#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H +#define LMP_COMPUTE_SNA_GRID_KOKKOS_H + +#include "compute_sna_grid.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +//template +//struct TagComputeCoordAtom{}; + +template +class ComputeSNAGridKokkos : public ComputeSNAGrid { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + ComputeSNAGridKokkos(class LAMMPS *, int, char **); + ~ComputeSNAGridKokkos() override; + + private: + +}; + +} + +#endif +#endif +*/ + diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index e958fcdb45..b0cf30d070 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -3,12 +3,10 @@ LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories LAMMPS development team: developers@lammps.org - Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. - See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ @@ -18,17 +16,20 @@ ------------------------------------------------------------------------- */ #include "compute_sna_grid_kokkos.h" +#include "pair_snap_kokkos.h" #include "atom_kokkos.h" #include "atom_masks.h" #include "comm.h" #include "error.h" -#include "force.h" -#include "kokkos.h" #include "memory_kokkos.h" -#include "neighbor_kokkos.h" +#include "modify.h" +#include "neigh_list.h" #include "neigh_request.h" +#include "neighbor_kokkos.h" +//#include "sna_kokkos.h" #include "sna.h" +#include "update.h" #include #include @@ -39,69 +40,757 @@ namespace LAMMPS_NS { -/* ---------------------------------------------------------------------- */ +// Constructor template -ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid (lmp, narg, arg) +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg) { //respa_enable = 0; kokkosable = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; - //datamask_read = EMPTY_MASK; - //datamask_modify = EMPTY_MASK; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; - //host_flag = (execution_space == Host); + k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1); + auto d_cutsq = k_cutsq.template view(); + rnd_cutsq = d_cutsq; + + host_flag = (execution_space == Host); + + // ComputeSNAGrid constructor allocates `map` so let's do same here. + // actually, let's move this down to init + //int n = atom->ntypes; + //printf("^^^ realloc d_map\n"); + //MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + + + printf("^^^^^ cutsq: %f\n", cutsq[1][1]); + + cutsq_tmp = cutsq[1][1]; + + //memoryKK->create_kokkos(k_gridlocal, + //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); } -/* ---------------------------------------------------------------------- */ +// Destructor template ComputeSNAGridKokkos::~ComputeSNAGridKokkos() { if (copymode) return; + //memoryKK->destroy_kokkos(k_eatom,eatom); + //memoryKK->destroy_kokkos(k_vatom,vatom); + printf("^^^ Finish ComputeSNAGridKokkos destructor\n"); } -/* ---------------------------------------------------------------------- */ +// Init template void ComputeSNAGridKokkos::init() { + printf("^^^ Begin ComputeSNAGridKokkos init()\n"); + // The part of pair_snap_kokkos_impl.h that allocates snap params is coeff(), and it + // calls the original coeff function. So let's do that here: + + ComputeSNAGrid::init(); + + // Set up element lists + printf("^^^ Begin kokkos reallocs\n"); + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); + MemKK::realloc_kokkos(d_wjelem,"pair:wjelem",nelements); + // pair snap kokkos uses `ncoeffall` in the following, inherits from original. + //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff); + MemKK::realloc_kokkos(d_sinnerelem,"pair:sinnerelem",nelements); + MemKK::realloc_kokkos(d_dinnerelem,"pair:dinnerelem",nelements); + int n = atom->ntypes; + //printf("^^^ realloc d_map\n"); + printf("^^^ n: %d\n", n); + MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + + printf("^^^ begin mirrow view creation\n"); + auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); + //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem); + auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); + auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); + auto h_map = Kokkos::create_mirror_view(d_map); + + printf("^^^ begin loop over elements, nelements = %d\n", nelements); + for (int ielem = 0; ielem < nelements; ielem++) { + printf("^^^^^ ielem %d\n", ielem); + h_radelem(ielem) = radelem[ielem]; + printf("^^^^^ 1\n"); + h_wjelem(ielem) = wjelem[ielem]; + printf("^^^^^ 2\n"); + if (switchinnerflag){ + h_sinnerelem(ielem) = sinnerelem[ielem]; + h_dinnerelem(ielem) = dinnerelem[ielem]; + } + // pair snap kokkos uses `ncoeffall` in the following. + //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) { + // h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff]; + //} + } + + printf("^^^ begin loop over map\n"); + // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where + // some things like `map` get allocated regardless of chem flag. + if (chemflag){ + for (int i = 1; i <= atom->ntypes; i++) { + h_map(i) = map[i]; + printf("%d\n", map[i]); + } + } + + Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_wjelem,h_wjelem); + if (switchinnerflag){ + Kokkos::deep_copy(d_sinnerelem,h_sinnerelem); + Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); + } + if (chemflag){ + Kokkos::deep_copy(d_map,h_map); + } + + snaKK = SNAKokkos(rfac0,twojmax, + rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK.grow_rij(0,0); + snaKK.init(); - printf("^^^ inside ComputeSNAGridKokkos init\n"); - // from pair_snap_kokkos_impl.h : - /* if (host_flag) { - if (lmp->kokkos->nthreads > 1) - error->all(FLERR,"Pair style snap/kk can currently only run on a single " - "CPU thread"); - PairSNAP::init_style(); + // The following lmp->kokkos will compile error with pointer to incomplete class type not allowed. + //if (lmp->kokkos->nthreads > 1) + // error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single " + // "CPU thread"); + + ComputeSNAGrid::init(); return; } - if (force->newton_pair == 0) - error->all(FLERR,"Pair style SNAP requires newton pair on"); + printf("^^^ Finished ComputeSNAGridKokkos init\n"); - // neighbor list request for KOKKOS - - neighflag = lmp->kokkos->neighflag; - - auto request = neighbor->add_request(this, NeighConst::REQ_FULL); - request->set_kokkos_host(std::is_same::value && - !std::is_same::value); - request->set_kokkos_device(std::is_same::value); - if (neighflag == FULL) - error->all(FLERR,"Must use half neighbor list style with pair snap/kk"); - */ } +// Setup + +template +void ComputeSNAGridKokkos::setup() +{ + // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. + // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. + //ComputeGrid::setup(); + printf("^^^^^ SETUP!\n"); + //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); + ComputeGrid::set_grid_global(); + ComputeGrid::set_grid_local(); + + // allocate arrays + + memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid"); + memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); + if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { + gridlocal_allocated = 1; + memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, + nyhi, nxlo, nxhi, "grid:gridlocal"); + } + array = gridall; +} + +// Compute + +template +void ComputeSNAGridKokkos::compute_array() +{ + printf("^^^ Begin ComputeSNAGridKokkos compute_array()\n"); + + if (DeviceType::in_parallel()) { + printf("^^^ compute_array() is a host function\n"); + } else { + printf("^^^ compute_array() is not a host function\n"); + } + + if (host_flag) { + /* + atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); + PairSNAP::compute(eflag_in,vflag_in); + atomKK->modified(Host,F_MASK); + */ + return; + } + + copymode = 1; + + zlen = nzhi-nzlo+1; + ylen = nyhi-nylo+1; + xlen = nxhi-nxlo+1; + printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi); + total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + // This will error because trying to access host view on the device: + //printf("x(0,0): %f\n", x(0,0)); + type = atomKK->k_type.view(); + k_cutsq.template sync(); + + + MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",total_range); + + //printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi); + + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // number of atoms. + + //const int ntotal = atomKK->nlocal + atomKK->nghost; + ntotal = atomKK->nlocal + atomKK->nghost; + //printf("^^^ ntotal: %d\n", ntotal); + + // ensure rij, inside, and typej are of size jnum + // snaKK.grow_rij(int, int) requires 2 args where one is a chunksize. + + chunk_size = MIN(chunksize, total_range); // "chunksize" variable is set by user + //printf("^^^ chunk_size: %d\n", chunk_size); + snaKK.grow_rij(chunk_size, ntotal); + + // Launch 3 teams of the maximum number of threads per team + //const int team_size_max = team_policy(3, 1).team_size_max( + // TagCSNAGridTeamPolicy, Kokkos::ParallelForTag()); + //typename Kokkos::TeamPolicy team_policy_test(3,1); + + // Using custom policy: + /* + CSNAGridTeamPolicy team_policy(chunk_size,team_size_compute_neigh,vector_length); + //team_policy = team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("TeamPolicy",team_policy,*this); + */ + + + chunk_size = total_range; + printf("%d %d %d\n", chunk_size, team_size_compute_neigh, vector_length); + // team_size_compute_neigh is defined in `pair_snap_kokkos.h` + + + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + + //ComputeNeigh + { + int scratch_size = scratch_size_helper(team_size_compute_neigh * ntotal); + + SnapAoSoATeamPolicy + policy_neigh(chunk_size, team_size_compute_neigh, vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + //ComputeCayleyKlein + { + // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0}, {vector_length, ntotal, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); + Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); + } + + //PreUi + { + // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); + Kokkos::parallel_for("PreUi",policy_preui,*this); + } + + // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot + { + // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + + if (chunk_size < parallel_thresh) + { + // Version with parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations) + const int n_teams = chunk_size_div * ntotal * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); + } else { + // Version w/out parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) + const int n_teams = chunk_size_div * ntotal; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this); + } + } + + //TransformUi: un-"fold" ulisttot, zero ylist + { + // team_size_transform_ui is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); + Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + } + + //Compute bispectrum in AoSoA data layout, transform Bi + //if (quadraticflag || eflag) { + + //ComputeZi + const int idxz_max = snaKK.idxz_max; + Snap3DRangePolicy + policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); + Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + + //ComputeBi + const int idxb_max = snaKK.idxb_max; + Snap3DRangePolicy + policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); + Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + + //Looks like best way to grab blist is in a parallel_for + + //Transform data layout of blist out of AoSoA + //We need this because `blist` gets used in ComputeForce which doesn't + //take advantage of AoSoA, which at best would only be beneficial on the margins + //NOTE: Do we need this in compute sna/grid/kk? + /* + Snap3DRangePolicy + policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); + Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + */ + + + + // let's try a simple parallel for loop + // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this + // function, because this is a host-device function. + /* + typename Kokkos::RangePolicy policy_loop(0,4); + Kokkos::parallel_for("Loop",policy_loop,*this); + */ + + + // Simple working loop: + /* + Kokkos::parallel_for("Loop1", 4, KOKKOS_LAMBDA (const int& i) { + printf("Greeting from iteration %i\n",i); + }); + */ + + /* + // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this + // function, because this is a host-device function. + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + Snap3DRangePolicy + policy_compute_ck({0,0,0},{vector_length,ntotal,chunk_size_div},{vector_length,tile_size_compute_ck,1}); + Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); + */ + + // Simple example of 3D MD range policy. + // Begin loop over grid points. + /* + // NOTE: We don't get the compiler error calling host function DeviceType::in_parallel() in this + // function, but we get it in the above function. + int n = 3; // bounds for mdrange policy + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagComputeSNAGrid3D> policy_3d({0,0,0},{n,n,n}); + Kokkos::parallel_for("3D",policy_3d,*this); + */ + + printf("^^^ End ComputeSNAGridKokkos compute_array()\n"); +} + +/* ---------------------------------------------------------------------- + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials +------------------------------------------------------------------------- */ + +/* + Simple team policy functor seeing how many layers deep we can go with the parallelism. + */ +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos + + SNAKokkos my_sna = snaKK; + + // basic quantities associated with this team: + // team_rank : rank of thread in this team + // league_rank : rank of team in this league + // team_size : number of threads in this team + //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size()); + + // extract loop index + int ii = team.team_rank() + team.league_rank() * team.team_size(); + if (ii >= chunk_size) return; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + const int tile_size = ntotal; // number of elements per thread + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + //printf("ntotal scratch_shift: %d %d\n", ntotal, scratch_shift); + int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + //printf("ii: %d\n", ii); + + // convert to grid indices + + int iz = ii/(xlen*ylen); + int i2 = ii - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + //int igrid = iz * (nx * ny) + iy * nx + ix; + + // these end up being the same...? + //printf("ii igrid: %d %d\n", ii, igrid); + + // grid2x converts igrid to ix,iy,iz like we've done before + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + const double xtmp = xgrid[0]; + const double ytmp = xgrid[1]; + const double ztmp = xgrid[2]; + + // currently, all grid points are type 1 + // not clear what a better choice would be + + const int itype = 1; + const int ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. + if (triclinic){ + printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp); + } else { + //printf("We are not triclinic\n"); + } + + // can check xgrid positions with original + //printf("%f %f %f\n", xgrid[0], xgrid[1], xgrid[2]); + + // Compute the number of neighbors, store rsq + int ninside = 0; + // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]? + //printf("ntotal: %d\n", ntotal); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), + [&] (const int j, int& count) { + + // From pair snap/kk : + /* + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + */ + // From compute sna/grid/kk : + /* + const double delx = xtmp - x[j][0]; + const double dely = ytmp - x[j][1]; + const double delz = ztmp - x[j][2]; + */ + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + //printf("dx: %f\n", dx); + + //const double rsq = delx * delx + dely * dely + delz * delz; + int jtype = type(j); + //printf("jtype: %d\n", jtype); + //int jelem = 0; + //if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) { + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + //if (rsq >= rnd_cutsq(itype,jtype)) { + if (rsq >= cutsq_tmp){ + jtype = -1; // use -1 to signal it's outside the radius + } + //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); + + if (j > 340){ + printf("j: %d\n", j); + } + + //printf("j: %d\n", j); + type_cache[j] = jtype; + + if (jtype >= 0) + count++; + + }, ninside); + + //printf("ninside: %d\n", ninside); + + d_ninside(ii) = ninside; + + // TODO: Make sure itype is appropriate instead of ielem + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), + [&] (const int j, int& offset, bool final) { + + const int jtype = type_cache[j]; + + if (jtype >= 0) { + if (final) { + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + const int jelem = d_map[jtype]; + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); + my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); + my_sna.inside(ii,offset) = j; + if (switchinnerflag) { + my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[itype] + d_sinnerelem[jelem]); + my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[itype] + d_dinnerelem[jelem]); + } + if (chemflag) + my_sna.element(ii,offset) = jelem; + else + my_sna.element(ii,offset) = 0; + } + offset++; + } + }); +} + + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + //printf("^^^ ComputeCayleyKlein\n"); + + /* + if (DeviceType::in_parallel()) { + printf("operator() of TagCSNAGridComputeCayleyKlein is a host function\n"); + } else { + printf("operator() of TagCSNAGridComputeCayleyKlein is not a host function\n"); + } + */ + + const int ii = iatom_mod + iatom_div * vector_length; + if (ii >= chunk_size) return; + + const int ninside = ntotal; //d_ninside(ii); + if (jnbor >= ninside) return; + + my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int ii = iatom_mod + iatom_div * vector_length; + if (ii >= chunk_size) return; + + int itype = type(ii); + int ielem = d_map[itype]; + + my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // extract flattened atom_div / neighbor number / bend_location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (ntotal * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (ntotal * (twojmax + 1)); + const int jbend = jj_jbend / ntotal; + int jj = jj_jbend - jbend * ntotal; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / ntotal; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * ntotal; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div); + }); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (idxu > my_sna.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++){ + + const FullHalfMapper mapper = my_sna.idxu_full_half[idxu]; + + auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + + if (mapper.flip_sign == 1){ + utot_im = -utot_im; + } else if (mapper.flip_sign == -1){ + utot_re = -utot_re; + } + + my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + + if (mapper.flip_sign == 0) { + my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + } + } +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjz >= my_sna.idxz_max) return; + + my_sna.compute_zi(iatom_mod,jjz,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjb >= my_sna.idxb_max) return; + + my_sna.compute_bi(iatom_mod,jjb,iatom_div); +} + +/* +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + +} +*/ + +/* ---------------------------------------------------------------------- + Begin routines that are unique to the CPU codepath. These do not take + advantage of AoSoA data layouts, but that could be a good point of + future optimization and unification with the above kernels. It's unlikely + that scratch memory optimizations will ever be useful for the CPU due to + different arithmetic intensity requirements for the CPU vs GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagComputeSNAGridLoopCPU,const int& ii) const { + +} + +/* ---------------------------------------------------------------------- + utility functions +------------------------------------------------------------------------- */ + +template +template +void ComputeSNAGridKokkos::check_team_size_for(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +void ComputeSNAGridKokkos::check_team_size_reduce(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +int ComputeSNAGridKokkos::scratch_size_helper(int values_per_team) { + typedef Kokkos::View > ScratchViewType; + + return ScratchViewType::shmem_size(values_per_team); +} + +/* ---------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- routines used by template reference classes ------------------------------------------------------------------------- */ + template ComputeSNAGridKokkosDevice::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridKokkos(lmp, narg, arg) { ; } @@ -112,6 +801,12 @@ void ComputeSNAGridKokkosDevice::init() Base::init(); } +template +void ComputeSNAGridKokkosDevice::compute_array() +{ + Base::compute_array(); +} + #ifdef LMP_KOKKOS_GPU template ComputeSNAGridKokkosHost::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg) @@ -122,6 +817,12 @@ void ComputeSNAGridKokkosHost::init() { Base::init(); } + +template +void ComputeSNAGridKokkosHost::compute_array() +{ + Base::compute_array(); +} #endif } diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h index 9d894a344a..35a7ceaeb4 100644 --- a/src/KOKKOS/memory_kokkos.h +++ b/src/KOKKOS/memory_kokkos.h @@ -183,6 +183,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, return data; } +/* ---------------------------------------------------------------------- + create a 4d array with indices 2,3,4 offset, but not first + 2nd index from n2lo to n2hi inclusive + 3rd index from n3lo to n3hi inclusive + 4th index from n4lo to n4hi inclusive + cannot grow it +------------------------------------------------------------------------- */ + +template +TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array, + int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi, + const char *name) +{ + //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array = nullptr; + + printf("^^^^^ memoryKK->create_4d_offset_kokkos\n"); + + int n2 = n2hi - n2lo + 1; + int n3 = n3hi - n3lo + 1; + int n4 = n4hi - n4lo + 1; + data = TYPE(std::string(name),n1,n2,n3,n4); + bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1; + array = (typename TYPE::value_type ****) smalloc(nbytes,name); + + for (int i = 0; i < n1; i++) { + if (n2 == 0) { + array[i] = nullptr; + } else { + nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2; + array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name); + for (int j = 0; j < n2; j++){ + if (n3 == 0){ + array[i][j] = nullptr; + } else { + nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3; + array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name); + for (int k = 0; k < n3; k++){ + if (n4 == 0) + array[i][j][k] = nullptr; + else + array[i][j][k] = &data.h_view(i,j,k,0); + } + } + } + } + } + + return data; +} + template TYPE create_kokkos(TYPE &data, HTYPE &h_data, typename TYPE::value_type **&array, int n1, int n2, diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp index 2179bb8ebd..ad70df30e8 100644 --- a/src/ML-SNAP/compute_grid.cpp +++ b/src/ML-SNAP/compute_grid.cpp @@ -57,6 +57,8 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) : ComputeGrid::~ComputeGrid() { + printf("^^^ begin ComputeGrid destructor\n"); + if (copymode) return; deallocate(); } @@ -111,7 +113,7 @@ void ComputeGrid::assign_coords_all() void ComputeGrid::allocate() { // allocate arrays - + printf("^^^^^^^^^^^^^^^ ComputeGrid::allocate()\n"); memory->create(grid, size_array_rows, size_array_cols, "grid:grid"); memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall"); if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp index 4243202545..36780213f2 100644 --- a/src/ML-SNAP/compute_sna_grid.cpp +++ b/src/ML-SNAP/compute_sna_grid.cpp @@ -31,14 +31,14 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : // skip over arguments used by base class // so that argument positions are identical to // regular per-atom compute - + printf("^^^ inside compute sna grid constructor\n"); arg += nargbase; narg -= nargbase; // begin code common to all SNAP computes - double rfac0, rmin0; - int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + //double rfac0, rmin0; + //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; int ntypes = atom->ntypes; int nargmin = 6 + 2 * ntypes; @@ -56,6 +56,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : wselfallflag = 0; switchinnerflag = 0; nelements = 1; + chunksize = 32768; + parallel_thresh = 8192; // process required arguments @@ -112,6 +114,7 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp); iarg += 2; } else if (strcmp(arg[iarg], "chem") == 0) { + printf("^^^ chem flag, creating map\n"); if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style); chemflag = 1; memory->create(map, ntypes + 1, "compute_sna_grid:map"); @@ -181,11 +184,17 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid::~ComputeSNAGrid() { - memory->destroy(radelem); - memory->destroy(wjelem); - memory->destroy(cutsq); - delete snaptr; + if (copymode) return; + printf("^^^ begin ComputeSNAGrid destructor\n"); + memory->destroy(radelem); + printf("^^^^ CSG 1\n"); + memory->destroy(wjelem); + printf("^^^^ CSG 2\n"); + memory->destroy(cutsq); + printf("^^^^ CSG 3\n"); + delete snaptr; + printf("^^^^ CSG 4\n"); if (chemflag) memory->destroy(map); } @@ -196,12 +205,16 @@ void ComputeSNAGrid::init() if ((modify->get_compute_by_style("^sna/grid$").size() > 1) && (comm->me == 0)) error->warning(FLERR, "More than one instance of compute sna/grid"); snaptr->init(); + + printf("^^^ finished ComputeSNAGrid init()\n"); } /* ---------------------------------------------------------------------- */ void ComputeSNAGrid::compute_array() { + printf("^^^ inside ComputeSNAGrid compute_array()\n"); + invoked_array = update->ntimestep; // compute sna for each gridpoint @@ -211,6 +224,8 @@ void ComputeSNAGrid::compute_array() int *const type = atom->type; const int ntotal = atom->nlocal + atom->nghost; + printf("^^^ ntotal: %d\n", ntotal); + // ensure rij, inside, and typej are of size jnum snaptr->grow_rij(ntotal); diff --git a/src/ML-SNAP/compute_sna_grid.h b/src/ML-SNAP/compute_sna_grid.h index 3a5a373826..a158c2342f 100644 --- a/src/ML-SNAP/compute_sna_grid.h +++ b/src/ML-SNAP/compute_sna_grid.h @@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid { void init() override; void compute_array() override; double memory_usage() override; + int ncoeff,nelements; // public for kokkos, but could go in the protected block now - private: - int ncoeff; + protected: + //int ncoeff; double **cutsq; double rcutfac; double *radelem; double *wjelem; int *map; // map types to [0,nelements) - int nelements, chemflag; + int chemflag; int switchinnerflag; double *sinnerelem; double *dinnerelem; + int parallel_thresh; class SNA *snaptr; double cutmax; int quadraticflag; + double rfac0, rmin0; + int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + int chunksize; + }; } // namespace LAMMPS_NS