From be94176c034234ad4280420ca5d38cafb776c7ce Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 12 Jan 2023 15:44:04 +0100 Subject: [PATCH 01/51] Re-starting MALA branch in MALA fork --- examples/snap/in.grid.gaussian | 66 ++++++++ src/ML-SNAP/compute_gaussian_grid_local.cpp | 167 ++++++++++++++++++++ src/ML-SNAP/compute_gaussian_grid_local.h | 51 ++++++ src/ML-SNAP/compute_sna_grid_local.cpp | 2 +- 4 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 examples/snap/in.grid.gaussian create mode 100644 src/ML-SNAP/compute_gaussian_grid_local.cpp create mode 100644 src/ML-SNAP/compute_gaussian_grid_local.h diff --git a/examples/snap/in.grid.gaussian b/examples/snap/in.grid.gaussian new file mode 100644 index 0000000000..9caa61e455 --- /dev/null +++ b/examples/snap/in.grid.gaussian @@ -0,0 +1,66 @@ +# Demonstrate calculation of Gaussian descriptors on a grid +# for a cell with two atoms of type 1 and type 2. +# The output in dump.glocal shows that for grid points +# sitting on an atom of type 1 or 2: +# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219 +# val2 = 1.0/(0.2 *sqrt(2.0*pi))**3 = 7.93670 +# + +variable nrep index 1 +variable a index 3.316 +variable ngrid index 2 + +units metal +atom_modify map hash + +# generate the box and atom positions using a BCC lattice + +variable nx equal ${nrep} +variable ny equal ${nrep} +variable nz equal ${nrep} + +boundary p p p + +lattice custom $a & + a1 1 0 0 & + a2 0 1 0 & + a3 0 0 1 & + basis 0 0 0 & + basis 0.5 0.5 0.5 & + +region box block 0 ${nx} 0 ${ny} 0 ${nz} +create_box 2 box +create_atoms 1 box basis 1 1 basis 2 2 + +mass * 180.88 + +# define atom compute and grid compute + +group snapgroup type 1 +variable rcutfac equal 4.67637 +variable radelem1 equal 0.5 +variable radelem2 equal 0.5 +variable sigmaelem1 equal 0.1355 +variable sigmaelem2 equal 0.2 +variable gaussian_options string & + "${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}" + +# build zero potential to force ghost atom creation + +pair_style zero ${rcutfac} +pair_coeff * * + +# define atom and grid computes + +compute mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} & + ${gaussian_options} + +# define output + +dump 1 all local 1000 dump.glocal c_mygridlocal[*] +dump 2 all custom 1000 dump.gatom id x y z + +# run + +run 0 + diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp new file mode 100644 index 0000000000..ec75563bcf --- /dev/null +++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp @@ -0,0 +1,167 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "compute_gaussian_grid_local.h" + +#include "atom.h" +#include "comm.h" +#include "error.h" +#include "force.h" +#include "math_const.h" +#include "math_special.h" +#include "memory.h" +#include "modify.h" +#include "update.h" + +#include +#include + +using namespace LAMMPS_NS; +using MathConst::MY_2PI; +using MathSpecial::powint; + +ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char **arg) : + ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr), + sigmaelem(nullptr), prefacelem(nullptr), argfacelem(nullptr) +{ + // skip over arguments used by base class + // so that argument positions are identical to + // regular per-atom compute + + arg += nargbase; + narg -= nargbase; + + double rfac0, rmin0; + int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + + int ntypes = atom->ntypes; + int nargmin = 4 + 2 * ntypes; + + if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style); + + // process required arguments + + memory->create(radelem, ntypes + 1, "gaussian/atom:radelem"); // offset by 1 to match up with types + memory->create(sigmaelem, ntypes + 1, "gaussian/atom:sigmaelem"); + memory->create(prefacelem, ntypes + 1, "gaussian/atom:prefacelem"); + memory->create(argfacelem, ntypes + 1, "gaussian/atom:argfacelem"); + + rcutfac = utils::numeric(FLERR, arg[3], false, lmp); + + for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp); + for (int i = 0; i < ntypes; i++) + sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp); + + // construct cutsq + + double cut; + cutmax = 0.0; + memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq"); + for (int i = 1; i <= ntypes; i++) { + cut = 2.0 * radelem[i] * rcutfac; + if (cut > cutmax) cutmax = cut; + cutsq[i][i] = cut * cut; + for (int j = i + 1; j <= ntypes; j++) { + cut = (radelem[i] + radelem[j]) * rcutfac; + cutsq[i][j] = cutsq[j][i] = cut * cut; + } + } + + size_local_cols = size_local_cols_base + ntypes; + + // pre-compute coefficients + + for (int i = 0; i < ntypes; i++) { + prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3); + argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]); + } +} + +/* ---------------------------------------------------------------------- */ + +ComputeGaussianGridLocal::~ComputeGaussianGridLocal() +{ + memory->destroy(radelem); + memory->destroy(sigmaelem); + memory->destroy(prefacelem); + memory->destroy(argfacelem); + memory->destroy(cutsq); +} + +/* ---------------------------------------------------------------------- */ + +void ComputeGaussianGridLocal::init() +{ + if ((modify->get_compute_by_style("^gaussian/grid/local$").size() > 1) && (comm->me == 0)) + error->warning(FLERR, "More than one instance of compute gaussian/grid/local"); +} + +/* ---------------------------------------------------------------------- */ + +void ComputeGaussianGridLocal::compute_local() +{ + invoked_local = update->ntimestep; + + // compute gaussian for each gridpoint + + double **const x = atom->x; + const int *const mask = atom->mask; + int *const type = atom->type; + const int ntotal = atom->nlocal + atom->nghost; + + int igrid = 0; + for (int iz = nzlo; iz <= nzhi; iz++) + for (int iy = nylo; iy <= nyhi; iy++) + for (int ix = nxlo; ix <= nxhi; ix++) { + double xgrid[3]; + grid2x(ix, iy, iz, xgrid); + const double xtmp = xgrid[0]; + const double ytmp = xgrid[1]; + const double ztmp = xgrid[2]; + + // Zeroing out the components, which are filled as a sum. + for (int icol = size_local_cols_base; icol < size_local_cols; icol++){ + alocal[igrid][icol] = 0.0; + } + + for (int j = 0; j < ntotal; j++) { + + // check that j is in compute group + + if (!(mask[j] & groupbit)) continue; + + const double delx = xtmp - x[j][0]; + const double dely = ytmp - x[j][1]; + const double delz = ztmp - x[j][2]; + const double rsq = delx * delx + dely * dely + delz * delz; + int jtype = type[j]; + if (rsq < cutsq[jtype][jtype]) { + int icol = size_local_cols_base + jtype - 1; + alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]); + } + } + igrid++; + } +} + +/* ---------------------------------------------------------------------- + memory usage +------------------------------------------------------------------------- */ + +double ComputeGaussianGridLocal::memory_usage() +{ + int n = atom->ntypes + 1; + int nbytes = (double) n * sizeof(int); // map + + return nbytes; +} diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h new file mode 100644 index 0000000000..cfab841a6e --- /dev/null +++ b/src/ML-SNAP/compute_gaussian_grid_local.h @@ -0,0 +1,51 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(gaussian/grid/local,ComputeGaussianGridLocal); +// clang-format on +#else + +#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H +#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H + +#include "compute_grid_local.h" + +namespace LAMMPS_NS { + +class ComputeGaussianGridLocal : public ComputeGridLocal { + public: + ComputeGaussianGridLocal(class LAMMPS *, int, char **); + ~ComputeGaussianGridLocal() override; + void init() override; + void compute_local() override; + double memory_usage() override; + + private: + int ncoeff; + double **cutsq; + double rcutfac; // global cut-off scale + double *radelem; // cut-off radius of each atom type + double *sigmaelem; // Gaussian width of each atom type + double *prefacelem; // Gaussian prefactor of each atom type + double *argfacelem; // Gaussian argument factor of each atom type + int *map; // map types to [0,nelements) + int nelements; + double cutmax; +}; + +} // namespace LAMMPS_NS + +#endif +#endif diff --git a/src/ML-SNAP/compute_sna_grid_local.cpp b/src/ML-SNAP/compute_sna_grid_local.cpp index 80a1baddab..76fe03a03b 100644 --- a/src/ML-SNAP/compute_sna_grid_local.cpp +++ b/src/ML-SNAP/compute_sna_grid_local.cpp @@ -203,7 +203,7 @@ void ComputeSNAGridLocal::init() void ComputeSNAGridLocal::compute_local() { - invoked_array = update->ntimestep; + invoked_local = update->ntimestep; // compute sna for each gridpoint From e1e7984822ef494e23bd67e6770398830bbfebba Mon Sep 17 00:00:00 2001 From: rohskopf Date: Fri, 10 Mar 2023 16:19:24 -0700 Subject: [PATCH 02/51] Start ComputeSNAGridKokkos implementation --- src/KOKKOS/compute_sna_grid_kokkos.cpp | 64 +++++++++++++++++++++ src/KOKKOS/compute_sna_grid_kokkos.h | 80 ++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 src/KOKKOS/compute_sna_grid_kokkos.cpp create mode 100644 src/KOKKOS/compute_sna_grid_kokkos.h diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp new file mode 100644 index 0000000000..0eb6e1767c --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp @@ -0,0 +1,64 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "compute_sna_grid_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +#include "sna_kokkos.h" +#include "update.h" + +using namespace LAMMPS_NS; + + +/* ---------------------------------------------------------------------- */ + +template +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : + ComputeSNAGrid(lmp, narg, arg) +{ + + printf("^^^ inside ComputeSNAGridKokkos constructor\n"); + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + +} + +/* ---------------------------------------------------------------------- */ + +template +ComputeSNAGridKokkos::~ComputeSNAGridKokkos() +{ + if (copymode) return; + + +} + +namespace LAMMPS_NS { +template class ComputeSNAGridKokkos; +#ifdef LMP_KOKKOS_GPU +template class ComputeSNAGridKokkos; +#endif +} + diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h new file mode 100644 index 0000000000..ad365fca43 --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -0,0 +1,80 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos); +ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos); +ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos); +// clang-format on +#else + +// clang-format off +#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H +#define LMP_COMPUTE_SNA_GRID_KOKKOS_H + +#include "compute_sna_grid.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +//template +//struct TagComputeCoordAtom{}; + +template +class ComputeSNAGridKokkos : public ComputeSNAGrid { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + ComputeSNAGridKokkos(class LAMMPS *, int, char **); + ~ComputeSNAGridKokkos() override; + //void init() override; + //void compute_peratom() override; + //enum {NONE,CUTOFF,ORIENT}; + + //template + //KOKKOS_INLINE_FUNCTION + //void operator()(TagComputeCoordAtom, const int&) const; + + private: + + + /* + int inum; + + typename AT::t_x_array_randomread x; + typename ArrayTypes::t_int_1d_randomread type; + typename ArrayTypes::t_int_1d mask; + + typename AT::t_neighbors_2d d_neighbors; + typename AT::t_int_1d_randomread d_ilist; + typename AT::t_int_1d_randomread d_numneigh; + + typename AT::t_int_1d d_typelo; + typename AT::t_int_1d d_typehi; + + DAT::tdual_float_1d k_cvec; + typename AT::t_float_1d d_cvec; + DAT::tdual_float_2d k_carray; + typename AT::t_float_2d d_carray; + + typename AT::t_float_2d d_normv; + */ +}; + +} + +#endif +#endif + From 234346c37d44d3a75255f8e0583381fea69eed07 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sun, 12 Mar 2023 17:00:48 -0600 Subject: [PATCH 03/51] Experiment with different implementations --- src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo | Bin 0 -> 16384 bytes src/KOKKOS/compute_sna_grid_kokkos.cpp | 79 +++++++++++++++++--- src/KOKKOS/compute_sna_grid_kokkos.h | 23 +++++- 3 files changed, 89 insertions(+), 13 deletions(-) create mode 100644 src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo diff --git a/src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo b/src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo new file mode 100644 index 0000000000000000000000000000000000000000..1f6b26487397da6524dd5e8647ab7bba3ab61fb1 GIT binary patch literal 16384 zcmeI2U5jK#8OJLzCT@~VG*LmXp7cV}J@nh`3vSx%m|=EyBJ+~T_Ksj&wyEh;-F-6W zR2{3TXKa?l51=3#F@hJ0f)@%w!87UMMK)M!oRDAn`(Uho}HfQnMJeW zD0Sf1J#(u1sXG69o_eYdwb}f{$5-imy~%LBm$A=(d|Ej;w^jmkNW8gr{j{8b)^;%g-(^Kih6}RhkX0=ora6eH}-gEw#0!Oud_B zx3e0`g91T;+bS@$z1sPc8)r_>`cOYrdW;@@WaG9!3l)L_L4lw^P#`D}6bK3g1%d)W zf&U)`Y<4&MEFR}w#Ush$TNU4T72khx6I=zSz?%;- z_Gj>2z`=*WUr#dj7Wg5MAO+8WB~S++1owb9A7t$3;5G0a@CBg3^WYL#1T&xl%HWp| zF!l@ZRZs$Vfw%8x>?ZgL_&)d|SO=%UBjA1Dk0%)W19$`60Iz_rfGdE3o9|)l_uvh1 z1N;{J27DR344wlgzsMcolpOTmT;c?+1UuiVQs;}6-?dGWcT7vV(EI42rHc`fdT*?Yh_qrqgh-Mo2m=+6> z52B-@;gG;2Qq`g>9MydDNOk;?8<X)zFP2vC(rFANdPbzB!_l zZ^4zFM@zoE3@4xg;zYQBbNxLm+^YMd*XFK^(6s|G8M`0LZTYyCO4hcvyociCoO{$s zS^Ke+u0M$X$*0PF-aBczl1t65vu%YqomO%0gpUE@NykM|m9|CKD&0wh?Arn6>|*k; z0ujfpcq)&$TW(zS(I=5Y_F91>?6bsJV}@#X+GF8 z@k`TwT&}R=C;`##tce_07M~H_p^cGwOvbx{ka}GD&9i01El;`G>OicxH$wfz)%7c% z?5xhWmt8vAEq8g~Kv9~(93!Em!F|&rsD6r z@;sX%*P+`q>+bAz{8*Ye5>q`Hv4XXM9y-6>_51y~)uo>*Nv``GAiP9f7KL&#nlX7CPCAC&go!+bbC z&pUB{JRu)mI9}jQ+Vj8bxCPqe{oGym-4m(Oqp2c8Q;#ycT6bhtb4K!&1+|6nN#gS2{KeHp>ba!~@{Y?JR%NwBY>R{ne~Yjjb)G+} z`vsn>v$VEBOF|0G6Ivf`CUKWmu(p>*)DczG720x`(^ytplN-h=$*7*);ab#*uDGN& z{@<;jA`FU-H<6Gc8p=HR(Z$8u{6h0=?a8zAE9cKOn@zutqBfUN%n_+NQ<}TU2q^-+ z+Nihdw4}C$mMIpDZn8!%T-rz7Y~*`G<eNMOc-&J3^bM^CRkYrTebkfwh@wH5xlRJM~*7jq14lHQ#-xxWchFfzRkz z_zo9M)li`OqGqrx7v!e3xZ7A&2`1qLQNN0{X^f^vw9YXW2?nhlrznDTMHf4KM&*YVt1g#CZm|A+nmZrC2rAPxKfu>T*wEe`wt|Kcl< nu>V)?)m_;C{~KFS{~9gq|0mvE`1b-~{~z}M{vF2u?*9LuF+hoN literal 0 HcmV?d00001 diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp index 0eb6e1767c..bce0b37763 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.cpp +++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp @@ -31,8 +31,8 @@ using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ -template -ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : +template +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg) { @@ -43,22 +43,83 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, ch datamask_read = EMPTY_MASK; datamask_modify = EMPTY_MASK; + host_flag = (execution_space == Host); + } /* ---------------------------------------------------------------------- */ -template -ComputeSNAGridKokkos::~ComputeSNAGridKokkos() +template +ComputeSNAGridKokkos::~ComputeSNAGridKokkos() { if (copymode) return; } -namespace LAMMPS_NS { -template class ComputeSNAGridKokkos; -#ifdef LMP_KOKKOS_GPU -template class ComputeSNAGridKokkos; -#endif +/* ---------------------------------------------------------------------- */ + +template +void ComputeSNAGridKokkos::init() +{ + + printf("^^^ beginning of ComputeSNAGridKokkos init()\n"); + + // init non-kk compute + // this calls snaptr->init(), we probably want to init the kokkos snaptr? + // let's copy pair_snap_kokkos by making a snaKK in header + ComputeSNAGrid::init(); + + // adjust neighbor list request for KOKKOS + + // taken from compute_coord_atom_kokkos + // this segfaults + /* + printf("^^^ before neigh request\n"); + auto request = neighbor->find_request(this); + request->set_kokkos_host(std::is_same::value && + !std::is_same::value); + request->set_kokkos_device(std::is_same::value); + */ + + + // taken from pair_snap_kokkos init + // compile errors with: + // error: pointer to incomplete class type "LAMMPS_NS::KokkosLMP" is not allowed" + /* + if (host_flag) { + if (lmp->kokkos->nthreads > 1) + error->all(FLERR,"compute sna grid can currently only run on a single " + "CPU thread"); + + // this calls snaptr->init() + // we probably wanna call init of kokkos snaptr + ComputeSNAGrid::init(); + return; + } + + if (force->newton_pair == 0) + error->all(FLERR,"Pair style SNAP requires newton pair on"); + + // neighbor list request for KOKKOS + + neighflag = lmp->kokkos->neighflag; + + auto request = neighbor->add_request(this, NeighConst::REQ_FULL); + request->set_kokkos_host(std::is_same::value && + !std::is_same::value); + request->set_kokkos_device(std::is_same::value); + if (neighflag == FULL) + error->all(FLERR,"Must use half neighbor list style with pair snap/kk"); + */ + + // Overall, I think maybe this compute does not need a neighlist request because the original + // compute_sna_grid.cpp does not have one. } +namespace LAMMPS_NS { +template class ComputeSNAGridKokkos; +#ifdef LMP_KOKKOS_GPU +template class ComputeSNAGridKokkos; +#endif +} diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index ad365fca43..4261d207f7 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -14,8 +14,8 @@ #ifdef COMPUTE_CLASS // clang-format off ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos); -ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos); -ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos); +//ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos); +//ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos); // clang-format on #else @@ -25,27 +25,42 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos); #include "compute_sna_grid.h" #include "kokkos_type.h" +#include "sna_kokkos.h" namespace LAMMPS_NS { //template //struct TagComputeCoordAtom{}; -template +// copying pair_snap_kokkos, template args are real_type and vector_length +template class ComputeSNAGridKokkos : public ComputeSNAGrid { public: typedef DeviceType device_type; typedef ArrayTypes AT; + static constexpr int vector_length = vector_length_; + using real_type = real_type_; + ComputeSNAGridKokkos(class LAMMPS *, int, char **); ~ComputeSNAGridKokkos() override; - //void init() override; + void init() override; //void compute_peratom() override; //enum {NONE,CUTOFF,ORIENT}; //template //KOKKOS_INLINE_FUNCTION //void operator()(TagComputeCoordAtom, const int&) const; + + protected: + + // these are used by pair_snap_kokkos + // neighflag gets set in init() + // what about host_flag? + // dunno... commented these out for now + int host_flag, neighflag; + + SNAKokkos snaKK; private: From a0a7f14db5d284f41c4fe738a2d4d8d0a87b8dc5 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sun, 12 Mar 2023 17:01:20 -0600 Subject: [PATCH 04/51] Remove swo --- src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo | Bin 16384 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo diff --git a/src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo b/src/KOKKOS/.compute_sna_grid_kokkos.cpp.swo deleted file mode 100644 index 1f6b26487397da6524dd5e8647ab7bba3ab61fb1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeI2U5jK#8OJLzCT@~VG*LmXp7cV}J@nh`3vSx%m|=EyBJ+~T_Ksj&wyEh;-F-6W zR2{3TXKa?l51=3#F@hJ0f)@%w!87UMMK)M!oRDAn`(Uho}HfQnMJeW zD0Sf1J#(u1sXG69o_eYdwb}f{$5-imy~%LBm$A=(d|Ej;w^jmkNW8gr{j{8b)^;%g-(^Kih6}RhkX0=ora6eH}-gEw#0!Oud_B zx3e0`g91T;+bS@$z1sPc8)r_>`cOYrdW;@@WaG9!3l)L_L4lw^P#`D}6bK3g1%d)W zf&U)`Y<4&MEFR}w#Ush$TNU4T72khx6I=zSz?%;- z_Gj>2z`=*WUr#dj7Wg5MAO+8WB~S++1owb9A7t$3;5G0a@CBg3^WYL#1T&xl%HWp| zF!l@ZRZs$Vfw%8x>?ZgL_&)d|SO=%UBjA1Dk0%)W19$`60Iz_rfGdE3o9|)l_uvh1 z1N;{J27DR344wlgzsMcolpOTmT;c?+1UuiVQs;}6-?dGWcT7vV(EI42rHc`fdT*?Yh_qrqgh-Mo2m=+6> z52B-@;gG;2Qq`g>9MydDNOk;?8<X)zFP2vC(rFANdPbzB!_l zZ^4zFM@zoE3@4xg;zYQBbNxLm+^YMd*XFK^(6s|G8M`0LZTYyCO4hcvyociCoO{$s zS^Ke+u0M$X$*0PF-aBczl1t65vu%YqomO%0gpUE@NykM|m9|CKD&0wh?Arn6>|*k; z0ujfpcq)&$TW(zS(I=5Y_F91>?6bsJV}@#X+GF8 z@k`TwT&}R=C;`##tce_07M~H_p^cGwOvbx{ka}GD&9i01El;`G>OicxH$wfz)%7c% z?5xhWmt8vAEq8g~Kv9~(93!Em!F|&rsD6r z@;sX%*P+`q>+bAz{8*Ye5>q`Hv4XXM9y-6>_51y~)uo>*Nv``GAiP9f7KL&#nlX7CPCAC&go!+bbC z&pUB{JRu)mI9}jQ+Vj8bxCPqe{oGym-4m(Oqp2c8Q;#ycT6bhtb4K!&1+|6nN#gS2{KeHp>ba!~@{Y?JR%NwBY>R{ne~Yjjb)G+} z`vsn>v$VEBOF|0G6Ivf`CUKWmu(p>*)DczG720x`(^ytplN-h=$*7*);ab#*uDGN& z{@<;jA`FU-H<6Gc8p=HR(Z$8u{6h0=?a8zAE9cKOn@zutqBfUN%n_+NQ<}TU2q^-+ z+Nihdw4}C$mMIpDZn8!%T-rz7Y~*`G<eNMOc-&J3^bM^CRkYrTebkfwh@wH5xlRJM~*7jq14lHQ#-xxWchFfzRkz z_zo9M)li`OqGqrx7v!e3xZ7A&2`1qLQNN0{X^f^vw9YXW2?nhlrznDTMHf4KM&*YVt1g#CZm|A+nmZrC2rAPxKfu>T*wEe`wt|Kcl< nu>V)?)m_;C{~KFS{~9gq|0mvE`1b-~{~z}M{vF2u?*9LuF+hoN From 584a6200f5db3032d57d7c66bb0a8168a2e3ea1e Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sun, 12 Mar 2023 20:02:57 -0600 Subject: [PATCH 05/51] Mimic pair snap kokkos pattern --- src/KOKKOS/.compute_sna_grid_kokkos.h.swo | Bin 0 -> 12288 bytes src/KOKKOS/compute_sna_grid_kokkos.cpp | 110 +------------------ src/KOKKOS/compute_sna_grid_kokkos.h | 98 +++++++++-------- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 127 ++++++++++++++++++++++ 4 files changed, 187 insertions(+), 148 deletions(-) create mode 100644 src/KOKKOS/.compute_sna_grid_kokkos.h.swo create mode 100644 src/KOKKOS/compute_sna_grid_kokkos_impl.h diff --git a/src/KOKKOS/.compute_sna_grid_kokkos.h.swo b/src/KOKKOS/.compute_sna_grid_kokkos.h.swo new file mode 100644 index 0000000000000000000000000000000000000000..12ede098aa5071704880ee2741eba65408343158 GIT binary patch literal 12288 zcmeI2&yO6%6~_xOKWt*KumlnUsqjj$Uhhnge+4tFR~crVpY!au-)LlltU1_DP)1mCLeo*%oj>m{-f zin^uG%y!qSSFhfydauUr&C-?2EA&FKL~tD=q#WPpu3mqQuKwgWVXlZeva4J6z2xcA z`3p~-J-e?E8edKoT0RScSXxpz*7Dl&@>;`kq9E=mZp)C_ZQ;3gH|lmHS?t_QXWpKB zAosulJkV3E!iA&c?3vSMt52PHj2?er;{Y9Vk=z5h2XYVO9>_hAdm#5f?t$C`xd;AV zJfPy+$Tdvn_H;toul?K7`|SJc^!jo7{GlD~>23a%dm#5f?t$C`xd(C&Q$Kmi;9hrtI& z3Hd2_1^D1Oa2)*o2ZX!@x`2a6!Gquq@ZWn0`49L6ybpc@ehpp%E~tW2;25|c{QC$Y zH^AHA8t8*6I0X)ae;y{}Z{V-s=O6&jfC{)5e0&cf{{sI2zXR96iy#J1f**qO;KtpA zdp;itOK=qbfO7;K26uzIz}JZD z4e%BC5{zC`JaoMlNx#TT@cz?;(isaMYCGm33plMb%VlM^{D>+0j7wcRj6~3Xn(Qm% zFd^8sR<>aJyFKO)b#(1v`^0fh>v5bucP45;D>Rta-9~t!vQ2lbxZ`OBJBApO@S{mH zNknYHLN^KoXJ*>HAmm=Vv!}*!anJXpnjmX*Y(_$+&eSUP?G7{GzNztRSFF;oA!L}N zWw}vr)8{ncK_s^Ao@8x4eNr3Hk35%pp{Mk%5v?XK+B8-xi+Igu*}P@56iGGh_)JP# zudJ-BH|UJ%eL&e+jG$(z(_wQlNdiqyPg{@rv}~1@#zah@gVkL z0G&(5A5LVROnEJrw`hH3FrCG9ylt^>#Awv#0#V^RLO1e!&lo0SA2f;+rN6s4F4V4E zuGYR~Bomu&SuDqGxJfjLWzncb`Bfzt9fUn*0$EaVA$7ZobUU^xRUJ;9WS>!~wy`r? z%u(2yT1NKbI1(ybiqtF}F6oxVLdwLZhan&t_JU0=wBBZiWAAJU7ROu+#Q9a^7_Bd0 zUB1tV9dz5am`Ma3*2iAV7Gbh6mzRgD78`VsFSF+(yK}x=?lZrK)v5P2c50G6MkHn_ z5~go!C9@Ps8gsUN9=26yp5o2ECWX0-AUDz)9tSv8I+NjrG zT&mPBnUCfYa{7Yqk`;)(6Zc*lePpNZO1rbe^*eN{iP#RG&Er6Ztt(^Z@U-UHs|~2e zhMV$2$M3nEo`{(z1}~zXKuXUV^joUg3#uo^x|kAg>-s+;FzlFgGX5-fMm6fCGy49k zZY5KQs29RO6p`xM%KApLW@Ba9i_czO$cp5Wt~6m*YW9{wB7v{NcAz=xk6?_w0hbvxNyO*wSrDT^=;qUZJkOiq)uQVF?HFc}5% z%R~L~LQ^7C6c>En=YDq1b3I|Q^mb7v5hJs>x0IHdbCvpq^QBUW zZh5Lh8#s1*48>8PNFo(RA@(%N=|-_pq>K7|6b2kAQE+XA)S9YMylt$Z(`QPhvnMC$ zfK?lM;KT`B1X5S?Y80Z@I?7BvJ@{g|a6gL87>cu?Ow%VQ@;^#mTQ5eUt;g*s70a?^ zZEbB8hbXiBZ-&YX%2H#=@?f?J) literal 0 HcmV?d00001 diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp index bce0b37763..197234cf1d 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.cpp +++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp @@ -13,113 +13,13 @@ ------------------------------------------------------------------------- */ #include "compute_sna_grid_kokkos.h" - -#include "atom_kokkos.h" -#include "atom_masks.h" -#include "comm.h" -#include "error.h" -#include "memory_kokkos.h" -#include "modify.h" -#include "neigh_list.h" -#include "neigh_request.h" -#include "neighbor_kokkos.h" -#include "sna_kokkos.h" -#include "update.h" - -using namespace LAMMPS_NS; - - -/* ---------------------------------------------------------------------- */ - -template -ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : - ComputeSNAGrid(lmp, narg, arg) -{ - - printf("^^^ inside ComputeSNAGridKokkos constructor\n"); - kokkosable = 1; - atomKK = (AtomKokkos *) atom; - execution_space = ExecutionSpaceFromDevice::space; - datamask_read = EMPTY_MASK; - datamask_modify = EMPTY_MASK; - - host_flag = (execution_space == Host); - -} - -/* ---------------------------------------------------------------------- */ - -template -ComputeSNAGridKokkos::~ComputeSNAGridKokkos() -{ - if (copymode) return; - - -} - -/* ---------------------------------------------------------------------- */ - -template -void ComputeSNAGridKokkos::init() -{ - - printf("^^^ beginning of ComputeSNAGridKokkos init()\n"); - - // init non-kk compute - // this calls snaptr->init(), we probably want to init the kokkos snaptr? - // let's copy pair_snap_kokkos by making a snaKK in header - ComputeSNAGrid::init(); - - // adjust neighbor list request for KOKKOS - - // taken from compute_coord_atom_kokkos - // this segfaults - /* - printf("^^^ before neigh request\n"); - auto request = neighbor->find_request(this); - request->set_kokkos_host(std::is_same::value && - !std::is_same::value); - request->set_kokkos_device(std::is_same::value); - */ - - - // taken from pair_snap_kokkos init - // compile errors with: - // error: pointer to incomplete class type "LAMMPS_NS::KokkosLMP" is not allowed" - /* - if (host_flag) { - if (lmp->kokkos->nthreads > 1) - error->all(FLERR,"compute sna grid can currently only run on a single " - "CPU thread"); - - // this calls snaptr->init() - // we probably wanna call init of kokkos snaptr - ComputeSNAGrid::init(); - return; - } - - if (force->newton_pair == 0) - error->all(FLERR,"Pair style SNAP requires newton pair on"); - - // neighbor list request for KOKKOS - - neighflag = lmp->kokkos->neighflag; - - auto request = neighbor->add_request(this, NeighConst::REQ_FULL); - request->set_kokkos_host(std::is_same::value && - !std::is_same::value); - request->set_kokkos_device(std::is_same::value); - if (neighflag == FULL) - error->all(FLERR,"Must use half neighbor list style with pair snap/kk"); - */ - - // Overall, I think maybe this compute does not need a neighlist request because the original - // compute_sna_grid.cpp does not have one. -} +#include "compute_sna_grid_kokkos_impl.h" namespace LAMMPS_NS { -template class ComputeSNAGridKokkos; + +template class ComputeSNAGridKokkosDevice; #ifdef LMP_KOKKOS_GPU -template class ComputeSNAGridKokkos; +template class ComputeSNAGridKokkosHost; #endif + } diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index 4261d207f7..9ab23f5bd2 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -13,9 +13,13 @@ #ifdef COMPUTE_CLASS // clang-format off -ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos); -//ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos); -//ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos); +ComputeStyle(sna/grid/kk,ComputeSNAGridKokkosDevice); +ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkosDevice); +#ifdef LMP_KOKKOS_GPU +ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosHost); +#else +ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice); +#endif // clang-format on #else @@ -25,71 +29,79 @@ ComputeStyle(sna/grid/kk,ComputeSNAGridKokkos); #include "compute_sna_grid.h" #include "kokkos_type.h" +//#include "neigh_list_kokkos.h" #include "sna_kokkos.h" +//#include "pair_kokkos.h" namespace LAMMPS_NS { -//template -//struct TagComputeCoordAtom{}; - -// copying pair_snap_kokkos, template args are real_type and vector_length template class ComputeSNAGridKokkos : public ComputeSNAGrid { public: + //enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD}; + //enum {COUL_FLAG=0}; typedef DeviceType device_type; typedef ArrayTypes AT; + typedef EV_FLOAT value_type; static constexpr int vector_length = vector_length_; using real_type = real_type_; + //using complex = SNAComplex; ComputeSNAGridKokkos(class LAMMPS *, int, char **); ~ComputeSNAGridKokkos() override; - void init() override; - //void compute_peratom() override; - //enum {NONE,CUTOFF,ORIENT}; - //template - //KOKKOS_INLINE_FUNCTION - //void operator()(TagComputeCoordAtom, const int&) const; + void init() override; + //void compute_array(int, int) override; + //double memory_usage() override; protected: - - // these are used by pair_snap_kokkos - // neighflag gets set in init() - // what about host_flag? - // dunno... commented these out for now - int host_flag, neighflag; - SNAKokkos snaKK; - private: + using KKDeviceType = typename KKDevice::value; - /* - int inum; - - typename AT::t_x_array_randomread x; - typename ArrayTypes::t_int_1d_randomread type; - typename ArrayTypes::t_int_1d mask; - - typename AT::t_neighbors_2d d_neighbors; - typename AT::t_int_1d_randomread d_ilist; - typename AT::t_int_1d_randomread d_numneigh; - - typename AT::t_int_1d d_typelo; - typename AT::t_int_1d d_typehi; - - DAT::tdual_float_1d k_cvec; - typename AT::t_float_1d d_cvec; - DAT::tdual_float_2d k_carray; - typename AT::t_float_2d d_carray; - - typename AT::t_float_2d d_normv; - */ }; +// These wrapper classes exist to make the compute style factory happy/avoid having +// to extend the compute style factory to support Compute classes w/an arbitrary number +// of extra template parameters + +template +class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos { + + private: + using Base = ComputeSNAGridKokkos; + + public: + + ComputeSNAGridKokkosDevice(class LAMMPS *, int, char **); + //ComputeSNAGridKokkosDevice(class LAMMPS *); + + void init() override; + //double memory_usage() override; + +}; + +#ifdef LMP_KOKKOS_GPU +template +class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos { + + private: + using Base = ComputeSNAGridKokkos; + + public: + + ComputeSNAGridKokkosHost(class LAMMPS *, int, char **); + //ComputeSNAGridKokkosHost(class LAMMPS *); + + void init(); + //double memory_usage(); + +}; +#endif + } #endif #endif - diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h new file mode 100644 index 0000000000..e958fcdb45 --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -0,0 +1,127 @@ +// clang-format off +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Christian Trott (SNL), Stan Moore (SNL), + Evan Weinberg (NVIDIA) +------------------------------------------------------------------------- */ + +#include "compute_sna_grid_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "force.h" +#include "kokkos.h" +#include "memory_kokkos.h" +#include "neighbor_kokkos.h" +#include "neigh_request.h" +#include "sna.h" + +#include +#include +#include + +#define MAXLINE 1024 +#define MAXWORD 3 + +namespace LAMMPS_NS { + +/* ---------------------------------------------------------------------- */ + +template +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid (lmp, narg, arg) +{ + //respa_enable = 0; + + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + //datamask_read = EMPTY_MASK; + //datamask_modify = EMPTY_MASK; + + //host_flag = (execution_space == Host); +} + +/* ---------------------------------------------------------------------- */ + +template +ComputeSNAGridKokkos::~ComputeSNAGridKokkos() +{ + if (copymode) return; + +} + +/* ---------------------------------------------------------------------- */ + +template +void ComputeSNAGridKokkos::init() +{ + + printf("^^^ inside ComputeSNAGridKokkos init\n"); + // from pair_snap_kokkos_impl.h : + /* + if (host_flag) { + if (lmp->kokkos->nthreads > 1) + error->all(FLERR,"Pair style snap/kk can currently only run on a single " + "CPU thread"); + + PairSNAP::init_style(); + return; + } + + if (force->newton_pair == 0) + error->all(FLERR,"Pair style SNAP requires newton pair on"); + + // neighbor list request for KOKKOS + + neighflag = lmp->kokkos->neighflag; + + auto request = neighbor->add_request(this, NeighConst::REQ_FULL); + request->set_kokkos_host(std::is_same::value && + !std::is_same::value); + request->set_kokkos_device(std::is_same::value); + if (neighflag == FULL) + error->all(FLERR,"Must use half neighbor list style with pair snap/kk"); + */ +} + +/* ---------------------------------------------------------------------- + routines used by template reference classes +------------------------------------------------------------------------- */ + +template +ComputeSNAGridKokkosDevice::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg) + : ComputeSNAGridKokkos(lmp, narg, arg) { ; } + +template +void ComputeSNAGridKokkosDevice::init() +{ + Base::init(); +} + +#ifdef LMP_KOKKOS_GPU +template +ComputeSNAGridKokkosHost::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg) + : ComputeSNAGridKokkos(lmp, narg, arg) { ; } + +template +void ComputeSNAGridKokkosHost::init() +{ + Base::init(); +} +#endif + +} From de4dbec66100284c04f712f6788ff61699009222 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sun, 12 Mar 2023 20:03:35 -0600 Subject: [PATCH 06/51] Remove swo --- src/KOKKOS/.compute_sna_grid_kokkos.h.swo | Bin 12288 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/KOKKOS/.compute_sna_grid_kokkos.h.swo diff --git a/src/KOKKOS/.compute_sna_grid_kokkos.h.swo b/src/KOKKOS/.compute_sna_grid_kokkos.h.swo deleted file mode 100644 index 12ede098aa5071704880ee2741eba65408343158..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2&yO6%6~_xOKWt*KumlnUsqjj$Uhhnge+4tFR~crVpY!au-)LlltU1_DP)1mCLeo*%oj>m{-f zin^uG%y!qSSFhfydauUr&C-?2EA&FKL~tD=q#WPpu3mqQuKwgWVXlZeva4J6z2xcA z`3p~-J-e?E8edKoT0RScSXxpz*7Dl&@>;`kq9E=mZp)C_ZQ;3gH|lmHS?t_QXWpKB zAosulJkV3E!iA&c?3vSMt52PHj2?er;{Y9Vk=z5h2XYVO9>_hAdm#5f?t$C`xd;AV zJfPy+$Tdvn_H;toul?K7`|SJc^!jo7{GlD~>23a%dm#5f?t$C`xd(C&Q$Kmi;9hrtI& z3Hd2_1^D1Oa2)*o2ZX!@x`2a6!Gquq@ZWn0`49L6ybpc@ehpp%E~tW2;25|c{QC$Y zH^AHA8t8*6I0X)ae;y{}Z{V-s=O6&jfC{)5e0&cf{{sI2zXR96iy#J1f**qO;KtpA zdp;itOK=qbfO7;K26uzIz}JZD z4e%BC5{zC`JaoMlNx#TT@cz?;(isaMYCGm33plMb%VlM^{D>+0j7wcRj6~3Xn(Qm% zFd^8sR<>aJyFKO)b#(1v`^0fh>v5bucP45;D>Rta-9~t!vQ2lbxZ`OBJBApO@S{mH zNknYHLN^KoXJ*>HAmm=Vv!}*!anJXpnjmX*Y(_$+&eSUP?G7{GzNztRSFF;oA!L}N zWw}vr)8{ncK_s^Ao@8x4eNr3Hk35%pp{Mk%5v?XK+B8-xi+Igu*}P@56iGGh_)JP# zudJ-BH|UJ%eL&e+jG$(z(_wQlNdiqyPg{@rv}~1@#zah@gVkL z0G&(5A5LVROnEJrw`hH3FrCG9ylt^>#Awv#0#V^RLO1e!&lo0SA2f;+rN6s4F4V4E zuGYR~Bomu&SuDqGxJfjLWzncb`Bfzt9fUn*0$EaVA$7ZobUU^xRUJ;9WS>!~wy`r? z%u(2yT1NKbI1(ybiqtF}F6oxVLdwLZhan&t_JU0=wBBZiWAAJU7ROu+#Q9a^7_Bd0 zUB1tV9dz5am`Ma3*2iAV7Gbh6mzRgD78`VsFSF+(yK}x=?lZrK)v5P2c50G6MkHn_ z5~go!C9@Ps8gsUN9=26yp5o2ECWX0-AUDz)9tSv8I+NjrG zT&mPBnUCfYa{7Yqk`;)(6Zc*lePpNZO1rbe^*eN{iP#RG&Er6Ztt(^Z@U-UHs|~2e zhMV$2$M3nEo`{(z1}~zXKuXUV^joUg3#uo^x|kAg>-s+;FzlFgGX5-fMm6fCGy49k zZY5KQs29RO6p`xM%KApLW@Ba9i_czO$cp5Wt~6m*YW9{wB7v{NcAz=xk6?_w0hbvxNyO*wSrDT^=;qUZJkOiq)uQVF?HFc}5% z%R~L~LQ^7C6c>En=YDq1b3I|Q^mb7v5hJs>x0IHdbCvpq^QBUW zZh5Lh8#s1*48>8PNFo(RA@(%N=|-_pq>K7|6b2kAQE+XA)S9YMylt$Z(`QPhvnMC$ zfK?lM;KT`B1X5S?Y80Z@I?7BvJ@{g|a6gL87>cu?Ow%VQ@;^#mTQ5eUt;g*s70a?^ zZEbB8hbXiBZ-&YX%2H#=@?f?J) From 212b86405251bbdcc593a27cb0aec4ebc19b0374 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sat, 8 Apr 2023 11:19:07 -0600 Subject: [PATCH 07/51] Add all SNAP computations and 4D view Kokkos memory allocator --- src/KOKKOS/compute_sna_grid_kokkos.cpp | 56 ++ src/KOKKOS/compute_sna_grid_kokkos.h | 259 +++++++- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 765 +++++++++++++++++++++- src/KOKKOS/memory_kokkos.h | 50 ++ src/ML-SNAP/compute_grid.cpp | 4 +- src/ML-SNAP/compute_sna_grid.cpp | 29 +- src/ML-SNAP/compute_sna_grid.h | 12 +- 7 files changed, 1118 insertions(+), 57 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp index 197234cf1d..8a05ba7901 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.cpp +++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp @@ -23,3 +23,59 @@ template class ComputeSNAGridKokkosHost; #endif } + + + + +// The following chunk will compile but we're gonna try a wrapper approach like pair snap. +/* +#include "compute_sna_grid_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +#include "sna_kokkos.h" +#include "update.h" + +using namespace LAMMPS_NS; + +// ---------------------------------------------------------------------- + +template +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : + ComputeSNAGrid(lmp, narg, arg) +{ + + printf("^^^ inside ComputeSNAGridKokkos constructor\n"); + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + +} + +// ---------------------------------------------------------------------- + +template +ComputeSNAGridKokkos::~ComputeSNAGridKokkos() +{ + if (copymode) return; + + +} + +namespace LAMMPS_NS { +template class ComputeSNAGridKokkos; +#ifdef LMP_KOKKOS_GPU +template class ComputeSNAGridKokkos; +#endif +} +*/ + diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index 9ab23f5bd2..b461f755b8 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -29,42 +29,233 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice); #include "compute_sna_grid.h" #include "kokkos_type.h" +//#include "pair_snap.h" +//#include "kokkos_type.h" //#include "neigh_list_kokkos.h" #include "sna_kokkos.h" //#include "pair_kokkos.h" namespace LAMMPS_NS { +// Routines for both the CPU and GPU backend +//template +//struct TagPairSNAPComputeForce{}; + + +// GPU backend only +/* +struct TagPairSNAPComputeNeigh{}; +struct TagPairSNAPComputeCayleyKlein{}; +struct TagPairSNAPPreUi{}; +struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence +struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence +struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +struct TagPairSNAPComputeZi{}; +struct TagPairSNAPBeta{}; +struct TagPairSNAPComputeBi{}; +struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS +struct TagPairSNAPComputeYi{}; +struct TagPairSNAPComputeYiWithZlist{}; +template +struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence +template +struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence +*/ +//struct TagPairSNAPPreUi{}; +struct TagCSNAGridComputeNeigh{}; +struct TagCSNAGridComputeCayleyKlein{}; +struct TagCSNAGridPreUi{}; +struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence +struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence +struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +struct TagCSNAGridComputeZi{}; +struct TagCSNAGridComputeBi{}; + +struct TagComputeSNAGridLoop{}; +struct TagComputeSNAGrid3D{}; +//struct TagCSNAGridTeam{}; + +// CPU backend only +/* +struct TagPairSNAPComputeNeighCPU{}; +struct TagPairSNAPPreUiCPU{}; +struct TagPairSNAPComputeUiCPU{}; +struct TagPairSNAPTransformUiCPU{}; +struct TagPairSNAPComputeZiCPU{}; +struct TagPairSNAPBetaCPU{}; +struct TagPairSNAPComputeBiCPU{}; +struct TagPairSNAPZeroYiCPU{}; +struct TagPairSNAPComputeYiCPU{}; +struct TagPairSNAPComputeDuidrjCPU{}; +struct TagPairSNAPComputeDeidrjCPU{}; +*/ +struct TagComputeSNAGridLoopCPU{}; + +//template template class ComputeSNAGridKokkos : public ComputeSNAGrid { public: - //enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD}; - //enum {COUL_FLAG=0}; typedef DeviceType device_type; typedef ArrayTypes AT; - typedef EV_FLOAT value_type; static constexpr int vector_length = vector_length_; using real_type = real_type_; - //using complex = SNAComplex; + using complex = SNAComplex; + + // Static team/tile sizes for device offload + +#ifdef KOKKOS_ENABLE_HIP + static constexpr int team_size_compute_neigh = 2; + static constexpr int tile_size_compute_ck = 2; + static constexpr int tile_size_pre_ui = 2; + static constexpr int team_size_compute_ui = 2; + static constexpr int tile_size_transform_ui = 2; + static constexpr int tile_size_compute_zi = 2; + static constexpr int tile_size_compute_bi = 2; + static constexpr int tile_size_transform_bi = 2; + static constexpr int tile_size_compute_yi = 2; + static constexpr int team_size_compute_fused_deidrj = 2; +#else + static constexpr int team_size_compute_neigh = 4; + static constexpr int tile_size_compute_ck = 4; + static constexpr int tile_size_pre_ui = 4; + static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4; + static constexpr int tile_size_transform_ui = 4; + static constexpr int tile_size_compute_zi = 8; + static constexpr int tile_size_compute_bi = 4; + static constexpr int tile_size_transform_bi = 4; + static constexpr int tile_size_compute_yi = 8; + static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; +#endif + + // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches + // This hides the Kokkos::IndexType and Kokkos::Rank<3...> + // and reduces the verbosity of the LaunchBound by hiding the explicit + // multiplication by vector_length + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNAP>; + + // MDRangePolicy for the 3D grid loop: + template + using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>; + + // Testing out team policies + template + using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; + //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, Kokkos::IndexType, Kokkos::IndexType, TagComputeSNAP>; + //using team_member = typename team_policy::member_type; + + // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches + // This hides the LaunchBounds abstraction by hiding the explicit + // multiplication by vector length + template + using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; ComputeSNAGridKokkos(class LAMMPS *, int, char **); ~ComputeSNAGridKokkos() override; void init() override; - //void compute_array(int, int) override; - //double memory_usage() override; - + void setup() override; + void compute_array() override; + + // Utility functions for teams + + template + void check_team_size_for(int, int&); + + template + void check_team_size_reduce(int, int&); + + // operator function for example team policy + //KOKKOS_INLINE_FUNCTION + //void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLoop, const int& ) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLoopCPU, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + // 3D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + protected: + SNAKokkos snaKK; + int chunk_size, chunk_offset; + int host_flag; + int ntotal; + int total_range; // total number of loop iterations in grid + int zlen; //= nzhi-nzlo+1; + int ylen; //= nyhi-nylo+1; + int xlen; //= nxhi-nxlo+1; - using KKDeviceType = typename KKDevice::value; + double cutsq_tmp; // temporary cutsq until we get a view + + Kokkos::View d_radelem; // element radii + Kokkos::View d_wjelem; // elements weights + //Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_sinnerelem; // element inner cutoff midpoint + Kokkos::View d_dinnerelem; // element inner cutoff half-width + Kokkos::View d_ninside; // ninside for all atoms in list + Kokkos::View d_map; // mapping from atom types to elements + + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq; + typedef Kokkos::View > t_fparams_rnd; + t_fparams_rnd rnd_cutsq; + + typename AT::t_x_array_randomread x; + typename AT::t_int_1d_randomread type; + DAT::tdual_float_2d k_grid; + DAT::tdual_float_2d k_gridall; + typename AT::t_float_2d d_grid; + typename AT::t_float_2d d_gridall; + + //DAT::tdual_float_4d k_gridlocal; + //typedef Kokkos::DualView t_gridlocal_4d; + //typedef Kokkos::View t_4d; + typedef Kokkos::DualView tdual_float_4d; + tdual_float_4d k_gridlocal; + tdual_float_4d d_gridlocal; + + + // Utility routine which wraps computing per-team scratch size requirements for + // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj + template + int scratch_size_helper(int values_per_team); }; // These wrapper classes exist to make the compute style factory happy/avoid having -// to extend the compute style factory to support Compute classes w/an arbitrary number +// to extend the compute style factory to support Compute classes w/an arbitrary number // of extra template parameters template @@ -76,10 +267,9 @@ class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos); +ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos); +ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos); +// clang-format on +#else + +// clang-format off +#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H +#define LMP_COMPUTE_SNA_GRID_KOKKOS_H + +#include "compute_sna_grid.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +//template +//struct TagComputeCoordAtom{}; + +template +class ComputeSNAGridKokkos : public ComputeSNAGrid { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + ComputeSNAGridKokkos(class LAMMPS *, int, char **); + ~ComputeSNAGridKokkos() override; + + private: + +}; + +} + +#endif +#endif +*/ + diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index e958fcdb45..b0cf30d070 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -3,12 +3,10 @@ LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories LAMMPS development team: developers@lammps.org - Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. - See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ @@ -18,17 +16,20 @@ ------------------------------------------------------------------------- */ #include "compute_sna_grid_kokkos.h" +#include "pair_snap_kokkos.h" #include "atom_kokkos.h" #include "atom_masks.h" #include "comm.h" #include "error.h" -#include "force.h" -#include "kokkos.h" #include "memory_kokkos.h" -#include "neighbor_kokkos.h" +#include "modify.h" +#include "neigh_list.h" #include "neigh_request.h" +#include "neighbor_kokkos.h" +//#include "sna_kokkos.h" #include "sna.h" +#include "update.h" #include #include @@ -39,69 +40,757 @@ namespace LAMMPS_NS { -/* ---------------------------------------------------------------------- */ +// Constructor template -ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid (lmp, narg, arg) +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg) { //respa_enable = 0; kokkosable = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; - //datamask_read = EMPTY_MASK; - //datamask_modify = EMPTY_MASK; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; - //host_flag = (execution_space == Host); + k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1); + auto d_cutsq = k_cutsq.template view(); + rnd_cutsq = d_cutsq; + + host_flag = (execution_space == Host); + + // ComputeSNAGrid constructor allocates `map` so let's do same here. + // actually, let's move this down to init + //int n = atom->ntypes; + //printf("^^^ realloc d_map\n"); + //MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + + + printf("^^^^^ cutsq: %f\n", cutsq[1][1]); + + cutsq_tmp = cutsq[1][1]; + + //memoryKK->create_kokkos(k_gridlocal, + //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); } -/* ---------------------------------------------------------------------- */ +// Destructor template ComputeSNAGridKokkos::~ComputeSNAGridKokkos() { if (copymode) return; + //memoryKK->destroy_kokkos(k_eatom,eatom); + //memoryKK->destroy_kokkos(k_vatom,vatom); + printf("^^^ Finish ComputeSNAGridKokkos destructor\n"); } -/* ---------------------------------------------------------------------- */ +// Init template void ComputeSNAGridKokkos::init() { + printf("^^^ Begin ComputeSNAGridKokkos init()\n"); + // The part of pair_snap_kokkos_impl.h that allocates snap params is coeff(), and it + // calls the original coeff function. So let's do that here: + + ComputeSNAGrid::init(); + + // Set up element lists + printf("^^^ Begin kokkos reallocs\n"); + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); + MemKK::realloc_kokkos(d_wjelem,"pair:wjelem",nelements); + // pair snap kokkos uses `ncoeffall` in the following, inherits from original. + //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff); + MemKK::realloc_kokkos(d_sinnerelem,"pair:sinnerelem",nelements); + MemKK::realloc_kokkos(d_dinnerelem,"pair:dinnerelem",nelements); + int n = atom->ntypes; + //printf("^^^ realloc d_map\n"); + printf("^^^ n: %d\n", n); + MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + + printf("^^^ begin mirrow view creation\n"); + auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); + //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem); + auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); + auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); + auto h_map = Kokkos::create_mirror_view(d_map); + + printf("^^^ begin loop over elements, nelements = %d\n", nelements); + for (int ielem = 0; ielem < nelements; ielem++) { + printf("^^^^^ ielem %d\n", ielem); + h_radelem(ielem) = radelem[ielem]; + printf("^^^^^ 1\n"); + h_wjelem(ielem) = wjelem[ielem]; + printf("^^^^^ 2\n"); + if (switchinnerflag){ + h_sinnerelem(ielem) = sinnerelem[ielem]; + h_dinnerelem(ielem) = dinnerelem[ielem]; + } + // pair snap kokkos uses `ncoeffall` in the following. + //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) { + // h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff]; + //} + } + + printf("^^^ begin loop over map\n"); + // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where + // some things like `map` get allocated regardless of chem flag. + if (chemflag){ + for (int i = 1; i <= atom->ntypes; i++) { + h_map(i) = map[i]; + printf("%d\n", map[i]); + } + } + + Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_wjelem,h_wjelem); + if (switchinnerflag){ + Kokkos::deep_copy(d_sinnerelem,h_sinnerelem); + Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); + } + if (chemflag){ + Kokkos::deep_copy(d_map,h_map); + } + + snaKK = SNAKokkos(rfac0,twojmax, + rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK.grow_rij(0,0); + snaKK.init(); - printf("^^^ inside ComputeSNAGridKokkos init\n"); - // from pair_snap_kokkos_impl.h : - /* if (host_flag) { - if (lmp->kokkos->nthreads > 1) - error->all(FLERR,"Pair style snap/kk can currently only run on a single " - "CPU thread"); - PairSNAP::init_style(); + // The following lmp->kokkos will compile error with pointer to incomplete class type not allowed. + //if (lmp->kokkos->nthreads > 1) + // error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single " + // "CPU thread"); + + ComputeSNAGrid::init(); return; } - if (force->newton_pair == 0) - error->all(FLERR,"Pair style SNAP requires newton pair on"); + printf("^^^ Finished ComputeSNAGridKokkos init\n"); - // neighbor list request for KOKKOS - - neighflag = lmp->kokkos->neighflag; - - auto request = neighbor->add_request(this, NeighConst::REQ_FULL); - request->set_kokkos_host(std::is_same::value && - !std::is_same::value); - request->set_kokkos_device(std::is_same::value); - if (neighflag == FULL) - error->all(FLERR,"Must use half neighbor list style with pair snap/kk"); - */ } +// Setup + +template +void ComputeSNAGridKokkos::setup() +{ + // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. + // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. + //ComputeGrid::setup(); + printf("^^^^^ SETUP!\n"); + //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); + ComputeGrid::set_grid_global(); + ComputeGrid::set_grid_local(); + + // allocate arrays + + memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid"); + memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); + if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { + gridlocal_allocated = 1; + memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, + nyhi, nxlo, nxhi, "grid:gridlocal"); + } + array = gridall; +} + +// Compute + +template +void ComputeSNAGridKokkos::compute_array() +{ + printf("^^^ Begin ComputeSNAGridKokkos compute_array()\n"); + + if (DeviceType::in_parallel()) { + printf("^^^ compute_array() is a host function\n"); + } else { + printf("^^^ compute_array() is not a host function\n"); + } + + if (host_flag) { + /* + atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); + PairSNAP::compute(eflag_in,vflag_in); + atomKK->modified(Host,F_MASK); + */ + return; + } + + copymode = 1; + + zlen = nzhi-nzlo+1; + ylen = nyhi-nylo+1; + xlen = nxhi-nxlo+1; + printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi); + total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + // This will error because trying to access host view on the device: + //printf("x(0,0): %f\n", x(0,0)); + type = atomKK->k_type.view(); + k_cutsq.template sync(); + + + MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",total_range); + + //printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi); + + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // number of atoms. + + //const int ntotal = atomKK->nlocal + atomKK->nghost; + ntotal = atomKK->nlocal + atomKK->nghost; + //printf("^^^ ntotal: %d\n", ntotal); + + // ensure rij, inside, and typej are of size jnum + // snaKK.grow_rij(int, int) requires 2 args where one is a chunksize. + + chunk_size = MIN(chunksize, total_range); // "chunksize" variable is set by user + //printf("^^^ chunk_size: %d\n", chunk_size); + snaKK.grow_rij(chunk_size, ntotal); + + // Launch 3 teams of the maximum number of threads per team + //const int team_size_max = team_policy(3, 1).team_size_max( + // TagCSNAGridTeamPolicy, Kokkos::ParallelForTag()); + //typename Kokkos::TeamPolicy team_policy_test(3,1); + + // Using custom policy: + /* + CSNAGridTeamPolicy team_policy(chunk_size,team_size_compute_neigh,vector_length); + //team_policy = team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("TeamPolicy",team_policy,*this); + */ + + + chunk_size = total_range; + printf("%d %d %d\n", chunk_size, team_size_compute_neigh, vector_length); + // team_size_compute_neigh is defined in `pair_snap_kokkos.h` + + + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + + //ComputeNeigh + { + int scratch_size = scratch_size_helper(team_size_compute_neigh * ntotal); + + SnapAoSoATeamPolicy + policy_neigh(chunk_size, team_size_compute_neigh, vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + //ComputeCayleyKlein + { + // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0}, {vector_length, ntotal, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); + Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); + } + + //PreUi + { + // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); + Kokkos::parallel_for("PreUi",policy_preui,*this); + } + + // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot + { + // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + + if (chunk_size < parallel_thresh) + { + // Version with parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations) + const int n_teams = chunk_size_div * ntotal * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); + } else { + // Version w/out parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) + const int n_teams = chunk_size_div * ntotal; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this); + } + } + + //TransformUi: un-"fold" ulisttot, zero ylist + { + // team_size_transform_ui is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); + Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + } + + //Compute bispectrum in AoSoA data layout, transform Bi + //if (quadraticflag || eflag) { + + //ComputeZi + const int idxz_max = snaKK.idxz_max; + Snap3DRangePolicy + policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); + Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + + //ComputeBi + const int idxb_max = snaKK.idxb_max; + Snap3DRangePolicy + policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); + Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + + //Looks like best way to grab blist is in a parallel_for + + //Transform data layout of blist out of AoSoA + //We need this because `blist` gets used in ComputeForce which doesn't + //take advantage of AoSoA, which at best would only be beneficial on the margins + //NOTE: Do we need this in compute sna/grid/kk? + /* + Snap3DRangePolicy + policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); + Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + */ + + + + // let's try a simple parallel for loop + // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this + // function, because this is a host-device function. + /* + typename Kokkos::RangePolicy policy_loop(0,4); + Kokkos::parallel_for("Loop",policy_loop,*this); + */ + + + // Simple working loop: + /* + Kokkos::parallel_for("Loop1", 4, KOKKOS_LAMBDA (const int& i) { + printf("Greeting from iteration %i\n",i); + }); + */ + + /* + // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this + // function, because this is a host-device function. + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + Snap3DRangePolicy + policy_compute_ck({0,0,0},{vector_length,ntotal,chunk_size_div},{vector_length,tile_size_compute_ck,1}); + Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); + */ + + // Simple example of 3D MD range policy. + // Begin loop over grid points. + /* + // NOTE: We don't get the compiler error calling host function DeviceType::in_parallel() in this + // function, but we get it in the above function. + int n = 3; // bounds for mdrange policy + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagComputeSNAGrid3D> policy_3d({0,0,0},{n,n,n}); + Kokkos::parallel_for("3D",policy_3d,*this); + */ + + printf("^^^ End ComputeSNAGridKokkos compute_array()\n"); +} + +/* ---------------------------------------------------------------------- + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials +------------------------------------------------------------------------- */ + +/* + Simple team policy functor seeing how many layers deep we can go with the parallelism. + */ +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos + + SNAKokkos my_sna = snaKK; + + // basic quantities associated with this team: + // team_rank : rank of thread in this team + // league_rank : rank of team in this league + // team_size : number of threads in this team + //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size()); + + // extract loop index + int ii = team.team_rank() + team.league_rank() * team.team_size(); + if (ii >= chunk_size) return; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + const int tile_size = ntotal; // number of elements per thread + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + //printf("ntotal scratch_shift: %d %d\n", ntotal, scratch_shift); + int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + //printf("ii: %d\n", ii); + + // convert to grid indices + + int iz = ii/(xlen*ylen); + int i2 = ii - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + //int igrid = iz * (nx * ny) + iy * nx + ix; + + // these end up being the same...? + //printf("ii igrid: %d %d\n", ii, igrid); + + // grid2x converts igrid to ix,iy,iz like we've done before + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + const double xtmp = xgrid[0]; + const double ytmp = xgrid[1]; + const double ztmp = xgrid[2]; + + // currently, all grid points are type 1 + // not clear what a better choice would be + + const int itype = 1; + const int ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. + if (triclinic){ + printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp); + } else { + //printf("We are not triclinic\n"); + } + + // can check xgrid positions with original + //printf("%f %f %f\n", xgrid[0], xgrid[1], xgrid[2]); + + // Compute the number of neighbors, store rsq + int ninside = 0; + // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]? + //printf("ntotal: %d\n", ntotal); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), + [&] (const int j, int& count) { + + // From pair snap/kk : + /* + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + */ + // From compute sna/grid/kk : + /* + const double delx = xtmp - x[j][0]; + const double dely = ytmp - x[j][1]; + const double delz = ztmp - x[j][2]; + */ + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + //printf("dx: %f\n", dx); + + //const double rsq = delx * delx + dely * dely + delz * delz; + int jtype = type(j); + //printf("jtype: %d\n", jtype); + //int jelem = 0; + //if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) { + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + //if (rsq >= rnd_cutsq(itype,jtype)) { + if (rsq >= cutsq_tmp){ + jtype = -1; // use -1 to signal it's outside the radius + } + //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); + + if (j > 340){ + printf("j: %d\n", j); + } + + //printf("j: %d\n", j); + type_cache[j] = jtype; + + if (jtype >= 0) + count++; + + }, ninside); + + //printf("ninside: %d\n", ninside); + + d_ninside(ii) = ninside; + + // TODO: Make sure itype is appropriate instead of ielem + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), + [&] (const int j, int& offset, bool final) { + + const int jtype = type_cache[j]; + + if (jtype >= 0) { + if (final) { + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + const int jelem = d_map[jtype]; + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); + my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); + my_sna.inside(ii,offset) = j; + if (switchinnerflag) { + my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[itype] + d_sinnerelem[jelem]); + my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[itype] + d_dinnerelem[jelem]); + } + if (chemflag) + my_sna.element(ii,offset) = jelem; + else + my_sna.element(ii,offset) = 0; + } + offset++; + } + }); +} + + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + //printf("^^^ ComputeCayleyKlein\n"); + + /* + if (DeviceType::in_parallel()) { + printf("operator() of TagCSNAGridComputeCayleyKlein is a host function\n"); + } else { + printf("operator() of TagCSNAGridComputeCayleyKlein is not a host function\n"); + } + */ + + const int ii = iatom_mod + iatom_div * vector_length; + if (ii >= chunk_size) return; + + const int ninside = ntotal; //d_ninside(ii); + if (jnbor >= ninside) return; + + my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int ii = iatom_mod + iatom_div * vector_length; + if (ii >= chunk_size) return; + + int itype = type(ii); + int ielem = d_map[itype]; + + my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // extract flattened atom_div / neighbor number / bend_location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (ntotal * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (ntotal * (twojmax + 1)); + const int jbend = jj_jbend / ntotal; + int jj = jj_jbend - jbend * ntotal; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / ntotal; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * ntotal; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div); + }); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (idxu > my_sna.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++){ + + const FullHalfMapper mapper = my_sna.idxu_full_half[idxu]; + + auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + + if (mapper.flip_sign == 1){ + utot_im = -utot_im; + } else if (mapper.flip_sign == -1){ + utot_re = -utot_re; + } + + my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + + if (mapper.flip_sign == 0) { + my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + } + } +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjz >= my_sna.idxz_max) return; + + my_sna.compute_zi(iatom_mod,jjz,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjb >= my_sna.idxb_max) return; + + my_sna.compute_bi(iatom_mod,jjb,iatom_div); +} + +/* +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + +} +*/ + +/* ---------------------------------------------------------------------- + Begin routines that are unique to the CPU codepath. These do not take + advantage of AoSoA data layouts, but that could be a good point of + future optimization and unification with the above kernels. It's unlikely + that scratch memory optimizations will ever be useful for the CPU due to + different arithmetic intensity requirements for the CPU vs GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagComputeSNAGridLoopCPU,const int& ii) const { + +} + +/* ---------------------------------------------------------------------- + utility functions +------------------------------------------------------------------------- */ + +template +template +void ComputeSNAGridKokkos::check_team_size_for(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +void ComputeSNAGridKokkos::check_team_size_reduce(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +int ComputeSNAGridKokkos::scratch_size_helper(int values_per_team) { + typedef Kokkos::View > ScratchViewType; + + return ScratchViewType::shmem_size(values_per_team); +} + +/* ---------------------------------------------------------------------- */ + /* ---------------------------------------------------------------------- routines used by template reference classes ------------------------------------------------------------------------- */ + template ComputeSNAGridKokkosDevice::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridKokkos(lmp, narg, arg) { ; } @@ -112,6 +801,12 @@ void ComputeSNAGridKokkosDevice::init() Base::init(); } +template +void ComputeSNAGridKokkosDevice::compute_array() +{ + Base::compute_array(); +} + #ifdef LMP_KOKKOS_GPU template ComputeSNAGridKokkosHost::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg) @@ -122,6 +817,12 @@ void ComputeSNAGridKokkosHost::init() { Base::init(); } + +template +void ComputeSNAGridKokkosHost::compute_array() +{ + Base::compute_array(); +} #endif } diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h index 9d894a344a..35a7ceaeb4 100644 --- a/src/KOKKOS/memory_kokkos.h +++ b/src/KOKKOS/memory_kokkos.h @@ -183,6 +183,56 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, return data; } +/* ---------------------------------------------------------------------- + create a 4d array with indices 2,3,4 offset, but not first + 2nd index from n2lo to n2hi inclusive + 3rd index from n3lo to n3hi inclusive + 4th index from n4lo to n4hi inclusive + cannot grow it +------------------------------------------------------------------------- */ + +template +TYPE create4d_offset_kokkos(TYPE &data, typename TYPE::value_type ****&array, + int n1, int n2lo, int n2hi, int n3lo, int n3hi, int n4lo, int n4hi, + const char *name) +{ + //if (n1 <= 0 || n2lo > n2hi || n3lo > n3hi || n4lo > n4hi) array = nullptr; + + printf("^^^^^ memoryKK->create_4d_offset_kokkos\n"); + + int n2 = n2hi - n2lo + 1; + int n3 = n3hi - n3lo + 1; + int n4 = n4hi - n4lo + 1; + data = TYPE(std::string(name),n1,n2,n3,n4); + bigint nbytes = ((bigint) sizeof(typename TYPE::value_type ***)) * n1; + array = (typename TYPE::value_type ****) smalloc(nbytes,name); + + for (int i = 0; i < n1; i++) { + if (n2 == 0) { + array[i] = nullptr; + } else { + nbytes = ((bigint) sizeof(typename TYPE::value_type **)) * n2; + array[i] = (typename TYPE::value_type ***) smalloc(nbytes,name); + for (int j = 0; j < n2; j++){ + if (n3 == 0){ + array[i][j] = nullptr; + } else { + nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n3; + array[i][j] = (typename TYPE::value_type **) smalloc(nbytes, name); + for (int k = 0; k < n3; k++){ + if (n4 == 0) + array[i][j][k] = nullptr; + else + array[i][j][k] = &data.h_view(i,j,k,0); + } + } + } + } + } + + return data; +} + template TYPE create_kokkos(TYPE &data, HTYPE &h_data, typename TYPE::value_type **&array, int n1, int n2, diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp index 2179bb8ebd..ad70df30e8 100644 --- a/src/ML-SNAP/compute_grid.cpp +++ b/src/ML-SNAP/compute_grid.cpp @@ -57,6 +57,8 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) : ComputeGrid::~ComputeGrid() { + printf("^^^ begin ComputeGrid destructor\n"); + if (copymode) return; deallocate(); } @@ -111,7 +113,7 @@ void ComputeGrid::assign_coords_all() void ComputeGrid::allocate() { // allocate arrays - + printf("^^^^^^^^^^^^^^^ ComputeGrid::allocate()\n"); memory->create(grid, size_array_rows, size_array_cols, "grid:grid"); memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall"); if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp index 4243202545..36780213f2 100644 --- a/src/ML-SNAP/compute_sna_grid.cpp +++ b/src/ML-SNAP/compute_sna_grid.cpp @@ -31,14 +31,14 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : // skip over arguments used by base class // so that argument positions are identical to // regular per-atom compute - + printf("^^^ inside compute sna grid constructor\n"); arg += nargbase; narg -= nargbase; // begin code common to all SNAP computes - double rfac0, rmin0; - int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + //double rfac0, rmin0; + //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; int ntypes = atom->ntypes; int nargmin = 6 + 2 * ntypes; @@ -56,6 +56,8 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : wselfallflag = 0; switchinnerflag = 0; nelements = 1; + chunksize = 32768; + parallel_thresh = 8192; // process required arguments @@ -112,6 +114,7 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp); iarg += 2; } else if (strcmp(arg[iarg], "chem") == 0) { + printf("^^^ chem flag, creating map\n"); if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style); chemflag = 1; memory->create(map, ntypes + 1, "compute_sna_grid:map"); @@ -181,11 +184,17 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid::~ComputeSNAGrid() { - memory->destroy(radelem); - memory->destroy(wjelem); - memory->destroy(cutsq); - delete snaptr; + if (copymode) return; + printf("^^^ begin ComputeSNAGrid destructor\n"); + memory->destroy(radelem); + printf("^^^^ CSG 1\n"); + memory->destroy(wjelem); + printf("^^^^ CSG 2\n"); + memory->destroy(cutsq); + printf("^^^^ CSG 3\n"); + delete snaptr; + printf("^^^^ CSG 4\n"); if (chemflag) memory->destroy(map); } @@ -196,12 +205,16 @@ void ComputeSNAGrid::init() if ((modify->get_compute_by_style("^sna/grid$").size() > 1) && (comm->me == 0)) error->warning(FLERR, "More than one instance of compute sna/grid"); snaptr->init(); + + printf("^^^ finished ComputeSNAGrid init()\n"); } /* ---------------------------------------------------------------------- */ void ComputeSNAGrid::compute_array() { + printf("^^^ inside ComputeSNAGrid compute_array()\n"); + invoked_array = update->ntimestep; // compute sna for each gridpoint @@ -211,6 +224,8 @@ void ComputeSNAGrid::compute_array() int *const type = atom->type; const int ntotal = atom->nlocal + atom->nghost; + printf("^^^ ntotal: %d\n", ntotal); + // ensure rij, inside, and typej are of size jnum snaptr->grow_rij(ntotal); diff --git a/src/ML-SNAP/compute_sna_grid.h b/src/ML-SNAP/compute_sna_grid.h index 3a5a373826..a158c2342f 100644 --- a/src/ML-SNAP/compute_sna_grid.h +++ b/src/ML-SNAP/compute_sna_grid.h @@ -31,21 +31,27 @@ class ComputeSNAGrid : public ComputeGrid { void init() override; void compute_array() override; double memory_usage() override; + int ncoeff,nelements; // public for kokkos, but could go in the protected block now - private: - int ncoeff; + protected: + //int ncoeff; double **cutsq; double rcutfac; double *radelem; double *wjelem; int *map; // map types to [0,nelements) - int nelements, chemflag; + int chemflag; int switchinnerflag; double *sinnerelem; double *dinnerelem; + int parallel_thresh; class SNA *snaptr; double cutmax; int quadraticflag; + double rfac0, rmin0; + int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + int chunksize; + }; } // namespace LAMMPS_NS From 75392648469488627f091138314cfc3ee59121ea Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sat, 8 Apr 2023 13:08:24 -0600 Subject: [PATCH 08/51] Sync device and host compute arrays --- src/KOKKOS/compute_sna_grid_kokkos.h | 10 +++-- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 48 ++++++++--------------- src/KOKKOS/kokkos_type.h | 16 ++++++++ 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index b461f755b8..6b85300cda 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -242,9 +242,13 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { //DAT::tdual_float_4d k_gridlocal; //typedef Kokkos::DualView t_gridlocal_4d; //typedef Kokkos::View t_4d; - typedef Kokkos::DualView tdual_float_4d; - tdual_float_4d k_gridlocal; - tdual_float_4d d_gridlocal; + // should we use LMPDeviceType below? + //typedef Kokkos::DualView tdual_float_4d; + //typedef tdual_float_4d::t_dev tdev_float_4d; + //tdual_float_4d k_gridlocal; + //tdev_float_4d d_gridlocal; + DAT::tdual_float_4d k_gridlocal; + typename AT::t_float_4d d_gridlocal; // Utility routine which wraps computing per-team scratch size requirements for diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index b0cf30d070..583b5d1a46 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -198,6 +198,10 @@ void ComputeSNAGridKokkos::setup() nyhi, nxlo, nxhi, "grid:gridlocal"); } array = gridall; + + d_gridlocal = k_gridlocal.template view(); + d_grid = k_grid.template view(); + d_gridall = k_gridall.template view(); } // Compute @@ -372,41 +376,21 @@ void ComputeSNAGridKokkos::compute_array() */ + // populate the gridlocal array + // best to do parallel loop over grid points again + // ... - // let's try a simple parallel for loop - // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this - // function, because this is a host-device function. - /* - typename Kokkos::RangePolicy policy_loop(0,4); - Kokkos::parallel_for("Loop",policy_loop,*this); - */ + // d_grid(0,0) = 1.0; // attempt to access inaccessible memory space + k_gridlocal.template modify(); + k_gridlocal.template sync(); - // Simple working loop: - /* - Kokkos::parallel_for("Loop1", 4, KOKKOS_LAMBDA (const int& i) { - printf("Greeting from iteration %i\n",i); - }); - */ + k_grid.template modify(); + k_grid.template sync(); - /* - // NOTE: We get the compiler error calling host function DeviceType::in_parallel() in this - // function, because this is a host-device function. - const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; - Snap3DRangePolicy - policy_compute_ck({0,0,0},{vector_length,ntotal,chunk_size_div},{vector_length,tile_size_compute_ck,1}); - Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); - */ - - // Simple example of 3D MD range policy. - // Begin loop over grid points. - /* - // NOTE: We don't get the compiler error calling host function DeviceType::in_parallel() in this - // function, but we get it in the above function. - int n = 3; // bounds for mdrange policy - typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagComputeSNAGrid3D> policy_3d({0,0,0},{n,n,n}); - Kokkos::parallel_for("3D",policy_3d,*this); - */ + k_gridall.template modify(); + k_gridall.template sync(); + printf("^^^ End ComputeSNAGridKokkos compute_array()\n"); } @@ -437,6 +421,8 @@ void ComputeSNAGridKokkos::operator() (Tag int ii = team.team_rank() + team.league_rank() * team.team_size(); if (ii >= chunk_size) return; + d_gridall(ii,0) = 100.0; + // get a pointer to scratch memory // This is used to cache whether or not an atom is within the cutoff. // If it is, type_cache is assigned to the atom type. diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index a496f6ff94..456a22ac56 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -717,6 +717,14 @@ typedef tdual_float_3d::t_dev_um t_float_3d_um; typedef tdual_float_3d::t_dev_const_um t_float_3d_const_um; typedef tdual_float_3d::t_dev_const_randomread t_float_3d_randomread; +//4d float array n +typedef Kokkos::DualView tdual_float_4d; +typedef tdual_float_4d::t_dev t_float_4d; +typedef tdual_float_4d::t_dev_const t_float_4d_const; +typedef tdual_float_4d::t_dev_um t_float_4d_um; +typedef tdual_float_4d::t_dev_const_um t_float_4d_const_um; +typedef tdual_float_4d::t_dev_const_randomread t_float_4d_randomread; + #ifdef LMP_KOKKOS_NO_LEGACY typedef Kokkos::DualView tdual_float_1d_4; #else @@ -1017,6 +1025,14 @@ typedef tdual_float_2d::t_host_um t_float_2d_um; typedef tdual_float_2d::t_host_const_um t_float_2d_const_um; typedef tdual_float_2d::t_host_const_randomread t_float_2d_randomread; +//4d float array n +typedef Kokkos::DualView tdual_float_4d; +typedef tdual_float_4d::t_host t_float_4d; +typedef tdual_float_4d::t_host_const t_float_4d_const; +typedef tdual_float_4d::t_host_um t_float_4d_um; +typedef tdual_float_4d::t_host_const_um t_float_4d_const_um; +typedef tdual_float_4d::t_host_const_randomread t_float_4d_randomread; + #ifdef LMP_KOKKOS_NO_LEGACY typedef Kokkos::DualView tdual_float_1d_4; #else From bd1134c083c7035d3b1efabb06bc43f54a7521aa Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sat, 8 Apr 2023 17:21:27 -0600 Subject: [PATCH 09/51] Debug inability to deep copy --- src/KOKKOS/compute_sna_grid_kokkos.h | 15 ++ src/KOKKOS/compute_sna_grid_kokkos_impl.h | 267 +++++++++++++++++++--- src/ML-SNAP/compute_sna_grid.cpp | 4 +- 3 files changed, 255 insertions(+), 31 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index 6b85300cda..abd1c985b6 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -70,6 +70,9 @@ struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist struct TagCSNAGridComputeZi{}; struct TagCSNAGridComputeBi{}; +struct TagCSNAGridTransformBi{}; // re-order blist from AoSoA to AoS +struct TagCSNAGridLocalFill{}; // fill the gridlocal array +struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce struct TagComputeSNAGridLoop{}; struct TagComputeSNAGrid3D{}; @@ -179,6 +182,9 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { KOKKOS_INLINE_FUNCTION void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + // PrintNeigh + //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + // 3D case - used by parallel_for KOKKOS_INLINE_FUNCTION void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const; @@ -204,6 +210,15 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { KOKKOS_INLINE_FUNCTION void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalFill,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalFill2,const int& ii) const; + protected: SNAKokkos snaKK; diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 583b5d1a46..3148bf32ce 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -35,6 +35,8 @@ #include #include +#include + #define MAXLINE 1024 #define MAXWORD 3 @@ -46,7 +48,7 @@ template ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg) { //respa_enable = 0; - + printf("^^^ Begin ComputeSNAGridKokkos constructor\n"); kokkosable = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; @@ -64,6 +66,10 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos //int n = atom->ntypes; //printf("^^^ realloc d_map\n"); //MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + + + printf("^^^ wjelem[0]: %f\n", wjelem[0]); + printf("^^^ wjelem[1]: %f\n", wjelem[1]); printf("^^^^^ cutsq: %f\n", cutsq[1][1]); @@ -71,7 +77,76 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos cutsq_tmp = cutsq[1][1]; //memoryKK->create_kokkos(k_gridlocal, - //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); + //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); + + + // Set up element lists + printf("^^^ Begin kokkos reallocs with nelements = %d\n", nelements); + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); + MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements); + // pair snap kokkos uses `ncoeffall` in the following, inherits from original. + //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff); + MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements); + MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements); + int n = atom->ntypes; + //printf("^^^ realloc d_map\n"); + printf("^^^ n: %d\n", n); + MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + + printf("^^^ begin mirrow view creation\n"); + auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); + //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem); + auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); + auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); + auto h_map = Kokkos::create_mirror_view(d_map); + + printf("^^^ begin loop over elements, nelements = %d\n", nelements); + // start from index 1 because of how compute sna/grid is + for (int i = 1; i <= atom->ntypes; i++) { + printf("^^^^^ i %d\n", i); + h_radelem(i) = radelem[i]; + h_wjelem(i) = wjelem[i]; + printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]); + printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i)); + if (switchinnerflag){ + h_sinnerelem(i) = sinnerelem[i]; + h_dinnerelem(i) = dinnerelem[i]; + } + // pair snap kokkos uses `ncoeffall` in the following. + //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) { + // h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff]; + //} + } + + printf("^^^ begin loop over map\n"); + // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where + // some things like `map` get allocated regardless of chem flag. + if (chemflag){ + for (int i = 1; i <= atom->ntypes; i++) { + h_map(i) = map[i]; + printf("%d\n", map[i]); + } + } + + Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_wjelem,h_wjelem); + if (switchinnerflag){ + Kokkos::deep_copy(d_sinnerelem,h_sinnerelem); + Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); + } + if (chemflag){ + Kokkos::deep_copy(d_map,h_map); + } + + double bytes = MemKK::memory_usage(d_wjelem); + printf("^^^ bytes: %f\n", bytes); + + snaKK = SNAKokkos(rfac0,twojmax, + rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK.grow_rij(0,0); + snaKK.init(); + } // Destructor @@ -97,14 +172,15 @@ void ComputeSNAGridKokkos::init() ComputeSNAGrid::init(); + /* // Set up element lists printf("^^^ Begin kokkos reallocs\n"); MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); - MemKK::realloc_kokkos(d_wjelem,"pair:wjelem",nelements); + MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements); // pair snap kokkos uses `ncoeffall` in the following, inherits from original. //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff); - MemKK::realloc_kokkos(d_sinnerelem,"pair:sinnerelem",nelements); - MemKK::realloc_kokkos(d_dinnerelem,"pair:dinnerelem",nelements); + MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements); + MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements); int n = atom->ntypes; //printf("^^^ realloc d_map\n"); printf("^^^ n: %d\n", n); @@ -119,15 +195,16 @@ void ComputeSNAGridKokkos::init() auto h_map = Kokkos::create_mirror_view(d_map); printf("^^^ begin loop over elements, nelements = %d\n", nelements); - for (int ielem = 0; ielem < nelements; ielem++) { - printf("^^^^^ ielem %d\n", ielem); - h_radelem(ielem) = radelem[ielem]; - printf("^^^^^ 1\n"); - h_wjelem(ielem) = wjelem[ielem]; - printf("^^^^^ 2\n"); + // start from index 1 because of how compute sna/grid is + for (int i = 1; i <= atom->ntypes; i++) { + printf("^^^^^ i %d\n", i); + h_radelem(i) = radelem[i]; + h_wjelem(i) = wjelem[i]; + printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]); + printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i)); if (switchinnerflag){ - h_sinnerelem(ielem) = sinnerelem[ielem]; - h_dinnerelem(ielem) = dinnerelem[ielem]; + h_sinnerelem(i) = sinnerelem[i]; + h_dinnerelem(i) = dinnerelem[i]; } // pair snap kokkos uses `ncoeffall` in the following. //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) { @@ -159,6 +236,7 @@ void ComputeSNAGridKokkos::init() rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); snaKK.grow_rij(0,0); snaKK.init(); + */ if (host_flag) { @@ -167,7 +245,7 @@ void ComputeSNAGridKokkos::init() // error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single " // "CPU thread"); - ComputeSNAGrid::init(); + //ComputeSNAGrid::init(); return; } @@ -363,18 +441,34 @@ void ComputeSNAGridKokkos::compute_array() policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); - //Looks like best way to grab blist is in a parallel_for - //Transform data layout of blist out of AoSoA //We need this because `blist` gets used in ComputeForce which doesn't //take advantage of AoSoA, which at best would only be beneficial on the margins //NOTE: Do we need this in compute sna/grid/kk? - /* - Snap3DRangePolicy + Snap3DRangePolicy policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + + //Looks like best way to grab blist is in a parallel_for + + //GridFill + /* + { + int scratch_size = scratch_size_helper(team_size_compute_neigh * ntotal); + + SnapAoSoATeamPolicy + policy_fill(chunk_size, team_size_compute_neigh, vector_length); + policy_fill = policy_fill.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("GridLocalFill",policy_fill,*this); + } */ + //GridFill2 + { + typename Kokkos::RangePolicy policy_fill(0,chunk_size); + Kokkos::parallel_for(policy_fill, *this); + } + // populate the gridlocal array // best to do parallel loop over grid points again @@ -408,6 +502,11 @@ KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos + //printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0)); + //artificially set values here since we can't get the deep_copy to work + //d_wjelem[1] = 1.0; + //d_radelem[1] = 0.5; + SNAKokkos my_sna = snaKK; @@ -421,7 +520,7 @@ void ComputeSNAGridKokkos::operator() (Tag int ii = team.team_rank() + team.league_rank() * team.team_size(); if (ii >= chunk_size) return; - d_gridall(ii,0) = 100.0; + //d_gridall(ii,0) = 100.0; // get a pointer to scratch memory // This is used to cache whether or not an atom is within the cutoff. @@ -456,15 +555,16 @@ void ComputeSNAGridKokkos::operator() (Tag xgrid[0] = ix * delx; xgrid[1] = iy * dely; xgrid[2] = iz * delz; - const double xtmp = xgrid[0]; - const double ytmp = xgrid[1]; - const double ztmp = xgrid[2]; + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; // currently, all grid points are type 1 // not clear what a better choice would be const int itype = 1; - const int ielem = d_map[itype]; + int ielem = 0; + if (chemflag) ielem = d_map[itype]; const double radi = d_radelem[ielem]; // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. @@ -529,7 +629,8 @@ void ComputeSNAGridKokkos::operator() (Tag //printf("ninside: %d\n", ninside); - d_ninside(ii) = ninside; + d_ninside(ii) = ninside; + //printf("%d\n", d_ninside(ii)); // TODO: Make sure itype is appropriate instead of ielem Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), @@ -542,16 +643,45 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; - const int jelem = d_map[jtype]; + int jtype = type(j); + //printf("jtype: %d\n", jtype); + int jelem = 0; + if (chemflag) jelem = d_map[jtype]; + //d_wjelem[jelem] = 1.0; + //d_radelem[jelem] = 1.0; my_sna.rij(ii,offset,0) = static_cast(dx); my_sna.rij(ii,offset,1) = static_cast(dy); my_sna.rij(ii,offset,2) = static_cast(dz); - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); + // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp + my_sna.wj(ii,offset) = static_cast(d_wjelem[jtype]); + //my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jtype])*rcutfac); + my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jtype])*rcutfac); my_sna.inside(ii,offset) = j; + + //printf("%f\n", my_sna.wj(ii,offset)); + + //printf("jelem: %d\n", jelem); + //printf("rij: %f %f %f\n", dx, dy, dz); + //printf("params: %f %f %f\n", d_wjelem[jtype], d_radelem[jtype], rcutfac); + //printf("%f %f %f\n", my_sna.rij(ii,offset,0), my_sna.rij(ii,offset,1), my_sna.rij(offset,2)); + //printf("%f %f %f\n", my_sna.wj(ii,offset), my_sna.rcutij(ii,offset), my_sna.inside(ii,offset)); + // we can't use std::cout on device code, maybe make another function for this? + //std::cout << my_sna.rij(ii,offset,0) << std::endl; + //printf("%f %f %f\n", dx, dy, dz); + // apparently isnan is also a host function and not allowed here... + /* + if (isnan(dx) || isnan(dy) || isnan(dz)){ + printf("Found a nan!\n"); + } + if (isnan(d_wjelem[jelem]) || isnan(radi) || isnan(d_radelem[jelem]) || isnan(rcutfac) || isnan(j)){ + printf("Found a nan 2!\n"); + } + */ + // Our best bet is to make another non-host function for printing + if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[itype] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[itype] + d_dinnerelem[jelem]); + my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) my_sna.element(ii,offset) = jelem; @@ -621,7 +751,7 @@ void ComputeSNAGridKokkos::operator() (Tag const int ii = iatom_mod + vector_length * iatom_div; if (ii >= chunk_size) return; - const int ninside = d_ninside(ii); + const int ninside = d_ninside(ii); // use ntotal or d_ninside? if (jj >= ninside) return; my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); @@ -713,6 +843,83 @@ void ComputeSNAGridKokkos::operator() (Tag my_sna.compute_bi(iatom_mod,jjb,iatom_div); } +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (idxb >= my_sna.idxb_max) return; + + const int ntriples = my_sna.ntriples; + + for (int itriple = 0; itriple < ntriples; itriple++) { + + const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); + + my_sna.blist(iatom, itriple, idxb) = blocal; + } + +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill,const typename Kokkos::TeamPolicy::member_type& team) const { + + // this function is following the same procedure in ComputeNeigh so that we can fill the grid + + SNAKokkos my_sna = snaKK; + + // basic quantities associated with this team: + // team_rank : rank of thread in this team + // league_rank : rank of team in this league + // team_size : number of threads in this team + //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size()); + + // extract loop index + int ii = team.team_rank() + team.league_rank() * team.team_size(); + if (ii >= chunk_size) return; + + //d_gridall(ii,0) = 100.0; + + const auto idxb_max = snaKK.idxb_max; + + // linear contributions + + + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb); + } + +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill2, const int& ii) const { + SNAKokkos my_sna = snaKK; + + const auto idxb_max = snaKK.idxb_max; + + // linear contributions + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + //printf("blist: %f\n", my_sna.blist(ii,idx_chem,idxb)); + d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb); + + if (icoeff == 0){ + //printf("%f\n", my_sna.blist(ii,idx_chem,idxb)); + } + } + +} + /* template KOKKOS_INLINE_FUNCTION diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp index 36780213f2..9125b7dcd4 100644 --- a/src/ML-SNAP/compute_sna_grid.cpp +++ b/src/ML-SNAP/compute_sna_grid.cpp @@ -69,8 +69,10 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : twojmax = utils::inumeric(FLERR, arg[5], false, lmp); for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp); - for (int i = 0; i < ntypes; i++) + for (int i = 0; i < ntypes; i++) { wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp); + printf("^^^^^ ComputeSNAGrid wj: %f\n", wjelem[i+1]); + } // construct cutsq From 02122c809c4e3df9b092b3cf7bfb9f154faa4eef Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sat, 8 Apr 2023 17:52:35 -0600 Subject: [PATCH 10/51] Change ntotal to n_ninside --- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 3148bf32ce..d21d29485b 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -712,9 +712,11 @@ void ComputeSNAGridKokkos::operator() (Tag const int ii = iatom_mod + iatom_div * vector_length; if (ii >= chunk_size) return; - const int ninside = ntotal; //d_ninside(ii); + const int ninside = d_ninside(ii); // use d_ninside or ntotal? if (jnbor >= ninside) return; + printf("ninside: %d\n", ninside); + my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); } From a3d8ab308861a02b77e07694c36bbebf5f8a14a7 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sat, 8 Apr 2023 19:03:25 -0600 Subject: [PATCH 11/51] Add cutoff view properly --- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 22 +++++++++++++++++----- src/KOKKOS/pair_snap_kokkos_impl.h | 2 ++ src/KOKKOS/sna_kokkos_impl.h | 3 +++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index d21d29485b..6c4e11b25a 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -76,6 +76,14 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos cutsq_tmp = cutsq[1][1]; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = 1; j <= atom->ntypes; j++){ + k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp; + k_cutsq.template modify(); + } + } + + //memoryKK->create_kokkos(k_gridlocal, //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); @@ -502,10 +510,13 @@ KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos - //printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0)); + if (d_wjelem[1] > 0){ + printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0)); + } //artificially set values here since we can't get the deep_copy to work //d_wjelem[1] = 1.0; //d_radelem[1] = 0.5; + //printf("%f\n", rnd_cutsq(1,1)); SNAKokkos my_sna = snaKK; @@ -609,11 +620,12 @@ void ComputeSNAGridKokkos::operator() (Tag //if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) { const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; - //if (rsq >= rnd_cutsq(itype,jtype)) { - if (rsq >= cutsq_tmp){ + //if (rsq >= cutsq_tmp){ + if (rsq >= rnd_cutsq(itype,jtype)) { jtype = -1; // use -1 to signal it's outside the radius + } else { + //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); } - //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); if (j > 340){ printf("j: %d\n", j); @@ -715,7 +727,7 @@ void ComputeSNAGridKokkos::operator() (Tag const int ninside = d_ninside(ii); // use d_ninside or ntotal? if (jnbor >= ninside) return; - printf("ninside: %d\n", ninside); + //printf("ninside: %d\n", ninside); my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); } diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 45bacb4c97..02ba7f1604 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -664,6 +664,8 @@ template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + printf("d_wjelem: %f %f %f %f\n", d_wjelem[0], d_wjelem[1], d_wjelem(0), d_wjelem(1)); SNAKokkos my_sna = snaKK; // extract atom number diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index fd58f1c4f3..ba23a38af2 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -393,6 +393,9 @@ void SNAKokkos::compute_cayley_klein(const const real_type z0 = r * cs / sn; const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; + //printf("%f %f %f\n", sn, cs, z0); + //printf("%f %f %f %f %f\n", x, y, z, rcut, rmin0); + const real_type wj_local = wj(iatom, jnbor); real_type sfac, dsfac; compute_s_dsfac(r, rcut, sinner, dinner, sfac, dsfac); From a720328770ddae90d7a08a534e6433244d9f959e Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sat, 8 Apr 2023 21:32:46 -0600 Subject: [PATCH 12/51] Matching descriptors when no neighbors, good checkpoint for debugging --- src/KOKKOS/compute_sna_grid_kokkos.h | 1 + src/KOKKOS/compute_sna_grid_kokkos_impl.h | 30 ++++++++++++++++------- src/KOKKOS/sna_kokkos_impl.h | 4 ++- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index abd1c985b6..571e09742e 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -240,6 +240,7 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { Kokkos::View d_dinnerelem; // element inner cutoff half-width Kokkos::View d_ninside; // ninside for all atoms in list Kokkos::View d_map; // mapping from atom types to elements + Kokkos::View d_test; // test view typedef Kokkos::DualView tdual_fparams; tdual_fparams k_cutsq; diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 6c4e11b25a..ee080fab3b 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -96,6 +96,9 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff); MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements); MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements); + // test + MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements); + int n = atom->ntypes; //printf("^^^ realloc d_map\n"); printf("^^^ n: %d\n", n); @@ -108,13 +111,16 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); auto h_map = Kokkos::create_mirror_view(d_map); + // test + auto h_test = Kokkos::create_mirror_view(d_test); + h_test(0) = 2.0; printf("^^^ begin loop over elements, nelements = %d\n", nelements); // start from index 1 because of how compute sna/grid is for (int i = 1; i <= atom->ntypes; i++) { printf("^^^^^ i %d\n", i); - h_radelem(i) = radelem[i]; - h_wjelem(i) = wjelem[i]; + h_radelem(i-1) = radelem[i]; + h_wjelem(i-1) = wjelem[i]; printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]); printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i)); if (switchinnerflag){ @@ -146,6 +152,8 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos if (chemflag){ Kokkos::deep_copy(d_map,h_map); } + // test + Kokkos::deep_copy(d_test,h_test); double bytes = MemKK::memory_usage(d_wjelem); printf("^^^ bytes: %f\n", bytes); @@ -510,13 +518,14 @@ KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos - if (d_wjelem[1] > 0){ - printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0)); - } + //printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0)); //artificially set values here since we can't get the deep_copy to work //d_wjelem[1] = 1.0; //d_radelem[1] = 0.5; - //printf("%f\n", rnd_cutsq(1,1)); + //printf("%f\n", rnd_cutsq(1,1)); + + //Print the test view to see that the deep copy works: + //printf("%f\n", d_test(0)); SNAKokkos my_sna = snaKK; @@ -569,6 +578,7 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT xtmp = xgrid[0]; const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; + //printf("%f %f %f\n", xtmp, ytmp, ztmp); // currently, all grid points are type 1 // not clear what a better choice would be @@ -665,9 +675,10 @@ void ComputeSNAGridKokkos::operator() (Tag my_sna.rij(ii,offset,1) = static_cast(dy); my_sna.rij(ii,offset,2) = static_cast(dz); // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp - my_sna.wj(ii,offset) = static_cast(d_wjelem[jtype]); + // actually since the views here have values starting at 0, let's use jelem + my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); //my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jtype])*rcutfac); - my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jtype])*rcutfac); + my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); my_sna.inside(ii,offset) = j; //printf("%f\n", my_sna.wj(ii,offset)); @@ -741,7 +752,8 @@ void ComputeSNAGridKokkos::operator() (Tag if (ii >= chunk_size) return; int itype = type(ii); - int ielem = d_map[itype]; + //int ielem = d_map[itype]; + int ielem = 0; my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); } diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index ba23a38af2..8102a8b6b7 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -393,7 +393,8 @@ void SNAKokkos::compute_cayley_klein(const const real_type z0 = r * cs / sn; const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - //printf("%f %f %f\n", sn, cs, z0); + //printf("jnbor: %d %f %f %f %f %f\n", jnbor, x,y,z, rfac0, rcut); + //printf("%f %f %f %f %f %f %f\n", rscale0, r, rmin0, theta0, sn, cs, z0); //printf("%f %f %f %f %f\n", x, y, z, rcut, rmin0); const real_type wj_local = wj(iatom, jnbor); @@ -773,6 +774,7 @@ void SNAKokkos::compute_bi(const int& iato sumzu -= bzero[j]; } } + //printf("%f\n", sumzu); blist_pack(iatom_mod, jjb, itriple, iatom_div) = sumzu; //} // end loop over j //} // end loop over j1, j2 From d75ceabfb038b7a76d74b2c541578dd008f49d24 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sat, 8 Apr 2023 21:55:04 -0600 Subject: [PATCH 13/51] Fix neighbor criteria so atoms sharing positions with gridpoints aren't included as neighbors --- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index ee080fab3b..9041509e3f 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -578,7 +578,7 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT xtmp = xgrid[0]; const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; - //printf("%f %f %f\n", xtmp, ytmp, ztmp); + printf("rtmp: %f %f %f\n", xtmp, ytmp, ztmp); // currently, all grid points are type 1 // not clear what a better choice would be @@ -631,10 +631,10 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; //if (rsq >= cutsq_tmp){ - if (rsq >= rnd_cutsq(itype,jtype)) { + if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-10) { jtype = -1; // use -1 to signal it's outside the radius } else { - //printf("jtype rsq rnd_cutsq: %d %f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); + printf("jtype rsq rnd_cutsq: %d %.11f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); } if (j > 340){ @@ -667,6 +667,9 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT dz = x(j,2) - ztmp; int jtype = type(j); //printf("jtype: %d\n", jtype); + if (dx==0 && dy==0 && dz==0){ + printf("rij: %f %f %f\n", xtmp, ytmp, ztmp); + } int jelem = 0; if (chemflag) jelem = d_map[jtype]; //d_wjelem[jelem] = 1.0; From 40db9b1701a130bafd0689bfb5c9a0b3b9ca530b Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sun, 9 Apr 2023 15:54:10 -0600 Subject: [PATCH 14/51] Agreement between Kokkos and original compute sna/grid with switchflag = 1; note that switchflag = 0 gives wrongly zeroed values for Kokkos because of bug in compute_s_dsfac function of sna_kokkos_impl.h causing sfac to be zero --- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 10 ++++------ src/KOKKOS/sna_kokkos_impl.h | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 9041509e3f..b37082ca5f 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -578,7 +578,7 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT xtmp = xgrid[0]; const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; - printf("rtmp: %f %f %f\n", xtmp, ytmp, ztmp); + //printf("rtmp: %f %f %f\n", xtmp, ytmp, ztmp); // currently, all grid points are type 1 // not clear what a better choice would be @@ -631,14 +631,11 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; //if (rsq >= cutsq_tmp){ + // don't include atoms that share location with grid point if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-10) { jtype = -1; // use -1 to signal it's outside the radius } else { - printf("jtype rsq rnd_cutsq: %d %.11f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); - } - - if (j > 340){ - printf("j: %d\n", j); + //printf("jtype rsq rnd_cutsq: %d %.11f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); } //printf("j: %d\n", j); @@ -830,6 +827,7 @@ void ComputeSNAGridKokkos::operator() (Tag auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + //printf("^^^ utot: %f %f\n", utot_re, utot_im); if (mapper.flip_sign == 1){ utot_im = -utot_im; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 8102a8b6b7..55256f60cd 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -400,6 +400,7 @@ void SNAKokkos::compute_cayley_klein(const const real_type wj_local = wj(iatom, jnbor); real_type sfac, dsfac; compute_s_dsfac(r, rcut, sinner, dinner, sfac, dsfac); + //printf("^^^ sfac wj_local: %f %f\n", sfac, wj_local); sfac *= wj_local; dsfac *= wj_local; @@ -520,6 +521,8 @@ void SNAKokkos::compute_ui_small(const typ const complex b = b_pack(iatom_mod, jnbor, iatom_div); const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + //printf("^^^ %f %f %f %f %f\n", a.re, a.im, b.re, b.im, sfac); + const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); // we need to "choose" when to bend @@ -606,6 +609,7 @@ void SNAKokkos::evaluate_ui_jbend(const Wi ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re); } + //printf("^^^ ulist %f %f\n", ulist_accum.re, ulist_accum.im); ulist_wrapper.set(ma, ulist_accum); } @@ -647,7 +651,7 @@ void SNAKokkos::evaluate_ui_jbend(const Wi } ulist_wrapper.set(ma, ulist_accum); - + //printf("^^^ ulist_accum: %f %f\n", ulist_accum.re, ulist_accum.im); mb++; } @@ -656,10 +660,15 @@ void SNAKokkos::evaluate_ui_jbend(const Wi for (int ma = 0; ma < j; ma++) { const complex ulist_prev = ulist_wrapper.get(ma); + //printf("ulist_prev %f %f\n", ulist_prev.re, ulist_prev.im); // atomic add the previous level here Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + + // see if we can see this value + //printf("^^^ %f\n", ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)); + //printf("^^^ sfac: %f\n", sfac); } } @@ -750,6 +759,7 @@ void SNAKokkos::compute_bi(const int& iato const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); + //printf("^^^ %f %f %f %f\n", utot.re, zloc.re, utot.im, zloc.im); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } @@ -875,7 +885,7 @@ typename SNAKokkos::complex SNAKokkos::complex SNAKokkos::complex SNAKokkos::compute_s_dsfac(const real constexpr real_type zero = static_cast(0.0); constexpr real_type onehalf = static_cast(0.5); + //printf("^^^ flags: %d %d\n", switch_flag, switch_inner_flag); + if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; } else if (switch_flag == 1) { if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; } From 470581d4696912f5a756391195a8257422d7bd0c Mon Sep 17 00:00:00 2001 From: rohskopf Date: Mon, 10 Apr 2023 17:18:00 -0600 Subject: [PATCH 15/51] Organize sna method calls and clean up --- .../.compute_sna_grid_kokkos_impl.h.swo | Bin 0 -> 49152 bytes src/KOKKOS/compute_sna_grid_kokkos.h | 7 +- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 380 ++---------------- src/KOKKOS/sna_kokkos_impl.h | 21 - src/ML-SNAP/compute_grid.cpp | 2 - src/ML-SNAP/compute_sna_grid.cpp | 13 - 6 files changed, 30 insertions(+), 393 deletions(-) create mode 100644 src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo diff --git a/src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo b/src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo new file mode 100644 index 0000000000000000000000000000000000000000..7f92a608886209c3570518c3c072aa00307d6609 GIT binary patch literal 49152 zcmeI53w-2NmFI)+0YS&{J?qRR3{Xi|9^Jf}&@}cF+U}%Vk_M$^ib~Z#$v>$_sH$|5 zZd$=7DmpqoP#F{-%&afQk;RWQpA}toc11;86(5MAtEk{x$8mMWb-(A_d;kA`RjQIo zCkQ*6`t+~!f82ZSz2`p9x#yn$X#YjK2F=Fa{tkYR@96m68&A)F{k>^(+rX1LviVAR zqBS#(^~stZ`tC1OK5oP@6us;{}xHVXjo8?(Z=Te;?uQRe0cf ztl|A5-Ti?3dt1Z%N4fir?(aD+0iVvJ-Se&P@2eZ$KgK=x63cxJ@BhfXKhyoa&=ttj zKei$MKQz2Q&b{B~{yyR1_QJ<^w;g}!-d7vmyT@yP);O@nfi(`SabS%DYaCePz#0eE zIIzZnH4dzCV2uM0Ne*Q59UXU(v*(LE%>MtV%jJ(A*U|9~@H}u5`1uJP9q$99;OXGk z$9Hsm1Wbc6I2nBBu^k;ZgE0_*XM#>}B6tk=8AA3q!Dqk+z~6uugAnWi8^DR+c<@;8 zZwT!_0C$2ffscWYg7<(ofos84;7V{gK$X$a3*L+};9Bq!5P-h`j{@IAiEuA?6}T8Y z6PyR044wo&j}qbz@L#~q;AP+v@C@)%6d3n`d%<_XZQwBI2j4*X@d|JyI1`)!o&mmr z(&TRN4loJ2!TsnBJ`8RGhrl=(1M9%=se^BVkAfS4)X5p(7bqdV2)^Kz5sSAZqrZtv zH7u2aN>B)jsj$nOYux=3uL*u2y!4z72^Nl-RCPA274;cm5f85qb_f-IIW#jsK-S272~U^bQ4xLp#5 zEC`)Ab}>7>RAho^T$f%xY8OjIj6){l$L(sV_;JW&d|N+D#kHZ5@m#$v710GFCQKR( zio16Y48YPtp`3G8Meq}rq&v4+qod)WE$35Dp8ve+OqbTJ*QBLklt{6zK`cgCDkp7P zkd~L7X;YHc7?f&NO(L$*R+){fp+%nK0OY)mcq@`|&9opH2WXe1t*{p4?8|mZxhiZy zYrws3q$Yi1IH;MKY2$cc3SqUTz3&KZ!5I6$F;kh`Os#r(Z!N&QjLv3?<$O3cziH3l zz6;9L+6di#b4n{mV)@V$wljw5N$JB4Pbbql#wy)RtvD@x)tG2-nIr~-%jCkrWs<=~ z#+E4PlH(A8qhuhf*9X>L%0F%LaA9Qc_PqmU$6T$F&DCrGMk|vq&x{rVQz)0G8MF%J zS(;**))?e9^@ykrSMVVt13oFFg1LZ~VY!s4PG@t0Irm&6e6n4vQp#I)64G5ogIoN@ zvnyh`3~ZaW2M5Dkkl?oq2B}(pCR@miO=m0FVl~yGLur;f)5+esZhgb8&1F7Y%NDcM zsf?{^64)`gZ{)d|!7anPCA-mkhKl@xw+k2ma~(^Dg(Ewe4{TYIMH(1Tb}2WQPRFyL4QU_txAoE7ci%v%x7rJZ)r5%g79sHV1=Tjk%%_l*Vfln+FD( z#ZB$mGB~(z*mM_)(`i}*<65>do;JCOY{hhUbKPYIOnuChhPB}&K!>lrO}Rq0S~YR? zrH$mIlr09yd`T+xdF$w)d2TD0s!iA+=VJ07#O(883o)kAHALe)#mc}O{)u!N=E3SzwaN21-w6gxXTyqE=uS6HloXno@Cxyc zp4Ix9?ml`;i{Fle*^;?92uq_uW!$8OF50zy*Oo2`Pm7}+j53Cm#%0+xQLa>J>=P9@ z9TI7zQm)lZYItZ*SK18McrsWnR|5B>qp#bnP61Y*6T?AZY7>FEaL1PIgFDPv$Oytv zFOO>F>7GJxFen(;_48px2a5S#;fVw}e`vosKR`;z7R){-hcIXMFd&zzfdYPQEqgYe z(cj;15Gp3jaJG~Wvs6xHx?IW1{K7CouJ-p1_nPy|2ZKsUmPw2#l8dTP+E$*P*W8%Y zhW`H3x|+}+SNkPl%Y&d&tjfwO_gp|hD-D7g=*6H7)Z-_GVGH#c~dsu5Q;=z_N zZ2UVqK7?HPEaXp-|NZ{$e@cq=mf%fZ#)kHBNV*O2Gm z2i^nT4aULw;BMsjonQwz0X!BQ2W~}%{|Kmp?cmqQ?f(jX1?~bR@JR3oa38Y!E#PMG zPVjbc19%Cz0PF-kU>)cNPXxa~wtuaY@w@nY0=OHV-UU7lJ_Y^b zK?Vu;zH5`%$Xa)?qMj6vvHCPf zgT5r^@5AmWW^qHsXghDk>t

L;S_^l1R0UifxwIAE;`#@3?5!wjD=Nxy|z2ta@#@ zRjJ_BUxro6vbGGW21Q`rjLsV`DY1ANLkU|#YSyO7vS1ZgldDEb*45h7SZmExN@h66 zDke8^5kk61=SZ>>#h{2v`B0ELm@Oc=WTt}(YRPQT%GVD)%Y*?#q@~P}E6|<%6c#z+0HaF(^oDLB` zo3*660c#O4+Gj{gotX#SnhdaxaKX3&i zx;$vLcl4rHLrkZ|V9F?jwRtl(Q_6|-yQ)(I%TcfJ&g+qC~IKbN7u6y;)+E@}AXL{(=us{CtWS{kO$NJ#18%>s|c0Z*GXEFB9pH1|`JezcgOkCZgFgjN2R}ph{}lKD zcs|$zL?>_-_!ICr@Iz$(`@uhg4+F6ocp~^6^8b&)kHGyv^aEc9cY!y7?O-eD1}ShV z_;c`Q;BNB#dhpla1)vJbK+5uP@H@)&W$^dl4PXxt{lSmW6Wk7N0xtxjH|PMrM^A7& z5c_~Za6GsZUBPvr2nrw#9uIzup5RXKLeK~9MF;SDunYVfvi}#rhr!L@gWw;))4{#S z^xp;F0Y||3;3OdX{(lc{0`CVCpdZ|UZsv3C?fXm38Ge0zQ4gC;^e}g=J*>@O3>55H zDGk2`zLJ=2WUOh5^nF__`LkI0?E8lQSC*3cQf`HGCA#~ys1)9|c$4e`$SUmyY|;r97H}1* zts*ILqGD!NkKpQ2vRj~HH^Z;3`f95hR03~Bvek3eA5qg-Bsc9u_sbXGi>%?0l+a8| z{ACn%d7))h_aG0_TELZ5u-Bp5OA=A2-+pFBk16?IqD>PHeC?12u$YBX4Pz7OEZD$UyW^K zv|!I?c^=peU4bY`|iT5Om3|ESkcVnyNg%nSySyy^dw&eLEHw z-q1t^VkZ^4EfV9T#PLdbHd6^FQ~l|FC-_#1VW~fDjbFtKDm{8AsFd|eWMuoO#6GHC z%~pfL7y`E5L>721dTcu@^{qKTrd*~lnyQRtbN%VsYLcj?>MOimeZc5#MNxRz}Iq=s251Zy~Jln}zcRhmox?A>o%TlKo#ScqzLzrRE~C zFOGz*B@OqiO4c`@uET~Tw1XuU%hfU_ROm}f<5d(MQ-QgRv@SE_*fpvND^^K)y54xU zk}m|9U}4%I1dX)3-e!4@`E9cv-)jj({=Wrz_8*aTMgI5a_htY8ci`W_FTpRs2f#Q8 zKps2}d`=f4xY4O|0W2`&UDfuA6+{|NjLd<)zI z#2!G-1w0F!1;ifURB#G-GI$dBJ@Wj0;ETYY3HVE{H-g84dy(tE3*;<78I-_Q@E9Ov z{%7z>a2>cDYykt{Y_Jjh2HyMxd<3Q+at7cg@P6<^lY{%r+f3BT;V-9F*P+lugO0lShc9D{IH3h+X&UCOVv-0o`8+di(w{vHj>bptEIM|;5( zY|8Ho#|NRBSGA24#b__M?E^j)y<0pYOje^$(vj}Pcr5kfr7yGiPZ|WrnP>y@zg%|; zkpcNn)9zH^&loEnOE0wtElbCC@QdNBeHEqvM8s(B$MWpQ;alXy{7`NYjy?F1csqyU%8aSa z$j(fsI3h9|pnWNwS~IiRQYjmkQcf>OVO&GzmQxZO6koYYkGx9Kt287G3pE)K)c#-W z*|#FAiu~{W^NIZb9w7Vv&jA;Mv%y(F_WR!sZUCwekKzYksv4uDI*bHR6y!L|a-fdxOq?+6D~*#J9CdzRmB6mKO8Rdl;gwGYKxbdVT@DDIybPH_yr?q6sWgqi9e8uk-~za*i;uh z)^&122_w~HIOaE_-rwitQavX_8E_=FI7@&;41qN2iZ!B?38ovY?F+tbwm%`#zLbcepcKI?x^&(DL^2`die#Q%{MI+}v&16QRDu?XsXt$eu z*4c~pQ8KlON>H6}q~j7NTLQ&S-a1?UO~SB3Ft<6jl4oUa4>8l{yA9^Ke36JQ?%F_1 zJbDA=y4E5LSja|->(bZt_yDX@)j%~79I=)aCwn=0W6u{LT9l(bzL_yjktjQ)LYh>$ zp1sv0^EjL-nag}ZPzWSHsMI@g8B~O&(Aj1SyFWCueXbZ0Td4`w{P+eSKUO|SbGBE` z4{OfD+?Ko~#4375?1yP1QnyxEYrc-zS7Dy%OuL>qT?y!FP1i5_Ds`B>*mVDq^x-=G z`VCOLJk&Sjg#mr#WuPZR>iQ$D8l@wjw*5lctJt^TSVLUtKAuT*VJ*Yvs2&=UIEp$@ zf=hAE)zh{t8jShf1rE5!t3Y4Rxv3TtXB;46scGP;8v?`^gzsQ7JRV6e2pF~dsG~@_ z%jRsHr*x9L3(MJr)p6XTVE4il+T{I^ICO{_uPWZ^8n0{B`hqbbdAsz2q2;`lB$J9O zr`fz^vs6DACEEM)CN4)T(eNGPhJ+29jW7*u2IqB>!)l9BmSD|BSj#Fu{b*M0u#`+n zQ;LV(m<7^4NiV?^X05uB!q1O&4`pIx;G@^|xOQZ6L0CwV@gt%0rqX3y<6!Z^`Mn@p zl0yhp^cYGg&JTG>=qpu2!N|)2gvhic4$Huz@cWQWf z-N`gj3DskaxLvSE>X=y0I&-5;0Iqn^2laY$)3|+cdJs9Fh=RBRdu9J$_K+_`wiWq* zD}UtodE|Vt{r@W&w)>a+rXQ_8^8z^e74Rys1&H1LP2m0DJwSN!Y#_4# z6TlyX$Aia#6M&rOzZJY36u=ai1f$?&@I`b0*8n-Ee-;pZz_-9%;CAp?a1HQ3ar!W= zt{+&I=q&FB5_>`jM|IVJ!GB!c%yIB>-W0O)ETgFXU|MuimVk>IQ>%My$CP#{=5U2F zUbc%gQEvHFSA9~eLo-~Ze6A9iL~&O{Q6ih+B}pt9pjov?kyfidWL(!)cvS6AZTnXe zzeQ_fG1|+q>D52B&Ay(_jOzYNdK1UEt9F#MhmqadS0@1$zca`z2k8an!h8uIF4=-; zw9VFFT%6?iBjB|Oe5#gR=a9N2r+22u{frW1FnuK&N@#0gixf}U#siR zTUp^QiQSW=6e1g*>c-3qF)ieT#M)?&Kfbrwuls~7YN2>;Pb_bY9Q4x$k=Q6ucSxgQ zbsG7qGE;OCpe(a|HvGVAb;2BjFICYF7q`X|70&+bv&?7id>#;cHiC^JXHv*npW-jqu?Um6YgS3-!+(X)A^wN%7$6D9*n^5SIL z-tFsUdVp3qg+xNHhVz z=xfyg;`ggp{V~nuR(>L8vg)^4t%R-kc$cZq2pEx*WZ6MkKy9i@(6nzJOo^nsOu9KS9JgWSCZrkJ`Y@^4OoG?JIyKPn#;Cy4cA|81Zb` zuhpQkTCAo^Mr+hsEOe@P88u=>;RMwIRues)W90(+RIhPM8#$AJT?-00sYY8D_WxEh zhpYGfp*p0C^+=-kuCESP8Oz=MXsOJIs&^*zs_h3;39sdUh$-QvYsyb_ydwK2Y`xl3 zx=SmN9|R)(dY{ZDe)%8@_Bsu_$p5z^o1Tk2EAqepzMk0q{}Xs8xEopj>)@4O16U7! zihTbQAm0FZ4akD^pdUOHd=WWcz6J0>@G7ta91p&SyniE*{r(fddy)0UpZ}}CH9-9M zi~N5sI2Ak%90z`f{QvLZKJWu@Gk6iW20lz~pApZU34F4_Q^`Hta1kVPi0I}!) z-{9Lo{QbWNybxRky1`G;4SWOK0ImaX0b=Ah!J?AWd#g(RF;$X2P$k-*`o!`)gong z&1pF~Nl@3;vt*NV{1!9QxkS_KcTE#@fF#dIC(gIMzB4HXm;nvpJO3(*}G-L zlw?vuu{YPF9i(X}%_5=rwgZfboWGA`b`6c-@?G4btF{~0 zu+sZjXG~b zsKJ^xo5O03^LgXy{yU@lmy9zafJt_ptu-9b?_H3PTYgxxyM?yZY&qf{2Or|q%XL9x zAjG->7v8n8R43w>s8#U(lS2)Uui}%(?7!#9hL|uut(C;Oh-+oCY#T@6^_U`=K=)#% zXK*@<21YFOWJxAU8o!29ZBG=-jF#sj(?Bl)W)Ve;I3@O6#wl<(D$YCY_Alvu4`}g1e@~6_ zhlkijm+46QrhUHtQh7dapU=n7`}O&peXdTc`qWx}!(1wq?9qtikG32NEZ5X2nZYDM z{9nrhX({1cI-F0Fu%6JI4NgTTh&`yN4^YYk!ULlFW*de!=72dl&p&OfxZAQ1eOl68 zEPYp^R7vt#n(ty`ON+uiS1#9dFZuWkq_YL--}BfGOmiNalfbICu86CtL={Pv+$?2T zbQ*1iXCXwz7P(LsE|HGhVUetNhx876WWAFN%19S(Fej^pee_gD$#6rTSlvg@HpsJ( z@h}x0>XtX%Vt#M?q)?LaHw^ny$yb(bj@`m%<+roV)!de9SY^=8>_`I%ac9%rC5vNp zhhhjLZYon1Pfe+m zpb5t2IsW@EdQL@*V_4#Ad)zf18BqHm_oMwbf z29`D^W4Z(*%JzK~vGe1_i&yBb?|w(J=@qKH~wy934IC95IcY zdD9tdhNA6&QJu!MIcd!lJ0!4k5APSUlIqp=z>qgS6Ucd0_$!qcHy~{!Qti}Y;J%=u z1G)A%Nu3v3UD9-Li5zp@GCTH@FQIJsM0&)~7K5@+-qS>20qn3r667^hBCytE6g;Lq znyoSwUOvMV9@cO>FBMfrgDD5aRj;h%CA_XFmg#q3ffGEN!$J8h0N%m9Z(9nW+a73V ztzo=4_K~Xf3m@zKR-1wQHb%66UDZiA*^0bFM7j(ovS#_>Kv)dLQT}wLoWp}BJt$lC zYYDYU5Ot{>po-5gGPADOk46iBeQpt>38!t(l&(^@Z7>4A_bsq4O6(dWiQuS|#?3)9 zJ#Tf2HjF=j^FMJzDf0iDk$c7Goyh-Zy3Od{Aoq*^zxRWeftP|J5dVMYg8zdK;J<^b z!ByZ&5CE|a*axR^qR{}Zb@4x@Ij_XH) zH^GNjgGsOv^nh<8^WOsIfB_#u#vcN5W?#+-ydB&C{up@MfZLJTKMSq_=Ycc8>EHzL z3*>e=r~g_YXY|ux9q0z%LY{vccxx=z8~xeYWZgO$mgMJEeD-d!w`hCh*BGSf!z5Mb z8`RAu;vKZCahlLtBATA9>S8R`wdj%wyHpx8y!=G;Xhnx3#~(@*-z?-vGJ&NMcXT9} zy<93nDce@KwgRE}?oCz;j+yjF9_+>9Ws@+AW!KSIEJ%~K(WH`_@L#iT6KI=$+xb~0 z-CAnd5u_8eQP9QtyE5k4?*^vA2P*FibN4`{v@*IHyS+BFvV`J6OWJdSR^qbOd|+j#F7J!GRa2KL1Wa=oRRrM3t1(e>Y`RBy6M)j=a-6&%A5EeHDrCP!%i+RoDdnHG zrM|b{d)<5Q?Vbe zS}U>9#LJGV7ere=qehrIaa3uk{#awq@)V2BfZd9}UJ=C37{j}>FiPoc? zpb4Q?J)k~0sIOp$%E|v%AaDK%`7^oy|664Ie*xn6@8{rW;3jY_knaHg9guGTmcS4g z1S#+s@Mv&5^89CjoDH}dTn@x0;I-g2;5lF;I17kB|GSXsZv&qK*Mc_y@$G*hI0sB4 z=l6mh@Iz$#Yrz|V`1E@;_$TD|*8wlr%Rc|tz}vtPAie;^ci)-d4DfSg`5%J&!4JUq z!HwY6;1kIASAi?RW^g~U{e9r$pa5hqe-ro>^1S#1xDL#NIq+;Ce)-meU%}I#f*%9n z@4tf2gPXwzf%xfrH`ouJ1%69Cd~{7>+1kod_W#Ghev^N62oX2J~WG2Pp%SZhVqV%KdA zJI7JX7Q4pG8cWSN^(l6xWn9;}P`0@7Kg!M_dkGl~HQDxwazoDCHiZiwziEuceq_9% zQQ>=E2T@^nDlleK-?q)1#kQ(^o|O^YYw9C)vuf)TZPjPCKA+3m zA+V>WEb|I!MSl%!ldwVOUbrd1i9Ecf|6E12p zo(g;Udb>&?s1L+c@9zNT0g?C51{=Y@BJbY=Hi6$F=l=%W3a$fh0X0wsJHR#|zWwD}e)pjZ z*a;p19u7W*%zqe}RacV)J#bdRSYKv~{ozqKXe^}co^MIFNCX5#?SI%1w+)Dl?tUO(mv7C6%OHuiz4ex)_N1q;+`}-SAZ`o1-eLugj@F?;6`R!V;ZbC3g}qalAaCeTvi6#RmySYe$BLXhDJuRHs0C zNUyx2bC6rZr6yNl+e2vuHX0@x+T`SD(^c2xTE!!S(5@T5jO-RyOIN#a_Q6_F$yn8} z?mkps>xY`x`l0&nuBWrtx_g*+wSLKxoqclh(1Tj*c9ydLM{PilW9t-?35$u&Z#bqy zE0<4_&txXDg)wHFMmb)lmR@gn;o4M!UL`nsdNOsm9E=shX>{4+d<`MFyY15Rr^}no zk~7WoeAr;XU(3irC%wSo(GfQxd*-+R6tKr#+G0A-v8{5~v%c#v-AjI+vZoLhErnK5 zC1!_2%n~yZyRn#IaY0Gjs41IvOd6B)XzGqQh~vq|rAAw%HQRO~-#Jq#TtH(P)W+l5 zP@I%DrZK{3i#i~;_FzxEXo)mJP7SmPOSYhh14oV!vSg7P7A(puS>(hoNyS@2ne*7l sARxzAsvKaj=f`w0LV{+qAO18qc*9s5#KL^EX4${wFeYq!u*R|f3;$::member_type& team) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridLocalFill2,const int& ii) const; + void operator() (TagCSNAGridLocalFill,const int& ii) const; protected: diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index b37082ca5f..ec55b5fae4 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -47,8 +47,6 @@ namespace LAMMPS_NS { template ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGrid(lmp, narg, arg) { - //respa_enable = 0; - printf("^^^ Begin ComputeSNAGridKokkos constructor\n"); kokkosable = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; @@ -61,18 +59,7 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos host_flag = (execution_space == Host); - // ComputeSNAGrid constructor allocates `map` so let's do same here. - // actually, let's move this down to init - //int n = atom->ntypes; - //printf("^^^ realloc d_map\n"); - //MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); - - - printf("^^^ wjelem[0]: %f\n", wjelem[0]); - printf("^^^ wjelem[1]: %f\n", wjelem[1]); - - - printf("^^^^^ cutsq: %f\n", cutsq[1][1]); + // TODO: Extract cutsq in double loop below, no need for cutsq_tmp cutsq_tmp = cutsq[1][1]; @@ -83,31 +70,19 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos } } - - //memoryKK->create_kokkos(k_gridlocal, - //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); - - // Set up element lists - printf("^^^ Begin kokkos reallocs with nelements = %d\n", nelements); MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements); - // pair snap kokkos uses `ncoeffall` in the following, inherits from original. - //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff); MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements); MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements); // test MemKK::realloc_kokkos(d_test, "ComputeSNAGridKokkos::test", nelements); int n = atom->ntypes; - //printf("^^^ realloc d_map\n"); - printf("^^^ n: %d\n", n); MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); - printf("^^^ begin mirrow view creation\n"); auto h_radelem = Kokkos::create_mirror_view(d_radelem); auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); - //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem); auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); auto h_map = Kokkos::create_mirror_view(d_map); @@ -115,31 +90,20 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos auto h_test = Kokkos::create_mirror_view(d_test); h_test(0) = 2.0; - printf("^^^ begin loop over elements, nelements = %d\n", nelements); // start from index 1 because of how compute sna/grid is for (int i = 1; i <= atom->ntypes; i++) { - printf("^^^^^ i %d\n", i); h_radelem(i-1) = radelem[i]; h_wjelem(i-1) = wjelem[i]; - printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]); - printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i)); if (switchinnerflag){ h_sinnerelem(i) = sinnerelem[i]; h_dinnerelem(i) = dinnerelem[i]; } - // pair snap kokkos uses `ncoeffall` in the following. - //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) { - // h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff]; - //} } - printf("^^^ begin loop over map\n"); - // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where - // some things like `map` get allocated regardless of chem flag. + // In pair snap some things like `map` get allocated regardless of chem flag. if (chemflag){ for (int i = 1; i <= atom->ntypes; i++) { h_map(i) = map[i]; - printf("%d\n", map[i]); } } @@ -152,11 +116,9 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos if (chemflag){ Kokkos::deep_copy(d_map,h_map); } - // test Kokkos::deep_copy(d_test,h_test); double bytes = MemKK::memory_usage(d_wjelem); - printf("^^^ bytes: %f\n", bytes); snaKK = SNAKokkos(rfac0,twojmax, rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); @@ -171,10 +133,6 @@ template ComputeSNAGridKokkos::~ComputeSNAGridKokkos() { if (copymode) return; - - //memoryKK->destroy_kokkos(k_eatom,eatom); - //memoryKK->destroy_kokkos(k_vatom,vatom); - printf("^^^ Finish ComputeSNAGridKokkos destructor\n"); } // Init @@ -182,90 +140,10 @@ ComputeSNAGridKokkos::~ComputeSNAGridKokko template void ComputeSNAGridKokkos::init() { - printf("^^^ Begin ComputeSNAGridKokkos init()\n"); - // The part of pair_snap_kokkos_impl.h that allocates snap params is coeff(), and it - // calls the original coeff function. So let's do that here: - - ComputeSNAGrid::init(); - - /* - // Set up element lists - printf("^^^ Begin kokkos reallocs\n"); - MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); - MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridKokkos:wjelem",nelements); - // pair snap kokkos uses `ncoeffall` in the following, inherits from original. - //MemKK::realloc_kokkos(d_coeffelem,"pair:coeffelem",nelements,ncoeff); - MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridKokkos:sinnerelem",nelements); - MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridKokkos:dinnerelem",nelements); - int n = atom->ntypes; - //printf("^^^ realloc d_map\n"); - printf("^^^ n: %d\n", n); - MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); - - printf("^^^ begin mirrow view creation\n"); - auto h_radelem = Kokkos::create_mirror_view(d_radelem); - auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); - //auto h_coeffelem = Kokkos::create_mirror_view(d_coeffelem); - auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); - auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); - auto h_map = Kokkos::create_mirror_view(d_map); - - printf("^^^ begin loop over elements, nelements = %d\n", nelements); - // start from index 1 because of how compute sna/grid is - for (int i = 1; i <= atom->ntypes; i++) { - printf("^^^^^ i %d\n", i); - h_radelem(i) = radelem[i]; - h_wjelem(i) = wjelem[i]; - printf("^^^^^ radelem wjelem %f %f\n", radelem[i], wjelem[i]); - printf("host^^^ radelem wjelem %f %f\n", h_radelem(i), h_wjelem(i)); - if (switchinnerflag){ - h_sinnerelem(i) = sinnerelem[i]; - h_dinnerelem(i) = dinnerelem[i]; - } - // pair snap kokkos uses `ncoeffall` in the following. - //for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) { - // h_coeffelem(ielem,jcoeff) = coeffelem[ielem][jcoeff]; - //} - } - - printf("^^^ begin loop over map\n"); - // NOTE: At this point it's becoming obvious that compute sna grid is not like pair snap, where - // some things like `map` get allocated regardless of chem flag. - if (chemflag){ - for (int i = 1; i <= atom->ntypes; i++) { - h_map(i) = map[i]; - printf("%d\n", map[i]); - } - } - - Kokkos::deep_copy(d_radelem,h_radelem); - Kokkos::deep_copy(d_wjelem,h_wjelem); - if (switchinnerflag){ - Kokkos::deep_copy(d_sinnerelem,h_sinnerelem); - Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); - } - if (chemflag){ - Kokkos::deep_copy(d_map,h_map); - } - - snaKK = SNAKokkos(rfac0,twojmax, - rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); - snaKK.grow_rij(0,0); - snaKK.init(); - */ - if (host_flag) { - - // The following lmp->kokkos will compile error with pointer to incomplete class type not allowed. - //if (lmp->kokkos->nthreads > 1) - // error->all(FLERR,"Compute style sna/grid/kk can currently only run on a single " - // "CPU thread"); - - //ComputeSNAGrid::init(); return; } - - printf("^^^ Finished ComputeSNAGridKokkos init\n"); + ComputeSNAGrid::init(); } @@ -274,11 +152,10 @@ void ComputeSNAGridKokkos::init() template void ComputeSNAGridKokkos::setup() { + // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. - //ComputeGrid::setup(); - printf("^^^^^ SETUP!\n"); - //printf("^^^^^ gridlocal: %f\n", gridlocal[0][0][0][0]); + ComputeGrid::set_grid_global(); ComputeGrid::set_grid_local(); @@ -303,20 +180,7 @@ void ComputeSNAGridKokkos::setup() template void ComputeSNAGridKokkos::compute_array() { - printf("^^^ Begin ComputeSNAGridKokkos compute_array()\n"); - - if (DeviceType::in_parallel()) { - printf("^^^ compute_array() is a host function\n"); - } else { - printf("^^^ compute_array() is not a host function\n"); - } - if (host_flag) { - /* - atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); - PairSNAP::compute(eflag_in,vflag_in); - atomKK->modified(Host,F_MASK); - */ return; } @@ -325,53 +189,26 @@ void ComputeSNAGridKokkos::compute_array() zlen = nzhi-nzlo+1; ylen = nyhi-nylo+1; xlen = nxhi-nxlo+1; - printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi); total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1); atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); x = atomKK->k_x.view(); - // This will error because trying to access host view on the device: - //printf("x(0,0): %f\n", x(0,0)); type = atomKK->k_type.view(); k_cutsq.template sync(); - - MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",total_range); - - //printf("^^^ nzlo nzhi nylo nyhi nxlo nxhi: %d %d %d %d %d %d\n", nzlo, nzhi, nylo, nyhi, nxlo, nxhi); - // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total // number of atoms. - - //const int ntotal = atomKK->nlocal + atomKK->nghost; + ntotal = atomKK->nlocal + atomKK->nghost; - //printf("^^^ ntotal: %d\n", ntotal); + // Allocate view for number of neighbors per grid point + MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); - // ensure rij, inside, and typej are of size jnum - // snaKK.grow_rij(int, int) requires 2 args where one is a chunksize. - - chunk_size = MIN(chunksize, total_range); // "chunksize" variable is set by user - //printf("^^^ chunk_size: %d\n", chunk_size); + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + chunk_size = MIN(chunksize, total_range); snaKK.grow_rij(chunk_size, ntotal); - // Launch 3 teams of the maximum number of threads per team - //const int team_size_max = team_policy(3, 1).team_size_max( - // TagCSNAGridTeamPolicy, Kokkos::ParallelForTag()); - //typename Kokkos::TeamPolicy team_policy_test(3,1); - - // Using custom policy: - /* - CSNAGridTeamPolicy team_policy(chunk_size,team_size_compute_neigh,vector_length); - //team_policy = team_policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("TeamPolicy",team_policy,*this); - */ - - - chunk_size = total_range; - printf("%d %d %d\n", chunk_size, team_size_compute_neigh, vector_length); - // team_size_compute_neigh is defined in `pair_snap_kokkos.h` - - + //chunk_size = total_range; + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; @@ -443,7 +280,6 @@ void ComputeSNAGridKokkos::compute_array() } //Compute bispectrum in AoSoA data layout, transform Bi - //if (quadraticflag || eflag) { //ComputeZi const int idxz_max = snaKK.idxz_max; @@ -465,33 +301,12 @@ void ComputeSNAGridKokkos::compute_array() policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); - //Looks like best way to grab blist is in a parallel_for - - //GridFill - /* + // Fill the grid array with bispectrum values { - int scratch_size = scratch_size_helper(team_size_compute_neigh * ntotal); - - SnapAoSoATeamPolicy - policy_fill(chunk_size, team_size_compute_neigh, vector_length); - policy_fill = policy_fill.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("GridLocalFill",policy_fill,*this); - } - */ - - //GridFill2 - { - typename Kokkos::RangePolicy policy_fill(0,chunk_size); + typename Kokkos::RangePolicy policy_fill(0,chunk_size); Kokkos::parallel_for(policy_fill, *this); } - - // populate the gridlocal array - // best to do parallel loop over grid points again - // ... - - // d_grid(0,0) = 1.0; // attempt to access inaccessible memory space - k_gridlocal.template modify(); k_gridlocal.template sync(); @@ -500,9 +315,6 @@ void ComputeSNAGridKokkos::compute_array() k_gridall.template modify(); k_gridall.template sync(); - - - printf("^^^ End ComputeSNAGridKokkos compute_array()\n"); } /* ---------------------------------------------------------------------- @@ -517,16 +329,8 @@ template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { - // this function is following the same procedure in ComputeNeigh of PairSNAPKokkos - //printf("d_wjelem[1]: %f %f %f %f\n", d_wjelem[1], d_wjelem[0], d_wjelem(1), d_wjelem(0)); - //artificially set values here since we can't get the deep_copy to work - //d_wjelem[1] = 1.0; - //d_radelem[1] = 0.5; - //printf("%f\n", rnd_cutsq(1,1)); - - //Print the test view to see that the deep copy works: - //printf("%f\n", d_test(0)); - + // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos. + // Main difference is that we don't use the neighbor class or neighbor variables here. SNAKokkos my_sna = snaKK; @@ -534,14 +338,11 @@ void ComputeSNAGridKokkos::operator() (Tag // team_rank : rank of thread in this team // league_rank : rank of team in this league // team_size : number of threads in this team - //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size()); // extract loop index int ii = team.team_rank() + team.league_rank() * team.team_size(); if (ii >= chunk_size) return; - //d_gridall(ii,0) = 100.0; - // get a pointer to scratch memory // This is used to cache whether or not an atom is within the cutoff. // If it is, type_cache is assigned to the atom type. @@ -549,11 +350,8 @@ void ComputeSNAGridKokkos::operator() (Tag const int tile_size = ntotal; // number of elements per thread const int team_rank = team.team_rank(); const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team - //printf("ntotal scratch_shift: %d %d\n", ntotal, scratch_shift); int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; - //printf("ii: %d\n", ii); - // convert to grid indices int iz = ii/(xlen*ylen); @@ -565,10 +363,10 @@ void ComputeSNAGridKokkos::operator() (Tag ix += nxlo; double xgrid[3]; - //int igrid = iz * (nx * ny) + iy * nx + ix; - // these end up being the same...? - //printf("ii igrid: %d %d\n", ii, igrid); + // index ii already captures the proper grid point + // int igrid = iz * (nx * ny) + iy * nx + ix; + // printf("ii igrid: %d %d\n", ii, igrid); // grid2x converts igrid to ix,iy,iz like we've done before //grid2x(igrid, xgrid); @@ -578,7 +376,6 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT xtmp = xgrid[0]; const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; - //printf("rtmp: %f %f %f\n", xtmp, ytmp, ztmp); // currently, all grid points are type 1 // not clear what a better choice would be @@ -589,19 +386,14 @@ void ComputeSNAGridKokkos::operator() (Tag const double radi = d_radelem[ielem]; // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. + // The purpose here is to transform for triclinic boxes. if (triclinic){ printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp); - } else { - //printf("We are not triclinic\n"); - } - - // can check xgrid positions with original - //printf("%f %f %f\n", xgrid[0], xgrid[1], xgrid[2]); + } // Compute the number of neighbors, store rsq int ninside = 0; // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]? - //printf("ntotal: %d\n", ntotal); Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), [&] (const int j, int& count) { @@ -621,24 +413,15 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; - //printf("dx: %f\n", dx); - //const double rsq = delx * delx + dely * dely + delz * delz; int jtype = type(j); - //printf("jtype: %d\n", jtype); - //int jelem = 0; - //if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) { const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; - //if (rsq >= cutsq_tmp){ // don't include atoms that share location with grid point - if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-10) { + if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { jtype = -1; // use -1 to signal it's outside the radius - } else { - //printf("jtype rsq rnd_cutsq: %d %.11f %f\n", jtype, rsq, rnd_cutsq(itype, jtype)); - } + } - //printf("j: %d\n", j); type_cache[j] = jtype; if (jtype >= 0) @@ -646,12 +429,9 @@ void ComputeSNAGridKokkos::operator() (Tag }, ninside); - //printf("ninside: %d\n", ninside); - d_ninside(ii) = ninside; - //printf("%d\n", d_ninside(ii)); - // TODO: Make sure itype is appropriate instead of ielem + // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), [&] (const int j, int& offset, bool final) { @@ -663,45 +443,16 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; int jtype = type(j); - //printf("jtype: %d\n", jtype); - if (dx==0 && dy==0 && dz==0){ - printf("rij: %f %f %f\n", xtmp, ytmp, ztmp); - } int jelem = 0; if (chemflag) jelem = d_map[jtype]; - //d_wjelem[jelem] = 1.0; - //d_radelem[jelem] = 1.0; my_sna.rij(ii,offset,0) = static_cast(dx); my_sna.rij(ii,offset,1) = static_cast(dy); my_sna.rij(ii,offset,2) = static_cast(dz); // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp // actually since the views here have values starting at 0, let's use jelem my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - //my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jtype])*rcutfac); my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); my_sna.inside(ii,offset) = j; - - //printf("%f\n", my_sna.wj(ii,offset)); - - //printf("jelem: %d\n", jelem); - //printf("rij: %f %f %f\n", dx, dy, dz); - //printf("params: %f %f %f\n", d_wjelem[jtype], d_radelem[jtype], rcutfac); - //printf("%f %f %f\n", my_sna.rij(ii,offset,0), my_sna.rij(ii,offset,1), my_sna.rij(offset,2)); - //printf("%f %f %f\n", my_sna.wj(ii,offset), my_sna.rcutij(ii,offset), my_sna.inside(ii,offset)); - // we can't use std::cout on device code, maybe make another function for this? - //std::cout << my_sna.rij(ii,offset,0) << std::endl; - //printf("%f %f %f\n", dx, dy, dz); - // apparently isnan is also a host function and not allowed here... - /* - if (isnan(dx) || isnan(dy) || isnan(dz)){ - printf("Found a nan!\n"); - } - if (isnan(d_wjelem[jelem]) || isnan(radi) || isnan(d_radelem[jelem]) || isnan(rcutfac) || isnan(j)){ - printf("Found a nan 2!\n"); - } - */ - // Our best bet is to make another non-host function for printing - if (switchinnerflag) { my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); @@ -722,24 +473,12 @@ KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { SNAKokkos my_sna = snaKK; - //printf("^^^ ComputeCayleyKlein\n"); - - /* - if (DeviceType::in_parallel()) { - printf("operator() of TagCSNAGridComputeCayleyKlein is a host function\n"); - } else { - printf("operator() of TagCSNAGridComputeCayleyKlein is not a host function\n"); - } - */ - const int ii = iatom_mod + iatom_div * vector_length; if (ii >= chunk_size) return; - const int ninside = d_ninside(ii); // use d_ninside or ntotal? + const int ninside = d_ninside(ii); if (jnbor >= ninside) return; - //printf("ninside: %d\n", ninside); - my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); } @@ -752,7 +491,7 @@ void ComputeSNAGridKokkos::operator() (Tag if (ii >= chunk_size) return; int itype = type(ii); - //int ielem = d_map[itype]; + // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp` int ielem = 0; my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); @@ -777,7 +516,7 @@ void ComputeSNAGridKokkos::operator() (Tag const int ii = iatom_mod + vector_length * iatom_div; if (ii >= chunk_size) return; - const int ninside = d_ninside(ii); // use ntotal or d_ninside? + const int ninside = d_ninside(ii); if (jj >= ninside) return; my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); @@ -827,7 +566,6 @@ void ComputeSNAGridKokkos::operator() (Tag auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - //printf("^^^ utot: %f %f\n", utot_re, utot_im); if (mapper.flip_sign == 1){ utot_im = -utot_im; @@ -893,41 +631,7 @@ void ComputeSNAGridKokkos::operator() (Tag template KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill,const typename Kokkos::TeamPolicy::member_type& team) const { - - // this function is following the same procedure in ComputeNeigh so that we can fill the grid - - SNAKokkos my_sna = snaKK; - - // basic quantities associated with this team: - // team_rank : rank of thread in this team - // league_rank : rank of team in this league - // team_size : number of threads in this team - //printf("%d %d %d\n", team.team_rank(), team.league_rank(), team.team_size()); - - // extract loop index - int ii = team.team_rank() + team.league_rank() * team.team_size(); - if (ii >= chunk_size) return; - - //d_gridall(ii,0) = 100.0; - - const auto idxb_max = snaKK.idxb_max; - - // linear contributions - - - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb); - } - -} - -template -KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill2, const int& ii) const { +void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill, const int& ii) const { SNAKokkos my_sna = snaKK; const auto idxb_max = snaKK.idxb_max; @@ -937,39 +641,11 @@ void ComputeSNAGridKokkos::operator() (Tag for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - //printf("blist: %f\n", my_sna.blist(ii,idx_chem,idxb)); d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb); - - if (icoeff == 0){ - //printf("%f\n", my_sna.blist(ii,idx_chem,idxb)); - } } } -/* -template -KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { - - -} -*/ - -/* ---------------------------------------------------------------------- - Begin routines that are unique to the CPU codepath. These do not take - advantage of AoSoA data layouts, but that could be a good point of - future optimization and unification with the above kernels. It's unlikely - that scratch memory optimizations will ever be useful for the CPU due to - different arithmetic intensity requirements for the CPU vs GPU. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagComputeSNAGridLoopCPU,const int& ii) const { - -} - /* ---------------------------------------------------------------------- utility functions ------------------------------------------------------------------------- */ diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 55256f60cd..258fcb97a8 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -393,14 +393,9 @@ void SNAKokkos::compute_cayley_klein(const const real_type z0 = r * cs / sn; const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - //printf("jnbor: %d %f %f %f %f %f\n", jnbor, x,y,z, rfac0, rcut); - //printf("%f %f %f %f %f %f %f\n", rscale0, r, rmin0, theta0, sn, cs, z0); - //printf("%f %f %f %f %f\n", x, y, z, rcut, rmin0); - const real_type wj_local = wj(iatom, jnbor); real_type sfac, dsfac; compute_s_dsfac(r, rcut, sinner, dinner, sfac, dsfac); - //printf("^^^ sfac wj_local: %f %f\n", sfac, wj_local); sfac *= wj_local; dsfac *= wj_local; @@ -521,8 +516,6 @@ void SNAKokkos::compute_ui_small(const typ const complex b = b_pack(iatom_mod, jnbor, iatom_div); const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); - //printf("^^^ %f %f %f %f %f\n", a.re, a.im, b.re, b.im, sfac); - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); // we need to "choose" when to bend @@ -609,7 +602,6 @@ void SNAKokkos::evaluate_ui_jbend(const Wi ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re); } - //printf("^^^ ulist %f %f\n", ulist_accum.re, ulist_accum.im); ulist_wrapper.set(ma, ulist_accum); } @@ -651,7 +643,6 @@ void SNAKokkos::evaluate_ui_jbend(const Wi } ulist_wrapper.set(ma, ulist_accum); - //printf("^^^ ulist_accum: %f %f\n", ulist_accum.re, ulist_accum.im); mb++; } @@ -660,15 +651,10 @@ void SNAKokkos::evaluate_ui_jbend(const Wi for (int ma = 0; ma < j; ma++) { const complex ulist_prev = ulist_wrapper.get(ma); - //printf("ulist_prev %f %f\n", ulist_prev.re, ulist_prev.im); // atomic add the previous level here Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); - - // see if we can see this value - //printf("^^^ %f\n", ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)); - //printf("^^^ sfac: %f\n", sfac); } } @@ -759,7 +745,6 @@ void SNAKokkos::compute_bi(const int& iato const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); - //printf("^^^ %f %f %f %f\n", utot.re, zloc.re, utot.im, zloc.im); sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; } @@ -784,7 +769,6 @@ void SNAKokkos::compute_bi(const int& iato sumzu -= bzero[j]; } } - //printf("%f\n", sumzu); blist_pack(iatom_mod, jjb, itriple, iatom_div) = sumzu; //} // end loop over j //} // end loop over j1, j2 @@ -885,7 +869,6 @@ typename SNAKokkos::complex SNAKokkos::complex SNAKokkos::complex SNAKokkos::compute_s_dsfac(const real constexpr real_type zero = static_cast(0.0); constexpr real_type onehalf = static_cast(0.5); - //printf("^^^ flags: %d %d\n", switch_flag, switch_inner_flag); - if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; } else if (switch_flag == 1) { if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; } diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp index ad70df30e8..12135c705d 100644 --- a/src/ML-SNAP/compute_grid.cpp +++ b/src/ML-SNAP/compute_grid.cpp @@ -57,7 +57,6 @@ ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) : ComputeGrid::~ComputeGrid() { - printf("^^^ begin ComputeGrid destructor\n"); if (copymode) return; deallocate(); } @@ -113,7 +112,6 @@ void ComputeGrid::assign_coords_all() void ComputeGrid::allocate() { // allocate arrays - printf("^^^^^^^^^^^^^^^ ComputeGrid::allocate()\n"); memory->create(grid, size_array_rows, size_array_cols, "grid:grid"); memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall"); if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { diff --git a/src/ML-SNAP/compute_sna_grid.cpp b/src/ML-SNAP/compute_sna_grid.cpp index 9125b7dcd4..95c3fa70a8 100644 --- a/src/ML-SNAP/compute_sna_grid.cpp +++ b/src/ML-SNAP/compute_sna_grid.cpp @@ -31,7 +31,6 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : // skip over arguments used by base class // so that argument positions are identical to // regular per-atom compute - printf("^^^ inside compute sna grid constructor\n"); arg += nargbase; narg -= nargbase; @@ -71,7 +70,6 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp); for (int i = 0; i < ntypes; i++) { wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp); - printf("^^^^^ ComputeSNAGrid wj: %f\n", wjelem[i+1]); } // construct cutsq @@ -116,7 +114,6 @@ ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) : quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp); iarg += 2; } else if (strcmp(arg[iarg], "chem") == 0) { - printf("^^^ chem flag, creating map\n"); if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style); chemflag = 1; memory->create(map, ntypes + 1, "compute_sna_grid:map"); @@ -188,15 +185,10 @@ ComputeSNAGrid::~ComputeSNAGrid() { if (copymode) return; - printf("^^^ begin ComputeSNAGrid destructor\n"); memory->destroy(radelem); - printf("^^^^ CSG 1\n"); memory->destroy(wjelem); - printf("^^^^ CSG 2\n"); memory->destroy(cutsq); - printf("^^^^ CSG 3\n"); delete snaptr; - printf("^^^^ CSG 4\n"); if (chemflag) memory->destroy(map); } @@ -207,15 +199,12 @@ void ComputeSNAGrid::init() if ((modify->get_compute_by_style("^sna/grid$").size() > 1) && (comm->me == 0)) error->warning(FLERR, "More than one instance of compute sna/grid"); snaptr->init(); - - printf("^^^ finished ComputeSNAGrid init()\n"); } /* ---------------------------------------------------------------------- */ void ComputeSNAGrid::compute_array() { - printf("^^^ inside ComputeSNAGrid compute_array()\n"); invoked_array = update->ntimestep; @@ -226,8 +215,6 @@ void ComputeSNAGrid::compute_array() int *const type = atom->type; const int ntotal = atom->nlocal + atom->nghost; - printf("^^^ ntotal: %d\n", ntotal); - // ensure rij, inside, and typej are of size jnum snaptr->grow_rij(ntotal); From 66def742c44871535b15fdb91878673e517b622c Mon Sep 17 00:00:00 2001 From: rohskopf Date: Mon, 24 Apr 2023 11:29:04 -0600 Subject: [PATCH 16/51] Organize deallocator calls --- .../.compute_sna_grid_kokkos_impl.h.swo | Bin 49152 -> 0 bytes src/KOKKOS/compute_sna_grid_kokkos_impl.h | 48 +++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) delete mode 100644 src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo diff --git a/src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo b/src/KOKKOS/.compute_sna_grid_kokkos_impl.h.swo deleted file mode 100644 index 7f92a608886209c3570518c3c072aa00307d6609..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 49152 zcmeI53w-2NmFI)+0YS&{J?qRR3{Xi|9^Jf}&@}cF+U}%Vk_M$^ib~Z#$v>$_sH$|5 zZd$=7DmpqoP#F{-%&afQk;RWQpA}toc11;86(5MAtEk{x$8mMWb-(A_d;kA`RjQIo zCkQ*6`t+~!f82ZSz2`p9x#yn$X#YjK2F=Fa{tkYR@96m68&A)F{k>^(+rX1LviVAR zqBS#(^~stZ`tC1OK5oP@6us;{}xHVXjo8?(Z=Te;?uQRe0cf ztl|A5-Ti?3dt1Z%N4fir?(aD+0iVvJ-Se&P@2eZ$KgK=x63cxJ@BhfXKhyoa&=ttj zKei$MKQz2Q&b{B~{yyR1_QJ<^w;g}!-d7vmyT@yP);O@nfi(`SabS%DYaCePz#0eE zIIzZnH4dzCV2uM0Ne*Q59UXU(v*(LE%>MtV%jJ(A*U|9~@H}u5`1uJP9q$99;OXGk z$9Hsm1Wbc6I2nBBu^k;ZgE0_*XM#>}B6tk=8AA3q!Dqk+z~6uugAnWi8^DR+c<@;8 zZwT!_0C$2ffscWYg7<(ofos84;7V{gK$X$a3*L+};9Bq!5P-h`j{@IAiEuA?6}T8Y z6PyR044wo&j}qbz@L#~q;AP+v@C@)%6d3n`d%<_XZQwBI2j4*X@d|JyI1`)!o&mmr z(&TRN4loJ2!TsnBJ`8RGhrl=(1M9%=se^BVkAfS4)X5p(7bqdV2)^Kz5sSAZqrZtv zH7u2aN>B)jsj$nOYux=3uL*u2y!4z72^Nl-RCPA274;cm5f85qb_f-IIW#jsK-S272~U^bQ4xLp#5 zEC`)Ab}>7>RAho^T$f%xY8OjIj6){l$L(sV_;JW&d|N+D#kHZ5@m#$v710GFCQKR( zio16Y48YPtp`3G8Meq}rq&v4+qod)WE$35Dp8ve+OqbTJ*QBLklt{6zK`cgCDkp7P zkd~L7X;YHc7?f&NO(L$*R+){fp+%nK0OY)mcq@`|&9opH2WXe1t*{p4?8|mZxhiZy zYrws3q$Yi1IH;MKY2$cc3SqUTz3&KZ!5I6$F;kh`Os#r(Z!N&QjLv3?<$O3cziH3l zz6;9L+6di#b4n{mV)@V$wljw5N$JB4Pbbql#wy)RtvD@x)tG2-nIr~-%jCkrWs<=~ z#+E4PlH(A8qhuhf*9X>L%0F%LaA9Qc_PqmU$6T$F&DCrGMk|vq&x{rVQz)0G8MF%J zS(;**))?e9^@ykrSMVVt13oFFg1LZ~VY!s4PG@t0Irm&6e6n4vQp#I)64G5ogIoN@ zvnyh`3~ZaW2M5Dkkl?oq2B}(pCR@miO=m0FVl~yGLur;f)5+esZhgb8&1F7Y%NDcM zsf?{^64)`gZ{)d|!7anPCA-mkhKl@xw+k2ma~(^Dg(Ewe4{TYIMH(1Tb}2WQPRFyL4QU_txAoE7ci%v%x7rJZ)r5%g79sHV1=Tjk%%_l*Vfln+FD( z#ZB$mGB~(z*mM_)(`i}*<65>do;JCOY{hhUbKPYIOnuChhPB}&K!>lrO}Rq0S~YR? zrH$mIlr09yd`T+xdF$w)d2TD0s!iA+=VJ07#O(883o)kAHALe)#mc}O{)u!N=E3SzwaN21-w6gxXTyqE=uS6HloXno@Cxyc zp4Ix9?ml`;i{Fle*^;?92uq_uW!$8OF50zy*Oo2`Pm7}+j53Cm#%0+xQLa>J>=P9@ z9TI7zQm)lZYItZ*SK18McrsWnR|5B>qp#bnP61Y*6T?AZY7>FEaL1PIgFDPv$Oytv zFOO>F>7GJxFen(;_48px2a5S#;fVw}e`vosKR`;z7R){-hcIXMFd&zzfdYPQEqgYe z(cj;15Gp3jaJG~Wvs6xHx?IW1{K7CouJ-p1_nPy|2ZKsUmPw2#l8dTP+E$*P*W8%Y zhW`H3x|+}+SNkPl%Y&d&tjfwO_gp|hD-D7g=*6H7)Z-_GVGH#c~dsu5Q;=z_N zZ2UVqK7?HPEaXp-|NZ{$e@cq=mf%fZ#)kHBNV*O2Gm z2i^nT4aULw;BMsjonQwz0X!BQ2W~}%{|Kmp?cmqQ?f(jX1?~bR@JR3oa38Y!E#PMG zPVjbc19%Cz0PF-kU>)cNPXxa~wtuaY@w@nY0=OHV-UU7lJ_Y^b zK?Vu;zH5`%$Xa)?qMj6vvHCPf zgT5r^@5AmWW^qHsXghDk>t

L;S_^l1R0UifxwIAE;`#@3?5!wjD=Nxy|z2ta@#@ zRjJ_BUxro6vbGGW21Q`rjLsV`DY1ANLkU|#YSyO7vS1ZgldDEb*45h7SZmExN@h66 zDke8^5kk61=SZ>>#h{2v`B0ELm@Oc=WTt}(YRPQT%GVD)%Y*?#q@~P}E6|<%6c#z+0HaF(^oDLB` zo3*660c#O4+Gj{gotX#SnhdaxaKX3&i zx;$vLcl4rHLrkZ|V9F?jwRtl(Q_6|-yQ)(I%TcfJ&g+qC~IKbN7u6y;)+E@}AXL{(=us{CtWS{kO$NJ#18%>s|c0Z*GXEFB9pH1|`JezcgOkCZgFgjN2R}ph{}lKD zcs|$zL?>_-_!ICr@Iz$(`@uhg4+F6ocp~^6^8b&)kHGyv^aEc9cY!y7?O-eD1}ShV z_;c`Q;BNB#dhpla1)vJbK+5uP@H@)&W$^dl4PXxt{lSmW6Wk7N0xtxjH|PMrM^A7& z5c_~Za6GsZUBPvr2nrw#9uIzup5RXKLeK~9MF;SDunYVfvi}#rhr!L@gWw;))4{#S z^xp;F0Y||3;3OdX{(lc{0`CVCpdZ|UZsv3C?fXm38Ge0zQ4gC;^e}g=J*>@O3>55H zDGk2`zLJ=2WUOh5^nF__`LkI0?E8lQSC*3cQf`HGCA#~ys1)9|c$4e`$SUmyY|;r97H}1* zts*ILqGD!NkKpQ2vRj~HH^Z;3`f95hR03~Bvek3eA5qg-Bsc9u_sbXGi>%?0l+a8| z{ACn%d7))h_aG0_TELZ5u-Bp5OA=A2-+pFBk16?IqD>PHeC?12u$YBX4Pz7OEZD$UyW^K zv|!I?c^=peU4bY`|iT5Om3|ESkcVnyNg%nSySyy^dw&eLEHw z-q1t^VkZ^4EfV9T#PLdbHd6^FQ~l|FC-_#1VW~fDjbFtKDm{8AsFd|eWMuoO#6GHC z%~pfL7y`E5L>721dTcu@^{qKTrd*~lnyQRtbN%VsYLcj?>MOimeZc5#MNxRz}Iq=s251Zy~Jln}zcRhmox?A>o%TlKo#ScqzLzrRE~C zFOGz*B@OqiO4c`@uET~Tw1XuU%hfU_ROm}f<5d(MQ-QgRv@SE_*fpvND^^K)y54xU zk}m|9U}4%I1dX)3-e!4@`E9cv-)jj({=Wrz_8*aTMgI5a_htY8ci`W_FTpRs2f#Q8 zKps2}d`=f4xY4O|0W2`&UDfuA6+{|NjLd<)zI z#2!G-1w0F!1;ifURB#G-GI$dBJ@Wj0;ETYY3HVE{H-g84dy(tE3*;<78I-_Q@E9Ov z{%7z>a2>cDYykt{Y_Jjh2HyMxd<3Q+at7cg@P6<^lY{%r+f3BT;V-9F*P+lugO0lShc9D{IH3h+X&UCOVv-0o`8+di(w{vHj>bptEIM|;5( zY|8Ho#|NRBSGA24#b__M?E^j)y<0pYOje^$(vj}Pcr5kfr7yGiPZ|WrnP>y@zg%|; zkpcNn)9zH^&loEnOE0wtElbCC@QdNBeHEqvM8s(B$MWpQ;alXy{7`NYjy?F1csqyU%8aSa z$j(fsI3h9|pnWNwS~IiRQYjmkQcf>OVO&GzmQxZO6koYYkGx9Kt287G3pE)K)c#-W z*|#FAiu~{W^NIZb9w7Vv&jA;Mv%y(F_WR!sZUCwekKzYksv4uDI*bHR6y!L|a-fdxOq?+6D~*#J9CdzRmB6mKO8Rdl;gwGYKxbdVT@DDIybPH_yr?q6sWgqi9e8uk-~za*i;uh z)^&122_w~HIOaE_-rwitQavX_8E_=FI7@&;41qN2iZ!B?38ovY?F+tbwm%`#zLbcepcKI?x^&(DL^2`die#Q%{MI+}v&16QRDu?XsXt$eu z*4c~pQ8KlON>H6}q~j7NTLQ&S-a1?UO~SB3Ft<6jl4oUa4>8l{yA9^Ke36JQ?%F_1 zJbDA=y4E5LSja|->(bZt_yDX@)j%~79I=)aCwn=0W6u{LT9l(bzL_yjktjQ)LYh>$ zp1sv0^EjL-nag}ZPzWSHsMI@g8B~O&(Aj1SyFWCueXbZ0Td4`w{P+eSKUO|SbGBE` z4{OfD+?Ko~#4375?1yP1QnyxEYrc-zS7Dy%OuL>qT?y!FP1i5_Ds`B>*mVDq^x-=G z`VCOLJk&Sjg#mr#WuPZR>iQ$D8l@wjw*5lctJt^TSVLUtKAuT*VJ*Yvs2&=UIEp$@ zf=hAE)zh{t8jShf1rE5!t3Y4Rxv3TtXB;46scGP;8v?`^gzsQ7JRV6e2pF~dsG~@_ z%jRsHr*x9L3(MJr)p6XTVE4il+T{I^ICO{_uPWZ^8n0{B`hqbbdAsz2q2;`lB$J9O zr`fz^vs6DACEEM)CN4)T(eNGPhJ+29jW7*u2IqB>!)l9BmSD|BSj#Fu{b*M0u#`+n zQ;LV(m<7^4NiV?^X05uB!q1O&4`pIx;G@^|xOQZ6L0CwV@gt%0rqX3y<6!Z^`Mn@p zl0yhp^cYGg&JTG>=qpu2!N|)2gvhic4$Huz@cWQWf z-N`gj3DskaxLvSE>X=y0I&-5;0Iqn^2laY$)3|+cdJs9Fh=RBRdu9J$_K+_`wiWq* zD}UtodE|Vt{r@W&w)>a+rXQ_8^8z^e74Rys1&H1LP2m0DJwSN!Y#_4# z6TlyX$Aia#6M&rOzZJY36u=ai1f$?&@I`b0*8n-Ee-;pZz_-9%;CAp?a1HQ3ar!W= zt{+&I=q&FB5_>`jM|IVJ!GB!c%yIB>-W0O)ETgFXU|MuimVk>IQ>%My$CP#{=5U2F zUbc%gQEvHFSA9~eLo-~Ze6A9iL~&O{Q6ih+B}pt9pjov?kyfidWL(!)cvS6AZTnXe zzeQ_fG1|+q>D52B&Ay(_jOzYNdK1UEt9F#MhmqadS0@1$zca`z2k8an!h8uIF4=-; zw9VFFT%6?iBjB|Oe5#gR=a9N2r+22u{frW1FnuK&N@#0gixf}U#siR zTUp^QiQSW=6e1g*>c-3qF)ieT#M)?&Kfbrwuls~7YN2>;Pb_bY9Q4x$k=Q6ucSxgQ zbsG7qGE;OCpe(a|HvGVAb;2BjFICYF7q`X|70&+bv&?7id>#;cHiC^JXHv*npW-jqu?Um6YgS3-!+(X)A^wN%7$6D9*n^5SIL z-tFsUdVp3qg+xNHhVz z=xfyg;`ggp{V~nuR(>L8vg)^4t%R-kc$cZq2pEx*WZ6MkKy9i@(6nzJOo^nsOu9KS9JgWSCZrkJ`Y@^4OoG?JIyKPn#;Cy4cA|81Zb` zuhpQkTCAo^Mr+hsEOe@P88u=>;RMwIRues)W90(+RIhPM8#$AJT?-00sYY8D_WxEh zhpYGfp*p0C^+=-kuCESP8Oz=MXsOJIs&^*zs_h3;39sdUh$-QvYsyb_ydwK2Y`xl3 zx=SmN9|R)(dY{ZDe)%8@_Bsu_$p5z^o1Tk2EAqepzMk0q{}Xs8xEopj>)@4O16U7! zihTbQAm0FZ4akD^pdUOHd=WWcz6J0>@G7ta91p&SyniE*{r(fddy)0UpZ}}CH9-9M zi~N5sI2Ak%90z`f{QvLZKJWu@Gk6iW20lz~pApZU34F4_Q^`Hta1kVPi0I}!) z-{9Lo{QbWNybxRky1`G;4SWOK0ImaX0b=Ah!J?AWd#g(RF;$X2P$k-*`o!`)gong z&1pF~Nl@3;vt*NV{1!9QxkS_KcTE#@fF#dIC(gIMzB4HXm;nvpJO3(*}G-L zlw?vuu{YPF9i(X}%_5=rwgZfboWGA`b`6c-@?G4btF{~0 zu+sZjXG~b zsKJ^xo5O03^LgXy{yU@lmy9zafJt_ptu-9b?_H3PTYgxxyM?yZY&qf{2Or|q%XL9x zAjG->7v8n8R43w>s8#U(lS2)Uui}%(?7!#9hL|uut(C;Oh-+oCY#T@6^_U`=K=)#% zXK*@<21YFOWJxAU8o!29ZBG=-jF#sj(?Bl)W)Ve;I3@O6#wl<(D$YCY_Alvu4`}g1e@~6_ zhlkijm+46QrhUHtQh7dapU=n7`}O&peXdTc`qWx}!(1wq?9qtikG32NEZ5X2nZYDM z{9nrhX({1cI-F0Fu%6JI4NgTTh&`yN4^YYk!ULlFW*de!=72dl&p&OfxZAQ1eOl68 zEPYp^R7vt#n(ty`ON+uiS1#9dFZuWkq_YL--}BfGOmiNalfbICu86CtL={Pv+$?2T zbQ*1iXCXwz7P(LsE|HGhVUetNhx876WWAFN%19S(Fej^pee_gD$#6rTSlvg@HpsJ( z@h}x0>XtX%Vt#M?q)?LaHw^ny$yb(bj@`m%<+roV)!de9SY^=8>_`I%ac9%rC5vNp zhhhjLZYon1Pfe+m zpb5t2IsW@EdQL@*V_4#Ad)zf18BqHm_oMwbf z29`D^W4Z(*%JzK~vGe1_i&yBb?|w(J=@qKH~wy934IC95IcY zdD9tdhNA6&QJu!MIcd!lJ0!4k5APSUlIqp=z>qgS6Ucd0_$!qcHy~{!Qti}Y;J%=u z1G)A%Nu3v3UD9-Li5zp@GCTH@FQIJsM0&)~7K5@+-qS>20qn3r667^hBCytE6g;Lq znyoSwUOvMV9@cO>FBMfrgDD5aRj;h%CA_XFmg#q3ffGEN!$J8h0N%m9Z(9nW+a73V ztzo=4_K~Xf3m@zKR-1wQHb%66UDZiA*^0bFM7j(ovS#_>Kv)dLQT}wLoWp}BJt$lC zYYDYU5Ot{>po-5gGPADOk46iBeQpt>38!t(l&(^@Z7>4A_bsq4O6(dWiQuS|#?3)9 zJ#Tf2HjF=j^FMJzDf0iDk$c7Goyh-Zy3Od{Aoq*^zxRWeftP|J5dVMYg8zdK;J<^b z!ByZ&5CE|a*axR^qR{}Zb@4x@Ij_XH) zH^GNjgGsOv^nh<8^WOsIfB_#u#vcN5W?#+-ydB&C{up@MfZLJTKMSq_=Ycc8>EHzL z3*>e=r~g_YXY|ux9q0z%LY{vccxx=z8~xeYWZgO$mgMJEeD-d!w`hCh*BGSf!z5Mb z8`RAu;vKZCahlLtBATA9>S8R`wdj%wyHpx8y!=G;Xhnx3#~(@*-z?-vGJ&NMcXT9} zy<93nDce@KwgRE}?oCz;j+yjF9_+>9Ws@+AW!KSIEJ%~K(WH`_@L#iT6KI=$+xb~0 z-CAnd5u_8eQP9QtyE5k4?*^vA2P*FibN4`{v@*IHyS+BFvV`J6OWJdSR^qbOd|+j#F7J!GRa2KL1Wa=oRRrM3t1(e>Y`RBy6M)j=a-6&%A5EeHDrCP!%i+RoDdnHG zrM|b{d)<5Q?Vbe zS}U>9#LJGV7ere=qehrIaa3uk{#awq@)V2BfZd9}UJ=C37{j}>FiPoc? zpb4Q?J)k~0sIOp$%E|v%AaDK%`7^oy|664Ie*xn6@8{rW;3jY_knaHg9guGTmcS4g z1S#+s@Mv&5^89CjoDH}dTn@x0;I-g2;5lF;I17kB|GSXsZv&qK*Mc_y@$G*hI0sB4 z=l6mh@Iz$#Yrz|V`1E@;_$TD|*8wlr%Rc|tz}vtPAie;^ci)-d4DfSg`5%J&!4JUq z!HwY6;1kIASAi?RW^g~U{e9r$pa5hqe-ro>^1S#1xDL#NIq+;Ce)-meU%}I#f*%9n z@4tf2gPXwzf%xfrH`ouJ1%69Cd~{7>+1kod_W#Ghev^N62oX2J~WG2Pp%SZhVqV%KdA zJI7JX7Q4pG8cWSN^(l6xWn9;}P`0@7Kg!M_dkGl~HQDxwazoDCHiZiwziEuceq_9% zQQ>=E2T@^nDlleK-?q)1#kQ(^o|O^YYw9C)vuf)TZPjPCKA+3m zA+V>WEb|I!MSl%!ldwVOUbrd1i9Ecf|6E12p zo(g;Udb>&?s1L+c@9zNT0g?C51{=Y@BJbY=Hi6$F=l=%W3a$fh0X0wsJHR#|zWwD}e)pjZ z*a;p19u7W*%zqe}RacV)J#bdRSYKv~{ozqKXe^}co^MIFNCX5#?SI%1w+)Dl?tUO(mv7C6%OHuiz4ex)_N1q;+`}-SAZ`o1-eLugj@F?;6`R!V;ZbC3g}qalAaCeTvi6#RmySYe$BLXhDJuRHs0C zNUyx2bC6rZr6yNl+e2vuHX0@x+T`SD(^c2xTE!!S(5@T5jO-RyOIN#a_Q6_F$yn8} z?mkps>xY`x`l0&nuBWrtx_g*+wSLKxoqclh(1Tj*c9ydLM{PilW9t-?35$u&Z#bqy zE0<4_&txXDg)wHFMmb)lmR@gn;o4M!UL`nsdNOsm9E=shX>{4+d<`MFyY15Rr^}no zk~7WoeAr;XU(3irC%wSo(GfQxd*-+R6tKr#+G0A-v8{5~v%c#v-AjI+vZoLhErnK5 zC1!_2%n~yZyRn#IaY0Gjs41IvOd6B)XzGqQh~vq|rAAw%HQRO~-#Jq#TtH(P)W+l5 zP@I%DrZK{3i#i~;_FzxEXo)mJP7SmPOSYhh14oV!vSg7P7A(puS>(hoNyS@2ne*7l sARxzAsvKaj=f`w0LV{+qAO18qc*9s5#KL^EX4${wFeYq!u*R|f3;$ ComputeSNAGridKokkos::~ComputeSNAGridKokkos() { if (copymode) return; + + printf("^^^ ComputeSNAGridKokkos destructor begin destroy\n"); + memoryKK->destroy_kokkos(k_cutsq,cutsq); + memoryKK->destroy_kokkos(k_grid,grid); + memoryKK->destroy_kokkos(k_gridall, gridall); + //memoryKK->destroy_kokkos(k_gridlocal, gridlocal); } // Init @@ -163,11 +169,17 @@ void ComputeSNAGridKokkos::setup() memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid"); memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); + + // do not use or allocate gridlocal for now + + gridlocal_allocated = 0; + /* if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { gridlocal_allocated = 1; memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, nyhi, nxlo, nxhi, "grid:gridlocal"); } + */ array = gridall; d_gridlocal = k_gridlocal.template view(); @@ -331,6 +343,11 @@ void ComputeSNAGridKokkos::operator() (Tag // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos. // Main difference is that we don't use the neighbor class or neighbor variables here. + // This is because the grid points are not atoms and therefore do not get assigned + // neighbors in LAMMPS. + // TODO: If we did make a neighborlist for each grid point, we could use current + // routines and avoid having to loop over all atoms (which limits us to + // natoms = max team size). SNAKokkos my_sna = snaKK; @@ -369,6 +386,7 @@ void ComputeSNAGridKokkos::operator() (Tag // printf("ii igrid: %d %d\n", ii, igrid); // grid2x converts igrid to ix,iy,iz like we've done before + // multiply grid integers by grid spacing delx, dely, delz //grid2x(igrid, xgrid); xgrid[0] = ix * delx; xgrid[1] = iy * dely; @@ -634,6 +652,34 @@ KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill, const int& ii) const { SNAKokkos my_sna = snaKK; + // convert to grid indices + + int iz = ii/(xlen*ylen); + int i2 = ii - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + // int igrid = iz * (nx * ny) + iy * nx + ix; + // printf("ii igrid: %d %d\n", ii, igrid); + + // grid2x converts igrid to ix,iy,iz like we've done before + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; + d_gridall(ii,0) = xtmp; + d_gridall(ii,1) = ytmp; + d_gridall(ii,2) = ztmp; + const auto idxb_max = snaKK.idxb_max; // linear contributions @@ -641,7 +687,7 @@ void ComputeSNAGridKokkos::operator() (Tag for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - d_gridall(ii,icoeff) = my_sna.blist(ii,idx_chem,idxb); + d_gridall(ii,icoeff+3) = my_sna.blist(ii,idx_chem,idxb); } } From 709da60474592bfc9729e370935b465d068540fa Mon Sep 17 00:00:00 2001 From: rohskopf Date: Tue, 30 May 2023 11:08:43 -0600 Subject: [PATCH 17/51] Replace limited parallel for with normal for for now --- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 86 ++++++++++++++++++----- 1 file changed, 67 insertions(+), 19 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 5ec494f206..db6245ec34 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -136,7 +136,7 @@ ComputeSNAGridKokkos::~ComputeSNAGridKokko printf("^^^ ComputeSNAGridKokkos destructor begin destroy\n"); memoryKK->destroy_kokkos(k_cutsq,cutsq); - memoryKK->destroy_kokkos(k_grid,grid); + //memoryKK->destroy_kokkos(k_grid,grid); memoryKK->destroy_kokkos(k_gridall, gridall); //memoryKK->destroy_kokkos(k_gridlocal, gridlocal); } @@ -166,9 +166,11 @@ void ComputeSNAGridKokkos::setup() ComputeGrid::set_grid_local(); // allocate arrays - - memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid"); + printf(">>> Allocating gridall.\n"); + printf(">>> %d %d\n", size_array_rows, size_array_cols); + //memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid"); memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); + printf(">>> Allocated gridall.\n"); // do not use or allocate gridlocal for now @@ -183,7 +185,7 @@ void ComputeSNAGridKokkos::setup() array = gridall; d_gridlocal = k_gridlocal.template view(); - d_grid = k_grid.template view(); + //d_grid = k_grid.template view(); d_gridall = k_gridall.template view(); } @@ -218,6 +220,7 @@ void ComputeSNAGridKokkos::compute_array() // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user chunk_size = MIN(chunksize, total_range); snaKK.grow_rij(chunk_size, ntotal); + //snaKK.grow_rij(chunk_size, max_neighs); //chunk_size = total_range; @@ -322,8 +325,8 @@ void ComputeSNAGridKokkos::compute_array() k_gridlocal.template modify(); k_gridlocal.template sync(); - k_grid.template modify(); - k_grid.template sync(); + //k_grid.template modify(); + //k_grid.template sync(); k_gridall.template modify(); k_gridall.template sync(); @@ -411,23 +414,32 @@ void ComputeSNAGridKokkos::operator() (Tag // Compute the number of neighbors, store rsq int ninside = 0; + // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]? - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), - [&] (const int j, int& count) { - - // From pair snap/kk : - /* - T_INT j = d_neighbors(i,jj); + for (int j = 0; j < ntotal; j++){ const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; - */ - // From compute sna/grid/kk : - /* - const double delx = xtmp - x[j][0]; - const double dely = ytmp - x[j][1]; - const double delz = ztmp - x[j][2]; - */ + + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + // don't include atoms that share location with grid point + if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { + jtype = -1; // use -1 to signal it's outside the radius + } + + type_cache[j] = jtype; + + if (jtype >= 0) + ninside++; + + } + + + /* + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), + [&] (const int j, int& count) { const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; @@ -446,10 +458,45 @@ void ComputeSNAGridKokkos::operator() (Tag count++; }, ninside); + */ + + + //printf("ninside: %d\n", ninside); d_ninside(ii) = ninside; // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. + int offset = 0; + for (int j = 0; j < ntotal; j++){ + const int jtype = type_cache[j]; + if (jtype >= 0) { + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + int jtype = type(j); + int jelem = 0; + if (chemflag) jelem = d_map[jtype]; + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp + // actually since the views here have values starting at 0, let's use jelem + my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); + my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + my_sna.inside(ii,offset) = j; + if (switchinnerflag) { + my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + } + if (chemflag) + my_sna.element(ii,offset) = jelem; + else + my_sna.element(ii,offset) = 0; + offset++; + } + } + + /* Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), [&] (const int j, int& offset, bool final) { @@ -483,6 +530,7 @@ void ComputeSNAGridKokkos::operator() (Tag offset++; } }); + */ } From 1037e4a4eb672e914df45d718d0d2973dc7b03ff Mon Sep 17 00:00:00 2001 From: rohskopf Date: Tue, 30 May 2023 22:40:12 -0600 Subject: [PATCH 18/51] Use normal loop over ntotal inside neighbor team policy --- src/KOKKOS/compute_sna_grid_kokkos.h | 2 +- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 100 ++++++++++++++++------ src/KOKKOS/memory_kokkos.h | 2 + src/KOKKOS/pair_mliap_kokkos.cpp | 2 + 4 files changed, 81 insertions(+), 25 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index 830601c0fb..0f56fdcbf1 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -220,7 +220,7 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { SNAKokkos snaKK; - int chunk_size, chunk_offset; + int max_neighs, chunk_size, chunk_offset; int host_flag; int ntotal; int total_range; // total number of loop iterations in grid diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index db6245ec34..9e704954f1 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -166,11 +166,11 @@ void ComputeSNAGridKokkos::setup() ComputeGrid::set_grid_local(); // allocate arrays - printf(">>> Allocating gridall.\n"); - printf(">>> %d %d\n", size_array_rows, size_array_cols); + //printf(">>> Allocating gridall.\n"); + //printf(">>> %d %d\n", size_array_rows, size_array_cols); //memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid"); memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); - printf(">>> Allocated gridall.\n"); + //printf(">>> Allocated gridall.\n"); // do not use or allocate gridlocal for now @@ -209,6 +209,9 @@ void ComputeSNAGridKokkos::compute_array() x = atomKK->k_x.view(); type = atomKK->k_type.view(); k_cutsq.template sync(); + //printf(">>> max neighs\n"); + // max_neighs is defined here - think of more elaborate methods. + max_neighs = 100; // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total // number of atoms. @@ -216,32 +219,37 @@ void ComputeSNAGridKokkos::compute_array() ntotal = atomKK->nlocal + atomKK->nghost; // Allocate view for number of neighbors per grid point MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); - + //printf(">>> chunk_size\n"); // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user chunk_size = MIN(chunksize, total_range); - snaKK.grow_rij(chunk_size, ntotal); - //snaKK.grow_rij(chunk_size, max_neighs); + //snaKK.grow_rij(chunk_size, ntotal); + snaKK.grow_rij(chunk_size, max_neighs); //chunk_size = total_range; // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + //printf(">>> Begin computeneigh block\n"); //ComputeNeigh { - int scratch_size = scratch_size_helper(team_size_compute_neigh * ntotal); + int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); SnapAoSoATeamPolicy policy_neigh(chunk_size, team_size_compute_neigh, vector_length); policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + //printf(">>>> blah\n"); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + //printf(">>>> foo\n"); } + //printf(">>>>> Ended compute neigh\n"); + //ComputeCayleyKlein { // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` Snap3DRangePolicy - policy_compute_ck({0,0,0}, {vector_length, ntotal, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); + policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); } @@ -265,7 +273,7 @@ void ComputeSNAGridKokkos::compute_array() // Version with parallelism over j_bend // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations) - const int n_teams = chunk_size_div * ntotal * (twojmax + 1); + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; SnapAoSoATeamPolicy @@ -276,7 +284,7 @@ void ComputeSNAGridKokkos::compute_array() // Version w/out parallelism over j_bend // total number of teams needed: (natoms / 32) * (ntotal) - const int n_teams = chunk_size_div * ntotal; + const int n_teams = chunk_size_div * max_neighs; const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; SnapAoSoATeamPolicy @@ -353,7 +361,7 @@ void ComputeSNAGridKokkos::operator() (Tag // natoms = max team size). SNAKokkos my_sna = snaKK; - + //printf(">>> Begin computeneigh\n"); // basic quantities associated with this team: // team_rank : rank of thread in this team // league_rank : rank of team in this league @@ -367,11 +375,11 @@ void ComputeSNAGridKokkos::operator() (Tag // This is used to cache whether or not an atom is within the cutoff. // If it is, type_cache is assigned to the atom type. // If it's not, it's assigned to -1. - const int tile_size = ntotal; // number of elements per thread + const int tile_size = ntotal; //max_neighs; // number of elements per thread const int team_rank = team.team_rank(); const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; - + //printf(">>> Convert to grid indices\n"); // convert to grid indices int iz = ii/(xlen*ylen); @@ -415,12 +423,13 @@ void ComputeSNAGridKokkos::operator() (Tag // Compute the number of neighbors, store rsq int ninside = 0; - // want to loop over ntotal... keep getting seg fault when accessing type_cache[j]? + //printf(">>> Looping over ntotal\n"); + // Looping over ntotal for now. for (int j = 0; j < ntotal; j++){ const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; - + //printf(">>> jtype\n"); int jtype = type(j); const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; @@ -428,14 +437,17 @@ void ComputeSNAGridKokkos::operator() (Tag if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { jtype = -1; // use -1 to signal it's outside the radius } - - type_cache[j] = jtype; + //printf(">>> accessing type cache\n"); + //type_cache[j] = jtype; if (jtype >= 0) ninside++; + //printf(">>> after type cache\n"); + } - + + //printf(">>> after first loop\n"); /* Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), @@ -467,9 +479,46 @@ void ComputeSNAGridKokkos::operator() (Tag // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. int offset = 0; + for (int j = 0; j < ntotal; j++){ + //const int jtype = type_cache[j]; + //if (jtype >= 0) { + //printf(">>> offset: %d\n", offset); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + int jtype = type(j); + if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) { + int jelem = 0; + if (chemflag) jelem = d_map[jtype]; + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp + // actually since the views here have values starting at 0, let's use jelem + my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); + my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + my_sna.inside(ii,offset) = j; + if (switchinnerflag) { + my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + } + if (chemflag) + my_sna.element(ii,offset) = jelem; + else + my_sna.element(ii,offset) = 0; + offset++; + } + } + + //printf(">>> end inside\n"); + + /* + int offset = 0; for (int j = 0; j < ntotal; j++){ const int jtype = type_cache[j]; if (jtype >= 0) { + printf(">>> offset: %d\n", offset); const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; @@ -495,6 +544,9 @@ void ComputeSNAGridKokkos::operator() (Tag offset++; } } + */ + + //printf(">>> End of computeneigh\n"); /* Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), @@ -572,10 +624,10 @@ void ComputeSNAGridKokkos::operator() (Tag int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; // extract neighbor index, iatom_div - int iatom_div = flattened_idx / (ntotal * (twojmax + 1)); // removed "const" to work around GCC 7 bug - const int jj_jbend = flattened_idx - iatom_div * (ntotal * (twojmax + 1)); - const int jbend = jj_jbend / ntotal; - int jj = jj_jbend - jbend * ntotal; // removed "const" to work around GCC 7 bug + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), [&] (const int iatom_mod) { @@ -599,8 +651,8 @@ void ComputeSNAGridKokkos::operator() (Tag int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; // extract neighbor index, iatom_div - int iatom_div = flattened_idx / ntotal; // removed "const" to work around GCC 7 bug - int jj = flattened_idx - iatom_div * ntotal; + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * max_neighs; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), [&] (const int iatom_mod) { diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h index 35a7ceaeb4..623b002dcb 100644 --- a/src/KOKKOS/memory_kokkos.h +++ b/src/KOKKOS/memory_kokkos.h @@ -163,6 +163,7 @@ template { data = TYPE(std::string(name),n1,n2); h_data = Kokkos::create_mirror_view(data); + printf(">>> name: %s\n", name); return data; } @@ -173,6 +174,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, data = TYPE(std::string(name),n1,n2); bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1; array = (typename TYPE::value_type **) smalloc(nbytes,name); + printf(">>> name %s nbytes %d\n", name, nbytes); for (int i = 0; i < n1; i++) { if (n2 == 0) diff --git a/src/KOKKOS/pair_mliap_kokkos.cpp b/src/KOKKOS/pair_mliap_kokkos.cpp index d19d81e314..18f5368a98 100644 --- a/src/KOKKOS/pair_mliap_kokkos.cpp +++ b/src/KOKKOS/pair_mliap_kokkos.cpp @@ -232,6 +232,7 @@ void PairMLIAPKokkos::coeff(int narg, char **arg) { // map[i] = which element the Ith atom type is, -1 if not mapped // map[0] is not used + //printf(">>> ntypes: %d\n", atom->ntypes); for (int i = 1; i <= atom->ntypes; i++) { char* elemname = elemtypes[i-1]; int jelem; @@ -239,6 +240,7 @@ void PairMLIAPKokkos::coeff(int narg, char **arg) { if (strcmp(elemname,descriptor->elements[jelem]) == 0) break; + //printf(">>> nelements: %d\n", descriptor->nelements); if (jelem < descriptor->nelements) map[i] = jelem; else if (strcmp(elemname,"NULL") == 0) map[i] = -1; From 95e39ba89a93e9c0fdcdf5213b7bdfe6934a6f8a Mon Sep 17 00:00:00 2001 From: rohskopf Date: Tue, 30 May 2023 22:53:24 -0600 Subject: [PATCH 19/51] Clean up kernels --- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 33 ++++------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 9e704954f1..d6984fbdb1 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -134,7 +134,6 @@ ComputeSNAGridKokkos::~ComputeSNAGridKokko { if (copymode) return; - printf("^^^ ComputeSNAGridKokkos destructor begin destroy\n"); memoryKK->destroy_kokkos(k_cutsq,cutsq); //memoryKK->destroy_kokkos(k_grid,grid); memoryKK->destroy_kokkos(k_gridall, gridall); @@ -209,7 +208,7 @@ void ComputeSNAGridKokkos::compute_array() x = atomKK->k_x.view(); type = atomKK->k_type.view(); k_cutsq.template sync(); - //printf(">>> max neighs\n"); + // max_neighs is defined here - think of more elaborate methods. max_neighs = 100; @@ -219,7 +218,7 @@ void ComputeSNAGridKokkos::compute_array() ntotal = atomKK->nlocal + atomKK->nghost; // Allocate view for number of neighbors per grid point MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); - //printf(">>> chunk_size\n"); + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user chunk_size = MIN(chunksize, total_range); //snaKK.grow_rij(chunk_size, ntotal); @@ -230,7 +229,6 @@ void ComputeSNAGridKokkos::compute_array() // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; - //printf(">>> Begin computeneigh block\n"); //ComputeNeigh { int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); @@ -238,13 +236,9 @@ void ComputeSNAGridKokkos::compute_array() SnapAoSoATeamPolicy policy_neigh(chunk_size, team_size_compute_neigh, vector_length); policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - //printf(">>>> blah\n"); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); - //printf(">>>> foo\n"); } - //printf(">>>>> Ended compute neigh\n"); - //ComputeCayleyKlein { // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` @@ -361,7 +355,7 @@ void ComputeSNAGridKokkos::operator() (Tag // natoms = max team size). SNAKokkos my_sna = snaKK; - //printf(">>> Begin computeneigh\n"); + // basic quantities associated with this team: // team_rank : rank of thread in this team // league_rank : rank of team in this league @@ -379,7 +373,7 @@ void ComputeSNAGridKokkos::operator() (Tag const int team_rank = team.team_rank(); const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; - //printf(">>> Convert to grid indices\n"); + // convert to grid indices int iz = ii/(xlen*ylen); @@ -394,7 +388,6 @@ void ComputeSNAGridKokkos::operator() (Tag // index ii already captures the proper grid point // int igrid = iz * (nx * ny) + iy * nx + ix; - // printf("ii igrid: %d %d\n", ii, igrid); // grid2x converts igrid to ix,iy,iz like we've done before // multiply grid integers by grid spacing delx, dely, delz @@ -423,13 +416,11 @@ void ComputeSNAGridKokkos::operator() (Tag // Compute the number of neighbors, store rsq int ninside = 0; - //printf(">>> Looping over ntotal\n"); // Looping over ntotal for now. for (int j = 0; j < ntotal; j++){ const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; - //printf(">>> jtype\n"); int jtype = type(j); const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; @@ -437,17 +428,11 @@ void ComputeSNAGridKokkos::operator() (Tag if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { jtype = -1; // use -1 to signal it's outside the radius } - //printf(">>> accessing type cache\n"); - //type_cache[j] = jtype; if (jtype >= 0) ninside++; - //printf(">>> after type cache\n"); - - } - - //printf(">>> after first loop\n"); + } /* Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), @@ -471,9 +456,6 @@ void ComputeSNAGridKokkos::operator() (Tag }, ninside); */ - - - //printf("ninside: %d\n", ninside); d_ninside(ii) = ninside; @@ -482,7 +464,6 @@ void ComputeSNAGridKokkos::operator() (Tag for (int j = 0; j < ntotal; j++){ //const int jtype = type_cache[j]; //if (jtype >= 0) { - //printf(">>> offset: %d\n", offset); const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; @@ -511,8 +492,6 @@ void ComputeSNAGridKokkos::operator() (Tag } } - //printf(">>> end inside\n"); - /* int offset = 0; for (int j = 0; j < ntotal; j++){ @@ -546,8 +525,6 @@ void ComputeSNAGridKokkos::operator() (Tag } */ - //printf(">>> End of computeneigh\n"); - /* Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), [&] (const int j, int& offset, bool final) { From be5476e442dd66e1854bcb011de3488f9419e8fd Mon Sep 17 00:00:00 2001 From: rohskopf Date: Fri, 2 Jun 2023 15:10:45 -0600 Subject: [PATCH 20/51] Loop over chunks on GPU to write values properly when using default chunk size --- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 206 ++++++++++++---------- src/KOKKOS/memory_kokkos.h | 4 +- src/ML-SNAP/compute_grid.cpp | 3 + 3 files changed, 120 insertions(+), 93 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index d6984fbdb1..cb0a8a646f 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -220,7 +220,10 @@ void ComputeSNAGridKokkos::compute_array() MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // `total_range` is the number of grid points which may be larger than chunk size. + //printf(">>> total_range: %d\n", total_range); chunk_size = MIN(chunksize, total_range); + chunk_offset = 0; //snaKK.grow_rij(chunk_size, ntotal); snaKK.grow_rij(chunk_size, max_neighs); @@ -229,100 +232,112 @@ void ComputeSNAGridKokkos::compute_array() // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; - //ComputeNeigh - { - int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); + while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory - SnapAoSoATeamPolicy - policy_neigh(chunk_size, team_size_compute_neigh, vector_length); - policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); - } + if (chunk_size > total_range - chunk_offset) + chunk_size = total_range - chunk_offset; - //ComputeCayleyKlein - { - // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` - Snap3DRangePolicy - policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); - Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); - } + //printf(">>> chunk_offset: %d\n", chunk_offset); - //PreUi - { - // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h` - Snap3DRangePolicy - policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); - Kokkos::parallel_for("PreUi",policy_preui,*this); - } - - // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot - { - // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h` - // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer - const int tile_size = vector_length * (twojmax + 1); - const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); - - if (chunk_size < parallel_thresh) + //ComputeNeigh { - // Version with parallelism over j_bend + int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); - // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations) - const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); - const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - - SnapAoSoATeamPolicy - policy_ui(n_teams_div, team_size_compute_ui, vector_length); - policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); - } else { - // Version w/out parallelism over j_bend - - // total number of teams needed: (natoms / 32) * (ntotal) - const int n_teams = chunk_size_div * max_neighs; - const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - - SnapAoSoATeamPolicy - policy_ui(n_teams_div, team_size_compute_ui, vector_length); - policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this); + SnapAoSoATeamPolicy + policy_neigh(chunk_size, team_size_compute_neigh, vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); } - } - //TransformUi: un-"fold" ulisttot, zero ylist - { - // team_size_transform_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); - Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); - } + //ComputeCayleyKlein + { + // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); + Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); + } - //Compute bispectrum in AoSoA data layout, transform Bi + //PreUi + { + // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); + Kokkos::parallel_for("PreUi",policy_preui,*this); + } - //ComputeZi - const int idxz_max = snaKK.idxz_max; - Snap3DRangePolicy - policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); - Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot + { + // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); - //ComputeBi - const int idxb_max = snaKK.idxb_max; - Snap3DRangePolicy - policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); - Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + if (chunk_size < parallel_thresh) + { + // Version with parallelism over j_bend - //Transform data layout of blist out of AoSoA - //We need this because `blist` gets used in ComputeForce which doesn't - //take advantage of AoSoA, which at best would only be beneficial on the margins - //NOTE: Do we need this in compute sna/grid/kk? - Snap3DRangePolicy - policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); - Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - // Fill the grid array with bispectrum values - { - typename Kokkos::RangePolicy policy_fill(0,chunk_size); - Kokkos::parallel_for(policy_fill, *this); - } + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); + } else { + // Version w/out parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this); + } + } + + //TransformUi: un-"fold" ulisttot, zero ylist + { + // team_size_transform_ui is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); + Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + } + + //Compute bispectrum in AoSoA data layout, transform Bi + + //ComputeZi + const int idxz_max = snaKK.idxz_max; + Snap3DRangePolicy + policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); + Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + + //ComputeBi + const int idxb_max = snaKK.idxb_max; + Snap3DRangePolicy + policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); + Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + + //Transform data layout of blist out of AoSoA + //We need this because `blist` gets used in ComputeForce which doesn't + //take advantage of AoSoA, which at best would only be beneficial on the margins + //NOTE: Do we need this in compute sna/grid/kk? + Snap3DRangePolicy + policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); + Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + + // Fill the grid array with bispectrum values + { + typename Kokkos::RangePolicy policy_fill(0,chunk_size); + Kokkos::parallel_for(policy_fill, *this); + } + + // Proceed to the next chunk. + chunk_offset += chunk_size; + + } // end while k_gridlocal.template modify(); k_gridlocal.template sync(); @@ -363,8 +378,12 @@ void ComputeSNAGridKokkos::operator() (Tag // extract loop index int ii = team.team_rank() + team.league_rank() * team.team_size(); + if (ii >= chunk_size) return; + // extract grid index + int igrid = ii + chunk_offset; + // get a pointer to scratch memory // This is used to cache whether or not an atom is within the cutoff. // If it is, type_cache is assigned to the atom type. @@ -376,8 +395,8 @@ void ComputeSNAGridKokkos::operator() (Tag // convert to grid indices - int iz = ii/(xlen*ylen); - int i2 = ii - (iz*xlen*ylen); + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); int iy = i2/xlen; int ix = i2 % xlen; iz += nzlo; @@ -387,7 +406,8 @@ void ComputeSNAGridKokkos::operator() (Tag double xgrid[3]; // index ii already captures the proper grid point - // int igrid = iz * (nx * ny) + iy * nx + ix; + //int igrid = iz * (nx * ny) + iy * nx + ix; + //printf("%d %d\n", ii, igrid); // grid2x converts igrid to ix,iy,iz like we've done before // multiply grid integers by grid spacing delx, dely, delz @@ -729,10 +749,14 @@ KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill, const int& ii) const { SNAKokkos my_sna = snaKK; + + // extract grid index + int igrid = ii + chunk_offset; + // convert to grid indices - int iz = ii/(xlen*ylen); - int i2 = ii - (iz*xlen*ylen); + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); int iy = i2/xlen; int ix = i2 % xlen; iz += nzlo; @@ -753,9 +777,9 @@ void ComputeSNAGridKokkos::operator() (Tag const F_FLOAT xtmp = xgrid[0]; const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; - d_gridall(ii,0) = xtmp; - d_gridall(ii,1) = ytmp; - d_gridall(ii,2) = ztmp; + d_gridall(igrid,0) = xtmp; + d_gridall(igrid,1) = ytmp; + d_gridall(igrid,2) = ztmp; const auto idxb_max = snaKK.idxb_max; @@ -764,7 +788,7 @@ void ComputeSNAGridKokkos::operator() (Tag for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - d_gridall(ii,icoeff+3) = my_sna.blist(ii,idx_chem,idxb); + d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb); } } diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h index 623b002dcb..e40edac607 100644 --- a/src/KOKKOS/memory_kokkos.h +++ b/src/KOKKOS/memory_kokkos.h @@ -163,7 +163,7 @@ template { data = TYPE(std::string(name),n1,n2); h_data = Kokkos::create_mirror_view(data); - printf(">>> name: %s\n", name); + //printf(">>> name: %s\n", name); return data; } @@ -174,7 +174,7 @@ TYPE create_kokkos(TYPE &data, typename TYPE::value_type **&array, data = TYPE(std::string(name),n1,n2); bigint nbytes = ((bigint) sizeof(typename TYPE::value_type *)) * n1; array = (typename TYPE::value_type **) smalloc(nbytes,name); - printf(">>> name %s nbytes %d\n", name, nbytes); + //printf(">>> name %s nbytes %d\n", name, nbytes); for (int i = 0; i < n1; i++) { if (n2 == 0) diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp index 12135c705d..dce2ab0283 100644 --- a/src/ML-SNAP/compute_grid.cpp +++ b/src/ML-SNAP/compute_grid.cpp @@ -88,6 +88,7 @@ void ComputeGrid::grid2x(int igrid, double *x) x[2] = iz * delz; if (triclinic) domain->lamda2x(x, x); + //printf(">>>>> ComputeGrid::grid2x\n"); } /* ---------------------------------------------------------------------- @@ -103,6 +104,7 @@ void ComputeGrid::assign_coords_all() gridall[igrid][1] = x[1]; gridall[igrid][2] = x[2]; } + //printf(">>>>> ComputeGrid::assign_coords_all\n"); } /* ---------------------------------------------------------------------- @@ -111,6 +113,7 @@ void ComputeGrid::assign_coords_all() void ComputeGrid::allocate() { + //printf(">>> ComputeGrid::allocate\n"); // allocate arrays memory->create(grid, size_array_rows, size_array_cols, "grid:grid"); memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall"); From b1ffcbcd4190ccd26b10ab2726aac083fe404740 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Fri, 2 Jun 2023 17:38:48 -0600 Subject: [PATCH 21/51] Fix cutoff factor when switchflag = 0 --- src/KOKKOS/sna_kokkos_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 258fcb97a8..3bc241825b 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -2296,7 +2296,7 @@ void SNAKokkos::compute_s_dsfac(const real constexpr real_type zero = static_cast(0.0); constexpr real_type onehalf = static_cast(0.5); - if (switch_flag == 0) { sfac_outer = zero; dsfac_outer = zero; } + if (switch_flag == 0) { sfac_outer = one; dsfac_outer = zero; } else if (switch_flag == 1) { if (r <= rmin0) { sfac_outer = one; dsfac_outer = zero; } else if (r > rcut) { sfac = zero; dsfac = zero; return; } From b1105a231baccc1e4ac7092747c051cefd71d4eb Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sun, 4 Jun 2023 05:03:06 -0600 Subject: [PATCH 22/51] Add triclinic cell conversion --- src/KOKKOS/compute_sna_grid_kokkos.h | 11 ++++ src/KOKKOS/compute_sna_grid_kokkos_impl.h | 63 ++++++++++++++++++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index 0f56fdcbf1..fa0c7f18dd 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -269,6 +269,17 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { template int scratch_size_helper(int values_per_team); + class DomainKokkos *domainKK; + + // triclinic vars + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + double h0, h1, h2, h3, h4, h5; + double lo0, lo1, lo2; + }; // These wrapper classes exist to make the compute style factory happy/avoid having diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index cb0a8a646f..6dc3be90d4 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -28,6 +28,8 @@ #include "neigh_request.h" #include "neighbor_kokkos.h" //#include "sna_kokkos.h" +#include "domain.h" +#include "domain_kokkos.h" #include "sna.h" #include "update.h" @@ -49,6 +51,7 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos { kokkosable = 1; atomKK = (AtomKokkos *) atom; + domainKK = (DomainKokkos *) domain; execution_space = ExecutionSpaceFromDevice::space; datamask_read = EMPTY_MASK; datamask_modify = EMPTY_MASK; @@ -232,6 +235,23 @@ void ComputeSNAGridKokkos::compute_array() // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + if (triclinic){ + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + h0 = domain->h[0]; + h1 = domain->h[1]; + h2 = domain->h[2]; + h3 = domain->h[3]; + h4 = domain->h[4]; + h5 = domain->h[5]; + lo0 = domain->boxlo[0]; + lo1 = domain->boxlo[1]; + lo2 = domain->boxlo[2]; + } + while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory if (chunk_size > total_range - chunk_offset) @@ -415,6 +435,26 @@ void ComputeSNAGridKokkos::operator() (Tag xgrid[0] = ix * delx; xgrid[1] = iy * dely; xgrid[2] = iz * delz; + + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + const F_FLOAT xtmp = xgrid[0]; const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; @@ -429,9 +469,11 @@ void ComputeSNAGridKokkos::operator() (Tag // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. // The purpose here is to transform for triclinic boxes. + /* if (triclinic){ printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp); - } + } + */ // Compute the number of neighbors, store rsq int ninside = 0; @@ -774,6 +816,25 @@ void ComputeSNAGridKokkos::operator() (Tag xgrid[0] = ix * delx; xgrid[1] = iy * dely; xgrid[2] = iz * delz; + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + const F_FLOAT xtmp = xgrid[0]; const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; From 788fd3a9ac1726f892069aa620b4fcdd12fd5b60 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 6 Jun 2023 15:28:41 +0200 Subject: [PATCH 23/51] Re-Adding Gaussian grid again, originally authored by Aidan Thompson Co-authored-by: Aidan Thompson --- src/ML-SNAP/compute_gaussian_grid_local.cpp | 167 ++++++++++++++++++++ src/ML-SNAP/compute_gaussian_grid_local.h | 51 ++++++ 2 files changed, 218 insertions(+) create mode 100644 src/ML-SNAP/compute_gaussian_grid_local.cpp create mode 100644 src/ML-SNAP/compute_gaussian_grid_local.h diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp new file mode 100644 index 0000000000..ec75563bcf --- /dev/null +++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp @@ -0,0 +1,167 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "compute_gaussian_grid_local.h" + +#include "atom.h" +#include "comm.h" +#include "error.h" +#include "force.h" +#include "math_const.h" +#include "math_special.h" +#include "memory.h" +#include "modify.h" +#include "update.h" + +#include +#include + +using namespace LAMMPS_NS; +using MathConst::MY_2PI; +using MathSpecial::powint; + +ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char **arg) : + ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr), + sigmaelem(nullptr), prefacelem(nullptr), argfacelem(nullptr) +{ + // skip over arguments used by base class + // so that argument positions are identical to + // regular per-atom compute + + arg += nargbase; + narg -= nargbase; + + double rfac0, rmin0; + int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + + int ntypes = atom->ntypes; + int nargmin = 4 + 2 * ntypes; + + if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style); + + // process required arguments + + memory->create(radelem, ntypes + 1, "gaussian/atom:radelem"); // offset by 1 to match up with types + memory->create(sigmaelem, ntypes + 1, "gaussian/atom:sigmaelem"); + memory->create(prefacelem, ntypes + 1, "gaussian/atom:prefacelem"); + memory->create(argfacelem, ntypes + 1, "gaussian/atom:argfacelem"); + + rcutfac = utils::numeric(FLERR, arg[3], false, lmp); + + for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp); + for (int i = 0; i < ntypes; i++) + sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp); + + // construct cutsq + + double cut; + cutmax = 0.0; + memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq"); + for (int i = 1; i <= ntypes; i++) { + cut = 2.0 * radelem[i] * rcutfac; + if (cut > cutmax) cutmax = cut; + cutsq[i][i] = cut * cut; + for (int j = i + 1; j <= ntypes; j++) { + cut = (radelem[i] + radelem[j]) * rcutfac; + cutsq[i][j] = cutsq[j][i] = cut * cut; + } + } + + size_local_cols = size_local_cols_base + ntypes; + + // pre-compute coefficients + + for (int i = 0; i < ntypes; i++) { + prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3); + argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]); + } +} + +/* ---------------------------------------------------------------------- */ + +ComputeGaussianGridLocal::~ComputeGaussianGridLocal() +{ + memory->destroy(radelem); + memory->destroy(sigmaelem); + memory->destroy(prefacelem); + memory->destroy(argfacelem); + memory->destroy(cutsq); +} + +/* ---------------------------------------------------------------------- */ + +void ComputeGaussianGridLocal::init() +{ + if ((modify->get_compute_by_style("^gaussian/grid/local$").size() > 1) && (comm->me == 0)) + error->warning(FLERR, "More than one instance of compute gaussian/grid/local"); +} + +/* ---------------------------------------------------------------------- */ + +void ComputeGaussianGridLocal::compute_local() +{ + invoked_local = update->ntimestep; + + // compute gaussian for each gridpoint + + double **const x = atom->x; + const int *const mask = atom->mask; + int *const type = atom->type; + const int ntotal = atom->nlocal + atom->nghost; + + int igrid = 0; + for (int iz = nzlo; iz <= nzhi; iz++) + for (int iy = nylo; iy <= nyhi; iy++) + for (int ix = nxlo; ix <= nxhi; ix++) { + double xgrid[3]; + grid2x(ix, iy, iz, xgrid); + const double xtmp = xgrid[0]; + const double ytmp = xgrid[1]; + const double ztmp = xgrid[2]; + + // Zeroing out the components, which are filled as a sum. + for (int icol = size_local_cols_base; icol < size_local_cols; icol++){ + alocal[igrid][icol] = 0.0; + } + + for (int j = 0; j < ntotal; j++) { + + // check that j is in compute group + + if (!(mask[j] & groupbit)) continue; + + const double delx = xtmp - x[j][0]; + const double dely = ytmp - x[j][1]; + const double delz = ztmp - x[j][2]; + const double rsq = delx * delx + dely * dely + delz * delz; + int jtype = type[j]; + if (rsq < cutsq[jtype][jtype]) { + int icol = size_local_cols_base + jtype - 1; + alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]); + } + } + igrid++; + } +} + +/* ---------------------------------------------------------------------- + memory usage +------------------------------------------------------------------------- */ + +double ComputeGaussianGridLocal::memory_usage() +{ + int n = atom->ntypes + 1; + int nbytes = (double) n * sizeof(int); // map + + return nbytes; +} diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h new file mode 100644 index 0000000000..cfab841a6e --- /dev/null +++ b/src/ML-SNAP/compute_gaussian_grid_local.h @@ -0,0 +1,51 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/ Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(gaussian/grid/local,ComputeGaussianGridLocal); +// clang-format on +#else + +#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H +#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_H + +#include "compute_grid_local.h" + +namespace LAMMPS_NS { + +class ComputeGaussianGridLocal : public ComputeGridLocal { + public: + ComputeGaussianGridLocal(class LAMMPS *, int, char **); + ~ComputeGaussianGridLocal() override; + void init() override; + void compute_local() override; + double memory_usage() override; + + private: + int ncoeff; + double **cutsq; + double rcutfac; // global cut-off scale + double *radelem; // cut-off radius of each atom type + double *sigmaelem; // Gaussian width of each atom type + double *prefacelem; // Gaussian prefactor of each atom type + double *argfacelem; // Gaussian argument factor of each atom type + int *map; // map types to [0,nelements) + int nelements; + double cutmax; +}; + +} // namespace LAMMPS_NS + +#endif +#endif From fc5e583c56c61fc122d4782f9ddfb88da4109931 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sat, 24 Jun 2023 15:56:54 -0600 Subject: [PATCH 24/51] Initial Gaussian grid implementation --- .../compute_gaussian_grid_local_kokkos.cpp | 85 +++++++++++++++++++ .../compute_gaussian_grid_local_kokkos.h | 75 ++++++++++++++++ src/KOKKOS/compute_sna_grid_kokkos.h | 8 -- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 11 --- 4 files changed, 160 insertions(+), 19 deletions(-) create mode 100644 src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp create mode 100644 src/KOKKOS/compute_gaussian_grid_local_kokkos.h diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp new file mode 100644 index 0000000000..240767e43a --- /dev/null +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -0,0 +1,85 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Drew Rohskopf (SNL) +------------------------------------------------------------------------- */ + +#include "compute_gaussian_grid_local_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +#include "pair.h" +#include "update.h" + +#include +#include + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +template +ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : + ComputeGaussianGridLocal(lmp, narg, arg) +{ + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; +} + +/* ---------------------------------------------------------------------- */ + +template +ComputeGaussianGridLocalKokkos::~ComputeGaussianGridLocalKokkos() +{ + if (copymode) return; + + //memoryKK->destroy_kokkos(k_result,result); +} + +/* ---------------------------------------------------------------------- */ + +template +void ComputeGaussianGridLocalKokkos::init() +{ + ComputeGaussianGridLocal::init(); +} + +/* ---------------------------------------------------------------------- */ + +template +void ComputeGaussianGridLocalKokkos::compute_local() +{ + + printf(">>> compute_local Kokkos\n"); + +} + +namespace LAMMPS_NS { +template class ComputeGaussianGridLocalKokkos; +#ifdef LMP_KOKKOS_GPU +template class ComputeGaussianGridLocalKokkos; +#endif +} \ No newline at end of file diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h new file mode 100644 index 0000000000..7698ce9567 --- /dev/null +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h @@ -0,0 +1,75 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(gaussian/grid/local/kk,ComputeGaussianGridLocalKokkos); +ComputeStyle(gaussian/grid/local/kk/device,ComputeGaussianGridLocalKokkos); +ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos); +// clang-format on + +#else + +#ifndef LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H +#define LMP_COMPUTE_GAUSSIAN_GRID_LOCAL_KOKKOS_H + +#include "compute_gaussian_grid_local.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +// clang-format off +//struct TagComputeGaussianGridLocal {}; +// clang-format on + +template class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **); + ~ComputeGaussianGridLocalKokkos() override; + void init() override; + void compute_local() override; + + //KOKKOS_INLINE_FUNCTION + //void operator()(TagComputeGaussianGridLocal const int &) const; + + private: + //double adof, mvv2e, mv2d, boltz; + + Kokkos::View d_radelem; // element radii + Kokkos::View d_ninside; // ninside for all atoms in list + Kokkos::View d_map; // mapping from atom types to elements + + /* + typename AT::t_x_array x; + typename AT::t_v_array v; + typename ArrayTypes::t_float_1d rmass; + typename ArrayTypes::t_float_1d mass; + typename ArrayTypes::t_int_1d type; + typename ArrayTypes::t_int_1d mask; + */ + + //typename AT::t_neighbors_2d d_neighbors; + //typename AT::t_int_1d d_ilist; + //typename AT::t_int_1d d_numneigh; + + //DAT::tdual_float_2d k_result; + //typename AT::t_float_2d d_result; +}; + +} // namespace LAMMPS_NS + +#endif +#endif diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index fa0c7f18dd..bd47059312 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -252,14 +252,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { typename AT::t_float_2d d_grid; typename AT::t_float_2d d_gridall; - //DAT::tdual_float_4d k_gridlocal; - //typedef Kokkos::DualView t_gridlocal_4d; - //typedef Kokkos::View t_4d; - // should we use LMPDeviceType below? - //typedef Kokkos::DualView tdual_float_4d; - //typedef tdual_float_4d::t_dev tdev_float_4d; - //tdual_float_4d k_gridlocal; - //tdev_float_4d d_gridlocal; DAT::tdual_float_4d k_gridlocal; typename AT::t_float_4d d_gridlocal; diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 6dc3be90d4..bd95c6a62c 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -168,22 +168,11 @@ void ComputeSNAGridKokkos::setup() ComputeGrid::set_grid_local(); // allocate arrays - //printf(">>> Allocating gridall.\n"); - //printf(">>> %d %d\n", size_array_rows, size_array_cols); - //memoryKK->create_kokkos(k_grid,grid, size_array_rows, size_array_cols, "grid:grid"); memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); - //printf(">>> Allocated gridall.\n"); // do not use or allocate gridlocal for now gridlocal_allocated = 0; - /* - if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { - gridlocal_allocated = 1; - memoryKK->create4d_offset_kokkos(k_gridlocal, gridlocal, size_array_cols, nzlo, nzhi, nylo, - nyhi, nxlo, nxhi, "grid:gridlocal"); - } - */ array = gridall; d_gridlocal = k_gridlocal.template view(); From 5885f49b751452f3a1009517dfb808f5ce493f0a Mon Sep 17 00:00:00 2001 From: rohskopf Date: Mon, 26 Jun 2023 14:50:44 -0600 Subject: [PATCH 25/51] Prevent polymorphic destructor calls with copymode --- .../compute_gaussian_grid_local_kokkos.cpp | 167 +++++++++++++++++- .../compute_gaussian_grid_local_kokkos.h | 37 +++- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 2 + 3 files changed, 201 insertions(+), 5 deletions(-) diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp index 240767e43a..e7da2a315a 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -47,6 +47,49 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP execution_space = ExecutionSpaceFromDevice::space; datamask_read = EMPTY_MASK; datamask_modify = EMPTY_MASK; + + k_cutsq = tdual_fparams("ComputeSNAGridKokkos::cutsq",atom->ntypes+1,atom->ntypes+1); + auto d_cutsq = k_cutsq.template view(); + rnd_cutsq = d_cutsq; + + host_flag = (execution_space == Host); + + // TODO: Extract cutsq in double loop below, no need for cutsq_tmp + + //cutsq_tmp = cutsq[1][1]; + + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = 1; j <= atom->ntypes; j++){ + k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp; + k_cutsq.template modify(); + } + } + //printf(">>> 1\n"); + // Set up element lists + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); + int n = atom->ntypes; + MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); + //printf(">>> 2\n"); + auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_map = Kokkos::create_mirror_view(d_map); + //printf(">>> 3\n"); + // start from index 1 because of how compute sna/grid is + for (int i = 1; i <= atom->ntypes; i++) { + h_radelem(i-1) = radelem[i]; + } + //printf(">>> 4\n"); + // In pair snap some things like `map` get allocated regardless of chem flag. + // In this compute, however, map does not get allocated in parent classes. + /* + for (int i = 1; i <= atom->ntypes; i++) { + h_map(i) = map[i]; + } + */ + //printf(">>> 5\n"); + Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_map,h_map); + //printf(">>> 6\n"); + } /* ---------------------------------------------------------------------- */ @@ -54,9 +97,40 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP template ComputeGaussianGridLocalKokkos::~ComputeGaussianGridLocalKokkos() { + printf(">>> ComputeGaussianGridLocalKokkos destruct begin, copymode %d\n", copymode); if (copymode) return; - //memoryKK->destroy_kokkos(k_result,result); + memoryKK->destroy_kokkos(k_cutsq,cutsq); + memoryKK->destroy_kokkos(k_alocal,alocal); + //gridlocal_allocated = 0; + + printf(">>> ComputeGaussianGridLocalKokkos end\n"); +} + +/* ---------------------------------------------------------------------- */ + +template +void ComputeGaussianGridLocalKokkos::setup() +{ + + // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. + // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. + + //ComputeGrid::set_grid_global(); + //ComputeGrid::set_grid_local(); + ComputeGridLocal::setup(); + + // allocate arrays + printf(">>> rows cols kokkos init: %d %d\n", size_local_rows, size_local_cols); + memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal"); + + //gridlocal_allocated = 1; + //array = gridall; + + d_alocal = k_alocal.template view(); + //d_grid = k_grid.template view(); + //d_gridall = k_gridall.template view(); + } /* ---------------------------------------------------------------------- */ @@ -72,9 +146,98 @@ void ComputeGaussianGridLocalKokkos::init() template void ComputeGaussianGridLocalKokkos::compute_local() { + printf(">>> compute_local Kokkos begin\n"); - printf(">>> compute_local Kokkos\n"); + if (host_flag) { + return; + } + invoked_local = update->ntimestep; + + copymode = 1; + + zlen = nzhi-nzlo+1; + ylen = nyhi-nylo+1; + xlen = nxhi-nxlo+1; + total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + type = atomKK->k_type.view(); + k_cutsq.template sync(); + + // max_neighs is defined here - think of more elaborate methods. + max_neighs = 100; + + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // number of atoms. + + ntotal = atomKK->nlocal + atomKK->nghost; + // Allocate view for number of neighbors per grid point + MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); + + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // `total_range` is the number of grid points which may be larger than chunk size. + //printf(">>> total_range: %d\n", total_range); + chunksize = 32768; + chunk_size = MIN(chunksize, total_range); + chunk_offset = 0; + + int vector_length_default = 1; + int team_size_default = 1; + if (!host_flag) + team_size_default = 32;//max_neighs; + + while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory + + if (chunk_size > total_range - chunk_offset) + chunk_size = total_range - chunk_offset; + + //Neigh + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length); + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + printf(">>> Check 2\n"); + Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this); + } + + // Proceed to the next chunk. + chunk_offset += chunk_size; + } // end while + + copymode = 0; + + k_alocal.template modify(); + k_alocal.template sync(); + +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy::member_type& team) const +{ + const int ii = team.league_rank(); + //printf("%d\n", ii); +} + +/* ---------------------------------------------------------------------- + check max team size +------------------------------------------------------------------------- */ + +template +template +void ComputeGaussianGridLocalKokkos::check_team_size_for(int inum, int &team_size, int vector_length) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; } namespace LAMMPS_NS { diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h index 7698ce9567..474797584f 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h @@ -29,7 +29,7 @@ ComputeStyle(gaussian/grid/local/kk/host,ComputeGaussianGridLocalKokkos class ComputeGaussianGridLocalKokkos : public ComputeGaussianGridLocal { @@ -37,13 +37,25 @@ template class ComputeGaussianGridLocalKokkos : public Comput typedef DeviceType device_type; typedef ArrayTypes AT; + // Static team/tile sizes for device offload + +#ifdef KOKKOS_ENABLE_HIP + static constexpr int team_size_compute_neigh = 2; +#else + static constexpr int team_size_compute_neigh = 4; +#endif + ComputeGaussianGridLocalKokkos(class LAMMPS *, int, char **); ~ComputeGaussianGridLocalKokkos() override; + void setup() override; void init() override; void compute_local() override; - //KOKKOS_INLINE_FUNCTION - //void operator()(TagComputeGaussianGridLocal const int &) const; + template + void check_team_size_for(int, int&, int); + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy::member_type& team) const; private: //double adof, mvv2e, mv2d, boltz; @@ -52,6 +64,12 @@ template class ComputeGaussianGridLocalKokkos : public Comput Kokkos::View d_ninside; // ninside for all atoms in list Kokkos::View d_map; // mapping from atom types to elements + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq; + typedef Kokkos::View > t_fparams_rnd; + t_fparams_rnd rnd_cutsq; + /* typename AT::t_x_array x; typename AT::t_v_array v; @@ -67,6 +85,19 @@ template class ComputeGaussianGridLocalKokkos : public Comput //DAT::tdual_float_2d k_result; //typename AT::t_float_2d d_result; + + int max_neighs, inum, chunk_size, chunk_offset; + int host_flag; + int total_range; // total number of loop iterations in grid + int xlen, ylen, zlen; + int chunksize; + int ntotal; + + typename AT::t_x_array_randomread x; + typename AT::t_int_1d_randomread type; + + DAT::tdual_float_2d k_alocal; + typename AT::t_float_2d d_alocal; }; } // namespace LAMMPS_NS diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index bd95c6a62c..81f3173a7d 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -135,7 +135,9 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos template ComputeSNAGridKokkos::~ComputeSNAGridKokkos() { + //printf(">>> ComputeSNAGridKokkos destruct begin copymode %d\n", copymode); if (copymode) return; + //printf(">>> After copymode\n"); memoryKK->destroy_kokkos(k_cutsq,cutsq); //memoryKK->destroy_kokkos(k_grid,grid); From 9eb26e4cd0c995e4b159d5d765cc73af79b2e703 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Mon, 26 Jun 2023 16:43:28 -0600 Subject: [PATCH 26/51] Shallow copy Kokkos written array to returned array variable --- .../compute_gaussian_grid_local_kokkos.cpp | 135 +++++++++++++++++- .../compute_gaussian_grid_local_kokkos.h | 12 ++ src/ML-SNAP/compute_gaussian_grid_local.cpp | 11 +- src/ML-SNAP/compute_gaussian_grid_local.h | 2 +- src/ML-SNAP/compute_grid_local.cpp | 10 ++ 5 files changed, 163 insertions(+), 7 deletions(-) diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp index e7da2a315a..5158cb5246 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -66,16 +66,25 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP } //printf(">>> 1\n"); // Set up element lists - MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); int n = atom->ntypes; + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); + MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1); + MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1); + MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1); MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); //printf(">>> 2\n"); auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem); + auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem); + auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem); auto h_map = Kokkos::create_mirror_view(d_map); //printf(">>> 3\n"); // start from index 1 because of how compute sna/grid is for (int i = 1; i <= atom->ntypes; i++) { h_radelem(i-1) = radelem[i]; + h_sigmaelem(i-1) = sigmaelem[i]; + h_prefacelem(i-1) = prefacelem[i]; + h_argfacelem(i-1) = argfacelem[i]; } //printf(">>> 4\n"); // In pair snap some things like `map` get allocated regardless of chem flag. @@ -87,6 +96,9 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP */ //printf(">>> 5\n"); Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_sigmaelem,h_sigmaelem); + Kokkos::deep_copy(d_prefacelem, h_prefacelem); + Kokkos::deep_copy(d_argfacelem, h_argfacelem); Kokkos::deep_copy(d_map,h_map); //printf(">>> 6\n"); @@ -127,6 +139,8 @@ void ComputeGaussianGridLocalKokkos::setup() //gridlocal_allocated = 1; //array = gridall; + array_local = alocal; + d_alocal = k_alocal.template view(); //d_grid = k_grid.template view(); //d_gridall = k_gridall.template view(); @@ -188,6 +202,23 @@ void ComputeGaussianGridLocalKokkos::compute_local() if (!host_flag) team_size_default = 32;//max_neighs; + if (triclinic){ + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + h0 = domain->h[0]; + h1 = domain->h[1]; + h2 = domain->h[2]; + h3 = domain->h[3]; + h4 = domain->h[4]; + h5 = domain->h[5]; + lo0 = domain->boxlo[0]; + lo1 = domain->boxlo[1]; + lo2 = domain->boxlo[2]; + } + while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory if (chunk_size > total_range - chunk_offset) @@ -198,9 +229,9 @@ void ComputeGaussianGridLocalKokkos::compute_local() int vector_length = vector_length_default; int team_size = team_size_default; check_team_size_for(chunk_size,team_size,vector_length); - printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length); + //printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length); typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); - printf(">>> Check 2\n"); + //printf(">>> Check 2\n"); Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this); } @@ -213,6 +244,8 @@ void ComputeGaussianGridLocalKokkos::compute_local() k_alocal.template modify(); k_alocal.template sync(); + printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6)); + } /* ---------------------------------------------------------------------- */ @@ -223,6 +256,102 @@ void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianG { const int ii = team.league_rank(); //printf("%d\n", ii); + + if (ii >= chunk_size) return; + + // extract grid index + int igrid = ii + chunk_offset; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + const int tile_size = ntotal; //max_neighs; // number of elements per thread + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + // convert to grid indices + + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + //int igrid = iz * (nx * ny) + iy * nx + ix; + //printf("%d %d\n", ii, igrid); + + // grid2x converts igrid to ix,iy,iz like we've done before + // multiply grid integers by grid spacing delx, dely, delz + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; + + // Zeroing out the components, which are filled as a sum. + for (int icol = size_local_cols_base; icol < size_local_cols; icol++){ + d_alocal(igrid, icol) = 0.0; + } + + // currently, all grid points are type 1 + // not clear what a better choice would be + + const int itype = 1; + int ielem = 0; + ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + // Compute the number of neighbors, store rsq + int ninside = 0; + + + // Looping over ntotal for now. + + for (int j = 0; j < ntotal; j++){ + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + if (rsq < rnd_cutsq(jtype, jtype) ) { + //printf("%f %f\n", d_prefacelem(jtype-1), d_argfacelem(jtype-1)); + int icol = size_local_cols_base + jtype - 1; + d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1)); + } + } + + //printf("%f\n", d_alocal(igrid, 6)); + } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h index 474797584f..db3e87a7e9 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h @@ -61,6 +61,9 @@ template class ComputeGaussianGridLocalKokkos : public Comput //double adof, mvv2e, mv2d, boltz; Kokkos::View d_radelem; // element radii + Kokkos::View d_sigmaelem; + Kokkos::View d_prefacelem; + Kokkos::View d_argfacelem; Kokkos::View d_ninside; // ninside for all atoms in list Kokkos::View d_map; // mapping from atom types to elements @@ -98,6 +101,15 @@ template class ComputeGaussianGridLocalKokkos : public Comput DAT::tdual_float_2d k_alocal; typename AT::t_float_2d d_alocal; + + // triclinic vars + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + double h0, h1, h2, h3, h4, h5; + double lo0, lo1, lo2; }; } // namespace LAMMPS_NS diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp index ec75563bcf..c660a16cee 100644 --- a/src/ML-SNAP/compute_gaussian_grid_local.cpp +++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp @@ -41,8 +41,8 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char * arg += nargbase; narg -= nargbase; - double rfac0, rmin0; - int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + //double rfac0, rmin0; + //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; int ntypes = atom->ntypes; int nargmin = 4 + 2 * ntypes; @@ -91,11 +91,14 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char * ComputeGaussianGridLocal::~ComputeGaussianGridLocal() { + //printf(">>> ComputeGaussianGridLocal begin destruct copymode %d\n", copymode); + if (copymode) return; memory->destroy(radelem); memory->destroy(sigmaelem); memory->destroy(prefacelem); memory->destroy(argfacelem); memory->destroy(cutsq); + //printf(">>> ComputeGaussianGridLocal end destruct\n"); } /* ---------------------------------------------------------------------- */ @@ -110,6 +113,8 @@ void ComputeGaussianGridLocal::init() void ComputeGaussianGridLocal::compute_local() { + printf(">>> compute_local CPU\n"); + printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols); invoked_local = update->ntimestep; // compute gaussian for each gridpoint @@ -146,7 +151,7 @@ void ComputeGaussianGridLocal::compute_local() const double rsq = delx * delx + dely * dely + delz * delz; int jtype = type[j]; if (rsq < cutsq[jtype][jtype]) { - int icol = size_local_cols_base + jtype - 1; + int icol = size_local_cols_base + jtype - 1; alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]); } } diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h index cfab841a6e..72e7326b49 100644 --- a/src/ML-SNAP/compute_gaussian_grid_local.h +++ b/src/ML-SNAP/compute_gaussian_grid_local.h @@ -32,7 +32,7 @@ class ComputeGaussianGridLocal : public ComputeGridLocal { void compute_local() override; double memory_usage() override; - private: + protected: int ncoeff; double **cutsq; double rcutfac; // global cut-off scale diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp index 0f275a9aae..5dd8185ae7 100644 --- a/src/ML-SNAP/compute_grid_local.cpp +++ b/src/ML-SNAP/compute_grid_local.cpp @@ -61,13 +61,16 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) : ComputeGridLocal::~ComputeGridLocal() { + printf(">>> ComputeGridLocal begin destruct\n"); deallocate(); + printf(">>> ComputeGridLocal end destruct\n"); } /* ---------------------------------------------------------------------- */ void ComputeGridLocal::setup() { + printf(">>> ComputeGridLocal setup\n"); deallocate(); set_grid_global(); set_grid_local(); @@ -106,6 +109,7 @@ void ComputeGridLocal::grid2lamda(int ix, int iy, int iz, double *x) void ComputeGridLocal::allocate() { + printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols); if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { gridlocal_allocated = 1; memory->create(alocal, size_local_rows, size_local_cols, "compute/grid/local:alocal"); @@ -119,10 +123,14 @@ void ComputeGridLocal::allocate() void ComputeGridLocal::deallocate() { + //printf(">>> ComputeGridLocal::deallocate begin gridlocal_allocated %d copymode %d\n", gridlocal_allocated, copymode); + if (copymode) return; + if (gridlocal_allocated) { gridlocal_allocated = 0; memory->destroy(alocal); } + //printf(">>> ComputeGridLocal:: deallocate end\n"); array_local = nullptr; } @@ -178,6 +186,8 @@ void ComputeGridLocal::set_grid_local() // the 2 equality if tests ensure a consistent decision // as to which proc owns it + //printf(">>> ComputeGridLocal set_grid_local\n"); + double xfraclo, xfrachi, yfraclo, yfrachi, zfraclo, zfrachi; if (comm->layout != Comm::LAYOUT_TILED) { From c871fe8505067017ca20e3ce46f8ba01344f4802 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Mon, 26 Jun 2023 17:03:05 -0600 Subject: [PATCH 27/51] Fill grid geometry info --- src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp index 5158cb5246..11eda3a3e2 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -322,6 +322,14 @@ void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianG d_alocal(igrid, icol) = 0.0; } + // Fill grid info columns + d_alocal(igrid, 0) = ix; + d_alocal(igrid, 1) = iy; + d_alocal(igrid, 2) = iz; + d_alocal(igrid, 3) = xtmp; + d_alocal(igrid, 4) = ytmp; + d_alocal(igrid, 5) = ztmp; + // currently, all grid points are type 1 // not clear what a better choice would be From 969cc5dc035d69c574aa860283ff6728c2553887 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Mon, 26 Jun 2023 17:41:13 -0600 Subject: [PATCH 28/51] Tweak TeamPolicy settings for speedup --- src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp index 11eda3a3e2..55e5b599e7 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -193,14 +193,14 @@ void ComputeGaussianGridLocalKokkos::compute_local() // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user // `total_range` is the number of grid points which may be larger than chunk size. //printf(">>> total_range: %d\n", total_range); - chunksize = 32768; + chunksize = 10*32768; // 100*32768 chunk_size = MIN(chunksize, total_range); chunk_offset = 0; int vector_length_default = 1; int team_size_default = 1; if (!host_flag) - team_size_default = 32;//max_neighs; + team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs; if (triclinic){ /* From be5eb198c345ba35e9020a5d2fbd0cefbbe47f90 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Tue, 27 Jun 2023 11:50:00 -0600 Subject: [PATCH 29/51] Clean up debug prints --- src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp | 12 ++++++------ src/ML-SNAP/compute_gaussian_grid_local.cpp | 4 ++-- src/ML-SNAP/compute_grid_local.cpp | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp index 55e5b599e7..a52d747922 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -109,14 +109,14 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP template ComputeGaussianGridLocalKokkos::~ComputeGaussianGridLocalKokkos() { - printf(">>> ComputeGaussianGridLocalKokkos destruct begin, copymode %d\n", copymode); + //printf(">>> ComputeGaussianGridLocalKokkos destruct begin, copymode %d\n", copymode); if (copymode) return; memoryKK->destroy_kokkos(k_cutsq,cutsq); memoryKK->destroy_kokkos(k_alocal,alocal); //gridlocal_allocated = 0; - printf(">>> ComputeGaussianGridLocalKokkos end\n"); + //printf(">>> ComputeGaussianGridLocalKokkos end\n"); } /* ---------------------------------------------------------------------- */ @@ -133,7 +133,7 @@ void ComputeGaussianGridLocalKokkos::setup() ComputeGridLocal::setup(); // allocate arrays - printf(">>> rows cols kokkos init: %d %d\n", size_local_rows, size_local_cols); + //printf(">>> rows cols kokkos init: %d %d\n", size_local_rows, size_local_cols); memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal"); //gridlocal_allocated = 1; @@ -160,7 +160,7 @@ void ComputeGaussianGridLocalKokkos::init() template void ComputeGaussianGridLocalKokkos::compute_local() { - printf(">>> compute_local Kokkos begin\n"); + //printf(">>> compute_local Kokkos begin\n"); if (host_flag) { return; @@ -193,7 +193,7 @@ void ComputeGaussianGridLocalKokkos::compute_local() // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user // `total_range` is the number of grid points which may be larger than chunk size. //printf(">>> total_range: %d\n", total_range); - chunksize = 10*32768; // 100*32768 + chunksize = 32768; // 100*32768 chunk_size = MIN(chunksize, total_range); chunk_offset = 0; @@ -244,7 +244,7 @@ void ComputeGaussianGridLocalKokkos::compute_local() k_alocal.template modify(); k_alocal.template sync(); - printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6)); + //printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6)); } diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp index c660a16cee..c18aa31d05 100644 --- a/src/ML-SNAP/compute_gaussian_grid_local.cpp +++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp @@ -113,8 +113,8 @@ void ComputeGaussianGridLocal::init() void ComputeGaussianGridLocal::compute_local() { - printf(">>> compute_local CPU\n"); - printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols); + //printf(">>> compute_local CPU\n"); + //printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols); invoked_local = update->ntimestep; // compute gaussian for each gridpoint diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp index 5dd8185ae7..6065f38171 100644 --- a/src/ML-SNAP/compute_grid_local.cpp +++ b/src/ML-SNAP/compute_grid_local.cpp @@ -61,16 +61,16 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) : ComputeGridLocal::~ComputeGridLocal() { - printf(">>> ComputeGridLocal begin destruct\n"); + //printf(">>> ComputeGridLocal begin destruct\n"); deallocate(); - printf(">>> ComputeGridLocal end destruct\n"); + //printf(">>> ComputeGridLocal end destruct\n"); } /* ---------------------------------------------------------------------- */ void ComputeGridLocal::setup() { - printf(">>> ComputeGridLocal setup\n"); + //printf(">>> ComputeGridLocal setup\n"); deallocate(); set_grid_global(); set_grid_local(); @@ -109,7 +109,7 @@ void ComputeGridLocal::grid2lamda(int ix, int iy, int iz, double *x) void ComputeGridLocal::allocate() { - printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols); + //printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols); if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { gridlocal_allocated = 1; memory->create(alocal, size_local_rows, size_local_cols, "compute/grid/local:alocal"); From 3f9cc8f0fdd4bf272da65ee5cc1eb66df1bef6c9 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Fri, 7 Jul 2023 17:08:41 -0600 Subject: [PATCH 30/51] Initial sna/grid/local/kk implement --- src/KOKKOS/compute_sna_grid_local_kokkos.cpp | 81 ++ src/KOKKOS/compute_sna_grid_local_kokkos.h | 310 ++++++ .../compute_sna_grid_local_kokkos_impl.h | 924 ++++++++++++++++++ src/ML-SNAP/compute_sna_grid_local.cpp | 6 +- src/ML-SNAP/compute_sna_grid_local.h | 6 +- 5 files changed, 1324 insertions(+), 3 deletions(-) create mode 100644 src/KOKKOS/compute_sna_grid_local_kokkos.cpp create mode 100644 src/KOKKOS/compute_sna_grid_local_kokkos.h create mode 100644 src/KOKKOS/compute_sna_grid_local_kokkos_impl.h diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp new file mode 100644 index 0000000000..087dbc5fd5 --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp @@ -0,0 +1,81 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "compute_sna_grid_local_kokkos.h" +#include "compute_sna_grid_local_kokkos_impl.h" + +namespace LAMMPS_NS { + +template class ComputeSNAGridLocalKokkosDevice; +#ifdef LMP_KOKKOS_GPU +template class ComputeSNAGridLocalKokkosHost; +#endif + +} + + + + +// The following chunk will compile but we're gonna try a wrapper approach like pair snap. +/* +#include "compute_sna_grid_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +#include "sna_kokkos.h" +#include "update.h" + +using namespace LAMMPS_NS; + +// ---------------------------------------------------------------------- + +template +ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : + ComputeSNAGrid(lmp, narg, arg) +{ + + printf("^^^ inside ComputeSNAGridKokkos constructor\n"); + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + +} + +// ---------------------------------------------------------------------- + +template +ComputeSNAGridKokkos::~ComputeSNAGridKokkos() +{ + if (copymode) return; + + +} + +namespace LAMMPS_NS { +template class ComputeSNAGridKokkos; +#ifdef LMP_KOKKOS_GPU +template class ComputeSNAGridKokkos; +#endif +} +*/ + diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h new file mode 100644 index 0000000000..9fccb39aa2 --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h @@ -0,0 +1,310 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef COMPUTE_CLASS +// clang-format off +ComputeStyle(sna/grid/local/kk,ComputeSNAGridLocalKokkosDevice); +ComputeStyle(sna/grid/local/kk/device,ComputeSNAGridLocalKokkosDevice); +#ifdef LMP_KOKKOS_GPU +ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosHost); +#else +ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice); +#endif +// clang-format on +#else + +// clang-format off +#ifndef LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H +#define LMP_COMPUTE_SNA_GRID_LOCAL_KOKKOS_H + +#include "compute_sna_grid_local.h" +#include "kokkos_type.h" +//#include "pair_snap.h" +//#include "kokkos_type.h" +//#include "neigh_list_kokkos.h" +#include "sna_kokkos.h" +//#include "pair_kokkos.h" + +namespace LAMMPS_NS { + +// Routines for both the CPU and GPU backend +//template +//struct TagPairSNAPComputeForce{}; + + +// GPU backend only +/* +struct TagPairSNAPComputeNeigh{}; +struct TagPairSNAPComputeCayleyKlein{}; +struct TagPairSNAPPreUi{}; +struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence +struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence +struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +struct TagPairSNAPComputeZi{}; +struct TagPairSNAPBeta{}; +struct TagPairSNAPComputeBi{}; +struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS +struct TagPairSNAPComputeYi{}; +struct TagPairSNAPComputeYiWithZlist{}; +template +struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence +template +struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence +*/ +//struct TagPairSNAPPreUi{}; +struct TagCSNAGridLocalComputeNeigh{}; +struct TagCSNAGridLocalComputeCayleyKlein{}; +struct TagCSNAGridLocalPreUi{}; +struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence +struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence +struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +struct TagCSNAGridLocalComputeZi{}; +struct TagCSNAGridLocalComputeBi{}; +struct TagCSNAGridLocalTransformBi{}; // re-order blist from AoSoA to AoS +struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array +//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce + +struct TagComputeSNAGridLocalLoop{}; +struct TagComputeSNAGridLocal3D{}; + +// CPU backend only +/* +struct TagPairSNAPComputeNeighCPU{}; +struct TagPairSNAPPreUiCPU{}; +struct TagPairSNAPComputeUiCPU{}; +struct TagPairSNAPTransformUiCPU{}; +struct TagPairSNAPComputeZiCPU{}; +struct TagPairSNAPBetaCPU{}; +struct TagPairSNAPComputeBiCPU{}; +struct TagPairSNAPZeroYiCPU{}; +struct TagPairSNAPComputeYiCPU{}; +struct TagPairSNAPComputeDuidrjCPU{}; +struct TagPairSNAPComputeDeidrjCPU{}; +*/ +struct TagComputeSNAGridLocalLoopCPU{}; + +//template +template +class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + static constexpr int vector_length = vector_length_; + using real_type = real_type_; + using complex = SNAComplex; + + // Static team/tile sizes for device offload + +#ifdef KOKKOS_ENABLE_HIP + static constexpr int team_size_compute_neigh = 2; + static constexpr int tile_size_compute_ck = 2; + static constexpr int tile_size_pre_ui = 2; + static constexpr int team_size_compute_ui = 2; + static constexpr int tile_size_transform_ui = 2; + static constexpr int tile_size_compute_zi = 2; + static constexpr int tile_size_compute_bi = 2; + static constexpr int tile_size_transform_bi = 2; + static constexpr int tile_size_compute_yi = 2; + static constexpr int team_size_compute_fused_deidrj = 2; +#else + static constexpr int team_size_compute_neigh = 4; + static constexpr int tile_size_compute_ck = 4; + static constexpr int tile_size_pre_ui = 4; + static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4; + static constexpr int tile_size_transform_ui = 4; + static constexpr int tile_size_compute_zi = 8; + static constexpr int tile_size_compute_bi = 4; + static constexpr int tile_size_transform_bi = 4; + static constexpr int tile_size_compute_yi = 8; + static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; +#endif + + // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches + // This hides the Kokkos::IndexType and Kokkos::Rank<3...> + // and reduces the verbosity of the LaunchBound by hiding the explicit + // multiplication by vector_length + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNAP>; + + // MDRangePolicy for the 3D grid loop: + template + using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>; + + // Testing out team policies + template + using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; + + // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches + // This hides the LaunchBounds abstraction by hiding the explicit + // multiplication by vector length + template + using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; + + ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **); + ~ComputeSNAGridLocalKokkos() override; + + void init() override; + void setup() override; + void compute_local() override; + + // Utility functions for teams + + template + void check_team_size_for(int, int&); + + template + void check_team_size_reduce(int, int&); + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLocalLoop, const int& ) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagComputeSNAGridLocalLoopCPU, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + // PrintNeigh + //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + // 3D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalPreUi,const int iatom_mod, const int j, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int j, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocal2Fill,const int& ii) const; + + protected: + + SNAKokkos snaKK; + + int max_neighs, chunk_size, chunk_offset; + int host_flag; + int ntotal; + int total_range; // total number of loop iterations in grid + int zlen; //= nzhi-nzlo+1; + int ylen; //= nyhi-nylo+1; + int xlen; //= nxhi-nxlo+1; + + double cutsq_tmp; // temporary cutsq until we get a view + + Kokkos::View d_radelem; // element radii + Kokkos::View d_wjelem; // elements weights + //Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_sinnerelem; // element inner cutoff midpoint + Kokkos::View d_dinnerelem; // element inner cutoff half-width + Kokkos::View d_ninside; // ninside for all atoms in list + Kokkos::View d_map; // mapping from atom types to elements + Kokkos::View d_test; // test view + + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq; + typedef Kokkos::View > t_fparams_rnd; + t_fparams_rnd rnd_cutsq; + + typename AT::t_x_array_randomread x; + typename AT::t_int_1d_randomread type; + DAT::tdual_float_2d k_grid; + DAT::tdual_float_2d k_gridall; + typename AT::t_float_2d d_grid; + typename AT::t_float_2d d_gridall; + + DAT::tdual_float_4d k_gridlocal; + typename AT::t_float_4d d_gridlocal; + + + // Utility routine which wraps computing per-team scratch size requirements for + // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj + template + int scratch_size_helper(int values_per_team); + + class DomainKokkos *domainKK; + + // triclinic vars + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + double h0, h1, h2, h3, h4, h5; + double lo0, lo1, lo2; + +}; + +// These wrapper classes exist to make the compute style factory happy/avoid having +// to extend the compute style factory to support Compute classes w/an arbitrary number +// of extra template parameters + +template +class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos { + + private: + using Base = ComputeSNAGridLocalKokkos; + + public: + + ComputeSNAGridLocalKokkosDevice(class LAMMPS *, int, char **); + + void init() override; + void compute_local() override; + //void setup() override; + +}; + +#ifdef LMP_KOKKOS_GPU +template +class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos { + + private: + using Base = ComputeSNAGridLocalKokkos; + + public: + + ComputeSNAGridLocalKokkosHost(class LAMMPS *, int, char **); + + void init() override; + void compute_local() override; + +}; +#endif + +} + +#endif +#endif \ No newline at end of file diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h new file mode 100644 index 0000000000..67ea878143 --- /dev/null +++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h @@ -0,0 +1,924 @@ +// clang-format off +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Christian Trott (SNL), Stan Moore (SNL), + Evan Weinberg (NVIDIA) +------------------------------------------------------------------------- */ + +#include "compute_sna_grid_local_kokkos.h" +#include "pair_snap_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "memory_kokkos.h" +#include "modify.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "neighbor_kokkos.h" +//#include "sna_kokkos.h" +#include "domain.h" +#include "domain_kokkos.h" +#include "sna.h" +#include "update.h" + +#include +#include +#include + +#include + +#define MAXLINE 1024 +#define MAXWORD 3 + +namespace LAMMPS_NS { + +// Constructor + +template +ComputeSNAGridLocalKokkos::ComputeSNAGridLocalKokkos(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal(lmp, narg, arg) +{ + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + domainKK = (DomainKokkos *) domain; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + + k_cutsq = tdual_fparams("ComputeSNAGridLocalKokkos::cutsq",atom->ntypes+1,atom->ntypes+1); + auto d_cutsq = k_cutsq.template view(); + rnd_cutsq = d_cutsq; + + host_flag = (execution_space == Host); + + // TODO: Extract cutsq in double loop below, no need for cutsq_tmp + + cutsq_tmp = cutsq[1][1]; + + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = 1; j <= atom->ntypes; j++){ + k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq_tmp; + k_cutsq.template modify(); + } + } + + // Set up element lists + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridLocalKokkos::radelem",nelements); + MemKK::realloc_kokkos(d_wjelem,"ComputeSNAGridLocalKokkos:wjelem",nelements); + MemKK::realloc_kokkos(d_sinnerelem,"ComputeSNAGridLocalKokkos:sinnerelem",nelements); + MemKK::realloc_kokkos(d_dinnerelem,"ComputeSNAGridLocalKokkos:dinnerelem",nelements); + // test + MemKK::realloc_kokkos(d_test, "ComputeSNAGridLocalKokkos::test", nelements); + + int n = atom->ntypes; + MemKK::realloc_kokkos(d_map,"ComputeSNAGridLocalKokkos::map",n+1); + + auto h_radelem = Kokkos::create_mirror_view(d_radelem); + auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); + auto h_sinnerelem = Kokkos::create_mirror_view(d_sinnerelem); + auto h_dinnerelem = Kokkos::create_mirror_view(d_dinnerelem); + auto h_map = Kokkos::create_mirror_view(d_map); + // test + auto h_test = Kokkos::create_mirror_view(d_test); + h_test(0) = 2.0; + + // start from index 1 because of how compute sna/grid is + for (int i = 1; i <= atom->ntypes; i++) { + h_radelem(i-1) = radelem[i]; + h_wjelem(i-1) = wjelem[i]; + if (switchinnerflag){ + h_sinnerelem(i) = sinnerelem[i]; + h_dinnerelem(i) = dinnerelem[i]; + } + } + + // In pair snap some things like `map` get allocated regardless of chem flag. + if (chemflag){ + for (int i = 1; i <= atom->ntypes; i++) { + h_map(i) = map[i]; + } + } + + Kokkos::deep_copy(d_radelem,h_radelem); + Kokkos::deep_copy(d_wjelem,h_wjelem); + if (switchinnerflag){ + Kokkos::deep_copy(d_sinnerelem,h_sinnerelem); + Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); + } + if (chemflag){ + Kokkos::deep_copy(d_map,h_map); + } + Kokkos::deep_copy(d_test,h_test); + + double bytes = MemKK::memory_usage(d_wjelem); + + snaKK = SNAKokkos(rfac0,twojmax, + rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK.grow_rij(0,0); + snaKK.init(); + +} + +// Destructor + +template +ComputeSNAGridLocalKokkos::~ComputeSNAGridLocalKokkos() +{ + //printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode); + if (copymode) return; + //printf(">>> After copymode\n"); + + memoryKK->destroy_kokkos(k_cutsq,cutsq); + //memoryKK->destroy_kokkos(k_grid,grid); + //memoryKK->destroy_kokkos(k_gridall, gridall); + //memoryKK->destroy_kokkos(k_gridlocal, gridlocal); +} + +// Init + +template +void ComputeSNAGridLocalKokkos::init() +{ + if (host_flag) { + return; + } + ComputeSNAGridLocal::init(); + +} + +// Setup + +template +void ComputeSNAGridLocalKokkos::setup() +{ + + // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. + // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. + + //ComputeGrid::set_grid_global(); + //ComputeGrid::set_grid_local(); + //ComputeSNAGridLocal::setup(); + + // allocate arrays + //memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); + + // do not use or allocate gridlocal for now + + gridlocal_allocated = 0; + //array = gridall; + + d_gridlocal = k_gridlocal.template view(); + //d_grid = k_grid.template view(); + d_gridall = k_gridall.template view(); +} + +// Compute + +template +void ComputeSNAGridLocalKokkos::compute_local() +{ + if (host_flag) { + return; + } + + copymode = 1; + + zlen = nzhi-nzlo+1; + ylen = nyhi-nylo+1; + xlen = nxhi-nxlo+1; + total_range = (nzhi-nzlo+1)*(nyhi-nylo+1)*(nxhi-nxlo+1); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + type = atomKK->k_type.view(); + k_cutsq.template sync(); + + // max_neighs is defined here - think of more elaborate methods. + max_neighs = 100; + + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // number of atoms. + + ntotal = atomKK->nlocal + atomKK->nghost; + // Allocate view for number of neighbors per grid point + MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range); + + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // `total_range` is the number of grid points which may be larger than chunk size. + //printf(">>> total_range: %d\n", total_range); + chunk_size = MIN(chunksize, total_range); + chunk_offset = 0; + //snaKK.grow_rij(chunk_size, ntotal); + snaKK.grow_rij(chunk_size, max_neighs); + + //chunk_size = total_range; + + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + + if (triclinic){ + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + h0 = domain->h[0]; + h1 = domain->h[1]; + h2 = domain->h[2]; + h3 = domain->h[3]; + h4 = domain->h[4]; + h5 = domain->h[5]; + lo0 = domain->boxlo[0]; + lo1 = domain->boxlo[1]; + lo2 = domain->boxlo[2]; + } + + while (chunk_offset < total_range) { // chunk up loop to prevent running out of memory + + if (chunk_size > total_range - chunk_offset) + chunk_size = total_range - chunk_offset; + + //printf(">>> chunk_offset: %d\n", chunk_offset); + + //ComputeNeigh + { + int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); + + SnapAoSoATeamPolicy + policy_neigh(chunk_size, team_size_compute_neigh, vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + //ComputeCayleyKlein + { + // tile_size_compute_ck is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0}, {vector_length, max_neighs, chunk_size_div}, {vector_length, tile_size_compute_ck, 1}); + Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); + } + + //PreUi + { + // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h` + Snap3DRangePolicy + policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); + Kokkos::parallel_for("PreUi",policy_preui,*this); + } + + // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot + { + // team_size_compute_ui is defined in `compute_sna_grid_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + + if (chunk_size < parallel_thresh) + { + // Version with parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); + } else { + // Version w/out parallelism over j_bend + + // total number of teams needed: (natoms / 32) * (ntotal) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge", policy_ui, *this); + } + } + + //TransformUi: un-"fold" ulisttot, zero ylist + { + // team_size_transform_ui is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); + Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + } + + //Compute bispectrum in AoSoA data layout, transform Bi + + //ComputeZi + const int idxz_max = snaKK.idxz_max; + Snap3DRangePolicy + policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); + Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + + //ComputeBi + const int idxb_max = snaKK.idxb_max; + Snap3DRangePolicy + policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); + Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + + //Transform data layout of blist out of AoSoA + //We need this because `blist` gets used in ComputeForce which doesn't + //take advantage of AoSoA, which at best would only be beneficial on the margins + //NOTE: Do we need this in compute sna/grid/kk? + Snap3DRangePolicy + policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); + Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + + // Fill the grid array with bispectrum values + { + typename Kokkos::RangePolicy policy_fill(0,chunk_size); + Kokkos::parallel_for(policy_fill, *this); + } + + // Proceed to the next chunk. + chunk_offset += chunk_size; + + } // end while + + k_gridlocal.template modify(); + k_gridlocal.template sync(); + + //k_grid.template modify(); + //k_grid.template sync(); + + k_gridall.template modify(); + k_gridall.template sync(); +} + +/* ---------------------------------------------------------------------- + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials +------------------------------------------------------------------------- */ + +/* + Simple team policy functor seeing how many layers deep we can go with the parallelism. + */ +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos. + // Main difference is that we don't use the neighbor class or neighbor variables here. + // This is because the grid points are not atoms and therefore do not get assigned + // neighbors in LAMMPS. + // TODO: If we did make a neighborlist for each grid point, we could use current + // routines and avoid having to loop over all atoms (which limits us to + // natoms = max team size). + + SNAKokkos my_sna = snaKK; + + // basic quantities associated with this team: + // team_rank : rank of thread in this team + // league_rank : rank of team in this league + // team_size : number of threads in this team + + // extract loop index + int ii = team.team_rank() + team.league_rank() * team.team_size(); + + if (ii >= chunk_size) return; + + // extract grid index + int igrid = ii + chunk_offset; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + const int tile_size = ntotal; //max_neighs; // number of elements per thread + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + // convert to grid indices + + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + //int igrid = iz * (nx * ny) + iy * nx + ix; + //printf("%d %d\n", ii, igrid); + + // grid2x converts igrid to ix,iy,iz like we've done before + // multiply grid integers by grid spacing delx, dely, delz + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; + + // currently, all grid points are type 1 + // not clear what a better choice would be + + const int itype = 1; + int ielem = 0; + if (chemflag) ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. + // The purpose here is to transform for triclinic boxes. + /* + if (triclinic){ + printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp); + } + */ + + // Compute the number of neighbors, store rsq + int ninside = 0; + + // Looping over ntotal for now. + for (int j = 0; j < ntotal; j++){ + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + // don't include atoms that share location with grid point + if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { + jtype = -1; // use -1 to signal it's outside the radius + } + + if (jtype >= 0) + ninside++; + + } + + /* + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), + [&] (const int j, int& count) { + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + // don't include atoms that share location with grid point + if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { + jtype = -1; // use -1 to signal it's outside the radius + } + + type_cache[j] = jtype; + + if (jtype >= 0) + count++; + + }, ninside); + */ + + d_ninside(ii) = ninside; + + // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. + int offset = 0; + for (int j = 0; j < ntotal; j++){ + //const int jtype = type_cache[j]; + //if (jtype >= 0) { + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + int jtype = type(j); + if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) { + int jelem = 0; + if (chemflag) jelem = d_map[jtype]; + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp + // actually since the views here have values starting at 0, let's use jelem + my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); + my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + my_sna.inside(ii,offset) = j; + if (switchinnerflag) { + my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + } + if (chemflag) + my_sna.element(ii,offset) = jelem; + else + my_sna.element(ii,offset) = 0; + offset++; + } + } + + /* + int offset = 0; + for (int j = 0; j < ntotal; j++){ + const int jtype = type_cache[j]; + if (jtype >= 0) { + printf(">>> offset: %d\n", offset); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + int jtype = type(j); + int jelem = 0; + if (chemflag) jelem = d_map[jtype]; + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp + // actually since the views here have values starting at 0, let's use jelem + my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); + my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + my_sna.inside(ii,offset) = j; + if (switchinnerflag) { + my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + } + if (chemflag) + my_sna.element(ii,offset) = jelem; + else + my_sna.element(ii,offset) = 0; + offset++; + } + } + */ + + /* + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), + [&] (const int j, int& offset, bool final) { + + const int jtype = type_cache[j]; + + if (jtype >= 0) { + if (final) { + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + int jtype = type(j); + int jelem = 0; + if (chemflag) jelem = d_map[jtype]; + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp + // actually since the views here have values starting at 0, let's use jelem + my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); + my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + my_sna.inside(ii,offset) = j; + if (switchinnerflag) { + my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + } + if (chemflag) + my_sna.element(ii,offset) = jelem; + else + my_sna.element(ii,offset) = 0; + } + offset++; + } + }); + */ +} + + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int ii = iatom_mod + iatom_div * vector_length; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jnbor >= ninside) return; + + my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int iatom_mod, const int j, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int ii = iatom_mod + iatom_div * vector_length; + if (ii >= chunk_size) return; + + int itype = type(ii); + // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp` + int ielem = 0; + + my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // extract flattened atom_div / neighbor number / bend_location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * max_neighs; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div); + }); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (idxu > my_sna.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++){ + + const FullHalfMapper mapper = my_sna.idxu_full_half[idxu]; + + auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + + if (mapper.flip_sign == 1){ + utot_im = -utot_im; + } else if (mapper.flip_sign == -1){ + utot_re = -utot_re; + } + + my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + + if (mapper.flip_sign == 0) { + my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + } + } +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjz >= my_sna.idxz_max) return; + + my_sna.compute_zi(iatom_mod,jjz,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjb >= my_sna.idxb_max) return; + + my_sna.compute_bi(iatom_mod,jjb,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (idxb >= my_sna.idxb_max) return; + + const int ntriples = my_sna.ntriples; + + for (int itriple = 0; itriple < ntriples; itriple++) { + + const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); + + my_sna.blist(iatom, itriple, idxb) = blocal; + } + +} + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocal2Fill, const int& ii) const { + SNAKokkos my_sna = snaKK; + + + // extract grid index + int igrid = ii + chunk_offset; + + // convert to grid indices + + int iz = igrid/(xlen*ylen); + int i2 = igrid - (iz*xlen*ylen); + int iy = i2/xlen; + int ix = i2 % xlen; + iz += nzlo; + iy += nylo; + ix += nxlo; + + double xgrid[3]; + + // index ii already captures the proper grid point + // int igrid = iz * (nx * ny) + iy * nx + ix; + // printf("ii igrid: %d %d\n", ii, igrid); + + // grid2x converts igrid to ix,iy,iz like we've done before + //grid2x(igrid, xgrid); + xgrid[0] = ix * delx; + xgrid[1] = iy * dely; + xgrid[2] = iz * delz; + if (triclinic) { + + // Do a conversion on `xgrid` here like we do in the CPU version. + + // Can't do this: + // domainKK->lamda2x(xgrid, xgrid); + // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed + + // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. + /* + xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; + xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; + xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; + */ + xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; + xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; + xgrid[2] = h2*xgrid[2] + lo2; + } + + const F_FLOAT xtmp = xgrid[0]; + const F_FLOAT ytmp = xgrid[1]; + const F_FLOAT ztmp = xgrid[2]; + d_gridall(igrid,0) = xtmp; + d_gridall(igrid,1) = ytmp; + d_gridall(igrid,2) = ztmp; + + const auto idxb_max = snaKK.idxb_max; + + // linear contributions + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb); + } + +} + +/* ---------------------------------------------------------------------- + utility functions +------------------------------------------------------------------------- */ + +template +template +void ComputeSNAGridLocalKokkos::check_team_size_for(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +void ComputeSNAGridLocalKokkos::check_team_size_reduce(int inum, int &team_size) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +int ComputeSNAGridLocalKokkos::scratch_size_helper(int values_per_team) { + typedef Kokkos::View > ScratchViewType; + + return ScratchViewType::shmem_size(values_per_team); +} + +/* ---------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + routines used by template reference classes +------------------------------------------------------------------------- */ + + +template +ComputeSNAGridLocalKokkosDevice::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg) + : ComputeSNAGridLocalKokkos(lmp, narg, arg) { ; } + +template +void ComputeSNAGridLocalKokkosDevice::init() +{ + Base::init(); +} + +template +void ComputeSNAGridLocalKokkosDevice::compute_local() +{ + Base::compute_local(); +} + +#ifdef LMP_KOKKOS_GPU +template +ComputeSNAGridLocalKokkosHost::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg) + : ComputeSNAGridLocalKokkos(lmp, narg, arg) { ; } + +template +void ComputeSNAGridLocalKokkosHost::init() +{ + Base::init(); +} + +template +void ComputeSNAGridLocalKokkosHost::compute_local() +{ + Base::compute_local(); +} +#endif + +} diff --git a/src/ML-SNAP/compute_sna_grid_local.cpp b/src/ML-SNAP/compute_sna_grid_local.cpp index 1d42a42c05..3981970506 100644 --- a/src/ML-SNAP/compute_sna_grid_local.cpp +++ b/src/ML-SNAP/compute_sna_grid_local.cpp @@ -37,8 +37,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) : // begin code common to all SNAP computes - double rfac0, rmin0; - int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + //double rfac0, rmin0; + //int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; int ntypes = atom->ntypes; int nargmin = 6 + 2 * ntypes; @@ -56,6 +56,8 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) : wselfallflag = 0; switchinnerflag = 0; nelements = 1; + chunksize = 32768; + parallel_thresh = 8192; // process required arguments diff --git a/src/ML-SNAP/compute_sna_grid_local.h b/src/ML-SNAP/compute_sna_grid_local.h index 0475212e13..85662ad509 100644 --- a/src/ML-SNAP/compute_sna_grid_local.h +++ b/src/ML-SNAP/compute_sna_grid_local.h @@ -32,7 +32,7 @@ class ComputeSNAGridLocal : public ComputeGridLocal { void compute_local() override; double memory_usage() override; - private: + protected: int ncoeff; double **cutsq; double rcutfac; @@ -46,6 +46,10 @@ class ComputeSNAGridLocal : public ComputeGridLocal { class SNA *snaptr; double cutmax; int quadraticflag; + double rfac0, rmin0; + int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag; + int chunksize; + int parallel_thresh; }; } // namespace LAMMPS_NS From b5dc7d58a8645ad19e5152dc050505a796118efc Mon Sep 17 00:00:00 2001 From: rohskopf Date: Sun, 9 Jul 2023 10:20:45 -0600 Subject: [PATCH 31/51] Destruct sna/grid/local/kk properly and use local arrays --- src/KOKKOS/compute_sna_grid_local_kokkos.h | 6 ++++ .../compute_sna_grid_local_kokkos_impl.h | 35 +++++++++++-------- src/ML-SNAP/compute_grid_local.cpp | 6 ++-- src/ML-SNAP/compute_sna_grid_local.cpp | 1 + 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h index 9fccb39aa2..d11d2e1623 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h @@ -240,6 +240,11 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { typename AT::t_x_array_randomread x; typename AT::t_int_1d_randomread type; + + DAT::tdual_float_2d k_alocal; + typename AT::t_float_2d d_alocal; + + /* DAT::tdual_float_2d k_grid; DAT::tdual_float_2d k_gridall; typename AT::t_float_2d d_grid; @@ -247,6 +252,7 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { DAT::tdual_float_4d k_gridlocal; typename AT::t_float_4d d_gridlocal; + */ // Utility routine which wraps computing per-team scratch size requirements for diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h index 67ea878143..e8555a2101 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h @@ -135,11 +135,12 @@ ComputeSNAGridLocalKokkos::ComputeSNAGridL template ComputeSNAGridLocalKokkos::~ComputeSNAGridLocalKokkos() { - //printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode); + printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode); if (copymode) return; //printf(">>> After copymode\n"); memoryKK->destroy_kokkos(k_cutsq,cutsq); + memoryKK->destroy_kokkos(k_alocal,alocal); //memoryKK->destroy_kokkos(k_grid,grid); //memoryKK->destroy_kokkos(k_gridall, gridall); //memoryKK->destroy_kokkos(k_gridlocal, gridlocal); @@ -169,18 +170,23 @@ void ComputeSNAGridLocalKokkos::setup() //ComputeGrid::set_grid_global(); //ComputeGrid::set_grid_local(); //ComputeSNAGridLocal::setup(); + ComputeGridLocal::setup(); // allocate arrays //memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); + memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal"); // do not use or allocate gridlocal for now - gridlocal_allocated = 0; + //gridlocal_allocated = 0; //array = gridall; - d_gridlocal = k_gridlocal.template view(); + array_local = alocal; + + //d_gridlocal = k_gridlocal.template view(); //d_grid = k_grid.template view(); - d_gridall = k_gridall.template view(); + //d_gridall = k_gridall.template view(); + d_alocal = k_alocal.template view(); } // Compute @@ -192,6 +198,8 @@ void ComputeSNAGridLocalKokkos::compute_lo return; } + printf(">>> ComputeSNAGridLocalKokkos::compute_local begin\n"); + copymode = 1; zlen = nzhi-nzlo+1; @@ -212,6 +220,7 @@ void ComputeSNAGridLocalKokkos::compute_lo ntotal = atomKK->nlocal + atomKK->nghost; // Allocate view for number of neighbors per grid point + printf(">>> total_range: %d\n", total_range); MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range); // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user @@ -351,14 +360,10 @@ void ComputeSNAGridLocalKokkos::compute_lo } // end while - k_gridlocal.template modify(); - k_gridlocal.template sync(); + copymode = 0; - //k_grid.template modify(); - //k_grid.template sync(); - - k_gridall.template modify(); - k_gridall.template sync(); + k_alocal.template modify(); + k_alocal.template sync(); } /* ---------------------------------------------------------------------- @@ -830,9 +835,9 @@ void ComputeSNAGridLocalKokkos::operator() const F_FLOAT xtmp = xgrid[0]; const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; - d_gridall(igrid,0) = xtmp; - d_gridall(igrid,1) = ytmp; - d_gridall(igrid,2) = ztmp; + //d_gridall(igrid,0) = xtmp; + //d_gridall(igrid,1) = ytmp; + //d_gridall(igrid,2) = ztmp; const auto idxb_max = snaKK.idxb_max; @@ -841,7 +846,7 @@ void ComputeSNAGridLocalKokkos::operator() for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb); + //d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb); } } diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp index 6065f38171..48a0e777e0 100644 --- a/src/ML-SNAP/compute_grid_local.cpp +++ b/src/ML-SNAP/compute_grid_local.cpp @@ -61,21 +61,21 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) : ComputeGridLocal::~ComputeGridLocal() { - //printf(">>> ComputeGridLocal begin destruct\n"); + printf(">>> ComputeGridLocal begin destruct\n"); deallocate(); - //printf(">>> ComputeGridLocal end destruct\n"); + printf(">>> ComputeGridLocal end destruct\n"); } /* ---------------------------------------------------------------------- */ void ComputeGridLocal::setup() { - //printf(">>> ComputeGridLocal setup\n"); deallocate(); set_grid_global(); set_grid_local(); allocate(); assign_coords(); + printf(">>> ComputeGridLocal setup nx ny nz %d %d %d %d %d %d\n", nxlo, nxhi, nylo, nyhi, nzlo, nzhi); } /* ---------------------------------------------------------------------- diff --git a/src/ML-SNAP/compute_sna_grid_local.cpp b/src/ML-SNAP/compute_sna_grid_local.cpp index 3981970506..db49063920 100644 --- a/src/ML-SNAP/compute_sna_grid_local.cpp +++ b/src/ML-SNAP/compute_sna_grid_local.cpp @@ -182,6 +182,7 @@ ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocal::~ComputeSNAGridLocal() { + if (copymode) return; memory->destroy(radelem); memory->destroy(wjelem); memory->destroy(cutsq); From cb915cdce7a2f7e1776310493389137fb10c2027 Mon Sep 17 00:00:00 2001 From: rohskopf Date: Mon, 10 Jul 2023 10:22:52 -0600 Subject: [PATCH 32/51] Fill local sna/grid array --- .../compute_sna_grid_local_kokkos_impl.h | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h index e8555a2101..ee7cd464cd 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h @@ -456,6 +456,19 @@ void ComputeSNAGridLocalKokkos::operator() const F_FLOAT ytmp = xgrid[1]; const F_FLOAT ztmp = xgrid[2]; + // Zeroing out the components, which are filled as a sum. + for (int icol = size_local_cols_base; icol < size_local_cols; icol++){ + d_alocal(igrid, icol) = 0.0; + } + + // Fill grid info columns + d_alocal(igrid, 0) = ix; + d_alocal(igrid, 1) = iy; + d_alocal(igrid, 2) = iz; + d_alocal(igrid, 3) = xtmp; + d_alocal(igrid, 4) = ytmp; + d_alocal(igrid, 5) = ztmp; + // currently, all grid points are type 1 // not clear what a better choice would be @@ -832,9 +845,9 @@ void ComputeSNAGridLocalKokkos::operator() xgrid[2] = h2*xgrid[2] + lo2; } - const F_FLOAT xtmp = xgrid[0]; - const F_FLOAT ytmp = xgrid[1]; - const F_FLOAT ztmp = xgrid[2]; + //const F_FLOAT xtmp = xgrid[0]; + //const F_FLOAT ytmp = xgrid[1]; + //const F_FLOAT ztmp = xgrid[2]; //d_gridall(igrid,0) = xtmp; //d_gridall(igrid,1) = ytmp; //d_gridall(igrid,2) = ztmp; @@ -846,7 +859,7 @@ void ComputeSNAGridLocalKokkos::operator() for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - //d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb); + d_alocal(igrid,icoeff+6) = my_sna.blist(ii,idx_chem,idxb); } } From 79e05595dbe6d6d25e7b4cee8c5adee4722561cf Mon Sep 17 00:00:00 2001 From: rohskopf Date: Tue, 11 Jul 2023 13:11:50 -0600 Subject: [PATCH 33/51] Remove destructor prints --- src/KOKKOS/compute_sna_grid_local_kokkos_impl.h | 6 +++--- src/ML-SNAP/compute_grid_local.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h index ee7cd464cd..73a9df39ac 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h @@ -135,7 +135,7 @@ ComputeSNAGridLocalKokkos::ComputeSNAGridL template ComputeSNAGridLocalKokkos::~ComputeSNAGridLocalKokkos() { - printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode); + //printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode); if (copymode) return; //printf(">>> After copymode\n"); @@ -198,7 +198,7 @@ void ComputeSNAGridLocalKokkos::compute_lo return; } - printf(">>> ComputeSNAGridLocalKokkos::compute_local begin\n"); + //printf(">>> ComputeSNAGridLocalKokkos::compute_local begin\n"); copymode = 1; @@ -220,7 +220,7 @@ void ComputeSNAGridLocalKokkos::compute_lo ntotal = atomKK->nlocal + atomKK->nghost; // Allocate view for number of neighbors per grid point - printf(">>> total_range: %d\n", total_range); + //printf(">>> total_range: %d\n", total_range); MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range); // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp index 48a0e777e0..92bb556c50 100644 --- a/src/ML-SNAP/compute_grid_local.cpp +++ b/src/ML-SNAP/compute_grid_local.cpp @@ -61,9 +61,9 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) : ComputeGridLocal::~ComputeGridLocal() { - printf(">>> ComputeGridLocal begin destruct\n"); + //printf(">>> ComputeGridLocal begin destruct\n"); deallocate(); - printf(">>> ComputeGridLocal end destruct\n"); + //printf(">>> ComputeGridLocal end destruct\n"); } /* ---------------------------------------------------------------------- */ @@ -75,7 +75,7 @@ void ComputeGridLocal::setup() set_grid_local(); allocate(); assign_coords(); - printf(">>> ComputeGridLocal setup nx ny nz %d %d %d %d %d %d\n", nxlo, nxhi, nylo, nyhi, nzlo, nzhi); + //printf(">>> ComputeGridLocal setup nx ny nz %d %d %d %d %d %d\n", nxlo, nxhi, nylo, nyhi, nzlo, nzhi); } /* ---------------------------------------------------------------------- From a5b262aefad8680886a55667b95c23440a71bfc6 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Fri, 8 Sep 2023 00:20:45 +0200 Subject: [PATCH 34/51] Hotfixing a small bug in the Kokkos Gaussian Compute Co-authored-by: Drew Rohskopf < --- src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp index a52d747922..6913fd284b 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -67,7 +67,7 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP //printf(">>> 1\n"); // Set up element lists int n = atom->ntypes; - MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",nelements); + MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n); MemKK::realloc_kokkos(d_sigmaelem,"ComputeSNAGridKokkos::sigmaelem",n+1); MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1); MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1); From 2185ffa4280072a3325b9795c92fe89632501f38 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 5 Dec 2024 16:43:18 +0100 Subject: [PATCH 35/51] Renamed files to be more consistent with other examples --- examples/snap/{in.grid.gaussian => in.gaussian.grid} | 0 examples/snap/{in.grid.snap => in.snap.grid} | 0 examples/snap/{in.grid.tri => in.snap.grid.triclinic} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename examples/snap/{in.grid.gaussian => in.gaussian.grid} (100%) rename examples/snap/{in.grid.snap => in.snap.grid} (100%) rename examples/snap/{in.grid.tri => in.snap.grid.triclinic} (100%) diff --git a/examples/snap/in.grid.gaussian b/examples/snap/in.gaussian.grid similarity index 100% rename from examples/snap/in.grid.gaussian rename to examples/snap/in.gaussian.grid diff --git a/examples/snap/in.grid.snap b/examples/snap/in.snap.grid similarity index 100% rename from examples/snap/in.grid.snap rename to examples/snap/in.snap.grid diff --git a/examples/snap/in.grid.tri b/examples/snap/in.snap.grid.triclinic similarity index 100% rename from examples/snap/in.grid.tri rename to examples/snap/in.snap.grid.triclinic From 30d39c8fb311565613c976583449d12674b19b11 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Fri, 6 Dec 2024 09:37:09 +0100 Subject: [PATCH 36/51] Fixed formatting issues --- .../compute_gaussian_grid_local_kokkos.cpp | 17 ++++------ .../compute_gaussian_grid_local_kokkos.h | 4 +-- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 34 +++++++++---------- src/KOKKOS/compute_sna_grid_local_kokkos.h | 4 +-- .../compute_sna_grid_local_kokkos_impl.h | 34 +++++++++---------- src/ML-SNAP/compute_gaussian_grid_local.cpp | 8 ++--- src/ML-SNAP/compute_gaussian_grid_local.h | 2 +- 7 files changed, 48 insertions(+), 55 deletions(-) diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp index 6913fd284b..99380e0d63 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -183,16 +183,15 @@ void ComputeGaussianGridLocalKokkos::compute_local() // max_neighs is defined here - think of more elaborate methods. max_neighs = 100; - // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total // number of atoms. - ntotal = atomKK->nlocal + atomKK->nghost; // Allocate view for number of neighbors per grid point MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); - // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user // `total_range` is the number of grid points which may be larger than chunk size. - //printf(">>> total_range: %d\n", total_range); + // printf(">>> total_range: %d\n", total_range); chunksize = 32768; // 100*32768 chunk_size = MIN(chunksize, total_range); chunk_offset = 0; @@ -212,8 +211,8 @@ void ComputeGaussianGridLocalKokkos::compute_local() h1 = domain->h[1]; h2 = domain->h[2]; h3 = domain->h[3]; - h4 = domain->h[4]; - h5 = domain->h[5]; + h4 = domain->h[4]; + h5 = domain->h[5]; lo0 = domain->boxlo[0]; lo1 = domain->boxlo[1]; lo2 = domain->boxlo[2]; @@ -332,7 +331,6 @@ void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianG // currently, all grid points are type 1 // not clear what a better choice would be - const int itype = 1; int ielem = 0; ielem = d_map[itype]; @@ -340,10 +338,8 @@ void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianG // Compute the number of neighbors, store rsq int ninside = 0; - // Looping over ntotal for now. - for (int j = 0; j < ntotal; j++){ const F_FLOAT dx = x(j,0) - xtmp; const F_FLOAT dy = x(j,1) - ytmp; @@ -359,7 +355,6 @@ void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianG } //printf("%f\n", d_alocal(igrid, 6)); - } /* ---------------------------------------------------------------------- @@ -382,4 +377,4 @@ template class ComputeGaussianGridLocalKokkos; #ifdef LMP_KOKKOS_GPU template class ComputeGaussianGridLocalKokkos; #endif -} \ No newline at end of file +} diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h index db3e87a7e9..deb5eaa8cb 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h @@ -93,8 +93,8 @@ template class ComputeGaussianGridLocalKokkos : public Comput int host_flag; int total_range; // total number of loop iterations in grid int xlen, ylen, zlen; - int chunksize; - int ntotal; + int chunksize; + int ntotal; typename AT::t_x_array_randomread x; typename AT::t_int_1d_randomread type; diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 81f3173a7d..2101d5968b 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -104,7 +104,7 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos } // In pair snap some things like `map` get allocated regardless of chem flag. - if (chemflag){ + if (chemflag){ for (int i = 1; i <= atom->ntypes; i++) { h_map(i) = map[i]; } @@ -168,7 +168,7 @@ void ComputeSNAGridKokkos::setup() ComputeGrid::set_grid_global(); ComputeGrid::set_grid_local(); - + // allocate arrays memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); @@ -206,14 +206,14 @@ void ComputeSNAGridKokkos::compute_array() // max_neighs is defined here - think of more elaborate methods. max_neighs = 100; - // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total // number of atoms. ntotal = atomKK->nlocal + atomKK->nghost; // Allocate view for number of neighbors per grid point MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridKokkos:ninside",total_range); - // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user // `total_range` is the number of grid points which may be larger than chunk size. //printf(">>> total_range: %d\n", total_range); chunk_size = MIN(chunksize, total_range); @@ -222,7 +222,7 @@ void ComputeSNAGridKokkos::compute_array() snaKK.grow_rij(chunk_size, max_neighs); //chunk_size = total_range; - + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; @@ -236,8 +236,8 @@ void ComputeSNAGridKokkos::compute_array() h1 = domain->h[1]; h2 = domain->h[2]; h3 = domain->h[3]; - h4 = domain->h[4]; - h5 = domain->h[5]; + h4 = domain->h[4]; + h5 = domain->h[5]; lo0 = domain->boxlo[0]; lo1 = domain->boxlo[1]; lo2 = domain->boxlo[2]; @@ -250,11 +250,11 @@ void ComputeSNAGridKokkos::compute_array() //printf(">>> chunk_offset: %d\n", chunk_offset); - //ComputeNeigh + //ComputeNeigh { int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); - SnapAoSoATeamPolicy + SnapAoSoATeamPolicy policy_neigh(chunk_size, team_size_compute_neigh, vector_length); policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); @@ -375,9 +375,9 @@ void ComputeSNAGridKokkos::operator() (Tag // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos. // Main difference is that we don't use the neighbor class or neighbor variables here. // This is because the grid points are not atoms and therefore do not get assigned - // neighbors in LAMMPS. - // TODO: If we did make a neighborlist for each grid point, we could use current - // routines and avoid having to loop over all atoms (which limits us to + // neighbors in LAMMPS. + // TODO: If we did make a neighborlist for each grid point, we could use current + // routines and avoid having to loop over all atoms (which limits us to // natoms = max team size). SNAKokkos my_sna = snaKK; @@ -468,7 +468,7 @@ void ComputeSNAGridKokkos::operator() (Tag // Compute the number of neighbors, store rsq int ninside = 0; - + // Looping over ntotal for now. for (int j = 0; j < ntotal; j++){ const F_FLOAT dx = x(j,0) - xtmp; @@ -480,12 +480,12 @@ void ComputeSNAGridKokkos::operator() (Tag // don't include atoms that share location with grid point if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { jtype = -1; // use -1 to signal it's outside the radius - } + } if (jtype >= 0) ninside++; - } + } /* Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), @@ -500,7 +500,7 @@ void ComputeSNAGridKokkos::operator() (Tag // don't include atoms that share location with grid point if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { jtype = -1; // use -1 to signal it's outside the radius - } + } type_cache[j] = jtype; @@ -510,7 +510,7 @@ void ComputeSNAGridKokkos::operator() (Tag }, ninside); */ - d_ninside(ii) = ninside; + d_ninside(ii) = ninside; // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. int offset = 0; diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h index d11d2e1623..9073b921c1 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h @@ -240,7 +240,7 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { typename AT::t_x_array_randomread x; typename AT::t_int_1d_randomread type; - + DAT::tdual_float_2d k_alocal; typename AT::t_float_2d d_alocal; @@ -313,4 +313,4 @@ class ComputeSNAGridLocalKokkosHost : public ComputeSNAGridLocalKokkos::ComputeSNAGridL } // In pair snap some things like `map` get allocated regardless of chem flag. - if (chemflag){ + if (chemflag){ for (int i = 1; i <= atom->ntypes; i++) { h_map(i) = map[i]; } @@ -171,7 +171,7 @@ void ComputeSNAGridLocalKokkos::setup() //ComputeGrid::set_grid_local(); //ComputeSNAGridLocal::setup(); ComputeGridLocal::setup(); - + // allocate arrays //memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal"); @@ -215,7 +215,7 @@ void ComputeSNAGridLocalKokkos::compute_lo // max_neighs is defined here - think of more elaborate methods. max_neighs = 100; - // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total + // Pair snap/kk uses grow_ij with some max number of neighs but compute sna/grid uses total // number of atoms. ntotal = atomKK->nlocal + atomKK->nghost; @@ -223,7 +223,7 @@ void ComputeSNAGridLocalKokkos::compute_lo //printf(">>> total_range: %d\n", total_range); MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range); - // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user + // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user // `total_range` is the number of grid points which may be larger than chunk size. //printf(">>> total_range: %d\n", total_range); chunk_size = MIN(chunksize, total_range); @@ -232,7 +232,7 @@ void ComputeSNAGridLocalKokkos::compute_lo snaKK.grow_rij(chunk_size, max_neighs); //chunk_size = total_range; - + // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; @@ -246,8 +246,8 @@ void ComputeSNAGridLocalKokkos::compute_lo h1 = domain->h[1]; h2 = domain->h[2]; h3 = domain->h[3]; - h4 = domain->h[4]; - h5 = domain->h[5]; + h4 = domain->h[4]; + h5 = domain->h[5]; lo0 = domain->boxlo[0]; lo1 = domain->boxlo[1]; lo2 = domain->boxlo[2]; @@ -260,11 +260,11 @@ void ComputeSNAGridLocalKokkos::compute_lo //printf(">>> chunk_offset: %d\n", chunk_offset); - //ComputeNeigh + //ComputeNeigh { int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); //ntotal); - SnapAoSoATeamPolicy + SnapAoSoATeamPolicy policy_neigh(chunk_size, team_size_compute_neigh, vector_length); policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); @@ -381,9 +381,9 @@ void ComputeSNAGridLocalKokkos::operator() // This function follows similar procedure as ComputeNeigh of PairSNAPKokkos. // Main difference is that we don't use the neighbor class or neighbor variables here. // This is because the grid points are not atoms and therefore do not get assigned - // neighbors in LAMMPS. - // TODO: If we did make a neighborlist for each grid point, we could use current - // routines and avoid having to loop over all atoms (which limits us to + // neighbors in LAMMPS. + // TODO: If we did make a neighborlist for each grid point, we could use current + // routines and avoid having to loop over all atoms (which limits us to // natoms = max team size). SNAKokkos my_sna = snaKK; @@ -487,7 +487,7 @@ void ComputeSNAGridLocalKokkos::operator() // Compute the number of neighbors, store rsq int ninside = 0; - + // Looping over ntotal for now. for (int j = 0; j < ntotal; j++){ const F_FLOAT dx = x(j,0) - xtmp; @@ -499,12 +499,12 @@ void ComputeSNAGridLocalKokkos::operator() // don't include atoms that share location with grid point if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { jtype = -1; // use -1 to signal it's outside the radius - } + } if (jtype >= 0) ninside++; - } + } /* Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), @@ -519,7 +519,7 @@ void ComputeSNAGridLocalKokkos::operator() // don't include atoms that share location with grid point if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { jtype = -1; // use -1 to signal it's outside the radius - } + } type_cache[j] = jtype; @@ -529,7 +529,7 @@ void ComputeSNAGridLocalKokkos::operator() }, ninside); */ - d_ninside(ii) = ninside; + d_ninside(ii) = ninside; // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. int offset = 0; diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp index c18aa31d05..81286f9d81 100644 --- a/src/ML-SNAP/compute_gaussian_grid_local.cpp +++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp @@ -1,7 +1,7 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/ Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains @@ -61,9 +61,8 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char * for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[4 + i], false, lmp); for (int i = 0; i < ntypes; i++) sigmaelem[i + 1] = utils::numeric(FLERR, arg[ntypes + 4 + i], false, lmp); - - // construct cutsq + // construct cutsq double cut; cutmax = 0.0; memory->create(cutsq, ntypes + 1, ntypes + 1, "gaussian/atom:cutsq"); @@ -80,7 +79,6 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char * size_local_cols = size_local_cols_base + ntypes; // pre-compute coefficients - for (int i = 0; i < ntypes; i++) { prefacelem[i + 1] = 1.0/powint(sigmaelem[i + 1] * sqrt(MY_2PI), 3); argfacelem[i + 1] = 1.0/(2.0 * sigmaelem[i + 1] * sigmaelem[i + 1]); @@ -155,7 +153,7 @@ void ComputeGaussianGridLocal::compute_local() alocal[igrid][icol] += prefacelem[jtype] * exp(-rsq * argfacelem[jtype]); } } - igrid++; + igrid++; } } diff --git a/src/ML-SNAP/compute_gaussian_grid_local.h b/src/ML-SNAP/compute_gaussian_grid_local.h index 72e7326b49..77f88a7a8e 100644 --- a/src/ML-SNAP/compute_gaussian_grid_local.h +++ b/src/ML-SNAP/compute_gaussian_grid_local.h @@ -1,7 +1,7 @@ /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/ Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov + LAMMPS development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains From 80acfeebe7e7c21c113f24dbbb4d6ba9e34971b5 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Mon, 9 Dec 2024 14:04:47 +0100 Subject: [PATCH 37/51] Added documentation --- doc/src/compute_gaussian_grid_local.rst | 99 +++++++++++++++++++++++++ doc/src/compute_sna_atom.rst | 9 ++- 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 doc/src/compute_gaussian_grid_local.rst diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst new file mode 100644 index 0000000000..24f41586d1 --- /dev/null +++ b/doc/src/compute_gaussian_grid_local.rst @@ -0,0 +1,99 @@ +.. index:: compute sna/atom +.. index:: compute snad/atom +.. index:: compute snav/atom +.. index:: compute snap +.. index:: compute sna/grid +.. index:: compute sna/grid/local + + +compute gaussian/grid/local command +=================================== + +compute gaussian/grid/local/kk command +====================================== + +Syntax +"""""" + +.. code-block:: LAMMPS + + compute ID group-ID gaussian/grid nx ny nz rcutfac R_1 R_2 ... R_1 R_2 ... sigma_1 sigma_2 + compute ID group-ID gaussian/grid/local nx ny nz rcutfac R_1 R_2 ... sigma_1 sigma_2 + +* ID, group-ID are documented in :doc:`compute ` command +* sna/atom = style name of this compute command +* rcutfac = scale factor applied to all cutoff radii (positive real) +* sigma_1, sigma_2,... = Gaussian broadening, one for each type (positive real) +* R_1, R_2,... = list of cutoff radii, one for each type (distance units) +* nx, ny, nz = number of grid points in x, y, and z directions (positive integer) + +Examples +"""""""" + +.. code-block:: LAMMPS + + compute ggrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4 + +Description +""""""""""" + +Define a computation that calculates a Gaussian representation of the ionic +structure. This representation is used for the efficient evaluation +of quantities related to the structure factor in a grid-based workflow, +such as the ML-DFT workflow MALA :ref:`(Ellis)) `, for which it was originally +implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) `. + +For each atomic species, a separate sum of Gaussians is calculated, using +a separate Gaussian broadening per species. The computation +is always performed on the numerical grid, no atom-based version of this +compute exists. The Gaussian representation can only be executed in a local +fashion, thus the output array only contains rows for grid points +that are local to the processor subdomain. The layout of the grid is the same +as for the see :doc:`sna/grid/local ` command. + +Namely, the array contains one row for each of the +local grid points, looping over the global index *ix* fastest, +then *iy*, and *iz* slowest. Each row of the array contains +the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*, +and *z* coordinates of the grid point, followed by the values of the Gaussians +(one floating point number per species per grid point). + +Computation of these Gaussians can be accelerated via Kokkos through the +*gaussian/grid/local/kk* command. + +---------- + +Output info +""""""""""" + +Compute *gaussian/grid/local* evaluates a local array. +The array contains one row for each of the +local grid points, looping over the global index *ix* fastest, +then *iy*, and *iz* slowest. Each row of the array contains +the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*, +and *z* coordinates of the grid point, followed by the values of the Gaussians +(one floating point number per species per grid point). + +Restrictions +"""""""""""" + +These computes are part of the ML-SNAP package. They are only enabled +if LAMMPS was built with that package. See the :doc:`Build package +` page for more info. + +Related commands +"""""""""""""""" + +:doc:`compute sna/grid/local ` + +---------- + +.. _Ellis2021: + +**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, Phys Rev B, 104, 035120, (2021) + +.. _Fiedler2023: + +**(Fiedler)** Fiedler, Modine, Schmerler, Vogel, Popoola, Thompson, Rajamanickam, and Cangi, +`npj Comp. Mater., 9, 115 (2023) `_ + diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst index 179c362dc6..776ccca5a6 100644 --- a/doc/src/compute_sna_atom.rst +++ b/doc/src/compute_sna_atom.rst @@ -20,9 +20,15 @@ compute snap command compute sna/grid command ======================== +compute sna/grid/kk command +=========================== + compute sna/grid/local command ============================== +compute sna/grid/local/kk command +================================= + Syntax """""" @@ -252,7 +258,8 @@ for finite-temperature Kohn-Sham density functional theory (:ref:`Ellis et al. `) Neighbor atoms not in the group do not contribute to the bispectrum components of the grid points. The distance cutoff :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom -*i'*. +*i'*. Both computes can be hardware accelerated with Kokkos by using the +*sna/grid/kk* and *sna/grid/local/kk* coammnds, respectively. Compute *sna/grid* calculates a global array containing bispectrum components for a regular grid of points. From f93dd3273d0f6f96b4e537cbf02a5b6dcba8f757 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Mon, 9 Dec 2024 14:08:22 +0100 Subject: [PATCH 38/51] Added link to PRB paper --- doc/src/compute_gaussian_grid_local.rst | 2 +- doc/src/compute_sna_atom.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst index 24f41586d1..226402bc22 100644 --- a/doc/src/compute_gaussian_grid_local.rst +++ b/doc/src/compute_gaussian_grid_local.rst @@ -90,7 +90,7 @@ Related commands .. _Ellis2021: -**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, Phys Rev B, 104, 035120, (2021) +**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) `_ .. _Fiedler2023: diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst index 776ccca5a6..28611ae3a7 100644 --- a/doc/src/compute_sna_atom.rst +++ b/doc/src/compute_sna_atom.rst @@ -661,7 +661,7 @@ of Angular Momentum, World Scientific, Singapore (1987). .. _Ellis2021: -**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, Phys Rev B, 104, 035120, (2021) +**(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) `_ .. _Lafourcade2023_2: From f59f084c37d05461fcc85c2c73f9d2c0b128e7c4 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 10 Dec 2024 10:39:57 +0100 Subject: [PATCH 39/51] Added logs for examples --- examples/snap/log.10Dec24.gaussian.grid.g++.1 | 57 +++++++++++++++++++ examples/snap/log.10Dec24.gaussian.grid.g++.4 | 57 +++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 examples/snap/log.10Dec24.gaussian.grid.g++.1 create mode 100644 examples/snap/log.10Dec24.gaussian.grid.g++.4 diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.1 b/examples/snap/log.10Dec24.gaussian.grid.g++.1 new file mode 100644 index 0000000000..230008ec97 --- /dev/null +++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1 @@ -0,0 +1,57 @@ +LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-57-gf93dd3273d) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) + using 1 OpenMP thread(s) per MPI task +Lattice spacing in x,y,z = 3.316 3.316 3.316 +Created orthogonal box = (0 0 0) to (3.316 3.316 3.316) + 1 by 1 by 1 MPI processor grid +Created 2 atoms + using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316) + create_atoms CPU = 0.002 seconds +1 atoms in group snapgroup +WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60) +Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 6.67637 + ghost atom cutoff = 6.67637 + binsize = 3.338185, bins = 1 1 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair zero, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Setting up Verlet run ... + Unit style : metal + Current step : 0 + Time step : 0.001 +Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 0 0 0 0 0 +Loop time of 6.83e-07 on 1 procs for 0 steps with 2 atoms + +146.4% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0 | 0 | 0 | 0.0 | 0.00 +Output | 0 | 0 | 0 | 0.0 | 0.00 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 6.83e-07 | | |100.00 + +Nlocal: 2 ave 2 max 2 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 339 ave 339 max 339 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 64 ave 64 max 64 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 64 +Ave neighs/atom = 32 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:00 diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.4 b/examples/snap/log.10Dec24.gaussian.grid.g++.4 new file mode 100644 index 0000000000..f46db86fc7 --- /dev/null +++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4 @@ -0,0 +1,57 @@ +LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-57-gf93dd3273d) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) + using 1 OpenMP thread(s) per MPI task +Lattice spacing in x,y,z = 3.316 3.316 3.316 +Created orthogonal box = (0 0 0) to (3.316 3.316 3.316) + 1 by 1 by 1 MPI processor grid +Created 2 atoms + using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316) + create_atoms CPU = 0.004 seconds +1 atoms in group snapgroup +WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60) +Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 6.67637 + ghost atom cutoff = 6.67637 + binsize = 3.338185, bins = 1 1 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair zero, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Setting up Verlet run ... + Unit style : metal + Current step : 0 + Time step : 0.001 +Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 0 0 0 0 0 +Loop time of 6.18e-07 on 1 procs for 0 steps with 2 atoms + +161.8% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0 | 0 | 0 | 0.0 | 0.00 +Output | 0 | 0 | 0 | 0.0 | 0.00 +Modify | 0 | 0 | 0 | 0.0 | 0.00 +Other | | 6.18e-07 | | |100.00 + +Nlocal: 2 ave 2 max 2 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 339 ave 339 max 339 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 64 ave 64 max 64 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 64 +Ave neighs/atom = 32 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:00 From 16e0a7788acdac870038a527f0607dc7d7e8e112 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Fri, 13 Dec 2024 09:51:07 +0100 Subject: [PATCH 40/51] Now actually added the correct log --- examples/snap/log.10Dec24.gaussian.grid.g++.4 | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.4 b/examples/snap/log.10Dec24.gaussian.grid.g++.4 index f46db86fc7..fab0236dd6 100644 --- a/examples/snap/log.10Dec24.gaussian.grid.g++.4 +++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4 @@ -3,10 +3,10 @@ OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99 using 1 OpenMP thread(s) per MPI task Lattice spacing in x,y,z = 3.316 3.316 3.316 Created orthogonal box = (0 0 0) to (3.316 3.316 3.316) - 1 by 1 by 1 MPI processor grid + 1 by 2 by 2 MPI processor grid Created 2 atoms using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316) - create_atoms CPU = 0.004 seconds + create_atoms CPU = 0.003 seconds 1 atoms in group snapgroup WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60) Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule @@ -22,16 +22,17 @@ Neighbor list info ... pair build: half/bin/atomonly/newton stencil: half/bin/3d bin: standard +WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202) Setting up Verlet run ... Unit style : metal Current step : 0 Time step : 0.001 -Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes - Step Temp E_pair E_mol TotEng Press - 0 0 0 0 0 0 -Loop time of 6.18e-07 on 1 procs for 0 steps with 2 atoms +Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 0 0 0 0 0 +Loop time of 6.4355e-06 on 4 procs for 0 steps with 2 atoms -161.8% CPU use with 1 MPI tasks x 1 OpenMP threads +15.5% CPU use with 4 MPI tasks x 1 OpenMP threads MPI task timing breakdown: Section | min time | avg time | max time |%varavg| %total @@ -41,14 +42,14 @@ Neigh | 0 | 0 | 0 | 0.0 | 0.00 Comm | 0 | 0 | 0 | 0.0 | 0.00 Output | 0 | 0 | 0 | 0.0 | 0.00 Modify | 0 | 0 | 0 | 0.0 | 0.00 -Other | | 6.18e-07 | | |100.00 +Other | | 6.435e-06 | | |100.00 -Nlocal: 2 ave 2 max 2 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Nghost: 339 ave 339 max 339 min -Histogram: 1 0 0 0 0 0 0 0 0 0 -Neighs: 64 ave 64 max 64 min -Histogram: 1 0 0 0 0 0 0 0 0 0 +Nlocal: 0.5 ave 1 max 0 min +Histogram: 2 0 0 0 0 0 0 0 0 2 +Nghost: 274.5 ave 275 max 274 min +Histogram: 2 0 0 0 0 0 0 0 0 2 +Neighs: 16 ave 40 max 0 min +Histogram: 2 0 0 0 0 0 1 0 0 1 Total # of neighbors = 64 Ave neighs/atom = 32 From bff2e64bbc60833b3a1af7b91763683f6c12151b Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 19 Dec 2024 10:06:53 +0100 Subject: [PATCH 41/51] Docs and example updates by Aidan Thompson I did NOT author this commit, I am only pushing it because for some reason, Github does not permit Aidan to do so Co-authored-by: Aidan Thompson --- doc/src/Commands_compute.rst | 5 +- doc/src/compute.rst | 1 + doc/src/compute_gaussian_grid_local.rst | 56 ++++++----- doc/src/compute_sna_atom.rst | 31 +++--- doc/utils/sphinx-config/false_positives.txt | 2 + examples/snap/README.md | 6 ++ examples/snap/in.gaussian.grid | 10 +- examples/snap/in.snap.grid.triclinic | 1 - examples/snap/log.10Dec24.gaussian.grid.g++.1 | 96 ++++++++++++++++--- examples/snap/log.10Dec24.gaussian.grid.g++.4 | 96 ++++++++++++++++--- 10 files changed, 232 insertions(+), 72 deletions(-) diff --git a/doc/src/Commands_compute.rst b/doc/src/Commands_compute.rst index fd68ce3e39..7c73583a4f 100644 --- a/doc/src/Commands_compute.rst +++ b/doc/src/Commands_compute.rst @@ -58,6 +58,7 @@ KOKKOS, o = OPENMP, t = OPT. * :doc:`fep/ta ` * :doc:`force/tally ` * :doc:`fragment/atom ` + * :doc:`gaussian/grid/local (k) ` * :doc:`global/atom ` * :doc:`group/group ` * :doc:`gyration ` @@ -140,8 +141,8 @@ KOKKOS, o = OPENMP, t = OPT. * :doc:`smd/vol ` * :doc:`snap ` * :doc:`sna/atom ` - * :doc:`sna/grid ` - * :doc:`sna/grid/local ` + * :doc:`sna/grid (k) ` + * :doc:`sna/grid/local (k) ` * :doc:`snad/atom ` * :doc:`snav/atom ` * :doc:`sph/e/atom ` diff --git a/doc/src/compute.rst b/doc/src/compute.rst index 082f93a6c4..9a8a1734fb 100644 --- a/doc/src/compute.rst +++ b/doc/src/compute.rst @@ -236,6 +236,7 @@ The individual style names on the :doc:`Commands compute ` pag * :doc:`fep/ta ` - compute free energies for a test area perturbation * :doc:`force/tally ` - force between two groups of atoms via the tally callback mechanism * :doc:`fragment/atom ` - fragment ID for each atom +* :doc:`gaussian/grid/local ` - local array of Gaussian atomic contributions on a regular grid * :doc:`global/atom ` - assign global values to each atom from arrays of global values * :doc:`group/group ` - energy/force between two groups of atoms * :doc:`gyration ` - radius of gyration of group of atoms diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst index 226402bc22..45ef6642c9 100644 --- a/doc/src/compute_gaussian_grid_local.rst +++ b/doc/src/compute_gaussian_grid_local.rst @@ -1,38 +1,31 @@ -.. index:: compute sna/atom -.. index:: compute snad/atom -.. index:: compute snav/atom -.. index:: compute snap -.. index:: compute sna/grid -.. index:: compute sna/grid/local - +.. index:: compute gaussian/grid/local +.. index:: compute gaussian/grid/local/kk compute gaussian/grid/local command =================================== -compute gaussian/grid/local/kk command -====================================== +Accelerator Variants: *gaussian/grid/local/kk* Syntax """""" .. code-block:: LAMMPS - compute ID group-ID gaussian/grid nx ny nz rcutfac R_1 R_2 ... R_1 R_2 ... sigma_1 sigma_2 - compute ID group-ID gaussian/grid/local nx ny nz rcutfac R_1 R_2 ... sigma_1 sigma_2 + compute ID group-ID gaussian/grid/local grid nx ny nz rcutfac R_1 R_2 ... sigma_1 sigma_2 * ID, group-ID are documented in :doc:`compute ` command -* sna/atom = style name of this compute command -* rcutfac = scale factor applied to all cutoff radii (positive real) -* sigma_1, sigma_2,... = Gaussian broadening, one for each type (positive real) -* R_1, R_2,... = list of cutoff radii, one for each type (distance units) -* nx, ny, nz = number of grid points in x, y, and z directions (positive integer) +* gaussian/grid/local = style name of this compute command +* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer) +* *rcutfac* = scale factor applied to all cutoff radii (positive real) +* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units) +* *sigma_1, sigma_2,...* = Gaussian widths, one for each type (distance units) Examples """""""" .. code-block:: LAMMPS - compute ggrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4 + compute mygrid all gaussian/grid/local grid 40 40 40 4.0 0.5 0.5 0.4 0.4 Description """"""""""" @@ -40,14 +33,14 @@ Description Define a computation that calculates a Gaussian representation of the ionic structure. This representation is used for the efficient evaluation of quantities related to the structure factor in a grid-based workflow, -such as the ML-DFT workflow MALA :ref:`(Ellis)) `, for which it was originally +such as the ML-DFT workflow MALA :ref:`(Ellis) `, for which it was originally implemented. Usage of the workflow is described in a separate publication :ref:`(Fiedler) `. -For each atomic species, a separate sum of Gaussians is calculated, using -a separate Gaussian broadening per species. The computation +For each LAMMPS type, a separate sum of Gaussians is calculated, using +a separate Gaussian broadening per type. The computation is always performed on the numerical grid, no atom-based version of this compute exists. The Gaussian representation can only be executed in a local -fashion, thus the output array only contains rows for grid points +fashion, thus the output array only contains rows for grid points that are local to the processor subdomain. The layout of the grid is the same as for the see :doc:`sna/grid/local ` command. @@ -56,10 +49,14 @@ local grid points, looping over the global index *ix* fastest, then *iy*, and *iz* slowest. Each row of the array contains the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*, and *z* coordinates of the grid point, followed by the values of the Gaussians -(one floating point number per species per grid point). +(one floating point number per type per grid point). -Computation of these Gaussians can be accelerated via Kokkos through the -*gaussian/grid/local/kk* command. +---------- + + +.. include:: accel_styles.rst + + ---------- @@ -69,10 +66,11 @@ Output info Compute *gaussian/grid/local* evaluates a local array. The array contains one row for each of the local grid points, looping over the global index *ix* fastest, -then *iy*, and *iz* slowest. Each row of the array contains -the global indexes *ix*, *iy*, and *iz* first, followed by the *x*, *y*, -and *z* coordinates of the grid point, followed by the values of the Gaussians -(one floating point number per species per grid point). +then *iy*, and *iz* slowest. The array contains math :math:`ntypes+6` columns, +where *ntypes* is the number of LAMMPS types. The first three columns are +the global indexes *ix*, *iy*, and *iz*, followed by the *x*, *y*, +and *z* coordinates of the grid point, followed by the *ntypes* columns +containing the values of the Gaussians for each type. Restrictions """""""""""" @@ -88,7 +86,7 @@ Related commands ---------- -.. _Ellis2021: +.. _Ellis2021b: **(Ellis)** Ellis, Fiedler, Popoola, Modine, Stephens, Thompson, Cangi, Rajamanickam, `Phys. Rev. B, 104, 035120, (2021) `_ diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst index 28611ae3a7..2079234ddf 100644 --- a/doc/src/compute_sna_atom.rst +++ b/doc/src/compute_sna_atom.rst @@ -3,7 +3,9 @@ .. index:: compute snav/atom .. index:: compute snap .. index:: compute sna/grid +.. index:: compute sna/grid/kk .. index:: compute sna/grid/local +.. index:: compute sna/grid/local/kk compute sna/atom command ======================== @@ -26,8 +28,7 @@ compute sna/grid/kk command compute sna/grid/local command ============================== -compute sna/grid/local/kk command -================================= +Accelerator Variants: *sna/grid/local/kk* Syntax """""" @@ -39,17 +40,17 @@ Syntax compute ID group-ID snav/atom rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... compute ID group-ID snap rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... - compute ID group-ID sna/grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... - compute ID group-ID sna/grid/local nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... + compute ID group-ID sna/grid grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... + compute ID group-ID sna/grid/local grid nx ny nz rcutfac rfac0 twojmax R_1 R_2 ... w_1 w_2 ... keyword values ... * ID, group-ID are documented in :doc:`compute ` command * sna/atom = style name of this compute command -* rcutfac = scale factor applied to all cutoff radii (positive real) -* rfac0 = parameter in distance to angle conversion (0 < rcutfac < 1) -* twojmax = band limit for bispectrum components (non-negative integer) -* R_1, R_2,... = list of cutoff radii, one for each type (distance units) -* w_1, w_2,... = list of neighbor weights, one for each type -* nx, ny, nz = number of grid points in x, y, and z directions (positive integer) +* *rcutfac* = scale factor applied to all cutoff radii (positive real) +* *rfac0* = parameter in distance to angle conversion (0 < rcutfac < 1) +* *twojmax* = band limit for bispectrum components (non-negative integer) +* *R_1, R_2,...* = list of cutoff radii, one for each type (distance units) +* *w_1, w_2,...* = list of neighbor weights, one for each type +* *grid* values = nx, ny, nz, number of grid points in x, y, and z directions (positive integer) * zero or more keyword/value pairs may be appended * keyword = *rmin0* or *switchflag* or *bzeroflag* or *quadraticflag* or *chem* or *bnormflag* or *wselfallflag* or *bikflag* or *switchinnerflag* or *sinner* or *dinner* or *dgradflag* or *nnn* or *wmode* or *delta* @@ -109,7 +110,7 @@ Examples compute snap all snap 1.4 0.95 6 2.0 1.0 compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 chem 2 0 1 compute snap all snap 1.0 0.99363 6 3.81 3.83 1.0 0.93 switchinnerflag 1 sinner 1.35 1.6 dinner 0.25 0.3 - compute bgrid all sna/grid/local 200 200 200 1.4 0.95 6 2.0 1.0 + compute bgrid all sna/grid/local grid 200 200 200 1.4 0.95 6 2.0 1.0 compute bnnn all sna/atom 9.0 0.99363 8 0.5 1.0 rmin0 0.0 nnn 24 wmode 1 delta 0.2 Description @@ -259,7 +260,7 @@ et al. `) Neighbor atoms not in the group do not contribute to the bispectrum components of the grid points. The distance cutoff :math:`R_{ii'}` assumes that *i* has the same type as the neighbor atom *i'*. Both computes can be hardware accelerated with Kokkos by using the -*sna/grid/kk* and *sna/grid/local/kk* coammnds, respectively. +*sna/grid/kk* and *sna/grid/local/kk* commands, respectively. Compute *sna/grid* calculates a global array containing bispectrum components for a regular grid of points. @@ -470,6 +471,12 @@ fluctuations in the resulting local atomic environment fingerprint. The detailed formalism is given in the paper by Lafourcade et al. :ref:`(Lafourcade) `. +---------- + + +.. include:: accel_styles.rst + + ---------- Output info diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index 8e601d6c16..34e56539fc 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -3360,6 +3360,7 @@ Schilfgarde Schimansky Schiotz Schlitter +Schmerler Schmid Schnieders Schoen @@ -4021,6 +4022,7 @@ VMDARCH VMDHOME vn Voigt +Vogel volfactor Volkov Volpe diff --git a/examples/snap/README.md b/examples/snap/README.md index 305f920ae8..1df24acf1f 100644 --- a/examples/snap/README.md +++ b/examples/snap/README.md @@ -9,5 +9,11 @@ in.snap.Mo_Chen # SNAP linear Mo potential in.snap.compute # SNAP compute for training a linear model in.snap.compute.quadratic # SNAP compute for training a quadratic model in.snap.scale.Ni_Zuo_JCPA2020 # SNAP linear Ni potential with thermodynamic integration (fix adapt scale) +in.C_SNAP # SNAP carbon potential compute_snap_dgrad.py # SNAP compute with dgradflag (dBi/dRj) for training a non-linear model + +in.snap.grid # SNAP descriptors on a grid +in.snap.grid.triclinic # SNAP descriptors on a grid, triclinic +in.gaussian.grid # Gaussian descriptors on a grid + diff --git a/examples/snap/in.gaussian.grid b/examples/snap/in.gaussian.grid index 9caa61e455..48aeec1632 100644 --- a/examples/snap/in.gaussian.grid +++ b/examples/snap/in.gaussian.grid @@ -4,6 +4,7 @@ # sitting on an atom of type 1 or 2: # val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219 # val2 = 1.0/(0.2 *sqrt(2.0*pi))**3 = 7.93670 +# These values are extracted to the log file # variable nrep index 1 @@ -36,7 +37,6 @@ mass * 180.88 # define atom compute and grid compute -group snapgroup type 1 variable rcutfac equal 4.67637 variable radelem1 equal 0.5 variable radelem2 equal 0.5 @@ -57,10 +57,12 @@ compute mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} & # define output -dump 1 all local 1000 dump.glocal c_mygridlocal[*] -dump 2 all custom 1000 dump.gatom id x y z +dump 1 all local 1000 dump.glocal c_mygridlocal[*] +dump 2 all custom 1000 dump.gatom id x y z +compute val1 all reduce max c_mygridlocal[7] inputs local +compute val2 all reduce max c_mygridlocal[8] inputs local +thermo_style custom step c_val1 c_val2 # run run 0 - diff --git a/examples/snap/in.snap.grid.triclinic b/examples/snap/in.snap.grid.triclinic index 95a14f3bb4..59063f576e 100644 --- a/examples/snap/in.snap.grid.triclinic +++ b/examples/snap/in.snap.grid.triclinic @@ -47,7 +47,6 @@ lattice custom $a & basis 0.0 0.0 0.5 & spacing 1 1 1 -box tilt large region box prism 0 ${nx} 0 ${ny} 0 ${nz} ${ny} ${nz} ${nz} create_box 1 box create_atoms 1 box diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.1 b/examples/snap/log.10Dec24.gaussian.grid.g++.1 index 230008ec97..b158ac07d0 100644 --- a/examples/snap/log.10Dec24.gaussian.grid.g++.1 +++ b/examples/snap/log.10Dec24.gaussian.grid.g++.1 @@ -1,13 +1,89 @@ -LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-57-gf93dd3273d) +LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a) OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) using 1 OpenMP thread(s) per MPI task +# Demonstrate calculation of Gaussian descriptors on a grid +# for a cell with two atoms of type 1 and type 2. +# The output in dump.glocal shows that for grid points +# sitting on an atom of type 1 or 2: +# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219 +# val2 = 1.0/(0.2 *sqrt(2.0*pi))**3 = 7.93670 +# These values are extracted to the log file +# + +variable nrep index 1 +variable a index 3.316 +variable ngrid index 2 + +units metal +atom_modify map hash + +# generate the box and atom positions using a BCC lattice + +variable nx equal ${nrep} +variable nx equal 1 +variable ny equal ${nrep} +variable ny equal 1 +variable nz equal ${nrep} +variable nz equal 1 + +boundary p p p + +lattice custom $a a1 1 0 0 a2 0 1 0 a3 0 0 1 basis 0 0 0 basis 0.5 0.5 0.5 +lattice custom 3.316 a1 1 0 0 a2 0 1 0 a3 0 0 1 basis 0 0 0 basis 0.5 0.5 0.5 Lattice spacing in x,y,z = 3.316 3.316 3.316 +region box block 0 ${nx} 0 ${ny} 0 ${nz} +region box block 0 1 0 ${ny} 0 ${nz} +region box block 0 1 0 1 0 ${nz} +region box block 0 1 0 1 0 1 +create_box 2 box Created orthogonal box = (0 0 0) to (3.316 3.316 3.316) 1 by 1 by 1 MPI processor grid +create_atoms 1 box basis 1 1 basis 2 2 Created 2 atoms using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316) - create_atoms CPU = 0.002 seconds -1 atoms in group snapgroup + create_atoms CPU = 0.001 seconds + +mass * 180.88 + +# define atom compute and grid compute + +variable rcutfac equal 4.67637 +variable radelem1 equal 0.5 +variable radelem2 equal 0.5 +variable sigmaelem1 equal 0.1355 +variable sigmaelem2 equal 0.2 +variable gaussian_options string "${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}" +4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 0.5 0.1355 ${sigmaelem2} +4.67637 0.5 0.5 0.1355 0.2 + +# build zero potential to force ghost atom creation + +pair_style zero ${rcutfac} +pair_style zero 4.67637 +pair_coeff * * + +# define atom and grid computes + +compute mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 2 ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 2 4.67637 0.5 0.5 0.1355 0.2 + +# define output + +dump 1 all local 1000 dump.glocal c_mygridlocal[*] +dump 2 all custom 1000 dump.gatom id x y z +compute val1 all reduce max c_mygridlocal[7] inputs local +compute val2 all reduce max c_mygridlocal[8] inputs local +thermo_style custom step c_val1 c_val2 + +# run + +run 0 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60) Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule Neighbor list info ... @@ -22,16 +98,12 @@ Neighbor list info ... pair build: half/bin/atomonly/newton stencil: half/bin/3d bin: standard -Setting up Verlet run ... - Unit style : metal - Current step : 0 - Time step : 0.001 Per MPI rank memory allocation (min/avg/max) = 3.492 | 3.492 | 3.492 Mbytes - Step Temp E_pair E_mol TotEng Press - 0 0 0 0 0 0 -Loop time of 6.83e-07 on 1 procs for 0 steps with 2 atoms + Step c_val1 c_val2 + 0 25.521859 7.9367045 +Loop time of 1.088e-06 on 1 procs for 0 steps with 2 atoms -146.4% CPU use with 1 MPI tasks x 1 OpenMP threads +183.8% CPU use with 1 MPI tasks x 1 OpenMP threads MPI task timing breakdown: Section | min time | avg time | max time |%varavg| %total @@ -41,7 +113,7 @@ Neigh | 0 | 0 | 0 | 0.0 | 0.00 Comm | 0 | 0 | 0 | 0.0 | 0.00 Output | 0 | 0 | 0 | 0.0 | 0.00 Modify | 0 | 0 | 0 | 0.0 | 0.00 -Other | | 6.83e-07 | | |100.00 +Other | | 1.088e-06 | | |100.00 Nlocal: 2 ave 2 max 2 min Histogram: 1 0 0 0 0 0 0 0 0 0 diff --git a/examples/snap/log.10Dec24.gaussian.grid.g++.4 b/examples/snap/log.10Dec24.gaussian.grid.g++.4 index fab0236dd6..54cc842bc7 100644 --- a/examples/snap/log.10Dec24.gaussian.grid.g++.4 +++ b/examples/snap/log.10Dec24.gaussian.grid.g++.4 @@ -1,13 +1,89 @@ -LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-57-gf93dd3273d) +LAMMPS (19 Nov 2024 - Development - patch_19Nov2024-59-g16e0a7788a) OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) using 1 OpenMP thread(s) per MPI task +# Demonstrate calculation of Gaussian descriptors on a grid +# for a cell with two atoms of type 1 and type 2. +# The output in dump.glocal shows that for grid points +# sitting on an atom of type 1 or 2: +# val1 = 1.0/(0.1355*sqrt(2.0*pi))**3 = 25.5219 +# val2 = 1.0/(0.2 *sqrt(2.0*pi))**3 = 7.93670 +# These values are extracted to the log file +# + +variable nrep index 1 +variable a index 3.316 +variable ngrid index 2 + +units metal +atom_modify map hash + +# generate the box and atom positions using a BCC lattice + +variable nx equal ${nrep} +variable nx equal 1 +variable ny equal ${nrep} +variable ny equal 1 +variable nz equal ${nrep} +variable nz equal 1 + +boundary p p p + +lattice custom $a a1 1 0 0 a2 0 1 0 a3 0 0 1 basis 0 0 0 basis 0.5 0.5 0.5 +lattice custom 3.316 a1 1 0 0 a2 0 1 0 a3 0 0 1 basis 0 0 0 basis 0.5 0.5 0.5 Lattice spacing in x,y,z = 3.316 3.316 3.316 +region box block 0 ${nx} 0 ${ny} 0 ${nz} +region box block 0 1 0 ${ny} 0 ${nz} +region box block 0 1 0 1 0 ${nz} +region box block 0 1 0 1 0 1 +create_box 2 box Created orthogonal box = (0 0 0) to (3.316 3.316 3.316) 1 by 2 by 2 MPI processor grid +create_atoms 1 box basis 1 1 basis 2 2 Created 2 atoms using lattice units in orthogonal box = (0 0 0) to (3.316 3.316 3.316) - create_atoms CPU = 0.003 seconds -1 atoms in group snapgroup + create_atoms CPU = 0.001 seconds + +mass * 180.88 + +# define atom compute and grid compute + +variable rcutfac equal 4.67637 +variable radelem1 equal 0.5 +variable radelem2 equal 0.5 +variable sigmaelem1 equal 0.1355 +variable sigmaelem2 equal 0.2 +variable gaussian_options string "${rcutfac} ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2}" +4.67637 ${radelem1} ${radelem2} ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 ${radelem2} ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 0.5 ${sigmaelem1} ${sigmaelem2} +4.67637 0.5 0.5 0.1355 ${sigmaelem2} +4.67637 0.5 0.5 0.1355 0.2 + +# build zero potential to force ghost atom creation + +pair_style zero ${rcutfac} +pair_style zero 4.67637 +pair_coeff * * + +# define atom and grid computes + +compute mygridlocal all gaussian/grid/local grid ${ngrid} ${ngrid} ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 ${ngrid} ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 ${ngrid} ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 2 ${gaussian_options} +compute mygridlocal all gaussian/grid/local grid 2 2 2 4.67637 0.5 0.5 0.1355 0.2 + +# define output + +dump 1 all local 1000 dump.glocal c_mygridlocal[*] +dump 2 all custom 1000 dump.gatom id x y z +compute val1 all reduce max c_mygridlocal[7] inputs local +compute val2 all reduce max c_mygridlocal[8] inputs local +thermo_style custom step c_val1 c_val2 + +# run + +run 0 WARNING: No fixes with time integration, atoms won't move (src/verlet.cpp:60) Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule Neighbor list info ... @@ -23,16 +99,12 @@ Neighbor list info ... stencil: half/bin/3d bin: standard WARNING: Proc sub-domain size < neighbor skin, could lead to lost atoms (src/domain.cpp:1202) -Setting up Verlet run ... - Unit style : metal - Current step : 0 - Time step : 0.001 Per MPI rank memory allocation (min/avg/max) = 3.522 | 3.523 | 3.524 Mbytes - Step Temp E_pair E_mol TotEng Press - 0 0 0 0 0 0 -Loop time of 6.4355e-06 on 4 procs for 0 steps with 2 atoms + Step c_val1 c_val2 + 0 25.521859 7.9367045 +Loop time of 2.238e-06 on 4 procs for 0 steps with 2 atoms -15.5% CPU use with 4 MPI tasks x 1 OpenMP threads +89.4% CPU use with 4 MPI tasks x 1 OpenMP threads MPI task timing breakdown: Section | min time | avg time | max time |%varavg| %total @@ -42,7 +114,7 @@ Neigh | 0 | 0 | 0 | 0.0 | 0.00 Comm | 0 | 0 | 0 | 0.0 | 0.00 Output | 0 | 0 | 0 | 0.0 | 0.00 Modify | 0 | 0 | 0 | 0.0 | 0.00 -Other | | 6.435e-06 | | |100.00 +Other | | 2.238e-06 | | |100.00 Nlocal: 0.5 ave 1 max 0 min Histogram: 2 0 0 0 0 0 0 0 0 2 From 824dcda382cbed99a4be27a72246412d7add53c9 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 19 Dec 2024 17:22:36 +0100 Subject: [PATCH 42/51] Fixed two style issues in the docs, got rid of printf that's also deleted on develop --- doc/src/compute_gaussian_grid_local.rst | 2 +- doc/src/compute_sna_atom.rst | 2 +- src/KOKKOS/pair_snap_kokkos_impl.h | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/src/compute_gaussian_grid_local.rst b/doc/src/compute_gaussian_grid_local.rst index 45ef6642c9..4ae99e7b55 100644 --- a/doc/src/compute_gaussian_grid_local.rst +++ b/doc/src/compute_gaussian_grid_local.rst @@ -56,7 +56,7 @@ and *z* coordinates of the grid point, followed by the values of the Gaussians .. include:: accel_styles.rst - + ---------- diff --git a/doc/src/compute_sna_atom.rst b/doc/src/compute_sna_atom.rst index 2079234ddf..2572093499 100644 --- a/doc/src/compute_sna_atom.rst +++ b/doc/src/compute_sna_atom.rst @@ -476,7 +476,7 @@ al. :ref:`(Lafourcade) `. .. include:: accel_styles.rst - + ---------- Output info diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 247289042e..6c3cea43ce 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -551,9 +551,8 @@ template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { - printf("d_wjelem: %f %f %f %f\n", d_wjelem[0], d_wjelem[1], d_wjelem(0), d_wjelem(1)); SNAKokkos my_sna = snaKK; - + // extract atom number int ii = team.team_rank() + team.league_rank() * team.team_size(); if (ii >= chunk_size) return; From 1f61c9ba828a952e76d53b3a4228ba48ab4d2832 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 19 Dec 2024 17:39:02 +0100 Subject: [PATCH 43/51] I forgot to include a change in merging develop that seems to be very important --- src/KOKKOS/pair_snap_kokkos_impl.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 6c3cea43ce..2b9b862645 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -551,8 +551,6 @@ template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - // extract atom number int ii = team.team_rank() + team.league_rank() * team.team_size(); if (ii >= chunk_size) return; From 399f81cf462a166b86904e04e8aa883ea1217c0d Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Thu, 19 Dec 2024 19:20:41 +0100 Subject: [PATCH 44/51] Code by Aidan Thompson, I am only committing it; fixing the cyclical include that broke the build process after merging develop Co-authored-by: Aidan Thompson --- src/KOKKOS/sna_kokkos.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index a438ccd25e..5ba5c159ac 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -29,7 +29,9 @@ #endif namespace LAMMPS_NS { - +// copied from pair_snap_kokkos.h +// pre-declare so sna_kokkos.h can refer to it +template class PairSNAPKokkos; template struct WignerWrapper { using real_type = real_type_; From 3101bb326341d33aedd261bb47713384be801a24 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 14 Jan 2025 14:34:59 -0700 Subject: [PATCH 45/51] Add new files to GNU Make build system --- src/KOKKOS/Install.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 64ba0c6b03..daa64d64e1 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -106,6 +106,12 @@ action compute_composition_atom_kokkos.cpp compute_composition_atom.cpp action compute_composition_atom_kokkos.h compute_composition_atom.h action compute_orientorder_atom_kokkos.cpp action compute_orientorder_atom_kokkos.h +action compute_sna_grid_kokkos.cpp compute_sna_grid.cpp +action compute_sna_grid_kokkos.h compute_sna_grid.h +action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp +action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp +action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h +action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp action compute_temp_deform_kokkos.cpp action compute_temp_deform_kokkos.h action compute_temp_kokkos.cpp From 0ee4bf621fc8e344afceba69f012b0f0aefd4496 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 14 Jan 2025 14:35:42 -0700 Subject: [PATCH 46/51] Fix some compile issues and remove unused variables --- src/KOKKOS/compute_sna_grid_kokkos.h | 4 +- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 130 ++++++++--------- src/KOKKOS/compute_sna_grid_local_kokkos.h | 4 +- .../compute_sna_grid_local_kokkos_impl.h | 131 ++++++++---------- src/KOKKOS/pair_snap_kokkos.h | 1 - src/KOKKOS/pair_snap_kokkos_impl.h | 3 +- src/KOKKOS/sna_kokkos.h | 4 +- src/KOKKOS/sna_kokkos_impl.h | 13 +- 8 files changed, 130 insertions(+), 160 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index bd47059312..a65ff44546 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -232,7 +232,7 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { Kokkos::View d_radelem; // element radii Kokkos::View d_wjelem; // elements weights - //Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_coeffelem; // element bispectrum coefficients Kokkos::View d_sinnerelem; // element inner cutoff midpoint Kokkos::View d_dinnerelem; // element inner cutoff half-width Kokkos::View d_ninside; // ninside for all atoms in list @@ -272,6 +272,8 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { double h0, h1, h2, h3, h4, h5; double lo0, lo1, lo2; + // Make SNAKokkos a friend + friend class SNAKokkos; }; // These wrapper classes exist to make the compute style factory happy/avoid having diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 2101d5968b..8275e810a3 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -121,13 +121,9 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos } Kokkos::deep_copy(d_test,h_test); - double bytes = MemKK::memory_usage(d_wjelem); - - snaKK = SNAKokkos(rfac0,twojmax, - rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK = SNAKokkos(*this); snaKK.grow_rij(0,0); snaKK.init(); - } // Destructor @@ -380,8 +376,6 @@ void ComputeSNAGridKokkos::operator() (Tag // routines and avoid having to loop over all atoms (which limits us to // natoms = max team size). - SNAKokkos my_sna = snaKK; - // basic quantities associated with this team: // team_rank : rank of thread in this team // league_rank : rank of team in this league @@ -399,10 +393,10 @@ void ComputeSNAGridKokkos::operator() (Tag // This is used to cache whether or not an atom is within the cutoff. // If it is, type_cache is assigned to the atom type. // If it's not, it's assigned to -1. - const int tile_size = ntotal; //max_neighs; // number of elements per thread - const int team_rank = team.team_rank(); - const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team - int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + //const int tile_size = ntotal; //max_neighs; // number of elements per thread + //const int team_rank = team.team_rank(); + //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; // convert to grid indices @@ -456,7 +450,7 @@ void ComputeSNAGridKokkos::operator() (Tag const int itype = 1; int ielem = 0; if (chemflag) ielem = d_map[itype]; - const double radi = d_radelem[ielem]; + //const double radi = d_radelem[ielem]; // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. // The purpose here is to transform for triclinic boxes. @@ -525,22 +519,22 @@ void ComputeSNAGridKokkos::operator() (Tag if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) { int jelem = 0; if (chemflag) jelem = d_map[jtype]; - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp // actually since the views here have values starting at 0, let's use jelem - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; offset++; } } @@ -557,22 +551,22 @@ void ComputeSNAGridKokkos::operator() (Tag int jtype = type(j); int jelem = 0; if (chemflag) jelem = d_map[jtype]; - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp // actually since the views here have values starting at 0, let's use jelem - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; offset++; } } @@ -592,22 +586,22 @@ void ComputeSNAGridKokkos::operator() (Tag int jtype = type(j); int jelem = 0; if (chemflag) jelem = d_map[jtype]; - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp // actually since the views here have values starting at 0, let's use jelem - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; } offset++; } @@ -619,7 +613,6 @@ void ComputeSNAGridKokkos::operator() (Tag template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int ii = iatom_mod + iatom_div * vector_length; if (ii >= chunk_size) return; @@ -627,28 +620,26 @@ void ComputeSNAGridKokkos::operator() (Tag const int ninside = d_ninside(ii); if (jnbor >= ninside) return; - my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); + snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int ii = iatom_mod + iatom_div * vector_length; if (ii >= chunk_size) return; - int itype = type(ii); + //int itype = type(ii); // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp` int ielem = 0; - my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); + snaKK.pre_ui(iatom_mod, j, ielem, iatom_div); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // extract flattened atom_div / neighbor number / bend_location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; @@ -667,7 +658,7 @@ void ComputeSNAGridKokkos::operator() (Tag const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); }); } @@ -675,7 +666,6 @@ void ComputeSNAGridKokkos::operator() (Tag template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // extract flattened atom_div / neighbor number / bend location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; @@ -692,28 +682,27 @@ void ComputeSNAGridKokkos::operator() (Tag const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div); + snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); }); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxu > my_sna.idxu_max) return; + if (idxu > snaKK.idxu_max) return; int elem_count = chemflag ? nelements : 1; for (int ielem = 0; ielem < elem_count; ielem++){ - const FullHalfMapper mapper = my_sna.idxu_full_half[idxu]; + const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); if (mapper.flip_sign == 1){ utot_im = -utot_im; @@ -721,11 +710,11 @@ void ComputeSNAGridKokkos::operator() (Tag utot_re = -utot_re; } - my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; if (mapper.flip_sign == 0) { - my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; } } } @@ -733,46 +722,43 @@ void ComputeSNAGridKokkos::operator() (Tag template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjz >= my_sna.idxz_max) return; + if (jjz >= snaKK.idxz_max) return; - my_sna.compute_zi(iatom_mod,jjz,iatom_div); + snaKK.compute_zi(iatom_mod,jjz,iatom_div); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjb >= my_sna.idxb_max) return; + if (jjb >= snaKK.idxb_max) return; - my_sna.compute_bi(iatom_mod,jjb,iatom_div); + snaKK.compute_bi(iatom_mod,jjb,iatom_div); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxb >= my_sna.idxb_max) return; + if (idxb >= snaKK.idxb_max) return; - const int ntriples = my_sna.ntriples; + const int ntriples = snaKK.ntriples; for (int itriple = 0; itriple < ntriples; itriple++) { - const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); + const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div); - my_sna.blist(iatom, itriple, idxb) = blocal; + snaKK.blist(iatom, itriple, idxb) = blocal; } } @@ -780,8 +766,6 @@ void ComputeSNAGridKokkos::operator() (Tag template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridLocalFill, const int& ii) const { - SNAKokkos my_sna = snaKK; - // extract grid index int igrid = ii + chunk_offset; @@ -840,7 +824,7 @@ void ComputeSNAGridKokkos::operator() (Tag for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - d_gridall(igrid,icoeff+3) = my_sna.blist(ii,idx_chem,idxb); + d_gridall(igrid,icoeff+3) = snaKK.blist(ii,idx_chem,idxb); } } diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h index 9073b921c1..2f2ae59426 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h @@ -225,7 +225,7 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { Kokkos::View d_radelem; // element radii Kokkos::View d_wjelem; // elements weights - //Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_coeffelem; // element bispectrum coefficients Kokkos::View d_sinnerelem; // element inner cutoff midpoint Kokkos::View d_dinnerelem; // element inner cutoff half-width Kokkos::View d_ninside; // ninside for all atoms in list @@ -271,6 +271,8 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { double h0, h1, h2, h3, h4, h5; double lo0, lo1, lo2; + // Make SNAKokkos a friend + friend class SNAKokkos; }; // These wrapper classes exist to make the compute style factory happy/avoid having diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h index 8f6958904b..1a40af4e8c 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h @@ -121,13 +121,9 @@ ComputeSNAGridLocalKokkos::ComputeSNAGridL } Kokkos::deep_copy(d_test,h_test); - double bytes = MemKK::memory_usage(d_wjelem); - - snaKK = SNAKokkos(rfac0,twojmax, - rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK = SNAKokkos(*this); snaKK.grow_rij(0,0); snaKK.init(); - } // Destructor @@ -386,8 +382,6 @@ void ComputeSNAGridLocalKokkos::operator() // routines and avoid having to loop over all atoms (which limits us to // natoms = max team size). - SNAKokkos my_sna = snaKK; - // basic quantities associated with this team: // team_rank : rank of thread in this team // league_rank : rank of team in this league @@ -405,10 +399,10 @@ void ComputeSNAGridLocalKokkos::operator() // This is used to cache whether or not an atom is within the cutoff. // If it is, type_cache is assigned to the atom type. // If it's not, it's assigned to -1. - const int tile_size = ntotal; //max_neighs; // number of elements per thread - const int team_rank = team.team_rank(); - const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team - int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + //const int tile_size = ntotal; //max_neighs; // number of elements per thread + //const int team_rank = team.team_rank(); + //const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + //int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; // convert to grid indices @@ -475,7 +469,7 @@ void ComputeSNAGridLocalKokkos::operator() const int itype = 1; int ielem = 0; if (chemflag) ielem = d_map[itype]; - const double radi = d_radelem[ielem]; + //const double radi = d_radelem[ielem]; // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. // The purpose here is to transform for triclinic boxes. @@ -503,7 +497,6 @@ void ComputeSNAGridLocalKokkos::operator() if (jtype >= 0) ninside++; - } /* @@ -544,22 +537,22 @@ void ComputeSNAGridLocalKokkos::operator() if (rsq < rnd_cutsq(itype,jtype) && rsq > 1e-20) { int jelem = 0; if (chemflag) jelem = d_map[jtype]; - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp // actually since the views here have values starting at 0, let's use jelem - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; offset++; } } @@ -576,22 +569,22 @@ void ComputeSNAGridLocalKokkos::operator() int jtype = type(j); int jelem = 0; if (chemflag) jelem = d_map[jtype]; - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp // actually since the views here have values starting at 0, let's use jelem - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; offset++; } } @@ -611,22 +604,22 @@ void ComputeSNAGridLocalKokkos::operator() int jtype = type(j); int jelem = 0; if (chemflag) jelem = d_map[jtype]; - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp // actually since the views here have values starting at 0, let's use jelem - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; } offset++; } @@ -638,7 +631,6 @@ void ComputeSNAGridLocalKokkos::operator() template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int ii = iatom_mod + iatom_div * vector_length; if (ii >= chunk_size) return; @@ -646,28 +638,26 @@ void ComputeSNAGridLocalKokkos::operator() const int ninside = d_ninside(ii); if (jnbor >= ninside) return; - my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); + snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int iatom_mod, const int j, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int ii = iatom_mod + iatom_div * vector_length; if (ii >= chunk_size) return; - int itype = type(ii); + //int itype = type(ii); // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp` int ielem = 0; - my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); + snaKK.pre_ui(iatom_mod, j, ielem, iatom_div); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // extract flattened atom_div / neighbor number / bend_location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; @@ -686,7 +676,7 @@ void ComputeSNAGridLocalKokkos::operator() const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); }); } @@ -694,7 +684,6 @@ void ComputeSNAGridLocalKokkos::operator() template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; // extract flattened atom_div / neighbor number / bend location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; @@ -711,28 +700,27 @@ void ComputeSNAGridLocalKokkos::operator() const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div); + snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); }); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxu > my_sna.idxu_max) return; + if (idxu > snaKK.idxu_max) return; int elem_count = chemflag ? nelements : 1; for (int ielem = 0; ielem < elem_count; ielem++){ - const FullHalfMapper mapper = my_sna.idxu_full_half[idxu]; + const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); if (mapper.flip_sign == 1){ utot_im = -utot_im; @@ -740,11 +728,11 @@ void ComputeSNAGridLocalKokkos::operator() utot_re = -utot_re; } - my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; if (mapper.flip_sign == 0) { - my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; } } } @@ -752,46 +740,43 @@ void ComputeSNAGridLocalKokkos::operator() template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjz >= my_sna.idxz_max) return; + if (jjz >= snaKK.idxz_max) return; - my_sna.compute_zi(iatom_mod,jjz,iatom_div); + snaKK.compute_zi(iatom_mod,jjz,iatom_div); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjb >= my_sna.idxb_max) return; + if (jjb >= snaKK.idxb_max) return; - my_sna.compute_bi(iatom_mod,jjb,iatom_div); + snaKK.compute_bi(iatom_mod,jjb,iatom_div); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (idxb >= my_sna.idxb_max) return; + if (idxb >= snaKK.idxb_max) return; - const int ntriples = my_sna.ntriples; + const int ntriples = snaKK.ntriples; for (int itriple = 0; itriple < ntriples; itriple++) { - const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); + const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div); - my_sna.blist(iatom, itriple, idxb) = blocal; + snaKK.blist(iatom, itriple, idxb) = blocal; } } @@ -799,8 +784,6 @@ void ComputeSNAGridLocalKokkos::operator() template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocal2Fill, const int& ii) const { - SNAKokkos my_sna = snaKK; - // extract grid index int igrid = ii + chunk_offset; @@ -859,7 +842,7 @@ void ComputeSNAGridLocalKokkos::operator() for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - d_alocal(igrid,icoeff+6) = my_sna.blist(ii,idx_chem,idxb); + d_alocal(igrid,icoeff+6) = snaKK.blist(ii,idx_chem,idxb); } } diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index 660503eed8..4dc4029d12 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -375,7 +375,6 @@ class PairSNAPKokkos : public PairSNAP { // Make SNAKokkos a friend friend class SNAKokkos; - }; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 2b9b862645..783043e6d9 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -536,8 +536,7 @@ void PairSNAPKokkos::coeff(int narg, char Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); Kokkos::deep_copy(d_map,h_map); - snaKK = SNAKokkos(*this); //rfac0,twojmax, - //rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK = SNAKokkos(*this); snaKK.grow_rij(0,0); snaKK.init(); } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 5ba5c159ac..61aebaf97d 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -172,9 +172,9 @@ class SNAKokkos { KOKKOS_INLINE_FUNCTION SNAKokkos(const SNAKokkos& sna, const typename Kokkos::TeamPolicy::member_type& team); + template inline - //SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int); - SNAKokkos(const PairSNAPKokkos&); + SNAKokkos(const CopyClass&); KOKKOS_INLINE_FUNCTION ~SNAKokkos(); diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 1ea971d146..622ef0b8ae 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -29,17 +29,18 @@ static const double MY_PI = 3.14159265358979323846; // pi static const double MY_PI2 = 1.57079632679489661923; // pi/2 template +template inline -SNAKokkos::SNAKokkos(const PairSNAPKokkos& psk) - : rfac0(psk.rfac0), rmin0(psk.rmin0), switch_flag(psk.switchflag), - bzero_flag(psk.bzeroflag), chem_flag(psk.chemflag), bnorm_flag(psk.bnormflag), - wselfall_flag(psk.wselfallflag), switch_inner_flag(psk.switchinnerflag), - quadratic_flag(psk.quadraticflag), twojmax(psk.twojmax), d_coeffelem(psk.d_coeffelem) +SNAKokkos::SNAKokkos(const CopyClass& copy) + : twojmax(copy.twojmax), d_coeffelem(copy.d_coeffelem), rmin0(copy.rmin0), + rfac0(copy.rfac0), switch_flag(copy.switchflag), switch_inner_flag(copy.switchinnerflag), + chem_flag(copy.chemflag), bnorm_flag(copy.bnormflag), wselfall_flag(copy.wselfallflag), + quadratic_flag(copy.quadraticflag), bzero_flag(copy.bzeroflag) { wself = static_cast(1.0); if (chem_flag) - nelements = psk.nelements; + nelements = copy.nelements; else nelements = 1; From 008bf146938fa1992084556e85141222eaa81983 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 14 Jan 2025 15:49:40 -0700 Subject: [PATCH 47/51] Fix compile issues from #4391 --- src/KOKKOS/compute_sna_grid_kokkos.h | 71 ++++-- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 217 ++++++++++-------- src/KOKKOS/compute_sna_grid_local_kokkos.h | 69 ++++-- .../compute_sna_grid_local_kokkos_impl.h | 214 +++++++++-------- src/KOKKOS/pair_snap_kokkos_impl.h | 22 +- 5 files changed, 355 insertions(+), 238 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index a65ff44546..ac378b07df 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -53,7 +53,6 @@ struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero yl struct TagPairSNAPComputeZi{}; struct TagPairSNAPBeta{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYiWithZlist{}; template @@ -68,9 +67,8 @@ struct TagCSNAGridPreUi{}; struct TagCSNAGridComputeUiSmall{}; // more parallelism, more divergence struct TagCSNAGridComputeUiLarge{}; // less parallelism, no divergence struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist -struct TagCSNAGridComputeZi{}; -struct TagCSNAGridComputeBi{}; -struct TagCSNAGridTransformBi{}; // re-order blist from AoSoA to AoS +template struct TagCSNAGridComputeZi{}; +template struct TagCSNAGridComputeBi{}; struct TagCSNAGridLocalFill{}; // fill the gridlocal array //struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce @@ -114,9 +112,10 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { static constexpr int team_size_compute_ui = 2; static constexpr int tile_size_transform_ui = 2; static constexpr int tile_size_compute_zi = 2; + static constexpr int min_blocks_compute_zi = 0; // no minimum bound static constexpr int tile_size_compute_bi = 2; - static constexpr int tile_size_transform_bi = 2; static constexpr int tile_size_compute_yi = 2; + static constexpr int min_blocks_compute_yi = 0; // no minimum bound static constexpr int team_size_compute_fused_deidrj = 2; #else static constexpr int team_size_compute_neigh = 4; @@ -126,33 +125,44 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { static constexpr int tile_size_transform_ui = 4; static constexpr int tile_size_compute_zi = 8; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_transform_bi = 4; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; + + // this empirically reduces perf fluctuations from compiler version to compiler version + static constexpr int min_blocks_compute_zi = 4; + static constexpr int min_blocks_compute_yi = 4; #endif // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches // This hides the Kokkos::IndexType and Kokkos::Rank<3...> // and reduces the verbosity of the LaunchBound by hiding the explicit // multiplication by vector_length - template - using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNAP>; + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNA>; // MDRangePolicy for the 3D grid loop: - template + template using CSNAGrid3DPolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>; // Testing out team policies - template - using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; - //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, Kokkos::IndexType, Kokkos::IndexType, TagComputeSNAP>; + template + using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNA>; + //using CSNAGridTeamPolicy = typename Kokkos::TeamPolicy, Kokkos::IndexType, Kokkos::IndexType, TagComputeSNA>; //using team_member = typename team_policy::member_type; // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches // This hides the LaunchBounds abstraction by hiding the explicit // multiplication by vector length - template - using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; + template + using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNA>; + + // Helper routine that returns a CPU or a GPU policy as appropriate + template + auto snap_get_policy(const int& chunk_size_div, const int& second_loop) { + return Snap3DRangePolicy({0, 0, 0}, + {vector_length, second_loop, chunk_size_div}, + {vector_length, num_tiles, 1}); + } ComputeSNAGridKokkos(class LAMMPS *, int, char **); ~ComputeSNAGridKokkos() override; @@ -193,7 +203,13 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { void operator() (TagCSNAGridComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridPreUi,const int iatom_mod, const int j, const int iatom_div) const; + void operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridPreUi, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -202,16 +218,31 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { void operator() (TagCSNAGridComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridTransformUi,const int iatom_mod, const int j, const int iatom_div) const; + void operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; + void operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const; KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + void operator() (TagCSNAGridTransformUi, const int& iatom) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeZi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeZi, const int& iatom, const int& idxz) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeZi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeBi, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeBi, const int& iatom, const int& idxb) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridComputeBi, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagCSNAGridLocalFill,const int& ii) const; diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 8275e810a3..ec69b8bbdc 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -222,7 +222,7 @@ void ComputeSNAGridKokkos::compute_array() // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; - if (triclinic){ + if (triclinic) { /* xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; @@ -266,10 +266,8 @@ void ComputeSNAGridKokkos::compute_array() //PreUi { - // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h` - Snap3DRangePolicy - policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); - Kokkos::parallel_for("PreUi",policy_preui,*this); + auto policy_pre_ui = snap_get_policy(chunk_size_div, twojmax + 1); + Kokkos::parallel_for("PreUi", policy_pre_ui, *this); } // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot @@ -292,7 +290,7 @@ void ComputeSNAGridKokkos::compute_array() policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); } else { - // Version w/out parallelism over j_bend + // Version w/out parallelism over j_bend // total number of teams needed: (natoms / 32) * (ntotal) const int n_teams = chunk_size_div * max_neighs; @@ -307,33 +305,29 @@ void ComputeSNAGridKokkos::compute_array() //TransformUi: un-"fold" ulisttot, zero ylist { - // team_size_transform_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); - Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + // Expand ulisttot_re,_im -> ulisttot + // Zero out ylist + auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); + Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); } - //Compute bispectrum in AoSoA data layout, transform Bi + //Compute bispectrum + // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` - //ComputeZi - const int idxz_max = snaKK.idxz_max; - Snap3DRangePolicy - policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); - Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + //ComputeZi and Bi + if (nelements > 1) { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this); - //ComputeBi - const int idxb_max = snaKK.idxb_max; - Snap3DRangePolicy - policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); - Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this); + } else { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); - //Transform data layout of blist out of AoSoA - //We need this because `blist` gets used in ComputeForce which doesn't - //take advantage of AoSoA, which at best would only be beneficial on the margins - //NOTE: Do we need this in compute sna/grid/kk? - Snap3DRangePolicy - policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); - Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this); + } // Fill the grid array with bispectrum values { @@ -346,6 +340,8 @@ void ComputeSNAGridKokkos::compute_array() } // end while + copymode = 0; + k_gridlocal.template modify(); k_gridlocal.template sync(); @@ -478,7 +474,6 @@ void ComputeSNAGridKokkos::operator() (Tag if (jtype >= 0) ninside++; - } /* @@ -609,39 +604,68 @@ void ComputeSNAGridKokkos::operator() (Tag */ } +/* ---------------------------------------------------------------------- + Pre-compute the Cayley-Klein parameters for reuse in later routines +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - const int ii = iatom_mod + iatom_div * vector_length; - if (ii >= chunk_size) return; + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; - const int ninside = d_ninside(ii); + const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; - snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div); + snaKK.compute_cayley_klein(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + Initialize the "ulisttot" structure with non-zero on-diagonal terms + and zero terms elsewhere +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); } template KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int iatom_mod, const int j, const int iatom_div) const { +void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int& iatom, const int& j) const { + if (iatom >= chunk_size) return; - const int ii = iatom_mod + iatom_div * vector_length; - if (ii >= chunk_size) return; + int itype = type(iatom); + int ielem = d_map[itype]; - //int itype = type(ii); - // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp` - int ielem = 0; + snaKK.pre_ui(iatom, j, ielem); +} - snaKK.pre_ui(iatom_mod, j, ielem, iatom_div); +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridPreUi, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int itype = type(iatom); + const int ielem = d_map[itype]; + + for (int j = 0; j <= twojmax; j++) + snaKK.pre_ui(iatom, j, ielem); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { - // extract flattened atom_div / neighbor number / bend_location + // extract flattened atom_div / neighbor number / bend location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; // extract neighbor index, iatom_div @@ -686,81 +710,90 @@ void ComputeSNAGridKokkos::operator() (Tag }); } +/* ---------------------------------------------------------------------- + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure. Zero-initialize ylist. CPU and GPU. +------------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { - +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - - if (idxu > snaKK.idxu_max) return; - - int elem_count = chemflag ? nelements : 1; - - for (int ielem = 0; ielem < elem_count; ielem++){ - - const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - - auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - - if (mapper.flip_sign == 1){ - utot_im = -utot_im; - } else if (mapper.flip_sign == -1){ - utot_re = -utot_re; - } - - snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; - - if (mapper.flip_sign == 0) { - snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - } - } + if (idxu >= snaKK.idxu_max) return; + snaKK.transform_ui(iatom, idxu); } template KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi, const int& iatom, const int& idxu) const { + if (iatom >= chunk_size) return; + snaKK.transform_ui(iatom, idxu); +} +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformUi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int idxu = 0; idxu < snaKK.idxu_max; idxu++) + snaKK.transform_ui(iatom, idxu); +} + +/* ---------------------------------------------------------------------- + Compute all elements of the Z tensor and store them into the `zlist` + view +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_zi(iatom_mod,jjz,iatom_div); + snaKK.template compute_zi(iatom, jjz); } template -KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi, const int& iatom, const int& jjz) const { + if (iatom >= chunk_size) return; + snaKK.template compute_zi(iatom, jjz); +} +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeZi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_zi(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Compute the energy triple products and store in the "blist" view +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi, const int& iatom_mod, const int& jjb, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjb >= snaKK.idxb_max) return; - - snaKK.compute_bi(iatom_mod,jjb,iatom_div); + snaKK.template compute_bi(iatom, jjb); } template -KOKKOS_INLINE_FUNCTION -void ComputeSNAGridKokkos::operator() (TagCSNAGridTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi, const int& iatom, const int& jjb) const { if (iatom >= chunk_size) return; + snaKK.template compute_bi(iatom, jjb); +} - if (idxb >= snaKK.idxb_max) return; - - const int ntriples = snaKK.ntriples; - - for (int itriple = 0; itriple < ntriples; itriple++) { - - const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div); - - snaKK.blist(iatom, itriple, idxb) = blocal; - } - +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridKokkos::operator() (TagCSNAGridComputeBi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) + snaKK.template compute_bi(iatom, jjb); } template diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h index 2f2ae59426..735e1b03d0 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h @@ -53,7 +53,6 @@ struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero yl struct TagPairSNAPComputeZi{}; struct TagPairSNAPBeta{}; struct TagPairSNAPComputeBi{}; -struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYiWithZlist{}; template @@ -68,9 +67,8 @@ struct TagCSNAGridLocalPreUi{}; struct TagCSNAGridLocalComputeUiSmall{}; // more parallelism, more divergence struct TagCSNAGridLocalComputeUiLarge{}; // less parallelism, no divergence struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist -struct TagCSNAGridLocalComputeZi{}; -struct TagCSNAGridLocalComputeBi{}; -struct TagCSNAGridLocalTransformBi{}; // re-order blist from AoSoA to AoS +template struct TagCSNAGridLocalComputeZi{}; +template struct TagCSNAGridLocalComputeBi{}; struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array //struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce @@ -113,9 +111,10 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { static constexpr int team_size_compute_ui = 2; static constexpr int tile_size_transform_ui = 2; static constexpr int tile_size_compute_zi = 2; + static constexpr int min_blocks_compute_zi = 0; // no minimum bound static constexpr int tile_size_compute_bi = 2; - static constexpr int tile_size_transform_bi = 2; static constexpr int tile_size_compute_yi = 2; + static constexpr int min_blocks_compute_yi = 0; // no minimum bound static constexpr int team_size_compute_fused_deidrj = 2; #else static constexpr int team_size_compute_neigh = 4; @@ -125,31 +124,42 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { static constexpr int tile_size_transform_ui = 4; static constexpr int tile_size_compute_zi = 8; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_transform_bi = 4; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; + + // this empirically reduces perf fluctuations from compiler version to compiler version + static constexpr int min_blocks_compute_zi = 4; + static constexpr int min_blocks_compute_yi = 4; #endif // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches // This hides the Kokkos::IndexType and Kokkos::Rank<3...> // and reduces the verbosity of the LaunchBound by hiding the explicit // multiplication by vector_length - template - using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNAP>; + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagComputeSNA>; // MDRangePolicy for the 3D grid loop: - template + template using CSNAGridLocal3DPolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>>; // Testing out team policies - template - using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; + template + using CSNAGridLocalTeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNA>; // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches // This hides the LaunchBounds abstraction by hiding the explicit // multiplication by vector length - template - using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNAP>; + template + using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagComputeSNA>; + + // Helper routine that returns a CPU or a GPU policy as appropriate + template + auto snap_get_policy(const int& chunk_size_div, const int& second_loop) { + return Snap3DRangePolicy({0, 0, 0}, + {vector_length, second_loop, chunk_size_div}, + {vector_length, num_tiles, 1}); + } ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **); ~ComputeSNAGridLocalKokkos() override; @@ -186,7 +196,13 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { void operator() (TagCSNAGridLocalComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridLocalPreUi,const int iatom_mod, const int j, const int iatom_div) const; + void operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalPreUi, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -195,16 +211,31 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { void operator() (TagCSNAGridLocalComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int j, const int iatom_div) const; + void operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; + void operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const; KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + void operator() (TagCSNAGridLocalTransformUi, const int& iatom) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeZi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeZi, const int& iatom, const int& idxz) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeZi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeBi, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeBi, const int& iatom, const int& idxb) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagCSNAGridLocalComputeBi, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagCSNAGridLocal2Fill,const int& ii) const; diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h index 1a40af4e8c..1c3fed3a0c 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h @@ -232,7 +232,7 @@ void ComputeSNAGridLocalKokkos::compute_lo // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; - if (triclinic){ + if (triclinic) { /* xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; @@ -276,10 +276,8 @@ void ComputeSNAGridLocalKokkos::compute_lo //PreUi { - // tile_size_pre_ui is defined in `compute_sna_grid_kokkos.h` - Snap3DRangePolicy - policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); - Kokkos::parallel_for("PreUi",policy_preui,*this); + auto policy_pre_ui = snap_get_policy(chunk_size_div, twojmax + 1); + Kokkos::parallel_for("PreUi", policy_pre_ui, *this); } // ComputeUi w/ vector parallelism, shared memory, direct atomicAdd into ulisttot @@ -302,7 +300,7 @@ void ComputeSNAGridLocalKokkos::compute_lo policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUiSmall", policy_ui, *this); } else { - // Version w/out parallelism over j_bend + // Version w/out parallelism over j_bend // total number of teams needed: (natoms / 32) * (ntotal) const int n_teams = chunk_size_div * max_neighs; @@ -317,33 +315,29 @@ void ComputeSNAGridLocalKokkos::compute_lo //TransformUi: un-"fold" ulisttot, zero ylist { - // team_size_transform_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); - Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); + // Expand ulisttot_re,_im -> ulisttot + // Zero out ylist + auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); + Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); } - //Compute bispectrum in AoSoA data layout, transform Bi + //Compute bispectrum + // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` - //ComputeZi - const int idxz_max = snaKK.idxz_max; - Snap3DRangePolicy - policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); - Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); + //ComputeZi and Bi + if (nelements > 1) { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this); - //ComputeBi - const int idxb_max = snaKK.idxb_max; - Snap3DRangePolicy - policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); - Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this); + } else { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); - //Transform data layout of blist out of AoSoA - //We need this because `blist` gets used in ComputeForce which doesn't - //take advantage of AoSoA, which at best would only be beneficial on the margins - //NOTE: Do we need this in compute sna/grid/kk? - Snap3DRangePolicy - policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); - Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this); + } // Fill the grid array with bispectrum values { @@ -627,39 +621,68 @@ void ComputeSNAGridLocalKokkos::operator() */ } +/* ---------------------------------------------------------------------- + Pre-compute the Cayley-Klein parameters for reuse in later routines +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - const int ii = iatom_mod + iatom_div * vector_length; - if (ii >= chunk_size) return; + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; - const int ninside = d_ninside(ii); + const int ninside = d_ninside(iatom); if (jnbor >= ninside) return; - snaKK.compute_cayley_klein(iatom_mod,jnbor,iatom_div); + snaKK.compute_cayley_klein(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + Initialize the "ulisttot" structure with non-zero on-diagonal terms + and zero terms elsewhere +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); } template KOKKOS_INLINE_FUNCTION -void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int iatom_mod, const int j, const int iatom_div) const { +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int& iatom, const int& j) const { + if (iatom >= chunk_size) return; - const int ii = iatom_mod + iatom_div * vector_length; - if (ii >= chunk_size) return; + int itype = type(iatom); + int ielem = d_map[itype]; - //int itype = type(ii); - // force ielem to be zero (i.e. type 1) per `compute_sna_grid.cpp` - int ielem = 0; + snaKK.pre_ui(iatom, j, ielem); +} - snaKK.pre_ui(iatom_mod, j, ielem, iatom_div); +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalPreUi, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int itype = type(iatom); + const int ielem = d_map[itype]; + + for (int j = 0; j <= twojmax; j++) + snaKK.pre_ui(iatom, j, ielem); } template KOKKOS_INLINE_FUNCTION void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { - // extract flattened atom_div / neighbor number / bend_location + // extract flattened atom_div / neighbor number / bend location int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; // extract neighbor index, iatom_div @@ -704,81 +727,90 @@ void ComputeSNAGridLocalKokkos::operator() }); } +/* ---------------------------------------------------------------------- + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure. Zero-initialize ylist. CPU and GPU. +------------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION -void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { - +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - - if (idxu > snaKK.idxu_max) return; - - int elem_count = chemflag ? nelements : 1; - - for (int ielem = 0; ielem < elem_count; ielem++){ - - const FullHalfMapper mapper = snaKK.idxu_full_half[idxu]; - - auto utot_re = snaKK.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - auto utot_im = snaKK.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - - if (mapper.flip_sign == 1){ - utot_im = -utot_im; - } else if (mapper.flip_sign == -1){ - utot_re = -utot_re; - } - - snaKK.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; - - if (mapper.flip_sign == 0) { - snaKK.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - snaKK.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - } - } + if (idxu >= snaKK.idxu_max) return; + snaKK.transform_ui(iatom, idxu); } template KOKKOS_INLINE_FUNCTION -void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi, const int& iatom, const int& idxu) const { + if (iatom >= chunk_size) return; + snaKK.transform_ui(iatom, idxu); +} +template +KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformUi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int idxu = 0; idxu < snaKK.idxu_max; idxu++) + snaKK.transform_ui(iatom, idxu); +} + +/* ---------------------------------------------------------------------- + Compute all elements of the Z tensor and store them into the `zlist` + view +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjz >= snaKK.idxz_max) return; - - snaKK.compute_zi(iatom_mod,jjz,iatom_div); + snaKK.template compute_zi(iatom, jjz); } template -KOKKOS_INLINE_FUNCTION -void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi, const int& iatom, const int& jjz) const { + if (iatom >= chunk_size) return; + snaKK.template compute_zi(iatom, jjz); +} +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeZi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_zi(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Compute the energy triple products and store in the "blist" view +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi, const int& iatom_mod, const int& jjb, const int& iatom_div) const { const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (jjb >= snaKK.idxb_max) return; - - snaKK.compute_bi(iatom_mod,jjb,iatom_div); + snaKK.template compute_bi(iatom, jjb); } template -KOKKOS_INLINE_FUNCTION -void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - - const int iatom = iatom_mod + iatom_div * vector_length; +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi, const int& iatom, const int& jjb) const { if (iatom >= chunk_size) return; + snaKK.template compute_bi(iatom, jjb); +} - if (idxb >= snaKK.idxb_max) return; - - const int ntriples = snaKK.ntriples; - - for (int itriple = 0; itriple < ntriples; itriple++) { - - const real_type blocal = snaKK.blist_pack(iatom_mod, idxb, itriple, iatom_div); - - snaKK.blist(iatom, itriple, idxb) = blocal; - } - +template +template KOKKOS_INLINE_FUNCTION +void ComputeSNAGridLocalKokkos::operator() (TagCSNAGridLocalComputeBi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) + snaKK.template compute_bi(iatom, jjb); } template diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 783043e6d9..17ce8e1c9d 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -3,12 +3,10 @@ LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories LAMMPS development team: developers@lammps.org - Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. - See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ @@ -39,17 +37,6 @@ namespace LAMMPS_NS { -// Outstanding issues with quadratic term -// 1. there seems to a problem with compute_optimized energy calc -// it does not match compute_regular, even when quadratic coeffs = 0 - -//static double t1 = 0.0; -//static double t2 = 0.0; -//static double t3 = 0.0; -//static double t4 = 0.0; -//static double t5 = 0.0; -//static double t6 = 0.0; -//static double t7 = 0.0; /* ---------------------------------------------------------------------- */ template @@ -219,7 +206,8 @@ void PairSNAPKokkos::compute(int eflag_in, // team_size_compute_neigh is defined in `pair_snap_kokkos.h` int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); - SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); + SnapAoSoATeamPolicy + policy_neigh(chunk_size,team_size_compute_neigh,vector_length); policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); } @@ -259,7 +247,8 @@ void PairSNAPKokkos::compute(int eflag_in, const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this); } else { @@ -269,7 +258,8 @@ void PairSNAPKokkos::compute(int eflag_in, const int n_teams = chunk_size_div * max_neighs; const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + SnapAoSoATeamPolicy + policy_ui(n_teams_div, team_size_compute_ui, vector_length); policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this); } From eb5977dc66881f63d0c6a200c8321845e261094a Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 14 Jan 2025 15:57:15 -0700 Subject: [PATCH 48/51] Fix issues with host_flag --- src/KOKKOS/compute_sna_grid_kokkos.h | 3 --- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 26 +------------------ src/KOKKOS/compute_sna_grid_local_kokkos.h | 3 --- .../compute_sna_grid_local_kokkos_impl.h | 25 +----------------- 4 files changed, 2 insertions(+), 55 deletions(-) diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index ac378b07df..5a81309a4e 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -167,7 +167,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { ComputeSNAGridKokkos(class LAMMPS *, int, char **); ~ComputeSNAGridKokkos() override; - void init() override; void setup() override; void compute_array() override; @@ -321,7 +320,6 @@ class ComputeSNAGridKokkosDevice : public ComputeSNAGridKokkos::~ComputeSNAGridKokko //memoryKK->destroy_kokkos(k_gridlocal, gridlocal); } -// Init - -template -void ComputeSNAGridKokkos::init() -{ - if (host_flag) { - return; - } - ComputeSNAGrid::init(); - -} - // Setup template void ComputeSNAGridKokkos::setup() { - // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. @@ -184,6 +171,7 @@ template void ComputeSNAGridKokkos::compute_array() { if (host_flag) { + ComputeSNAGrid::compute_array(); return; } @@ -907,12 +895,6 @@ template ComputeSNAGridKokkosDevice::ComputeSNAGridKokkosDevice(class LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridKokkos(lmp, narg, arg) { ; } -template -void ComputeSNAGridKokkosDevice::init() -{ - Base::init(); -} - template void ComputeSNAGridKokkosDevice::compute_array() { @@ -924,12 +906,6 @@ template ComputeSNAGridKokkosHost::ComputeSNAGridKokkosHost(class LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridKokkos(lmp, narg, arg) { ; } -template -void ComputeSNAGridKokkosHost::init() -{ - Base::init(); -} - template void ComputeSNAGridKokkosHost::compute_array() { diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h index 735e1b03d0..754d4e36af 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h @@ -164,7 +164,6 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { ComputeSNAGridLocalKokkos(class LAMMPS *, int, char **); ~ComputeSNAGridLocalKokkos() override; - void init() override; void setup() override; void compute_local() override; @@ -320,7 +319,6 @@ class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos::~ComputeSNAGrid //memoryKK->destroy_kokkos(k_gridlocal, gridlocal); } -// Init - -template -void ComputeSNAGridLocalKokkos::init() -{ - if (host_flag) { - return; - } - ComputeSNAGridLocal::init(); - -} - // Setup template @@ -191,6 +179,7 @@ template void ComputeSNAGridLocalKokkos::compute_local() { if (host_flag) { + ComputeSNAGridLocal::compute_array(); return; } @@ -924,12 +913,6 @@ template ComputeSNAGridLocalKokkosDevice::ComputeSNAGridLocalKokkosDevice(class LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocalKokkos(lmp, narg, arg) { ; } -template -void ComputeSNAGridLocalKokkosDevice::init() -{ - Base::init(); -} - template void ComputeSNAGridLocalKokkosDevice::compute_local() { @@ -941,12 +924,6 @@ template ComputeSNAGridLocalKokkosHost::ComputeSNAGridLocalKokkosHost(class LAMMPS *lmp, int narg, char **arg) : ComputeSNAGridLocalKokkos(lmp, narg, arg) { ; } -template -void ComputeSNAGridLocalKokkosHost::init() -{ - Base::init(); -} - template void ComputeSNAGridLocalKokkosHost::compute_local() { From 536aa7cadffd8ea998ea0b89e7c1cc569964c2fe Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Tue, 21 Jan 2025 17:09:04 +0100 Subject: [PATCH 49/51] Removed debug comments/old printfs etc. --- .../compute_gaussian_grid_local_kokkos.cpp | 53 ------- .../compute_gaussian_grid_local_kokkos.h | 22 --- src/KOKKOS/compute_sna_grid_kokkos.cpp | 56 ------- src/KOKKOS/compute_sna_grid_kokkos.h | 91 ----------- src/KOKKOS/compute_sna_grid_kokkos_impl.h | 130 --------------- src/KOKKOS/compute_sna_grid_local_kokkos.cpp | 56 ------- src/KOKKOS/compute_sna_grid_local_kokkos.h | 58 ------- .../compute_sna_grid_local_kokkos_impl.h | 150 ------------------ src/ML-SNAP/compute_gaussian_grid_local.cpp | 4 - src/ML-SNAP/compute_grid.cpp | 3 - src/ML-SNAP/compute_grid_local.cpp | 8 - 11 files changed, 631 deletions(-) diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp index 99380e0d63..cfd7e5a582 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.cpp @@ -54,17 +54,12 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP host_flag = (execution_space == Host); - // TODO: Extract cutsq in double loop below, no need for cutsq_tmp - - //cutsq_tmp = cutsq[1][1]; - for (int i = 1; i <= atom->ntypes; i++) { for (int j = 1; j <= atom->ntypes; j++){ k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutsq[i][j]; //cutsq_tmp; k_cutsq.template modify(); } } - //printf(">>> 1\n"); // Set up element lists int n = atom->ntypes; MemKK::realloc_kokkos(d_radelem,"ComputeSNAGridKokkos::radelem",n); @@ -72,13 +67,11 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP MemKK::realloc_kokkos(d_prefacelem,"ComputeSNAGridKokkos::prefacelem",n+1); MemKK::realloc_kokkos(d_argfacelem,"ComputeSNAGridKokkos::argfacelem",n+1); MemKK::realloc_kokkos(d_map,"ComputeSNAGridKokkos::map",n+1); - //printf(">>> 2\n"); auto h_radelem = Kokkos::create_mirror_view(d_radelem); auto h_sigmaelem = Kokkos::create_mirror_view(d_sigmaelem); auto h_prefacelem = Kokkos::create_mirror_view(d_prefacelem); auto h_argfacelem = Kokkos::create_mirror_view(d_argfacelem); auto h_map = Kokkos::create_mirror_view(d_map); - //printf(">>> 3\n"); // start from index 1 because of how compute sna/grid is for (int i = 1; i <= atom->ntypes; i++) { h_radelem(i-1) = radelem[i]; @@ -86,21 +79,11 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP h_prefacelem(i-1) = prefacelem[i]; h_argfacelem(i-1) = argfacelem[i]; } - //printf(">>> 4\n"); - // In pair snap some things like `map` get allocated regardless of chem flag. - // In this compute, however, map does not get allocated in parent classes. - /* - for (int i = 1; i <= atom->ntypes; i++) { - h_map(i) = map[i]; - } - */ - //printf(">>> 5\n"); Kokkos::deep_copy(d_radelem,h_radelem); Kokkos::deep_copy(d_sigmaelem,h_sigmaelem); Kokkos::deep_copy(d_prefacelem, h_prefacelem); Kokkos::deep_copy(d_argfacelem, h_argfacelem); Kokkos::deep_copy(d_map,h_map); - //printf(">>> 6\n"); } @@ -109,14 +92,12 @@ ComputeGaussianGridLocalKokkos::ComputeGaussianGridLocalKokkos(LAMMP template ComputeGaussianGridLocalKokkos::~ComputeGaussianGridLocalKokkos() { - //printf(">>> ComputeGaussianGridLocalKokkos destruct begin, copymode %d\n", copymode); if (copymode) return; memoryKK->destroy_kokkos(k_cutsq,cutsq); memoryKK->destroy_kokkos(k_alocal,alocal); //gridlocal_allocated = 0; - //printf(">>> ComputeGaussianGridLocalKokkos end\n"); } /* ---------------------------------------------------------------------- */ @@ -125,25 +106,12 @@ template void ComputeGaussianGridLocalKokkos::setup() { - // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. - // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. - - //ComputeGrid::set_grid_global(); - //ComputeGrid::set_grid_local(); ComputeGridLocal::setup(); // allocate arrays - //printf(">>> rows cols kokkos init: %d %d\n", size_local_rows, size_local_cols); memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal"); - - //gridlocal_allocated = 1; - //array = gridall; - array_local = alocal; - d_alocal = k_alocal.template view(); - //d_grid = k_grid.template view(); - //d_gridall = k_gridall.template view(); } @@ -160,8 +128,6 @@ void ComputeGaussianGridLocalKokkos::init() template void ComputeGaussianGridLocalKokkos::compute_local() { - //printf(">>> compute_local Kokkos begin\n"); - if (host_flag) { return; } @@ -202,11 +168,6 @@ void ComputeGaussianGridLocalKokkos::compute_local() team_size_default = 1; // cost will increase with increasing team size //32;//max_neighs; if (triclinic){ - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ h0 = domain->h[0]; h1 = domain->h[1]; h2 = domain->h[2]; @@ -228,9 +189,7 @@ void ComputeGaussianGridLocalKokkos::compute_local() int vector_length = vector_length_default; int team_size = team_size_default; check_team_size_for(chunk_size,team_size,vector_length); - //printf(">>> Check 1 %d %d %d\n", chunk_size, team_size, vector_length); typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); - //printf(">>> Check 2\n"); Kokkos::parallel_for("ComputeGaussianGridLocalNeigh",policy_neigh,*this); } @@ -243,8 +202,6 @@ void ComputeGaussianGridLocalKokkos::compute_local() k_alocal.template modify(); k_alocal.template sync(); - //printf(">>> k_alocal: %f\n", k_alocal.h_view(0,6)); - } /* ---------------------------------------------------------------------- */ @@ -254,7 +211,6 @@ KOKKOS_INLINE_FUNCTION void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianGridLocalNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { const int ii = team.league_rank(); - //printf("%d\n", ii); if (ii >= chunk_size) return; @@ -284,7 +240,6 @@ void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianG // index ii already captures the proper grid point //int igrid = iz * (nx * ny) + iy * nx + ix; - //printf("%d %d\n", ii, igrid); // grid2x converts igrid to ix,iy,iz like we've done before // multiply grid integers by grid spacing delx, dely, delz @@ -302,11 +257,6 @@ void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianG // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; xgrid[2] = h2*xgrid[2] + lo2; @@ -348,13 +298,10 @@ void ComputeGaussianGridLocalKokkos::operator() (TagComputeGaussianG const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; if (rsq < rnd_cutsq(jtype, jtype) ) { - //printf("%f %f\n", d_prefacelem(jtype-1), d_argfacelem(jtype-1)); int icol = size_local_cols_base + jtype - 1; d_alocal(igrid, icol) += d_prefacelem(jtype-1) * exp(-rsq * d_argfacelem(jtype-1)); } } - - //printf("%f\n", d_alocal(igrid, 6)); } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h index deb5eaa8cb..34e12bc4af 100644 --- a/src/KOKKOS/compute_gaussian_grid_local_kokkos.h +++ b/src/KOKKOS/compute_gaussian_grid_local_kokkos.h @@ -58,8 +58,6 @@ template class ComputeGaussianGridLocalKokkos : public Comput void operator() (TagComputeGaussianGridLocalNeigh, const typename Kokkos::TeamPolicy::member_type& team) const; private: - //double adof, mvv2e, mv2d, boltz; - Kokkos::View d_radelem; // element radii Kokkos::View d_sigmaelem; Kokkos::View d_prefacelem; @@ -73,21 +71,6 @@ template class ComputeGaussianGridLocalKokkos : public Comput Kokkos::MemoryTraits > t_fparams_rnd; t_fparams_rnd rnd_cutsq; - /* - typename AT::t_x_array x; - typename AT::t_v_array v; - typename ArrayTypes::t_float_1d rmass; - typename ArrayTypes::t_float_1d mass; - typename ArrayTypes::t_int_1d type; - typename ArrayTypes::t_int_1d mask; - */ - - //typename AT::t_neighbors_2d d_neighbors; - //typename AT::t_int_1d d_ilist; - //typename AT::t_int_1d d_numneigh; - - //DAT::tdual_float_2d k_result; - //typename AT::t_float_2d d_result; int max_neighs, inum, chunk_size, chunk_offset; int host_flag; @@ -103,11 +86,6 @@ template class ComputeGaussianGridLocalKokkos : public Comput typename AT::t_float_2d d_alocal; // triclinic vars - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ double h0, h1, h2, h3, h4, h5; double lo0, lo1, lo2; }; diff --git a/src/KOKKOS/compute_sna_grid_kokkos.cpp b/src/KOKKOS/compute_sna_grid_kokkos.cpp index 8a05ba7901..197234cf1d 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.cpp +++ b/src/KOKKOS/compute_sna_grid_kokkos.cpp @@ -23,59 +23,3 @@ template class ComputeSNAGridKokkosHost; #endif } - - - - -// The following chunk will compile but we're gonna try a wrapper approach like pair snap. -/* -#include "compute_sna_grid_kokkos.h" - -#include "atom_kokkos.h" -#include "atom_masks.h" -#include "comm.h" -#include "error.h" -#include "memory_kokkos.h" -#include "modify.h" -#include "neigh_list.h" -#include "neigh_request.h" -#include "neighbor_kokkos.h" -#include "sna_kokkos.h" -#include "update.h" - -using namespace LAMMPS_NS; - -// ---------------------------------------------------------------------- - -template -ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : - ComputeSNAGrid(lmp, narg, arg) -{ - - printf("^^^ inside ComputeSNAGridKokkos constructor\n"); - kokkosable = 1; - atomKK = (AtomKokkos *) atom; - execution_space = ExecutionSpaceFromDevice::space; - datamask_read = EMPTY_MASK; - datamask_modify = EMPTY_MASK; - -} - -// ---------------------------------------------------------------------- - -template -ComputeSNAGridKokkos::~ComputeSNAGridKokkos() -{ - if (copymode) return; - - -} - -namespace LAMMPS_NS { -template class ComputeSNAGridKokkos; -#ifdef LMP_KOKKOS_GPU -template class ComputeSNAGridKokkos; -#endif -} -*/ - diff --git a/src/KOKKOS/compute_sna_grid_kokkos.h b/src/KOKKOS/compute_sna_grid_kokkos.h index 5a81309a4e..8a7d87acbb 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_kokkos.h @@ -29,38 +29,13 @@ ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkosDevice); #include "compute_sna_grid.h" #include "kokkos_type.h" -//#include "pair_snap.h" -//#include "kokkos_type.h" -//#include "neigh_list_kokkos.h" #include "sna_kokkos.h" -//#include "pair_kokkos.h" namespace LAMMPS_NS { // Routines for both the CPU and GPU backend -//template -//struct TagPairSNAPComputeForce{}; - // GPU backend only -/* -struct TagPairSNAPComputeNeigh{}; -struct TagPairSNAPComputeCayleyKlein{}; -struct TagPairSNAPPreUi{}; -struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence -struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence -struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist -struct TagPairSNAPComputeZi{}; -struct TagPairSNAPBeta{}; -struct TagPairSNAPComputeBi{}; -struct TagPairSNAPComputeYi{}; -struct TagPairSNAPComputeYiWithZlist{}; -template -struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence -template -struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence -*/ -//struct TagPairSNAPPreUi{}; struct TagCSNAGridComputeNeigh{}; struct TagCSNAGridComputeCayleyKlein{}; struct TagCSNAGridPreUi{}; @@ -70,26 +45,11 @@ struct TagCSNAGridTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero yl template struct TagCSNAGridComputeZi{}; template struct TagCSNAGridComputeBi{}; struct TagCSNAGridLocalFill{}; // fill the gridlocal array -//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce struct TagComputeSNAGridLoop{}; struct TagComputeSNAGrid3D{}; -//struct TagCSNAGridTeam{}; // CPU backend only -/* -struct TagPairSNAPComputeNeighCPU{}; -struct TagPairSNAPPreUiCPU{}; -struct TagPairSNAPComputeUiCPU{}; -struct TagPairSNAPTransformUiCPU{}; -struct TagPairSNAPComputeZiCPU{}; -struct TagPairSNAPBetaCPU{}; -struct TagPairSNAPComputeBiCPU{}; -struct TagPairSNAPZeroYiCPU{}; -struct TagPairSNAPComputeYiCPU{}; -struct TagPairSNAPComputeDuidrjCPU{}; -struct TagPairSNAPComputeDeidrjCPU{}; -*/ struct TagComputeSNAGridLoopCPU{}; //template @@ -180,7 +140,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { // operator function for example team policy //KOKKOS_INLINE_FUNCTION - //void operator() (TagCSNAGridTeam, const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagComputeSNAGridLoop, const int& ) const; @@ -191,9 +150,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { KOKKOS_INLINE_FUNCTION void operator() (TagCSNAGridComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; - // PrintNeigh - //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; - // 3D case - used by parallel_for KOKKOS_INLINE_FUNCTION void operator()(TagComputeSNAGrid3D, const int& iz, const int& iy, const int& ix) const; @@ -294,11 +250,6 @@ class ComputeSNAGridKokkos : public ComputeSNAGrid { class DomainKokkos *domainKK; // triclinic vars - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ double h0, h1, h2, h3, h4, h5; double lo0, lo1, lo2; @@ -344,45 +295,3 @@ class ComputeSNAGridKokkosHost : public ComputeSNAGridKokkos); -ComputeStyle(sna/grid/kk/device,ComputeSNAGridKokkos); -ComputeStyle(sna/grid/kk/host,ComputeSNAGridKokkos); -// clang-format on -#else - -// clang-format off -#ifndef LMP_COMPUTE_SNA_GRID_KOKKOS_H -#define LMP_COMPUTE_SNA_GRID_KOKKOS_H - -#include "compute_sna_grid.h" -#include "kokkos_type.h" - -namespace LAMMPS_NS { - -//template -//struct TagComputeCoordAtom{}; - -template -class ComputeSNAGridKokkos : public ComputeSNAGrid { - public: - typedef DeviceType device_type; - typedef ArrayTypes AT; - - ComputeSNAGridKokkos(class LAMMPS *, int, char **); - ~ComputeSNAGridKokkos() override; - - private: - -}; - -} - -#endif -#endif -*/ - diff --git a/src/KOKKOS/compute_sna_grid_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_kokkos_impl.h index 432dbe9f98..665a1b67e7 100644 --- a/src/KOKKOS/compute_sna_grid_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_kokkos_impl.h @@ -27,7 +27,6 @@ #include "neigh_list.h" #include "neigh_request.h" #include "neighbor_kokkos.h" -//#include "sna_kokkos.h" #include "domain.h" #include "domain_kokkos.h" #include "sna.h" @@ -131,14 +130,10 @@ ComputeSNAGridKokkos::ComputeSNAGridKokkos template ComputeSNAGridKokkos::~ComputeSNAGridKokkos() { - //printf(">>> ComputeSNAGridKokkos destruct begin copymode %d\n", copymode); if (copymode) return; - //printf(">>> After copymode\n"); memoryKK->destroy_kokkos(k_cutsq,cutsq); - //memoryKK->destroy_kokkos(k_grid,grid); memoryKK->destroy_kokkos(k_gridall, gridall); - //memoryKK->destroy_kokkos(k_gridlocal, gridlocal); } // Setup @@ -161,7 +156,6 @@ void ComputeSNAGridKokkos::setup() array = gridall; d_gridlocal = k_gridlocal.template view(); - //d_grid = k_grid.template view(); d_gridall = k_gridall.template view(); } @@ -199,23 +193,14 @@ void ComputeSNAGridKokkos::compute_array() // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user // `total_range` is the number of grid points which may be larger than chunk size. - //printf(">>> total_range: %d\n", total_range); chunk_size = MIN(chunksize, total_range); chunk_offset = 0; - //snaKK.grow_rij(chunk_size, ntotal); snaKK.grow_rij(chunk_size, max_neighs); - //chunk_size = total_range; - // Pre-compute ceil(chunk_size / vector_length) for code cleanliness const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; if (triclinic) { - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ h0 = domain->h[0]; h1 = domain->h[1]; h2 = domain->h[2]; @@ -232,7 +217,6 @@ void ComputeSNAGridKokkos::compute_array() if (chunk_size > total_range - chunk_offset) chunk_size = total_range - chunk_offset; - //printf(">>> chunk_offset: %d\n", chunk_offset); //ComputeNeigh { @@ -333,9 +317,6 @@ void ComputeSNAGridKokkos::compute_array() k_gridlocal.template modify(); k_gridlocal.template sync(); - //k_grid.template modify(); - //k_grid.template sync(); - k_gridall.template modify(); k_gridall.template sync(); } @@ -396,7 +377,6 @@ void ComputeSNAGridKokkos::operator() (Tag // index ii already captures the proper grid point //int igrid = iz * (nx * ny) + iy * nx + ix; - //printf("%d %d\n", ii, igrid); // grid2x converts igrid to ix,iy,iz like we've done before // multiply grid integers by grid spacing delx, dely, delz @@ -414,11 +394,6 @@ void ComputeSNAGridKokkos::operator() (Tag // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; xgrid[2] = h2*xgrid[2] + lo2; @@ -436,14 +411,6 @@ void ComputeSNAGridKokkos::operator() (Tag if (chemflag) ielem = d_map[itype]; //const double radi = d_radelem[ielem]; - // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. - // The purpose here is to transform for triclinic boxes. - /* - if (triclinic){ - printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp); - } - */ - // Compute the number of neighbors, store rsq int ninside = 0; @@ -464,29 +431,6 @@ void ComputeSNAGridKokkos::operator() (Tag ninside++; } - /* - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), - [&] (const int j, int& count) { - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; - - int jtype = type(j); - const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; - - // don't include atoms that share location with grid point - if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { - jtype = -1; // use -1 to signal it's outside the radius - } - - type_cache[j] = jtype; - - if (jtype >= 0) - count++; - - }, ninside); - */ - d_ninside(ii) = ninside; // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. @@ -521,75 +465,6 @@ void ComputeSNAGridKokkos::operator() (Tag offset++; } } - - /* - int offset = 0; - for (int j = 0; j < ntotal; j++){ - const int jtype = type_cache[j]; - if (jtype >= 0) { - printf(">>> offset: %d\n", offset); - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; - int jtype = type(j); - int jelem = 0; - if (chemflag) jelem = d_map[jtype]; - snaKK.rij(ii,offset,0) = static_cast(dx); - snaKK.rij(ii,offset,1) = static_cast(dy); - snaKK.rij(ii,offset,2) = static_cast(dz); - // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp - // actually since the views here have values starting at 0, let's use jelem - snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); - snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - snaKK.inside(ii,offset) = j; - if (switchinnerflag) { - snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); - } - if (chemflag) - snaKK.element(ii,offset) = jelem; - else - snaKK.element(ii,offset) = 0; - offset++; - } - } - */ - - /* - Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), - [&] (const int j, int& offset, bool final) { - - const int jtype = type_cache[j]; - - if (jtype >= 0) { - if (final) { - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; - int jtype = type(j); - int jelem = 0; - if (chemflag) jelem = d_map[jtype]; - snaKK.rij(ii,offset,0) = static_cast(dx); - snaKK.rij(ii,offset,1) = static_cast(dy); - snaKK.rij(ii,offset,2) = static_cast(dz); - // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp - // actually since the views here have values starting at 0, let's use jelem - snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); - snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - snaKK.inside(ii,offset) = j; - if (switchinnerflag) { - snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); - } - if (chemflag) - snaKK.element(ii,offset) = jelem; - else - snaKK.element(ii,offset) = 0; - } - offset++; - } - }); - */ } /* ---------------------------------------------------------------------- @@ -821,11 +696,6 @@ void ComputeSNAGridKokkos::operator() (Tag // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; xgrid[2] = h2*xgrid[2] + lo2; diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp index 087dbc5fd5..3835a56bf8 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos.cpp +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.cpp @@ -23,59 +23,3 @@ template class ComputeSNAGridLocalKokkosHost; #endif } - - - - -// The following chunk will compile but we're gonna try a wrapper approach like pair snap. -/* -#include "compute_sna_grid_kokkos.h" - -#include "atom_kokkos.h" -#include "atom_masks.h" -#include "comm.h" -#include "error.h" -#include "memory_kokkos.h" -#include "modify.h" -#include "neigh_list.h" -#include "neigh_request.h" -#include "neighbor_kokkos.h" -#include "sna_kokkos.h" -#include "update.h" - -using namespace LAMMPS_NS; - -// ---------------------------------------------------------------------- - -template -ComputeSNAGridKokkos::ComputeSNAGridKokkos(LAMMPS *lmp, int narg, char **arg) : - ComputeSNAGrid(lmp, narg, arg) -{ - - printf("^^^ inside ComputeSNAGridKokkos constructor\n"); - kokkosable = 1; - atomKK = (AtomKokkos *) atom; - execution_space = ExecutionSpaceFromDevice::space; - datamask_read = EMPTY_MASK; - datamask_modify = EMPTY_MASK; - -} - -// ---------------------------------------------------------------------- - -template -ComputeSNAGridKokkos::~ComputeSNAGridKokkos() -{ - if (copymode) return; - - -} - -namespace LAMMPS_NS { -template class ComputeSNAGridKokkos; -#ifdef LMP_KOKKOS_GPU -template class ComputeSNAGridKokkos; -#endif -} -*/ - diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos.h b/src/KOKKOS/compute_sna_grid_local_kokkos.h index 754d4e36af..2ffc050b2d 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos.h @@ -29,38 +29,13 @@ ComputeStyle(sna/grid/local/kk/host,ComputeSNAGridLocalKokkosDevice #include "compute_sna_grid_local.h" #include "kokkos_type.h" -//#include "pair_snap.h" -//#include "kokkos_type.h" -//#include "neigh_list_kokkos.h" #include "sna_kokkos.h" -//#include "pair_kokkos.h" namespace LAMMPS_NS { // Routines for both the CPU and GPU backend -//template -//struct TagPairSNAPComputeForce{}; - // GPU backend only -/* -struct TagPairSNAPComputeNeigh{}; -struct TagPairSNAPComputeCayleyKlein{}; -struct TagPairSNAPPreUi{}; -struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence -struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence -struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist -struct TagPairSNAPComputeZi{}; -struct TagPairSNAPBeta{}; -struct TagPairSNAPComputeBi{}; -struct TagPairSNAPComputeYi{}; -struct TagPairSNAPComputeYiWithZlist{}; -template -struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence -template -struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence -*/ -//struct TagPairSNAPPreUi{}; struct TagCSNAGridLocalComputeNeigh{}; struct TagCSNAGridLocalComputeCayleyKlein{}; struct TagCSNAGridLocalPreUi{}; @@ -70,25 +45,11 @@ struct TagCSNAGridLocalTransformUi{}; // re-order ulisttot from SoA to AoSoA, ze template struct TagCSNAGridLocalComputeZi{}; template struct TagCSNAGridLocalComputeBi{}; struct TagCSNAGridLocal2Fill{}; // fill the gridlocal array -//struct TagCSNAGridLocalFill2{}; // fill the gridlocal array using same kinda loop as ComputeForce struct TagComputeSNAGridLocalLoop{}; struct TagComputeSNAGridLocal3D{}; // CPU backend only -/* -struct TagPairSNAPComputeNeighCPU{}; -struct TagPairSNAPPreUiCPU{}; -struct TagPairSNAPComputeUiCPU{}; -struct TagPairSNAPTransformUiCPU{}; -struct TagPairSNAPComputeZiCPU{}; -struct TagPairSNAPBetaCPU{}; -struct TagPairSNAPComputeBiCPU{}; -struct TagPairSNAPZeroYiCPU{}; -struct TagPairSNAPComputeYiCPU{}; -struct TagPairSNAPComputeDuidrjCPU{}; -struct TagPairSNAPComputeDeidrjCPU{}; -*/ struct TagComputeSNAGridLocalLoopCPU{}; //template @@ -184,9 +145,6 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { KOKKOS_INLINE_FUNCTION void operator() (TagCSNAGridLocalComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; - // PrintNeigh - //void operator() (TagPrintNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; - // 3D case - used by parallel_for KOKKOS_INLINE_FUNCTION void operator()(TagComputeSNAGridLocal3D, const int& iz, const int& iy, const int& ix) const; @@ -274,16 +232,6 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { DAT::tdual_float_2d k_alocal; typename AT::t_float_2d d_alocal; - /* - DAT::tdual_float_2d k_grid; - DAT::tdual_float_2d k_gridall; - typename AT::t_float_2d d_grid; - typename AT::t_float_2d d_gridall; - - DAT::tdual_float_4d k_gridlocal; - typename AT::t_float_4d d_gridlocal; - */ - // Utility routine which wraps computing per-team scratch size requirements for // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj @@ -293,11 +241,6 @@ class ComputeSNAGridLocalKokkos : public ComputeSNAGridLocal { class DomainKokkos *domainKK; // triclinic vars - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ double h0, h1, h2, h3, h4, h5; double lo0, lo1, lo2; @@ -320,7 +263,6 @@ class ComputeSNAGridLocalKokkosDevice : public ComputeSNAGridLocalKokkos::ComputeSNAGridL template ComputeSNAGridLocalKokkos::~ComputeSNAGridLocalKokkos() { - //printf(">>> ComputeSNAGridLocalKokkos destruct begin copymode %d\n", copymode); if (copymode) return; - //printf(">>> After copymode\n"); memoryKK->destroy_kokkos(k_cutsq,cutsq); memoryKK->destroy_kokkos(k_alocal,alocal); - //memoryKK->destroy_kokkos(k_grid,grid); - //memoryKK->destroy_kokkos(k_gridall, gridall); - //memoryKK->destroy_kokkos(k_gridlocal, gridlocal); } // Setup @@ -148,28 +142,11 @@ template void ComputeSNAGridLocalKokkos::setup() { - // Do not call ComputeGrid::setup(), we don't wanna allocate the grid array there. - // Instead, call ComputeGrid::set_grid_global and set_grid_local to set the n indices. - - //ComputeGrid::set_grid_global(); - //ComputeGrid::set_grid_local(); - //ComputeSNAGridLocal::setup(); ComputeGridLocal::setup(); // allocate arrays - //memoryKK->create_kokkos(k_gridall, gridall, size_array_rows, size_array_cols, "grid:gridall"); memoryKK->create_kokkos(k_alocal, alocal, size_local_rows, size_local_cols, "grid:alocal"); - - // do not use or allocate gridlocal for now - - //gridlocal_allocated = 0; - //array = gridall; - array_local = alocal; - - //d_gridlocal = k_gridlocal.template view(); - //d_grid = k_grid.template view(); - //d_gridall = k_gridall.template view(); d_alocal = k_alocal.template view(); } @@ -183,8 +160,6 @@ void ComputeSNAGridLocalKokkos::compute_lo return; } - //printf(">>> ComputeSNAGridLocalKokkos::compute_local begin\n"); - copymode = 1; zlen = nzhi-nzlo+1; @@ -205,12 +180,10 @@ void ComputeSNAGridLocalKokkos::compute_lo ntotal = atomKK->nlocal + atomKK->nghost; // Allocate view for number of neighbors per grid point - //printf(">>> total_range: %d\n", total_range); MemKK::realloc_kokkos(d_ninside,"ComputeSNAGridLocalKokkos:ninside",total_range); // "chunksize" variable is default 32768 in compute_sna_grid.cpp, and set by user // `total_range` is the number of grid points which may be larger than chunk size. - //printf(">>> total_range: %d\n", total_range); chunk_size = MIN(chunksize, total_range); chunk_offset = 0; //snaKK.grow_rij(chunk_size, ntotal); @@ -222,11 +195,6 @@ void ComputeSNAGridLocalKokkos::compute_lo const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; if (triclinic) { - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ h0 = domain->h[0]; h1 = domain->h[1]; h2 = domain->h[2]; @@ -243,7 +211,6 @@ void ComputeSNAGridLocalKokkos::compute_lo if (chunk_size > total_range - chunk_offset) chunk_size = total_range - chunk_offset; - //printf(">>> chunk_offset: %d\n", chunk_offset); //ComputeNeigh { @@ -401,7 +368,6 @@ void ComputeSNAGridLocalKokkos::operator() // index ii already captures the proper grid point //int igrid = iz * (nx * ny) + iy * nx + ix; - //printf("%d %d\n", ii, igrid); // grid2x converts igrid to ix,iy,iz like we've done before // multiply grid integers by grid spacing delx, dely, delz @@ -419,11 +385,6 @@ void ComputeSNAGridLocalKokkos::operator() // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; xgrid[2] = h2*xgrid[2] + lo2; @@ -454,14 +415,6 @@ void ComputeSNAGridLocalKokkos::operator() if (chemflag) ielem = d_map[itype]; //const double radi = d_radelem[ielem]; - // We need a DomainKokkos::lamda2x parallel for loop here, but let's ignore for now. - // The purpose here is to transform for triclinic boxes. - /* - if (triclinic){ - printf("We are triclinic %f %f %f\n", xtmp, ytmp, ztmp); - } - */ - // Compute the number of neighbors, store rsq int ninside = 0; @@ -482,29 +435,6 @@ void ComputeSNAGridLocalKokkos::operator() ninside++; } - /* - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,ntotal), - [&] (const int j, int& count) { - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; - - int jtype = type(j); - const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; - - // don't include atoms that share location with grid point - if (rsq >= rnd_cutsq(itype,jtype) || rsq < 1e-20) { - jtype = -1; // use -1 to signal it's outside the radius - } - - type_cache[j] = jtype; - - if (jtype >= 0) - count++; - - }, ninside); - */ - d_ninside(ii) = ninside; // TODO: Adjust for multi-element, currently we set jelem = 0 regardless of type. @@ -539,75 +469,6 @@ void ComputeSNAGridLocalKokkos::operator() offset++; } } - - /* - int offset = 0; - for (int j = 0; j < ntotal; j++){ - const int jtype = type_cache[j]; - if (jtype >= 0) { - printf(">>> offset: %d\n", offset); - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; - int jtype = type(j); - int jelem = 0; - if (chemflag) jelem = d_map[jtype]; - snaKK.rij(ii,offset,0) = static_cast(dx); - snaKK.rij(ii,offset,1) = static_cast(dy); - snaKK.rij(ii,offset,2) = static_cast(dz); - // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp - // actually since the views here have values starting at 0, let's use jelem - snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); - snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - snaKK.inside(ii,offset) = j; - if (switchinnerflag) { - snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); - } - if (chemflag) - snaKK.element(ii,offset) = jelem; - else - snaKK.element(ii,offset) = 0; - offset++; - } - } - */ - - /* - Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,ntotal), - [&] (const int j, int& offset, bool final) { - - const int jtype = type_cache[j]; - - if (jtype >= 0) { - if (final) { - const F_FLOAT dx = x(j,0) - xtmp; - const F_FLOAT dy = x(j,1) - ytmp; - const F_FLOAT dz = x(j,2) - ztmp; - int jtype = type(j); - int jelem = 0; - if (chemflag) jelem = d_map[jtype]; - snaKK.rij(ii,offset,0) = static_cast(dx); - snaKK.rij(ii,offset,1) = static_cast(dy); - snaKK.rij(ii,offset,2) = static_cast(dz); - // pair snap uses jelem here, but we use jtype, see compute_sna_grid.cpp - // actually since the views here have values starting at 0, let's use jelem - snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); - snaKK.rcutij(ii,offset) = static_cast((2.0 * d_radelem[jelem])*rcutfac); - snaKK.inside(ii,offset) = j; - if (switchinnerflag) { - snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); - } - if (chemflag) - snaKK.element(ii,offset) = jelem; - else - snaKK.element(ii,offset) = 0; - } - offset++; - } - }); - */ } /* ---------------------------------------------------------------------- @@ -839,22 +700,11 @@ void ComputeSNAGridLocalKokkos::operator() // Because calling a __host__ function("lamda2x") from a __host__ __device__ function("operator()") is not allowed // Using domainKK-> gives segfault, use domain-> instead since we're just accessing floats. - /* - xgrid[0] = domain->h[0]*xgrid[0] + domain->h[5]*xgrid[1] + domain->h[4]*xgrid[2] + domain->boxlo[0]; - xgrid[1] = domain->h[1]*xgrid[1] + domain->h[3]*xgrid[2] + domain->boxlo[1]; - xgrid[2] = domain->h[2]*xgrid[2] + domain->boxlo[2]; - */ xgrid[0] = h0*xgrid[0] + h5*xgrid[1] + h4*xgrid[2] + lo0; xgrid[1] = h1*xgrid[1] + h3*xgrid[2] + lo1; xgrid[2] = h2*xgrid[2] + lo2; } - //const F_FLOAT xtmp = xgrid[0]; - //const F_FLOAT ytmp = xgrid[1]; - //const F_FLOAT ztmp = xgrid[2]; - //d_gridall(igrid,0) = xtmp; - //d_gridall(igrid,1) = ytmp; - //d_gridall(igrid,2) = ztmp; const auto idxb_max = snaKK.idxb_max; diff --git a/src/ML-SNAP/compute_gaussian_grid_local.cpp b/src/ML-SNAP/compute_gaussian_grid_local.cpp index 81286f9d81..8a747a7908 100644 --- a/src/ML-SNAP/compute_gaussian_grid_local.cpp +++ b/src/ML-SNAP/compute_gaussian_grid_local.cpp @@ -89,14 +89,12 @@ ComputeGaussianGridLocal::ComputeGaussianGridLocal(LAMMPS *lmp, int narg, char * ComputeGaussianGridLocal::~ComputeGaussianGridLocal() { - //printf(">>> ComputeGaussianGridLocal begin destruct copymode %d\n", copymode); if (copymode) return; memory->destroy(radelem); memory->destroy(sigmaelem); memory->destroy(prefacelem); memory->destroy(argfacelem); memory->destroy(cutsq); - //printf(">>> ComputeGaussianGridLocal end destruct\n"); } /* ---------------------------------------------------------------------- */ @@ -111,8 +109,6 @@ void ComputeGaussianGridLocal::init() void ComputeGaussianGridLocal::compute_local() { - //printf(">>> compute_local CPU\n"); - //printf(">>> size_local_cols_base, size_local_cols: %d %d\n", size_local_cols_base, size_local_cols); invoked_local = update->ntimestep; // compute gaussian for each gridpoint diff --git a/src/ML-SNAP/compute_grid.cpp b/src/ML-SNAP/compute_grid.cpp index dce2ab0283..12135c705d 100644 --- a/src/ML-SNAP/compute_grid.cpp +++ b/src/ML-SNAP/compute_grid.cpp @@ -88,7 +88,6 @@ void ComputeGrid::grid2x(int igrid, double *x) x[2] = iz * delz; if (triclinic) domain->lamda2x(x, x); - //printf(">>>>> ComputeGrid::grid2x\n"); } /* ---------------------------------------------------------------------- @@ -104,7 +103,6 @@ void ComputeGrid::assign_coords_all() gridall[igrid][1] = x[1]; gridall[igrid][2] = x[2]; } - //printf(">>>>> ComputeGrid::assign_coords_all\n"); } /* ---------------------------------------------------------------------- @@ -113,7 +111,6 @@ void ComputeGrid::assign_coords_all() void ComputeGrid::allocate() { - //printf(">>> ComputeGrid::allocate\n"); // allocate arrays memory->create(grid, size_array_rows, size_array_cols, "grid:grid"); memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall"); diff --git a/src/ML-SNAP/compute_grid_local.cpp b/src/ML-SNAP/compute_grid_local.cpp index 92bb556c50..80feb75be5 100644 --- a/src/ML-SNAP/compute_grid_local.cpp +++ b/src/ML-SNAP/compute_grid_local.cpp @@ -61,9 +61,7 @@ ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) : ComputeGridLocal::~ComputeGridLocal() { - //printf(">>> ComputeGridLocal begin destruct\n"); deallocate(); - //printf(">>> ComputeGridLocal end destruct\n"); } /* ---------------------------------------------------------------------- */ @@ -75,7 +73,6 @@ void ComputeGridLocal::setup() set_grid_local(); allocate(); assign_coords(); - //printf(">>> ComputeGridLocal setup nx ny nz %d %d %d %d %d %d\n", nxlo, nxhi, nylo, nyhi, nzlo, nzhi); } /* ---------------------------------------------------------------------- @@ -109,7 +106,6 @@ void ComputeGridLocal::grid2lamda(int ix, int iy, int iz, double *x) void ComputeGridLocal::allocate() { - //printf(">>> ComputeGridLocal::allocate %d %d\n", size_local_rows, size_local_cols); if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) { gridlocal_allocated = 1; memory->create(alocal, size_local_rows, size_local_cols, "compute/grid/local:alocal"); @@ -123,14 +119,12 @@ void ComputeGridLocal::allocate() void ComputeGridLocal::deallocate() { - //printf(">>> ComputeGridLocal::deallocate begin gridlocal_allocated %d copymode %d\n", gridlocal_allocated, copymode); if (copymode) return; if (gridlocal_allocated) { gridlocal_allocated = 0; memory->destroy(alocal); } - //printf(">>> ComputeGridLocal:: deallocate end\n"); array_local = nullptr; } @@ -186,8 +180,6 @@ void ComputeGridLocal::set_grid_local() // the 2 equality if tests ensure a consistent decision // as to which proc owns it - //printf(">>> ComputeGridLocal set_grid_local\n"); - double xfraclo, xfrachi, yfraclo, yfrachi, zfraclo, zfrachi; if (comm->layout != Comm::LAYOUT_TILED) { From c0be84356ebc49cdde896db41c704758e0486077 Mon Sep 17 00:00:00 2001 From: Lenz Fiedler Date: Wed, 22 Jan 2025 08:13:36 +0100 Subject: [PATCH 50/51] Removed debugging output, added contributor, added files to Install.sh --- src/KOKKOS/Install.sh | 1 + src/KOKKOS/compute_sna_grid_local_kokkos_impl.h | 3 +-- src/KOKKOS/pair_mliap_kokkos.cpp | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 3da88f2fc4..efbf7bfaff 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -125,6 +125,7 @@ action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp +action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local_kokkos.cpp action compute_temp_deform_kokkos.cpp action compute_temp_deform_kokkos.h action compute_temp_kokkos.cpp diff --git a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h index 734706d2a3..01bb2b427b 100644 --- a/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h +++ b/src/KOKKOS/compute_sna_grid_local_kokkos_impl.h @@ -11,8 +11,7 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Christian Trott (SNL), Stan Moore (SNL), - Evan Weinberg (NVIDIA) + Contributing authors: Andrew Rohskopf (SNL) ------------------------------------------------------------------------- */ #include "compute_sna_grid_local_kokkos.h" diff --git a/src/KOKKOS/pair_mliap_kokkos.cpp b/src/KOKKOS/pair_mliap_kokkos.cpp index 8b9305d48c..6c98399416 100644 --- a/src/KOKKOS/pair_mliap_kokkos.cpp +++ b/src/KOKKOS/pair_mliap_kokkos.cpp @@ -233,7 +233,6 @@ void PairMLIAPKokkos::coeff(int narg, char **arg) { // map[i] = which element the Ith atom type is, -1 if not mapped // map[0] is not used - //printf(">>> ntypes: %d\n", atom->ntypes); for (int i = 1; i <= atom->ntypes; i++) { char* elemname = elemtypes[i-1]; int jelem; From 6b4ecfd719af983e65910dd023c65d403f7ad846 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 22 Jan 2025 15:33:09 -0700 Subject: [PATCH 51/51] Fix issues with GNU Make build --- src/.gitignore | 2 ++ src/KOKKOS/Install.sh | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/.gitignore b/src/.gitignore index c1f6b6e892..45f7a9f1a0 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -252,6 +252,8 @@ /*rheo*.cpp /*rheo*.h +/compute_gaussian_grid_local.cpp +/compute_gaussian_grid_local.h /compute_grid.cpp /compute_grid.h /compute_grid_local.cpp diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 191be47ff6..d34d5eb9ee 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -125,7 +125,8 @@ action compute_sna_grid_kokkos_impl.h compute_sna_grid.cpp action compute_sna_grid_local_kokkos.cpp compute_sna_grid_local.cpp action compute_sna_grid_local_kokkos.h compute_sna_grid_local.h action compute_sna_grid_local_kokkos_impl.h compute_sna_grid_local.cpp -action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local_kokkos.cpp +action compute_gaussian_grid_local_kokkos.cpp compute_gaussian_grid_local.cpp +action compute_gaussian_grid_local_kokkos.h compute_gaussian_grid_local.h action compute_temp_deform_kokkos.cpp action compute_temp_deform_kokkos.h action compute_temp_kokkos.cpp