diff --git a/doc/src/Speed_kokkos.rst b/doc/src/Speed_kokkos.rst index 569a24f1c2..8161e69a1c 100644 --- a/doc/src/Speed_kokkos.rst +++ b/doc/src/Speed_kokkos.rst @@ -285,6 +285,16 @@ one or more nodes, each with two GPUs: settings. Experimenting with its options can provide a speed-up for specific calculations. For example: +.. note:: + + The default binsize for :doc:`atom sorting ` on GPUs + is equal to the default CPU neighbor binsize (i.e. 2x smaller than the + default GPU neighbor binsize). When running simple pair-wise + potentials like Lennard Jones on GPUs, using a 2x larger binsize for + atom sorting (equal to the default GPU neighbor binsize) and a more + frequent sorting than default (e.g. sorting every 100 time steps + instead of 1000) may improve performance. + .. code-block:: bash mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos newton on neigh half binsize 2.8 -in in.lj # Newton on, half neighbor list, set binsize = neighbor ghost cutoff diff --git a/doc/src/atom_modify.rst b/doc/src/atom_modify.rst index 9049a24fde..1e5a3d49ff 100644 --- a/doc/src/atom_modify.rst +++ b/doc/src/atom_modify.rst @@ -153,6 +153,13 @@ cache locality will be undermined. order of atoms in a :doc:`dump ` file will also typically change if sorting is enabled. +.. note:: + + When running simple pair-wise potentials like Lennard Jones on GPUs + with the KOKKOS package, using a larger binsize (e.g. 2x larger than + default) and a more frequent reordering than default (e.g. every 100 + time steps) may improve performance. + Restrictions """""""""""" diff --git a/doc/src/package.rst b/doc/src/package.rst index 76bf20a97f..14d978c3c9 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -71,7 +71,7 @@ Syntax *no_affinity* values = none *kokkos* args = keyword value ... zero or more keyword/value pairs may be appended - keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *gpu/aware* or *pair/only* + keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *sort* or *gpu/aware* or *pair/only* *neigh* value = *full* or *half* full = full neighbor list half = half neighbor list built in thread-safe manner @@ -102,6 +102,9 @@ Syntax *comm/pair/reverse* value = *no* or *device* *no* = perform communication pack/unpack in non-KOKKOS mode *device* = perform pack/unpack on device (e.g. on GPU) + *sort* value = *no* or *device* + *no* = perform atom sorting in non-KOKKOS mode + *device* = perform atom sorting on device (e.g. on GPU) *gpu/aware* = *off* or *on* *off* = do not use GPU-aware MPI *on* = use GPU-aware MPI (default) @@ -554,6 +557,17 @@ pack/unpack communicated data. When running small systems on a GPU, performing the exchange pack/unpack on the host CPU can give speedup since it reduces the number of CUDA kernel launches. +The *sort* keyword determines whether the host or device performs atom +sorting, see the :doc:`atom_modify sort ` command. The +value options for the *sort* keyword are *no* or *device* similar to the +*comm* keywords above. If a value of *host* is used it will be +automatically be changed to *no* since the *sort* keyword doesn't +support *host* mode. The value of *no* will also always be used when +running on the CPU, i.e. setting the value to *device* will have no +effect if the simulation is running on the CPU. Not all fix styles with +extra atom data support *device* mode and in that case a warning will be +given and atom sorting will run in *no* mode instead. + The *gpu/aware* keyword chooses whether GPU-aware MPI will be used. When this keyword is set to *on*, buffers in GPU memory are passed directly through MPI send/receive calls. This reduces overhead of first copying @@ -705,12 +719,12 @@ script or via the "-pk intel" :doc:`command-line switch `. For the KOKKOS package, the option defaults for GPUs are neigh = full, neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default -value, comm = device, neigh/transpose = off, gpu/aware = on. When -LAMMPS can safely detect that GPU-aware MPI is not available, the -default value of gpu/aware becomes "off". For CPUs or Xeon Phis, the -option defaults are neigh = half, neigh/qeq = half, newton = on, -binsize = 0.0, and comm = no. The option neigh/thread = on when there -are 16K atoms or less on an MPI rank, otherwise it is "off". These +value, comm = device, sort = device, neigh/transpose = off, gpu/aware = +on. When LAMMPS can safely detect that GPU-aware MPI is not available, +the default value of gpu/aware becomes "off". For CPUs or Xeon Phis, the +option defaults are neigh = half, neigh/qeq = half, newton = on, binsize += 0.0, comm = no, and sort = no. The option neigh/thread = on when +there are 16K atoms or less on an MPI rank, otherwise it is "off". These settings are made automatically by the required "-k on" :doc:`command-line switch `. You can change them by using the package kokkos command in your input script or via the :doc:`-pk diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp index 0cc682eda9..f9fa31bf2a 100644 --- a/src/KOKKOS/atom_kokkos.cpp +++ b/src/KOKKOS/atom_kokkos.cpp @@ -22,6 +22,11 @@ #include "kokkos.h" #include "memory_kokkos.h" #include "update.h" +#include "kokkos_base.h" +#include "modify.h" +#include "fix.h" + +#include using namespace LAMMPS_NS; @@ -103,6 +108,15 @@ AtomKokkos::~AtomKokkos() /* ---------------------------------------------------------------------- */ +void AtomKokkos::init() +{ + Atom::init(); + + sort_classic = lmp->kokkos->sort_classic; +} + +/* ---------------------------------------------------------------------- */ + void AtomKokkos::sync(const ExecutionSpace space, unsigned int mask) { if (space == Device && lmp->kokkos->auto_sync) avecKK->modified(Host, mask); @@ -140,8 +154,37 @@ void AtomKokkos::allocate_type_arrays() void AtomKokkos::sort() { - int i, m, n, ix, iy, iz, ibin, empty; + // check if all fixes with atom-based arrays support sort on device + if (!sort_classic) { + int flag = 1; + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) { + auto fix_iextra = modify->fix[atom->extra_grow[iextra]]; + if (!fix_iextra->sort_device) { + flag = 0; + break; + } + } + if (!flag) { + if (comm->me == 0) { + error->warning(FLERR,"Fix with atom-based arrays not compatible with Kokkos sorting on device, " + "switching to classic host sorting"); + } + sort_classic = true; + } + } + + if (sort_classic) { + sync(Host, ALL_MASK); + Atom::sort(); + modified(Host, ALL_MASK); + } else sort_device(); +} + +/* ---------------------------------------------------------------------- */ + +void AtomKokkos::sort_device() +{ // set next timestep for sorting to take place nextsort = (update->ntimestep / sortfreq) * sortfreq + sortfreq; @@ -151,94 +194,37 @@ void AtomKokkos::sort() if (domain->box_change) setup_sort_bins(); if (nbins == 1) return; - // reallocate per-atom vectors if needed - - if (atom->nmax > maxnext) { - memory->destroy(next); - memory->destroy(permute); - maxnext = atom->nmax; - memory->create(next, maxnext, "atom:next"); - memory->create(permute, maxnext, "atom:permute"); - } - - // ensure there is one extra atom location at end of arrays for swaps - - if (nlocal == nmax) avec->grow(0); - // for triclinic, atoms must be in box coords (not lamda) to match bbox if (domain->triclinic) domain->lamda2x(nlocal); - sync(Host, ALL_MASK); + auto d_x = k_x.d_view; + sync(Device, X_MASK); - // bin atoms in reverse order so linked list will be in forward order + // sort - for (i = 0; i < nbins; i++) binhead[i] = -1; + int max_bins[3]; + max_bins[0] = nbinx; + max_bins[1] = nbiny; + max_bins[2] = nbinz; - HAT::t_x_array_const h_x = k_x.view(); - for (i = nlocal - 1; i >= 0; i--) { - ix = static_cast((h_x(i, 0) - bboxlo[0]) * bininvx); - iy = static_cast((h_x(i, 1) - bboxlo[1]) * bininvy); - iz = static_cast((h_x(i, 2) - bboxlo[2]) * bininvz); - ix = MAX(ix, 0); - iy = MAX(iy, 0); - iz = MAX(iz, 0); - ix = MIN(ix, nbinx - 1); - iy = MIN(iy, nbiny - 1); - iz = MIN(iz, nbinz - 1); - ibin = iz * nbiny * nbinx + iy * nbinx + ix; - next[i] = binhead[ibin]; - binhead[ibin] = i; - } + using KeyViewType = DAT::t_x_array; + using BinOp = BinOp3DLAMMPS; + BinOp binner(max_bins, bboxlo, bboxhi); + Kokkos::BinSort Sorter(d_x, 0, nlocal, binner, false); + Sorter.create_permute_vector(LMPDeviceType()); - // permute = desired permutation of atoms - // permute[I] = J means Ith new atom will be Jth old atom + avecKK->sort_kokkos(Sorter); - n = 0; - for (m = 0; m < nbins; m++) { - i = binhead[m]; - while (i >= 0) { - permute[n++] = i; - i = next[i]; + if (atom->nextra_grow) { + for (int iextra = 0; iextra < atom->nextra_grow; iextra++) { + auto fix_iextra = modify->fix[atom->extra_grow[iextra]]; + KokkosBase *kkbase = dynamic_cast(fix_iextra); + + kkbase->sort_kokkos(Sorter); } } - // current = current permutation, just reuse next vector - // current[I] = J means Ith current atom is Jth old atom - - int *current = next; - for (i = 0; i < nlocal; i++) current[i] = i; - - // reorder local atom list, when done, current = permute - // perform "in place" using copy() to extra atom location at end of list - // inner while loop processes one cycle of the permutation - // copy before inner-loop moves an atom to end of atom list - // copy after inner-loop moves atom at end of list back into list - // empty = location in atom list that is currently empty - - for (i = 0; i < nlocal; i++) { - if (current[i] == permute[i]) continue; - avec->copy(i, nlocal, 0); - empty = i; - while (permute[empty] != i) { - avec->copy(permute[empty], empty, 0); - empty = current[empty] = permute[empty]; - } - avec->copy(nlocal, empty, 0); - current[empty] = permute[empty]; - } - - // sanity check that current = permute - - //int flag = 0; - //for (i = 0; i < nlocal; i++) - // if (current[i] != permute[i]) flag = 1; - //int flagall; - //MPI_Allreduce(&flag,&flagall,1,MPI_INT,MPI_SUM,world); - //if (flagall) errorX->all(FLERR,"Atom sort did not operate correctly"); - - modified(Host, ALL_MASK); - // convert back to lamda coords if (domain->triclinic) domain->x2lamda(nlocal); @@ -250,7 +236,6 @@ void AtomKokkos::sort() void AtomKokkos::grow(unsigned int mask) { - if (mask & SPECIAL_MASK) { memoryKK->destroy_kokkos(k_special, special); sync(Device, mask); diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h index 2dd0198111..d1bb41a7b8 100644 --- a/src/KOKKOS/atom_kokkos.h +++ b/src/KOKKOS/atom_kokkos.h @@ -22,6 +22,8 @@ namespace LAMMPS_NS { class AtomKokkos : public Atom { public: + bool sort_classic; + DAT::tdual_tagint_1d k_tag; DAT::tdual_int_1d k_type, k_mask; DAT::tdual_imageint_1d k_image; @@ -108,6 +110,7 @@ class AtomKokkos : public Atom { return local; } + void init() override; void allocate_type_arrays() override; void sync(const ExecutionSpace space, unsigned int mask); void modified(const ExecutionSpace space, unsigned int mask); @@ -119,6 +122,7 @@ class AtomKokkos : public Atom { virtual void deallocate_topology(); void sync_modify(ExecutionSpace, unsigned int, unsigned int) override; private: + void sort_device(); class AtomVec *new_avec(const std::string &, int, int &) override; }; diff --git a/src/KOKKOS/atom_vec_angle_kokkos.cpp b/src/KOKKOS/atom_vec_angle_kokkos.cpp index f132298c2d..dd6be164c0 100644 --- a/src/KOKKOS/atom_vec_angle_kokkos.cpp +++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp @@ -155,6 +155,35 @@ void AtomVecAngleKokkos::grow_pointers() h_angle_atom3 = atomKK->k_angle_atom3.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecAngleKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_molecule); + Sorter.sort(LMPDeviceType(), d_num_bond); + Sorter.sort(LMPDeviceType(), d_bond_type); + Sorter.sort(LMPDeviceType(), d_bond_atom); + Sorter.sort(LMPDeviceType(), d_nspecial); + Sorter.sort(LMPDeviceType(), d_special); + Sorter.sort(LMPDeviceType(), d_num_angle); + Sorter.sort(LMPDeviceType(), d_angle_type); + Sorter.sort(LMPDeviceType(), d_angle_atom1); + Sorter.sort(LMPDeviceType(), d_angle_atom2); + Sorter.sort(LMPDeviceType(), d_angle_atom3); + + atomKK->modified(Device, ALL_MASK & ~F_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_angle_kokkos.h b/src/KOKKOS/atom_vec_angle_kokkos.h index a1c20c103b..44f1d824b2 100644 --- a/src/KOKKOS/atom_vec_angle_kokkos.h +++ b/src/KOKKOS/atom_vec_angle_kokkos.h @@ -34,6 +34,7 @@ class AtomVecAngleKokkos : public AtomVecKokkos, public AtomVecAngle { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, const int & iswap, const DAT::tdual_xfloat_2d &buf, diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp index e37779ace5..1ea8377a68 100644 --- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp +++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp @@ -100,6 +100,24 @@ void AtomVecAtomicKokkos::grow_pointers() h_f = atomKK->k_f.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecAtomicKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + + atomKK->modified(Device, ALL_MASK & ~F_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h index f72af73537..07631dda98 100644 --- a/src/KOKKOS/atom_vec_atomic_kokkos.h +++ b/src/KOKKOS/atom_vec_atomic_kokkos.h @@ -35,6 +35,7 @@ class AtomVecAtomicKokkos : public AtomVecKokkos, public AtomVecAtomic { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) override; diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp index dcbe1876f4..c45bdedf38 100644 --- a/src/KOKKOS/atom_vec_bond_kokkos.cpp +++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp @@ -126,6 +126,30 @@ void AtomVecBondKokkos::grow_pointers() h_bond_atom = atomKK->k_bond_atom.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecBondKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_molecule); + Sorter.sort(LMPDeviceType(), d_num_bond); + Sorter.sort(LMPDeviceType(), d_bond_type); + Sorter.sort(LMPDeviceType(), d_bond_atom); + Sorter.sort(LMPDeviceType(), d_nspecial); + Sorter.sort(LMPDeviceType(), d_special); + + atomKK->modified(Device, ALL_MASK & ~F_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_bond_kokkos.h b/src/KOKKOS/atom_vec_bond_kokkos.h index fc3f02e916..5ed59432de 100644 --- a/src/KOKKOS/atom_vec_bond_kokkos.h +++ b/src/KOKKOS/atom_vec_bond_kokkos.h @@ -34,6 +34,7 @@ class AtomVecBondKokkos : public AtomVecKokkos, public AtomVecBond { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) override; diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp index a9975c1bb4..22fc63ff91 100644 --- a/src/KOKKOS/atom_vec_charge_kokkos.cpp +++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp @@ -106,6 +106,25 @@ void AtomVecChargeKokkos::grow_pointers() h_q = atomKK->k_q.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecChargeKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_q); + + atomKK->modified(Device, ALL_MASK & ~F_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_charge_kokkos.h b/src/KOKKOS/atom_vec_charge_kokkos.h index 072b5e6894..397a5ee4c0 100644 --- a/src/KOKKOS/atom_vec_charge_kokkos.h +++ b/src/KOKKOS/atom_vec_charge_kokkos.h @@ -35,6 +35,7 @@ class AtomVecChargeKokkos : public AtomVecKokkos, public AtomVecCharge { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) override; diff --git a/src/KOKKOS/atom_vec_dipole_kokkos.cpp b/src/KOKKOS/atom_vec_dipole_kokkos.cpp index b2357ccb41..ad06570cdc 100644 --- a/src/KOKKOS/atom_vec_dipole_kokkos.cpp +++ b/src/KOKKOS/atom_vec_dipole_kokkos.cpp @@ -107,6 +107,26 @@ void AtomVecDipoleKokkos::grow_pointers() h_mu = atomKK->k_mu.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecDipoleKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_q); + Sorter.sort(LMPDeviceType(), d_mu); + + atomKK->modified(Device, ALL_MASK & ~F_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_dipole_kokkos.h b/src/KOKKOS/atom_vec_dipole_kokkos.h index f9abfc9a2a..97ec92c6c6 100644 --- a/src/KOKKOS/atom_vec_dipole_kokkos.h +++ b/src/KOKKOS/atom_vec_dipole_kokkos.h @@ -35,6 +35,7 @@ class AtomVecDipoleKokkos : public AtomVecKokkos, public AtomVecDipole { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) override; diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp index 6fa3277350..eda26a92dc 100644 --- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp +++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp @@ -135,6 +135,30 @@ void AtomVecDPDKokkos::grow_pointers() h_duChem = atomKK->k_duChem.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecDPDKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK & ~DPDRHO_MASK & ~DUCHEM_MASK & ~DVECTOR_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_dpdTheta); + Sorter.sort(LMPDeviceType(), d_uCond); + Sorter.sort(LMPDeviceType(), d_uMech); + Sorter.sort(LMPDeviceType(), d_uChem); + Sorter.sort(LMPDeviceType(), d_uCG); + Sorter.sort(LMPDeviceType(), d_uCGnew); + + atomKK->modified(Device, ALL_MASK & ~F_MASK & ~DPDRHO_MASK & ~DUCHEM_MASK & ~DVECTOR_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h index c605246eba..a76d7f908a 100644 --- a/src/KOKKOS/atom_vec_dpd_kokkos.h +++ b/src/KOKKOS/atom_vec_dpd_kokkos.h @@ -35,6 +35,7 @@ class AtomVecDPDKokkos : public AtomVecKokkos, public AtomVecDPD { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, const int & iswap, const DAT::tdual_xfloat_2d &buf, diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp index bb61c7fb46..829ebc75e6 100644 --- a/src/KOKKOS/atom_vec_full_kokkos.cpp +++ b/src/KOKKOS/atom_vec_full_kokkos.cpp @@ -225,6 +225,48 @@ void AtomVecFullKokkos::grow_pointers() h_improper_atom4 = atomKK->k_improper_atom4.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecFullKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_q); + Sorter.sort(LMPDeviceType(), d_molecule); + Sorter.sort(LMPDeviceType(), d_num_bond); + Sorter.sort(LMPDeviceType(), d_bond_type); + Sorter.sort(LMPDeviceType(), d_bond_atom); + Sorter.sort(LMPDeviceType(), d_nspecial); + Sorter.sort(LMPDeviceType(), d_special); + Sorter.sort(LMPDeviceType(), d_num_angle); + Sorter.sort(LMPDeviceType(), d_angle_type); + Sorter.sort(LMPDeviceType(), d_angle_atom1); + Sorter.sort(LMPDeviceType(), d_angle_atom2); + Sorter.sort(LMPDeviceType(), d_angle_atom3); + Sorter.sort(LMPDeviceType(), d_num_dihedral); + Sorter.sort(LMPDeviceType(), d_dihedral_type); + Sorter.sort(LMPDeviceType(), d_dihedral_atom1); + Sorter.sort(LMPDeviceType(), d_dihedral_atom2); + Sorter.sort(LMPDeviceType(), d_dihedral_atom3); + Sorter.sort(LMPDeviceType(), d_dihedral_atom4); + Sorter.sort(LMPDeviceType(), d_num_improper); + Sorter.sort(LMPDeviceType(), d_improper_type); + Sorter.sort(LMPDeviceType(), d_improper_atom1); + Sorter.sort(LMPDeviceType(), d_improper_atom2); + Sorter.sort(LMPDeviceType(), d_improper_atom3); + Sorter.sort(LMPDeviceType(), d_improper_atom4); + + atomKK->modified(Device, ALL_MASK & ~F_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_full_kokkos.h b/src/KOKKOS/atom_vec_full_kokkos.h index e6fcfd7e40..4937ef4152 100644 --- a/src/KOKKOS/atom_vec_full_kokkos.h +++ b/src/KOKKOS/atom_vec_full_kokkos.h @@ -34,6 +34,7 @@ class AtomVecFullKokkos : public AtomVecKokkos, public AtomVecFull { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) override; diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp index 03311d1c32..4e01ab5794 100644 --- a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp +++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp @@ -51,6 +51,16 @@ void AtomVecHybridKokkos::grow(int n) f = atom->f; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecHybridKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + for (int k = 0; k < nstyles; k++) + (dynamic_cast(styles[k]))->sort_kokkos(Sorter); +} + /* ---------------------------------------------------------------------- */ int AtomVecHybridKokkos::pack_comm_kokkos(const int &/*n*/, const DAT::tdual_int_2d &/*k_sendlist*/, diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.h b/src/KOKKOS/atom_vec_hybrid_kokkos.h index 862b43d80b..6f81c93673 100644 --- a/src/KOKKOS/atom_vec_hybrid_kokkos.h +++ b/src/KOKKOS/atom_vec_hybrid_kokkos.h @@ -34,6 +34,7 @@ class AtomVecHybridKokkos : public AtomVecKokkos, public AtomVecHybrid { AtomVecHybridKokkos(class LAMMPS *); void grow(int) override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, const int & iswap, diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h index dfb4aecfcf..310f1f4d48 100644 --- a/src/KOKKOS/atom_vec_kokkos.h +++ b/src/KOKKOS/atom_vec_kokkos.h @@ -20,6 +20,8 @@ #include "kokkos_type.h" #include +#include + namespace LAMMPS_NS { union d_ubuf { @@ -38,6 +40,11 @@ class AtomVecKokkos : virtual public AtomVec { AtomVecKokkos(class LAMMPS *); ~AtomVecKokkos() override; + using KeyViewType = DAT::t_x_array; + using BinOp = BinOp3DLAMMPS; + virtual void + sort_kokkos(Kokkos::BinSort &Sorter) = 0; + virtual void sync(ExecutionSpace space, unsigned int mask) = 0; virtual void modified(ExecutionSpace space, unsigned int mask) = 0; virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0; @@ -117,7 +124,6 @@ class AtomVecKokkos : virtual public AtomVec { ExecutionSpace space, DAT::tdual_int_1d &k_indices) = 0; - int no_comm_vel_flag,no_border_vel_flag; int unpack_exchange_indices_flag; int size_exchange; diff --git a/src/KOKKOS/atom_vec_molecular_kokkos.cpp b/src/KOKKOS/atom_vec_molecular_kokkos.cpp index 1bb75a1906..471dd0ad58 100644 --- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp +++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp @@ -217,6 +217,47 @@ void AtomVecMolecularKokkos::grow_pointers() h_improper_atom4 = atomKK->k_improper_atom4.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecMolecularKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_molecule); + Sorter.sort(LMPDeviceType(), d_num_bond); + Sorter.sort(LMPDeviceType(), d_bond_type); + Sorter.sort(LMPDeviceType(), d_bond_atom); + Sorter.sort(LMPDeviceType(), d_nspecial); + Sorter.sort(LMPDeviceType(), d_special); + Sorter.sort(LMPDeviceType(), d_num_angle); + Sorter.sort(LMPDeviceType(), d_angle_type); + Sorter.sort(LMPDeviceType(), d_angle_atom1); + Sorter.sort(LMPDeviceType(), d_angle_atom2); + Sorter.sort(LMPDeviceType(), d_angle_atom3); + Sorter.sort(LMPDeviceType(), d_num_dihedral); + Sorter.sort(LMPDeviceType(), d_dihedral_type); + Sorter.sort(LMPDeviceType(), d_dihedral_atom1); + Sorter.sort(LMPDeviceType(), d_dihedral_atom2); + Sorter.sort(LMPDeviceType(), d_dihedral_atom3); + Sorter.sort(LMPDeviceType(), d_dihedral_atom4); + Sorter.sort(LMPDeviceType(), d_num_improper); + Sorter.sort(LMPDeviceType(), d_improper_type); + Sorter.sort(LMPDeviceType(), d_improper_atom1); + Sorter.sort(LMPDeviceType(), d_improper_atom2); + Sorter.sort(LMPDeviceType(), d_improper_atom3); + Sorter.sort(LMPDeviceType(), d_improper_atom4); + + atomKK->modified(Device, ALL_MASK & ~F_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_molecular_kokkos.h b/src/KOKKOS/atom_vec_molecular_kokkos.h index af8a2258e1..eb976e9073 100644 --- a/src/KOKKOS/atom_vec_molecular_kokkos.h +++ b/src/KOKKOS/atom_vec_molecular_kokkos.h @@ -34,6 +34,7 @@ class AtomVecMolecularKokkos : public AtomVecKokkos, public AtomVecMolecular { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, const int & iswap, const DAT::tdual_xfloat_2d &buf, diff --git a/src/KOKKOS/atom_vec_sphere_kokkos.cpp b/src/KOKKOS/atom_vec_sphere_kokkos.cpp index 40af56489b..a9b64fc835 100644 --- a/src/KOKKOS/atom_vec_sphere_kokkos.cpp +++ b/src/KOKKOS/atom_vec_sphere_kokkos.cpp @@ -123,6 +123,27 @@ void AtomVecSphereKokkos::grow_pointers() h_torque = atomKK->k_torque.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecSphereKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, ALL_MASK & ~F_MASK & ~TORQUE_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_radius); + Sorter.sort(LMPDeviceType(), d_rmass); + Sorter.sort(LMPDeviceType(), d_omega); + + atomKK->modified(Device, ALL_MASK & ~F_MASK & ~TORQUE_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_sphere_kokkos.h b/src/KOKKOS/atom_vec_sphere_kokkos.h index 32357fb600..34529320d9 100644 --- a/src/KOKKOS/atom_vec_sphere_kokkos.h +++ b/src/KOKKOS/atom_vec_sphere_kokkos.h @@ -35,6 +35,7 @@ class AtomVecSphereKokkos : public AtomVecKokkos, public AtomVecSphere { void grow(int) override; void grow_pointers() override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist, const int & iswap, diff --git a/src/KOKKOS/atom_vec_spin_kokkos.cpp b/src/KOKKOS/atom_vec_spin_kokkos.cpp index 662072ead9..f5b8697352 100644 --- a/src/KOKKOS/atom_vec_spin_kokkos.cpp +++ b/src/KOKKOS/atom_vec_spin_kokkos.cpp @@ -129,6 +129,25 @@ void AtomVecSpinKokkos::grow_pointers() h_fm_long = atomKK->k_fm_long.h_view; } +/* ---------------------------------------------------------------------- + sort atom arrays on device +------------------------------------------------------------------------- */ + +void AtomVecSpinKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + atomKK->sync(Device, TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|X_MASK|V_MASK|SP_MASK); + + Sorter.sort(LMPDeviceType(), d_tag); + Sorter.sort(LMPDeviceType(), d_type); + Sorter.sort(LMPDeviceType(), d_mask); + Sorter.sort(LMPDeviceType(), d_image); + Sorter.sort(LMPDeviceType(), d_x); + Sorter.sort(LMPDeviceType(), d_v); + Sorter.sort(LMPDeviceType(), d_sp); + + atomKK->modified(Device, TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|X_MASK|V_MASK|SP_MASK); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/atom_vec_spin_kokkos.h b/src/KOKKOS/atom_vec_spin_kokkos.h index 6a48d195a2..d14d01fb62 100644 --- a/src/KOKKOS/atom_vec_spin_kokkos.h +++ b/src/KOKKOS/atom_vec_spin_kokkos.h @@ -34,7 +34,7 @@ class AtomVecSpinKokkos : public AtomVecKokkos, public AtomVecSpin { AtomVecSpinKokkos(class LAMMPS *); void grow(int) override; void grow_pointers() override; - // input lists to be checked + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) override; diff --git a/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp b/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp index 3a2447461e..1280a4d9a7 100644 --- a/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_acks2_reaxff_kokkos.cpp @@ -49,6 +49,7 @@ FixACKS2ReaxFFKokkos(LAMMPS *lmp, int narg, char **arg) : FixACKS2ReaxFF(lmp, narg, arg) { kokkosable = 1; + sort_device = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; @@ -1912,6 +1913,25 @@ void FixACKS2ReaxFFKokkos::copy_arrays(int i, int j, int delflag) k_s_hist_X.template modify(); } +/* ---------------------------------------------------------------------- + sort local atom-based arrays +------------------------------------------------------------------------- */ + +template +void FixACKS2ReaxFFKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + // always sort on the device + + k_s_hist.sync_device(); + k_s_hist_X.sync_device(); + + Sorter.sort(LMPDeviceType(), k_s_hist.d_view); + Sorter.sort(LMPDeviceType(), k_s_hist_X.d_view); + + k_s_hist.modify_device(); + k_s_hist_X.modify_device(); +} + /* ---------------------------------------------------------------------- pack values in local atom-based array for exchange with another proc ------------------------------------------------------------------------- */ diff --git a/src/KOKKOS/fix_acks2_reaxff_kokkos.h b/src/KOKKOS/fix_acks2_reaxff_kokkos.h index de49e8e72f..3e89cb4d43 100644 --- a/src/KOKKOS/fix_acks2_reaxff_kokkos.h +++ b/src/KOKKOS/fix_acks2_reaxff_kokkos.h @@ -27,6 +27,7 @@ FixStyle(acks2/reax/kk/host,FixACKS2ReaxFFKokkos); #include "fix_acks2_reaxff.h" #include "kokkos_type.h" +#include "kokkos_base.h" #include "neigh_list.h" #include "neigh_list_kokkos.h" @@ -57,7 +58,7 @@ struct TagACKS2ZeroQGhosts{}; struct TagACKS2CalculateQ{}; template -class FixACKS2ReaxFFKokkos : public FixACKS2ReaxFF { +class FixACKS2ReaxFFKokkos : public FixACKS2ReaxFF, public KokkosBase { public: typedef DeviceType device_type; typedef double value_type; @@ -252,6 +253,7 @@ class FixACKS2ReaxFFKokkos : public FixACKS2ReaxFF { void grow_arrays(int); void copy_arrays(int, int, int); + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_exchange(int, double *); int unpack_exchange(int, double *); void get_chi_field(); diff --git a/src/KOKKOS/fix_langevin_kokkos.cpp b/src/KOKKOS/fix_langevin_kokkos.cpp index 14eb0f1ab7..4d7a3e8820 100644 --- a/src/KOKKOS/fix_langevin_kokkos.cpp +++ b/src/KOKKOS/fix_langevin_kokkos.cpp @@ -45,6 +45,7 @@ FixLangevinKokkos::FixLangevinKokkos(LAMMPS *lmp, int narg, char **a { kokkosable = 1; fuse_integrate_flag = 1; + sort_device = 1; atomKK = (AtomKokkos *) atom; int ntypes = atomKK->ntypes; @@ -898,6 +899,25 @@ void FixLangevinKokkos::copy_arrays(int i, int j, int /*delflag*/) } +/* ---------------------------------------------------------------------- + sort local atom-based arrays +------------------------------------------------------------------------- */ + +template +void FixLangevinKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + // always sort on the device + + k_franprev.sync_device(); + k_lv.sync_device(); + + Sorter.sort(LMPDeviceType(), k_franprev.d_view); + Sorter.sort(LMPDeviceType(), k_lv.d_view); + + k_franprev.modify_device(); + k_lv.modify_device(); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/fix_langevin_kokkos.h b/src/KOKKOS/fix_langevin_kokkos.h index 0bd628270e..4fc22a1df1 100644 --- a/src/KOKKOS/fix_langevin_kokkos.h +++ b/src/KOKKOS/fix_langevin_kokkos.h @@ -25,6 +25,7 @@ FixStyle(langevin/kk/host,FixLangevinKokkos); #include "fix_langevin.h" #include "kokkos_type.h" +#include "kokkos_base.h" #include "Kokkos_Random.hpp" #include "comm_kokkos.h" @@ -61,7 +62,7 @@ namespace LAMMPS_NS { template struct FixLangevinKokkosTallyEnergyFunctor; template - class FixLangevinKokkos : public FixLangevin { + class FixLangevinKokkos : public FixLangevin, public KokkosBase { public: FixLangevinKokkos(class LAMMPS *, int, char **); ~FixLangevinKokkos() override; @@ -74,6 +75,7 @@ namespace LAMMPS_NS { void reset_dt() override; void grow_arrays(int) override; void copy_arrays(int i, int j, int delflag) override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; double compute_scalar() override; void end_of_step() override; diff --git a/src/KOKKOS/fix_minimize_kokkos.cpp b/src/KOKKOS/fix_minimize_kokkos.cpp index 07c78e86a3..90ef0f4525 100644 --- a/src/KOKKOS/fix_minimize_kokkos.cpp +++ b/src/KOKKOS/fix_minimize_kokkos.cpp @@ -27,6 +27,8 @@ using namespace FixConst; FixMinimizeKokkos::FixMinimizeKokkos(LAMMPS *lmp, int narg, char **arg) : FixMinimize(lmp, narg, arg) { + kokkosable = 1; + sort_device = 1; atomKK = (AtomKokkos *) atom; } @@ -217,6 +219,21 @@ void FixMinimizeKokkos::copy_arrays(int i, int j, int /*delflag*/) k_vectors.modify(); } +/* ---------------------------------------------------------------------- + sort local atom-based arrays +------------------------------------------------------------------------- */ + +void FixMinimizeKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + // always sort on the device + + k_vectors.sync_device(); + + Sorter.sort(LMPDeviceType(), k_vectors.d_view); + + k_vectors.modify_device(); +} + /* ---------------------------------------------------------------------- pack values in local atom-based arrays for exchange with another proc ------------------------------------------------------------------------- */ diff --git a/src/KOKKOS/fix_minimize_kokkos.h b/src/KOKKOS/fix_minimize_kokkos.h index e84cbd1ec2..121711b4e4 100644 --- a/src/KOKKOS/fix_minimize_kokkos.h +++ b/src/KOKKOS/fix_minimize_kokkos.h @@ -25,10 +25,11 @@ FixStyle(MINIMIZE/kk/host,FixMinimizeKokkos); #include "fix_minimize.h" #include "kokkos_type.h" +#include "kokkos_base.h" namespace LAMMPS_NS { -class FixMinimizeKokkos : public FixMinimize { +class FixMinimizeKokkos : public FixMinimize, public KokkosBase { friend class MinLineSearchKokkos; public: @@ -38,6 +39,7 @@ class FixMinimizeKokkos : public FixMinimize { void grow_arrays(int) override; void copy_arrays(int, int, int) override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_exchange(int, double *) override; int unpack_exchange(int, double *) override; diff --git a/src/KOKKOS/fix_neigh_history_kokkos.cpp b/src/KOKKOS/fix_neigh_history_kokkos.cpp index 198ab555f3..b4a852ba70 100644 --- a/src/KOKKOS/fix_neigh_history_kokkos.cpp +++ b/src/KOKKOS/fix_neigh_history_kokkos.cpp @@ -32,7 +32,7 @@ FixNeighHistoryKokkos::FixNeighHistoryKokkos(LAMMPS *lmp, int narg, FixNeighHistory(lmp, narg, arg) { kokkosable = 1; - exchange_comm_device = 1; + exchange_comm_device = sort_device = 1; atomKK = (AtomKokkos *)atom; execution_space = ExecutionSpaceFromDevice::space; @@ -325,6 +325,28 @@ void FixNeighHistoryKokkos::copy_arrays(int i, int j, int /*delflag* k_valuepartner.modify_host(); } +/* ---------------------------------------------------------------------- + sort local atom-based arrays +------------------------------------------------------------------------- */ + +template +void FixNeighHistoryKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + // always sort on the device + + k_npartner.sync_device(); + k_partner.sync_device(); + k_valuepartner.sync_device(); + + Sorter.sort(LMPDeviceType(), k_npartner.d_view); + Sorter.sort(LMPDeviceType(), k_partner.d_view); + Sorter.sort(LMPDeviceType(), k_valuepartner.d_view); + + k_npartner.modify_device(); + k_partner.modify_device(); + k_valuepartner.modify_device(); +} + /* ---------------------------------------------------------------------- pack values in local atom-based array for exchange with another proc ------------------------------------------------------------------------- */ diff --git a/src/KOKKOS/fix_neigh_history_kokkos.h b/src/KOKKOS/fix_neigh_history_kokkos.h index 6f29c817b8..9c07a953c4 100644 --- a/src/KOKKOS/fix_neigh_history_kokkos.h +++ b/src/KOKKOS/fix_neigh_history_kokkos.h @@ -48,6 +48,7 @@ class FixNeighHistoryKokkos : public FixNeighHistory, public KokkosBase { void post_neighbor() override; void grow_arrays(int) override; void copy_arrays(int, int, int) override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_exchange(int, double *) override; int unpack_exchange(int, double *) override; double memory_usage() override; diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index db0f7456dd..518677c643 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -58,7 +58,7 @@ FixQEqReaxFFKokkos(LAMMPS *lmp, int narg, char **arg) : { kokkosable = 1; comm_forward = comm_reverse = 2; // fused - forward_comm_device = exchange_comm_device = 1; + forward_comm_device = exchange_comm_device = sort_device = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; @@ -1338,6 +1338,25 @@ void FixQEqReaxFFKokkos::copy_arrays(int i, int j, int /*delflag*/) k_t_hist.template modify(); } +/* ---------------------------------------------------------------------- + sort local atom-based arrays +------------------------------------------------------------------------- */ + +template +void FixQEqReaxFFKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + // always sort on the device + + k_s_hist.sync_device(); + k_t_hist.sync_device(); + + Sorter.sort(LMPDeviceType(), k_s_hist.d_view); + Sorter.sort(LMPDeviceType(), k_t_hist.d_view); + + k_s_hist.modify_device(); + k_t_hist.modify_device(); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.h b/src/KOKKOS/fix_qeq_reaxff_kokkos.h index 29faefe56b..9bc38b0492 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.h +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.h @@ -280,6 +280,7 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { void grow_arrays(int) override; void copy_arrays(int, int, int) override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_exchange(int, double *) override; int unpack_exchange(int, double *) override; void get_chi_field() override; diff --git a/src/KOKKOS/fix_shake_kokkos.cpp b/src/KOKKOS/fix_shake_kokkos.cpp index b00195e5fd..1ea3ed1c5a 100644 --- a/src/KOKKOS/fix_shake_kokkos.cpp +++ b/src/KOKKOS/fix_shake_kokkos.cpp @@ -53,7 +53,7 @@ FixShakeKokkos::FixShakeKokkos(LAMMPS *lmp, int narg, char **arg) : FixShake(lmp, narg, arg) { kokkosable = 1; - forward_comm_device = exchange_comm_device = 1; + forward_comm_device = exchange_comm_device = sort_device = 1; maxexchange = 9; atomKK = (AtomKokkos *)atom; execution_space = ExecutionSpaceFromDevice::space; @@ -1484,6 +1484,28 @@ void FixShakeKokkos::copy_arrays(int i, int j, int delflag) k_shake_type.modify_host(); } +/* ---------------------------------------------------------------------- + sort local atom-based arrays +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + // always sort on the device + + k_shake_flag.sync_device(); + k_shake_atom.sync_device(); + k_shake_type.sync_device(); + + Sorter.sort(LMPDeviceType(), k_shake_flag.d_view); + Sorter.sort(LMPDeviceType(), k_shake_atom.d_view); + Sorter.sort(LMPDeviceType(), k_shake_type.d_view); + + k_shake_flag.modify_device(); + k_shake_atom.modify_device(); + k_shake_type.modify_device(); +} + /* ---------------------------------------------------------------------- initialize one atom's array values, called when atom is created ------------------------------------------------------------------------- */ diff --git a/src/KOKKOS/fix_shake_kokkos.h b/src/KOKKOS/fix_shake_kokkos.h index 650ad52287..185e69ce86 100644 --- a/src/KOKKOS/fix_shake_kokkos.h +++ b/src/KOKKOS/fix_shake_kokkos.h @@ -61,6 +61,7 @@ class FixShakeKokkos : public FixShake, public KokkosBase { void grow_arrays(int) override; void copy_arrays(int, int, int) override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; void set_arrays(int) override; void update_arrays(int, int) override; void set_molecule(int, tagint, int, double *, double *, double *) override; diff --git a/src/KOKKOS/fix_wall_gran_kokkos.cpp b/src/KOKKOS/fix_wall_gran_kokkos.cpp index 1569065bbe..f870b0f240 100644 --- a/src/KOKKOS/fix_wall_gran_kokkos.cpp +++ b/src/KOKKOS/fix_wall_gran_kokkos.cpp @@ -32,7 +32,7 @@ FixWallGranKokkos::FixWallGranKokkos(LAMMPS *lmp, int narg, char **a FixWallGranOld(lmp, narg, arg) { kokkosable = 1; - exchange_comm_device = 1; + exchange_comm_device = sort_device = 1; maxexchange = size_history; atomKK = (AtomKokkos *)atom; execution_space = ExecutionSpaceFromDevice::space; @@ -313,6 +313,22 @@ void FixWallGranKokkos::copy_arrays(int i, int j, int delflag) } } +/* ---------------------------------------------------------------------- + sort local atom-based arrays +------------------------------------------------------------------------- */ + +template +void FixWallGranKokkos::sort_kokkos(Kokkos::BinSort &Sorter) +{ + // always sort on the device + + k_history_one.sync_device(); + + Sorter.sort(LMPDeviceType(), k_history_one.d_view); + + k_history_one.modify_device(); +} + /* ---------------------------------------------------------------------- */ template diff --git a/src/KOKKOS/fix_wall_gran_kokkos.h b/src/KOKKOS/fix_wall_gran_kokkos.h index 4d80528fb8..c7d566ec72 100644 --- a/src/KOKKOS/fix_wall_gran_kokkos.h +++ b/src/KOKKOS/fix_wall_gran_kokkos.h @@ -47,6 +47,7 @@ class FixWallGranKokkos : public FixWallGranOld, public KokkosBase { void post_force(int) override; void grow_arrays(int) override; void copy_arrays(int, int, int) override; + void sort_kokkos(Kokkos::BinSort &Sorter) override; int pack_exchange(int, double *) override; int unpack_exchange(int, double *) override; diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 8b45c786e5..91ea6d37ac 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -93,6 +93,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) reverse_pair_comm_changed = 0; forward_fix_comm_changed = 0; reverse_comm_changed = 0; + sort_changed = 0; delete memory; memory = new MemoryKokkos(lmp); @@ -250,6 +251,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 0; + sort_classic = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else { @@ -264,6 +266,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1; forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1; + sort_classic = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } @@ -478,6 +481,14 @@ void KokkosLMP::accelerator(int narg, char **arg) } else error->all(FLERR,"Illegal package kokkos command"); reverse_comm_changed = 0; iarg += 2; + } else if (strcmp(arg[iarg],"sort") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + else if (strcmp(arg[iarg+1],"no") == 0) sort_classic = 1; + else if (strcmp(arg[iarg+1],"host") == 0) sort_classic = 1; + else if (strcmp(arg[iarg+1],"device") == 0) sort_classic = 0; + else error->all(FLERR,"Illegal package kokkos command"); + sort_changed = 0; + iarg += 2; } else if ((strcmp(arg[iarg],"gpu/aware") == 0) || (strcmp(arg[iarg],"cuda/aware") == 0)) { if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); @@ -533,6 +544,13 @@ void KokkosLMP::accelerator(int narg, char **arg) } } + if (lmp->pair_only_flag) { + if (sort_classic == 0) { + sort_classic = 1; + sort_changed = 1; + } + } + // if "gpu/aware on" and "pair/only off", and comm flags were changed previously, change them back if (gpu_aware_flag && !lmp->pair_only_flag) { @@ -562,6 +580,13 @@ void KokkosLMP::accelerator(int narg, char **arg) } } + if (lmp->pair_only_flag) { + if (sort_changed) { + sort_classic = 0; + sort_changed = 0; + } + } + #endif // set newton flags diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 08b6730e50..adfc1fc646 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -33,6 +33,7 @@ class KokkosLMP : protected Pointers { int reverse_pair_comm_classic; int forward_fix_comm_classic; int reverse_comm_classic; + int sort_classic; int exchange_comm_on_host; int forward_comm_on_host; int reverse_comm_on_host; @@ -42,6 +43,7 @@ class KokkosLMP : protected Pointers { int reverse_pair_comm_changed; int forward_fix_comm_changed; int reverse_comm_changed; + int sort_changed; int nthreads,ngpus; int auto_sync; int gpu_aware_flag; diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h index 463b271269..7d9ecb5d80 100644 --- a/src/KOKKOS/kokkos_base.h +++ b/src/KOKKOS/kokkos_base.h @@ -17,6 +17,8 @@ #include "kokkos_type.h" +#include + namespace LAMMPS_NS { class KokkosBase { @@ -51,6 +53,11 @@ class KokkosBase { virtual void unpack_exchange_kokkos(DAT::tdual_xfloat_2d & /*k_buf*/, DAT::tdual_int_1d & /*indices*/, int /*nrecv*/, ExecutionSpace /*space*/) {} + + using KeyViewType = DAT::t_x_array; + using BinOp = BinOp3DLAMMPS; + virtual void + sort_kokkos(Kokkos::BinSort & /*Sorter*/) {} }; } diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 69ebc91fec..555c7fa9ae 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -473,6 +473,64 @@ struct alignas(2*sizeof(F_FLOAT)) s_FLOAT2 { }; typedef struct s_FLOAT2 F_FLOAT2; +template +struct BinOp3DLAMMPS { + int max_bins_[3] = {}; + double mul_[3] = {}; + double min_[3] = {}; + + BinOp3DLAMMPS() = default; + + BinOp3DLAMMPS(int max_bins__[], typename KeyViewType::const_value_type min[], + typename KeyViewType::const_value_type max[]) { + max_bins_[0] = max_bins__[0]; + max_bins_[1] = max_bins__[1]; + max_bins_[2] = max_bins__[2]; + mul_[0] = static_cast(max_bins__[0]) / + (static_cast(max[0]) - static_cast(min[0])); + mul_[1] = static_cast(max_bins__[1]) / + (static_cast(max[1]) - static_cast(min[1])); + mul_[2] = static_cast(max_bins__[2]) / + (static_cast(max[2]) - static_cast(min[2])); + min_[0] = static_cast(min[0]); + min_[1] = static_cast(min[1]); + min_[2] = static_cast(min[2]); + } + + template + KOKKOS_INLINE_FUNCTION int bin(ViewType& keys, const int& i) const { + int ix = static_cast ((keys(i, 0) - min_[0]) * mul_[0]); + int iy = static_cast ((keys(i, 1) - min_[1]) * mul_[1]); + int iz = static_cast ((keys(i, 2) - min_[2]) * mul_[2]); + ix = MAX(ix,0); + iy = MAX(iy,0); + iz = MAX(iz,0); + ix = MIN(ix,max_bins_[0]-1); + iy = MIN(iy,max_bins_[1]-1); + iz = MIN(iz,max_bins_[2]-1); + const int ibin = iz*max_bins_[1]*max_bins_[0] + iy*max_bins_[0] + ix; + return ibin; + } + + KOKKOS_INLINE_FUNCTION + int max_bins() const { return max_bins_[0] * max_bins_[1] * max_bins_[2]; } + + template + KOKKOS_INLINE_FUNCTION bool operator()(ViewType& keys, iType1& i1, + iType2& i2) const { + if (keys(i1, 2) > keys(i2, 2)) + return true; + else if (keys(i1, 2) == keys(i2, 2)) { + if (keys(i1, 1) > keys(i2, 1)) + return true; + else if (keys(i1, 1) == keys(i2, 1)) { + if (keys(i1, 0) > keys(i2, 0)) return true; + } + } + return false; + } +}; + #ifndef PREC_POS #define PREC_POS PRECISION #endif diff --git a/src/atom.h b/src/atom.h index d2e8030108..810a2829ed 100644 --- a/src/atom.h +++ b/src/atom.h @@ -312,7 +312,7 @@ class Atom : protected Pointers { void create_avec(const std::string &, int, char **, int); virtual AtomVec *new_avec(const std::string &, int, int &); - void init(); + virtual void init(); void setup(); std::string get_style(); diff --git a/src/fix.cpp b/src/fix.cpp index 02adcbd016..25469387ae 100644 --- a/src/fix.cpp +++ b/src/fix.cpp @@ -109,7 +109,7 @@ Fix::Fix(LAMMPS *lmp, int /*narg*/, char **arg) : datamask_modify = ALL_MASK; kokkosable = copymode = 0; - forward_comm_device = exchange_comm_device = 0; + forward_comm_device = exchange_comm_device = sort_device = 0; fuse_integrate_flag = 0; } diff --git a/src/fix.h b/src/fix.h index 9676651afb..30373ab6f2 100644 --- a/src/fix.h +++ b/src/fix.h @@ -133,6 +133,7 @@ class Fix : protected Pointers { int forward_comm_device; // 1 if forward comm on Device int exchange_comm_device; // 1 if exchange comm on Device int fuse_integrate_flag; // 1 if can fuse initial integrate with final integrate + int sort_device; // 1 if sort on Device ExecutionSpace execution_space; unsigned int datamask_read, datamask_modify;