Update docs

2023-12-19 10:46:41 -07:00
parent ab29200c60
commit 86f87e0f7b
2 changed files with 24 additions and 23 deletions
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@ -474,13 +474,13 @@ If the *neigh/thread* keyword is set to *off*, then the KOKKOS package
 threads only over atoms. However, for small systems, this may not expose
 enough parallelism to keep a GPU busy. When this keyword is set to *on*,
 the KOKKOS package threads over both atoms and neighbors of atoms. When
-using *neigh/thread* *on*, a full neighbor list must also be used. Using
-*neigh/thread* *on* may be slower for large systems, so this this option
-is turned on by default only when there are 16K atoms or less owned by
-an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled
-potentials support this keyword yet, and only thread over atoms. Many
-simple pairwise potentials such as Lennard-Jones do support threading
-over both atoms and neighbors.
+using *neigh/thread* *on*, the :doc:`newton pair <newton>` setting must
+be "off". Using *neigh/thread* *on* may be slower for large systems, so
+this this option is turned on by default only when running on one or
+more GPUs and there are 16k atoms or less owned by an MPI rank. Not all
+KOKKOS-enabled potentials support this keyword yet, and only thread over
+atoms. Many simple pairwise potentials such as Lennard-Jones do support
+threading over both atoms and neighbors.

 If the *neigh/transpose* keyword is set to *off*, then the KOKKOS
 package will use the same memory layout for building the neighbor list on
@ -732,7 +732,7 @@ comm = device, sort = device, neigh/transpose = off, gpu/aware = on. When
 LAMMPS can safely detect that GPU-aware MPI is not available, the default value
 of gpu/aware becomes "off". For CPUs or Xeon Phis, the option defaults are
 neigh = half, neigh/qeq = half, newton = on, binsize = 0.0, comm = no, and sort
-= no.  The option neigh/thread = on when there are 16K atoms or less on an MPI
+= no. For GPUs, option neigh/thread = on when there are 16k atoms or less on an MPI
 rank, otherwise it is "off". These settings are made automatically by the
 required "-k on" :doc:`command-line switch <Run_options>`. You can change them
 by using the package kokkos command in your input script or via the :doc:`-pk
--- a/src/KOKKOS/pair_kokkos.h
+++ b/src/KOKKOS/pair_kokkos.h
@ -935,8 +935,10 @@ template<class PairStyle, unsigned NEIGHFLAG, int ZEROFLAG = 0, class Specialisa
 EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&PairStyle::EnabledNeighFlags) != 0, NeighListKokkos<typename PairStyle::device_type>*> list) {
  EV_FLOAT ev;

+  const int inum = list->inum;
+
  if (!fpair->lmp->kokkos->neigh_thread_set)
-    if (list->inum <= 16384)
+    if (fpair->lmp->kokkos->ngpus && inum <= 16000)
      if (NEIGHFLAG == FULL || !fpair->newton_pair)
        fpair->lmp->kokkos->neigh_thread = 1;

@ -947,26 +949,26 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
    static int lastcall = -1;

 #if defined(LMP_KOKKOS_GPU)
-
-  #if defined(KOKKOS_ENABLE_HIP)
-    int max_vectorsize = 64;
-  #else
-    int max_vectorsize = 32;
-  #endif
-
    if (!vectorsize || lastcall < fpair->lmp->neighbor->lastcall) {
      lastcall = fpair->lmp->update->ntimestep;
      vectorsize = GetMaxNeighs(list);
      vectorsize = MathSpecial::powint(2,(int(log2(vectorsize) + 0.5))); // round to nearest power of 2
+
+  #if defined(KOKKOS_ENABLE_HIP)
+      int max_vectorsize = 64;
+  #else
+      int max_vectorsize = 32;
+  #endif
+
      vectorsize = MIN(vectorsize,max_vectorsize);

      int teamsize_max_for,teamsize_max_reduce;
      if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
        PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
-        GetMaxTeamSize<typename PairStyle::device_type>(ff, list->inum, teamsize_max_for, teamsize_max_reduce);
+        GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
      } else {
        PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
-        GetMaxTeamSize<typename PairStyle::device_type>(ff, list->inum, teamsize_max_for, teamsize_max_reduce);
+        GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
      }

      int teamsize_max = teamsize_max_for;
@ -979,7 +981,6 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
    atoms_per_team = 1;
 #endif

-    const int inum = list->inum;
    const int num_teams = inum / atoms_per_team + (inum % atoms_per_team ? 1 : 0);

    if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
@ -996,13 +997,13 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
  } else {
    if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
      PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
-      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
-      else                              Kokkos::parallel_for(list->inum,ff);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev);
+      else                              Kokkos::parallel_for(inum,ff);
      ff.contribute();
    } else {
      PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
-      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(list->inum,ff,ev);
-      else                              Kokkos::parallel_for(list->inum,ff);
+      if (fpair->eflag || fpair->vflag) Kokkos::parallel_reduce(inum,ff,ev);
+      else                              Kokkos::parallel_for(inum,ff);
      ff.contribute();
    }
  }