From 7e5782a81b2e0e51a92d1adaf3683100ece6ed4a Mon Sep 17 00:00:00 2001
From: Stan Gerald Moore <stamoor@sandia.gov>
Date: Thu, 26 Aug 2021 08:21:16 -0600
Subject: [PATCH 01/15] Fix deallocation issue in
 pair_lj_charmm_coul_charmm_kokkos

---
 src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp b/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp
index 66064d58b2..982ec9d99e 100644
--- a/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp
+++ b/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp
@@ -69,7 +69,7 @@ PairLJCharmmCoulCharmmKokkos<DeviceType>::~PairLJCharmmCoulCharmmKokkos()
   if (allocated) {
     memoryKK->destroy_kokkos(k_eatom,eatom);
     memoryKK->destroy_kokkos(k_vatom,vatom);
-    k_cutsq = DAT::tdual_ffloat_2d();
+    memoryKK->destroy_kokkos(k_cutsq,cutsq);
   }
 }
 

From 0e8facdcbbc69ada65fa51c4aa3a835f1a924a2a Mon Sep 17 00:00:00 2001
From: Stan Gerald Moore <stamoor@sandia.gov>
Date: Thu, 26 Aug 2021 08:58:58 -0600
Subject: [PATCH 02/15] Makefile.kokkos: fix (standard_in) 1: syntax error
 (kokkos PR4173)

---
 lib/kokkos/Makefile.kokkos | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 013d2b3ede..2a984eefb6 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -406,8 +406,8 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW
 KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc ))
-KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc )
+KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
   $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )

From be98d0bbd95a67400c5900504fc585a1b6e69811 Mon Sep 17 00:00:00 2001
From: Stan Gerald Moore <stamoor@sandia.gov>
Date: Thu, 26 Aug 2021 09:36:16 -0600
Subject: [PATCH 03/15] Fix memory issue in fix_nvt_sllod_kokkos

---
 src/KOKKOS/fix_nvt_sllod_kokkos.cpp |  7 ++++---
 src/KOKKOS/fix_nvt_sllod_kokkos.h   | 19 +++++++++++--------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
index d0af72f17f..5ba0e6b666 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
@@ -65,8 +65,6 @@ void FixNVTSllodKokkos<DeviceType>::init()
 {
   FixNHKokkos<DeviceType>::init();
 
-  vdelu = typename ArrayTypes<DeviceType>::t_v_array("nvt/sllod/kk:vdelu", atomKK->nlocal);
-
   if (!this->temperature->tempbias)
     this->error->all(FLERR,"Temperature for fix nvt/sllod does not have a bias");
 
@@ -100,7 +98,7 @@ void FixNVTSllodKokkos<DeviceType>::nh_v_temp()
   //   calculate temperature since some computes require temp
   //   computed on current nlocal atoms to remove bias
 
-  if (nondeformbias){
+  if (nondeformbias) {
     atomKK->sync(this->temperature->execution_space,this->temperature->datamask_read);
     this->temperature->compute_scalar();
     atomKK->modified(this->temperature->execution_space,this->temperature->datamask_modify);
@@ -115,6 +113,9 @@ void FixNVTSllodKokkos<DeviceType>::nh_v_temp()
 
   d_h_two = Few<double, 6>(h_two);
 
+  if (vdelu.extent(0) < atomKK->nmax)
+    vdelu = typename AT::t_v_array(Kokkos::NoInit("nvt/sllod/kk:vdelu"), atomKK->nmax);
+
   this->copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixNVTSllod_temp1>(0,nlocal),*this);
   this->copymode = 0;
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.h b/src/KOKKOS/fix_nvt_sllod_kokkos.h
index 6057ce44d0..84e57ab2c3 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.h
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.h
@@ -35,6 +35,9 @@ struct TagFixNVTSllod_temp2{};
 template<class DeviceType>
 class FixNVTSllodKokkos : public FixNHKokkos<DeviceType> {
  public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
   FixNVTSllodKokkos(class LAMMPS *, int, char **);
   ~FixNVTSllodKokkos() {}
   void init();
@@ -51,14 +54,14 @@ class FixNVTSllodKokkos : public FixNHKokkos<DeviceType> {
   void nh_v_temp();
 
  protected:
-  typename ArrayTypes<DeviceType>::t_x_array x;
-  typename ArrayTypes<DeviceType>::t_v_array v;
-  typename ArrayTypes<DeviceType>::t_v_array vdelu;
-  typename ArrayTypes<DeviceType>::t_f_array_const f;
-  typename ArrayTypes<DeviceType>::t_float_1d rmass;
-  typename ArrayTypes<DeviceType>::t_float_1d mass;
-  typename ArrayTypes<DeviceType>::t_int_1d type;
-  typename ArrayTypes<DeviceType>::t_int_1d mask;
+  typename AT::t_x_array x;
+  typename AT::t_v_array v;
+  typename AT::t_v_array vdelu;
+  typename AT::t_f_array_const f;
+  typename AT::t_float_1d rmass;
+  typename AT::t_float_1d mass;
+  typename AT::t_int_1d type;
+  typename AT::t_int_1d mask;
 
   Few<double, 6> d_h_two;
 

From d38549e05fcb6577af834dcdb4098529f3e25e65 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 26 Aug 2021 11:03:57 -0600
Subject: [PATCH 04/15] Update ancient Kokkos Arch in Makefile.kokkos_cuda_mpi

---
 src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi
index 3971cc6c06..c6071cf747 100644
--- a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi
+++ b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi
@@ -23,7 +23,7 @@ ARCHIVE =	ar
 ARFLAGS =	-rc
 SHLIBFLAGS =	-shared
 KOKKOS_DEVICES = Cuda
-KOKKOS_ARCH = Kepler35
+KOKKOS_ARCH = Volta70
 
 # ---------------------------------------------------------------------
 # LAMMPS-specific settings, all OPTIONAL

From 49b0623d6b27923e04d22552c98566cf1f527828 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 26 Aug 2021 13:26:46 -0600
Subject: [PATCH 05/15] Fix issue with Kokkos granular and pair/only on

---
 src/GRANULAR/fix_freeze.h                     |  2 +-
 src/KOKKOS/fix_freeze_kokkos.cpp              | 49 ++-----------------
 src/KOKKOS/fix_freeze_kokkos.h                |  6 +--
 src/KOKKOS/fix_neigh_history_kokkos.cpp       | 15 +++++-
 src/KOKKOS/fix_neigh_history_kokkos.h         | 10 +++-
 src/KOKKOS/fix_nve_sphere_kokkos.cpp          |  2 +-
 src/KOKKOS/fix_nve_sphere_kokkos.h            |  2 +
 src/KOKKOS/pair_gran_hooke_history_kokkos.cpp | 12 ++++-
 src/KOKKOS/pair_gran_hooke_history_kokkos.h   |  4 +-
 9 files changed, 41 insertions(+), 61 deletions(-)

diff --git a/src/GRANULAR/fix_freeze.h b/src/GRANULAR/fix_freeze.h
index 2fc5fda71e..5846bfd769 100644
--- a/src/GRANULAR/fix_freeze.h
+++ b/src/GRANULAR/fix_freeze.h
@@ -30,7 +30,7 @@ class FixFreeze : public Fix {
   int setmask();
   void init();
   void setup(int);
-  void post_force(int);
+  virtual void post_force(int);
   void post_force_respa(int, int, int);
   double compute_vector(int);
 
diff --git a/src/KOKKOS/fix_freeze_kokkos.cpp b/src/KOKKOS/fix_freeze_kokkos.cpp
index 190a054f8b..9a486a3fcc 100644
--- a/src/KOKKOS/fix_freeze_kokkos.cpp
+++ b/src/KOKKOS/fix_freeze_kokkos.cpp
@@ -28,41 +28,16 @@ FixFreezeKokkos<DeviceType>::FixFreezeKokkos(LAMMPS *lmp, int narg, char **arg)
   atomKK = (AtomKokkos *)atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
 
-  datamask_read = F_MASK | MASK_MASK;
+  datamask_read = F_MASK | MASK_MASK | TORQUE_MASK;
   datamask_modify = F_MASK | TORQUE_MASK;
 }
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-int FixFreezeKokkos<DeviceType>::setmask()
-{
-  return FixFreeze::setmask();
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-void FixFreezeKokkos<DeviceType>::init()
-{
-  FixFreeze::init();
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-void FixFreezeKokkos<DeviceType>::setup(int vflag)
-{
-  FixFreeze::setup(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 void FixFreezeKokkos<DeviceType>::post_force(int /*vflag*/)
 {
   atomKK->sync(execution_space,datamask_read);
-  atomKK->modified(execution_space,datamask_modify);
 
   f = atomKK->k_f.view<DeviceType>();
   torque = atomKK->k_torque.view<DeviceType>();
@@ -80,28 +55,10 @@ void FixFreezeKokkos<DeviceType>::post_force(int /*vflag*/)
   foriginal[0] = original.values[0];
   foriginal[1] = original.values[1];
   foriginal[2] = original.values[2];
+
+  atomKK->modified(execution_space,datamask_modify);
 }
 
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-void FixFreezeKokkos<DeviceType>::post_force_respa(int vflag, int /*ilevel*/, int /*iloop*/)
-{
-  post_force(vflag);
-}
-
-/* ----------------------------------------------------------------------
-   return components of total force on fix group before force was changed
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-double FixFreezeKokkos<DeviceType>::compute_vector(int n)
-{
-  return FixFreeze::compute_vector(n);
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void FixFreezeKokkos<DeviceType>::operator()(const int i, OriginalForce &original) const {
diff --git a/src/KOKKOS/fix_freeze_kokkos.h b/src/KOKKOS/fix_freeze_kokkos.h
index dcfc14bd3d..67d4f3272c 100644
--- a/src/KOKKOS/fix_freeze_kokkos.h
+++ b/src/KOKKOS/fix_freeze_kokkos.h
@@ -31,6 +31,7 @@ namespace LAMMPS_NS {
 template<class DeviceType>
 class FixFreezeKokkos : public FixFreeze {
  public:
+  typedef DeviceType device_type;
   struct OriginalForce {
     double values[3];
 
@@ -58,12 +59,7 @@ class FixFreezeKokkos : public FixFreeze {
   };
 
   FixFreezeKokkos(class LAMMPS *, int, char **);
-  int setmask();
-  void init();
-  void setup(int);
   void post_force(int);
-  void post_force_respa(int, int, int);
-  double compute_vector(int);
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const int i, OriginalForce &original) const;
diff --git a/src/KOKKOS/fix_neigh_history_kokkos.cpp b/src/KOKKOS/fix_neigh_history_kokkos.cpp
index 611a8a40ef..4837148ee0 100644
--- a/src/KOKKOS/fix_neigh_history_kokkos.cpp
+++ b/src/KOKKOS/fix_neigh_history_kokkos.cpp
@@ -87,6 +87,9 @@ void FixNeighHistoryKokkos<DeviceType>::pre_exchange()
 {
   copymode = 1;
 
+  k_firstflag.sync<DeviceType>();
+  k_firstvalue.sync<DeviceType>();
+
   h_resize() = 1;
   while (h_resize() > 0) {
     FixNeighHistoryKokkosZeroPartnerCountFunctor<DeviceType> zero(this);
@@ -168,6 +171,9 @@ void FixNeighHistoryKokkos<DeviceType>::post_neighbor()
 {
   tag = atomKK->k_tag.view<DeviceType>();
 
+  k_firstflag.sync<DeviceType>();
+  k_firstvalue.sync<DeviceType>();
+
   int inum = pair->list->inum;
   NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(pair->list);
   d_numneigh = k_list->d_numneigh;
@@ -185,8 +191,10 @@ void FixNeighHistoryKokkos<DeviceType>::post_neighbor()
 
   if (maxatom < nlocal || k_list->maxneighs > (int)d_firstflag.extent(1)) {
     maxatom = nall;
-    d_firstflag = Kokkos::View<int**>("neighbor_history:firstflag",maxatom,k_list->maxneighs);
-    d_firstvalue = Kokkos::View<LMP_FLOAT**>("neighbor_history:firstvalue",maxatom,k_list->maxneighs*dnum);
+    k_firstflag = DAT::tdual_int_2d("neighbor_history:firstflag",maxatom,k_list->maxneighs);
+    k_firstvalue = DAT::tdual_float_2d("neighbor_history:firstvalue",maxatom,k_list->maxneighs*dnum);
+    d_firstflag = k_firstflag.view<DeviceType>();
+    d_firstvalue = k_firstvalue.view<DeviceType>();
   }
 
   copymode = 1;
@@ -194,6 +202,9 @@ void FixNeighHistoryKokkos<DeviceType>::post_neighbor()
   FixNeighHistoryKokkosPostNeighborFunctor<DeviceType> f(this);
   Kokkos::parallel_for(inum,f);
 
+  k_firstflag.modify<DeviceType>();
+  k_firstvalue.modify<DeviceType>();
+
   copymode = 0;
 }
 
diff --git a/src/KOKKOS/fix_neigh_history_kokkos.h b/src/KOKKOS/fix_neigh_history_kokkos.h
index acc1e9c408..0442b46cbd 100644
--- a/src/KOKKOS/fix_neigh_history_kokkos.h
+++ b/src/KOKKOS/fix_neigh_history_kokkos.h
@@ -50,10 +50,13 @@ class FixNeighHistoryKokkos : public FixNeighHistory {
   KOKKOS_INLINE_FUNCTION
   void post_neighbor_item(const int &ii) const;
 
-  typename Kokkos::View<int**> d_firstflag;
-  typename Kokkos::View<LMP_FLOAT**> d_firstvalue;
+  typename DAT::tdual_int_2d k_firstflag;
+  typename DAT::tdual_float_2d k_firstvalue;
 
  private:
+  typename ArrayTypes<DeviceType>::t_int_2d d_firstflag;
+  typename ArrayTypes<DeviceType>::t_float_2d d_firstvalue;
+
   typename ArrayTypes<DeviceType>::tdual_int_1d k_npartner;
   typename ArrayTypes<DeviceType>::tdual_tagint_2d k_partner;
   typename ArrayTypes<DeviceType>::tdual_float_2d k_valuepartner;
@@ -74,6 +77,7 @@ class FixNeighHistoryKokkos : public FixNeighHistory {
 
 template <class DeviceType>
 struct FixNeighHistoryKokkosZeroPartnerCountFunctor {
+  typedef DeviceType device_type;
   FixNeighHistoryKokkos<DeviceType> c;
   FixNeighHistoryKokkosZeroPartnerCountFunctor(FixNeighHistoryKokkos<DeviceType> *c_ptr): c(*c_ptr) {}
   KOKKOS_INLINE_FUNCTION
@@ -84,6 +88,7 @@ struct FixNeighHistoryKokkosZeroPartnerCountFunctor {
 
 template <class DeviceType>
 struct FixNeighHistoryKokkosPreExchangeFunctor {
+  typedef DeviceType device_type;
   FixNeighHistoryKokkos<DeviceType> c;
   FixNeighHistoryKokkosPreExchangeFunctor(FixNeighHistoryKokkos<DeviceType> *c_ptr): c(*c_ptr) {}
   KOKKOS_INLINE_FUNCTION
@@ -94,6 +99,7 @@ struct FixNeighHistoryKokkosPreExchangeFunctor {
 
 template <class DeviceType>
 struct FixNeighHistoryKokkosPostNeighborFunctor {
+  typedef DeviceType device_type;
   FixNeighHistoryKokkos<DeviceType> c;
   FixNeighHistoryKokkosPostNeighborFunctor(FixNeighHistoryKokkos<DeviceType> *c_ptr): c(*c_ptr) {}
   KOKKOS_INLINE_FUNCTION
diff --git a/src/KOKKOS/fix_nve_sphere_kokkos.cpp b/src/KOKKOS/fix_nve_sphere_kokkos.cpp
index 787171b6ce..1c5011c91f 100644
--- a/src/KOKKOS/fix_nve_sphere_kokkos.cpp
+++ b/src/KOKKOS/fix_nve_sphere_kokkos.cpp
@@ -31,7 +31,7 @@ FixNVESphereKokkos<DeviceType>::FixNVESphereKokkos(LAMMPS *lmp, int narg, char *
   atomKK = (AtomKokkos *)atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
 
-  datamask_read = F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK;
+  datamask_read = X_MASK | V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK;
   datamask_modify = X_MASK | V_MASK | OMEGA_MASK;
 }
 
diff --git a/src/KOKKOS/fix_nve_sphere_kokkos.h b/src/KOKKOS/fix_nve_sphere_kokkos.h
index 888a1baa0d..f3e3df13d4 100644
--- a/src/KOKKOS/fix_nve_sphere_kokkos.h
+++ b/src/KOKKOS/fix_nve_sphere_kokkos.h
@@ -56,6 +56,7 @@ class FixNVESphereKokkos : public FixNVESphere {
 
 template <class DeviceType>
 struct FixNVESphereKokkosInitialIntegrateFunctor {
+  typedef DeviceType device_type;
   FixNVESphereKokkos<DeviceType> c;
   FixNVESphereKokkosInitialIntegrateFunctor(FixNVESphereKokkos<DeviceType> *c_ptr): c(*c_ptr) { c.cleanup_copy(); }
   KOKKOS_INLINE_FUNCTION
@@ -66,6 +67,7 @@ struct FixNVESphereKokkosInitialIntegrateFunctor {
 
 template <class DeviceType>
 struct FixNVESphereKokkosFinalIntegrateFunctor {
+  typedef DeviceType device_type;
   FixNVESphereKokkos<DeviceType> c;
   FixNVESphereKokkosFinalIntegrateFunctor(FixNVESphereKokkos<DeviceType> *c_ptr): c(*c_ptr) { c.cleanup_copy(); }
   KOKKOS_INLINE_FUNCTION
diff --git a/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp b/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp
index fab33d0ec7..b47b5f5a47 100644
--- a/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp
+++ b/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp
@@ -165,8 +165,11 @@ void PairGranHookeHistoryKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       d_neighbors.extent(1) != d_neighbors_touch.extent(1))
     d_neighbors_touch = typename AT::t_neighbors_2d("pair:neighbors_touch",d_neighbors.extent(0),d_neighbors.extent(1));
 
-  d_firsttouch = fix_historyKK->d_firstflag;
-  d_firstshear = fix_historyKK->d_firstvalue;
+  fix_historyKK->k_firstflag.template sync<DeviceType>();
+  fix_historyKK->k_firstvalue.template sync<DeviceType>();
+
+  d_firsttouch = fix_historyKK->k_firstflag.template view<DeviceType>();
+  d_firstshear = fix_historyKK->k_firstvalue.template view<DeviceType>();
 
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryReduce>(0,inum),*this);
 
@@ -258,6 +261,11 @@ void PairGranHookeHistoryKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     }
   }
 
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
   if (vflag_global) {
     virial[0] += ev.v[0];
     virial[1] += ev.v[1];
diff --git a/src/KOKKOS/pair_gran_hooke_history_kokkos.h b/src/KOKKOS/pair_gran_hooke_history_kokkos.h
index 6b887c0df4..37fb208a70 100644
--- a/src/KOKKOS/pair_gran_hooke_history_kokkos.h
+++ b/src/KOKKOS/pair_gran_hooke_history_kokkos.h
@@ -92,8 +92,8 @@ class PairGranHookeHistoryKokkos : public PairGranHookeHistory {
   typename AT::t_int_1d_randomread d_ilist;
   typename AT::t_int_1d_randomread d_numneigh;
 
-  typename Kokkos::View<int**> d_firsttouch;
-  typename Kokkos::View<LMP_FLOAT**> d_firstshear;
+  typename AT::t_int_2d d_firsttouch;
+  typename AT::t_float_2d d_firstshear;
 
   typename AT::t_neighbors_2d d_neighbors_touch;
   typename AT::t_int_1d d_numneigh_touch;

From ddbb8f1aa64539c70bffb99332367623a26016f0 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 26 Aug 2021 13:36:32 -0600
Subject: [PATCH 06/15] Remove unnecessary data tranfer in
 fix_nve_sphere_kokkos

---
 src/KOKKOS/fix_nve_sphere_kokkos.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/fix_nve_sphere_kokkos.cpp b/src/KOKKOS/fix_nve_sphere_kokkos.cpp
index 1c5011c91f..12b170acee 100644
--- a/src/KOKKOS/fix_nve_sphere_kokkos.cpp
+++ b/src/KOKKOS/fix_nve_sphere_kokkos.cpp
@@ -31,8 +31,8 @@ FixNVESphereKokkos<DeviceType>::FixNVESphereKokkos(LAMMPS *lmp, int narg, char *
   atomKK = (AtomKokkos *)atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
 
-  datamask_read = X_MASK | V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK;
-  datamask_modify = X_MASK | V_MASK | OMEGA_MASK;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -61,8 +61,7 @@ void FixNVESphereKokkos<DeviceType>::init()
 template<class DeviceType>
 void FixNVESphereKokkos<DeviceType>::initial_integrate(int /*vflag*/)
 {
-  atomKK->sync(execution_space,datamask_read);
-  atomKK->modified(execution_space,datamask_modify);
+  atomKK->sync(execution_space, X_MASK | V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK);
 
   x = atomKK->k_x.view<DeviceType>();
   v = atomKK->k_v.view<DeviceType>();
@@ -78,6 +77,8 @@ void FixNVESphereKokkos<DeviceType>::initial_integrate(int /*vflag*/)
 
   FixNVESphereKokkosInitialIntegrateFunctor<DeviceType> f(this);
   Kokkos::parallel_for(nlocal,f);
+
+  atomKK->modified(execution_space,  X_MASK | V_MASK | OMEGA_MASK);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -109,8 +110,7 @@ void FixNVESphereKokkos<DeviceType>::initial_integrate_item(const int i) const
 template<class DeviceType>
 void FixNVESphereKokkos<DeviceType>::final_integrate()
 {
-  atomKK->sync(execution_space,datamask_read);
-  atomKK->modified(execution_space,datamask_modify);
+  atomKK->sync(execution_space, V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK);
 
   v = atomKK->k_v.view<DeviceType>();
   omega = atomKK->k_omega.view<DeviceType>();
@@ -125,6 +125,8 @@ void FixNVESphereKokkos<DeviceType>::final_integrate()
 
   FixNVESphereKokkosFinalIntegrateFunctor<DeviceType> f(this);
   Kokkos::parallel_for(nlocal,f);
+
+  atomKK->modified(execution_space, V_MASK | OMEGA_MASK);
 }
 
 /* ---------------------------------------------------------------------- */

From f63d0202be6b5c3c674143e5ebf5aa2ceb3b02ba Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 26 Aug 2021 14:54:48 -0600
Subject: [PATCH 07/15] Rely on auto_sync in verlet setup

---
 src/KOKKOS/verlet_kokkos.cpp | 54 ++++++++----------------------------
 1 file changed, 11 insertions(+), 43 deletions(-)

diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
index 909c6930cf..5e9ab757c6 100644
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -92,40 +92,22 @@ void VerletKokkos::setup(int flag)
   // acquire ghosts
   // build neighbor lists
 
-  atomKK->sync(Host,ALL_MASK);
-  atomKK->modified(Host,ALL_MASK);
+  lmp->kokkos->auto_sync = 1;
 
-  atomKK->setup();
+  atom->setup();
   modify->setup_pre_exchange();
-      // debug
-  atomKK->sync(Host,ALL_MASK);
-  atomKK->modified(Host,ALL_MASK);
-  if (triclinic) domain->x2lamda(atomKK->nlocal);
+  if (triclinic) domain->x2lamda(atom->nlocal);
   domain->pbc();
-
-  atomKK->sync(Host,ALL_MASK);
-
-
   domain->reset_box();
   comm->setup();
   if (neighbor->style) neighbor->setup_bins();
-
   comm->exchange();
-
-  if (atomKK->sortfreq > 0) atomKK->sort();
-
+  if (atom->sortfreq > 0) atom->sort();
   comm->borders();
-
-  if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost);
-
-  atomKK->sync(Host,ALL_MASK);
-
+  if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
   domain->image_check();
   domain->box_too_small_check();
   modify->setup_pre_neighbor();
-
-  atomKK->modified(Host,ALL_MASK);
-
   neighbor->build(1);
   modify->setup_post_neighbor();
   neighbor->ncalls = 0;
@@ -144,7 +126,7 @@ void VerletKokkos::setup(int flag)
   }
   else if (force->pair) force->pair->compute_dummy(eflag,vflag);
 
-  if (atomKK->molecular != Atom::ATOMIC) {
+  if (atom->molecular != Atom::ATOMIC) {
     if (force->bond) {
       atomKK->sync(force->bond->execution_space,force->bond->datamask_read);
       force->bond->compute(eflag,vflag);
@@ -200,35 +182,21 @@ void VerletKokkos::setup_minimal(int flag)
   // acquire ghosts
   // build neighbor lists
 
+  lmp->kokkos->auto_sync = 1;
+
   if (flag) {
-    atomKK->sync(Host,ALL_MASK);
-    atomKK->modified(Host,ALL_MASK);
-
     modify->setup_pre_exchange();
-      // debug
-      atomKK->sync(Host,ALL_MASK);
-      atomKK->modified(Host,ALL_MASK);
-
-    if (triclinic) domain->x2lamda(atomKK->nlocal);
+    if (triclinic) domain->x2lamda(atom->nlocal);
     domain->pbc();
-
-    atomKK->sync(Host,ALL_MASK);
-
     domain->reset_box();
     comm->setup();
     if (neighbor->style) neighbor->setup_bins();
     comm->exchange();
     comm->borders();
-    if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost);
-
-    atomKK->sync(Host,ALL_MASK);
-
+    if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
     domain->image_check();
     domain->box_too_small_check();
     modify->setup_pre_neighbor();
-
-    atomKK->modified(Host,ALL_MASK);
-
     neighbor->build(1);
     modify->setup_post_neighbor();
     neighbor->ncalls = 0;
@@ -247,7 +215,7 @@ void VerletKokkos::setup_minimal(int flag)
   }
   else if (force->pair) force->pair->compute_dummy(eflag,vflag);
 
-  if (atomKK->molecular != Atom::ATOMIC) {
+  if (atom->molecular != Atom::ATOMIC) {
     if (force->bond) {
       atomKK->sync(force->bond->execution_space,force->bond->datamask_read);
       force->bond->compute(eflag,vflag);

From 90f82a8ef191b0fef12ec39397577733d7d4604a Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 27 Aug 2021 17:03:11 -0400
Subject: [PATCH 08/15] memory leak

---
 src/REACTION/fix_bond_react.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/REACTION/fix_bond_react.cpp b/src/REACTION/fix_bond_react.cpp
index 192331889d..7b891d42fe 100644
--- a/src/REACTION/fix_bond_react.cpp
+++ b/src/REACTION/fix_bond_react.cpp
@@ -1990,7 +1990,10 @@ int FixBondReact::check_constraints()
       *ptr = satisfied[i] ? '1' : '0';
     }
     double verdict = input->variable->evaluate_boolean(evalstr);
-    if (verdict == 0.0) return 0;
+    if (verdict == 0.0) {
+      memory->destroy(satisfied);
+      return 0;
+    }
   }
 
   // let's also check chirality within 'check_constraint'
@@ -2012,7 +2015,10 @@ int FixBondReact::check_constraints()
           }
         }
       }
-      if (get_chirality(my4coords) != chiral_atoms[i][1][rxnID]) return 0;
+      if (get_chirality(my4coords) != chiral_atoms[i][1][rxnID]) {
+        memory->destroy(satisfied);
+        return 0;
+      }
     }
   }
 

From c779798f3f3d0729cd1a44406f1386d3e5916c71 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 27 Aug 2021 19:38:48 -0400
Subject: [PATCH 09/15] properly disable clang-format processing

---
 src/KOKKOS/compute_temp_deform_kokkos.h | 2 +-
 src/KOKKOS/pppm_kokkos.cpp              | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/compute_temp_deform_kokkos.h b/src/KOKKOS/compute_temp_deform_kokkos.h
index 8b53c1f633..0292c6776d 100644
--- a/src/KOKKOS/compute_temp_deform_kokkos.h
+++ b/src/KOKKOS/compute_temp_deform_kokkos.h
@@ -1,4 +1,3 @@
-// clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -20,6 +19,7 @@ ComputeStyle(temp/deform/kk/host,ComputeTempDeformKokkos<LMPHostType>);
 // clang-format on
 #else
 
+// clang-format off
 #ifndef LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H
 #define LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H
 
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index a7f58f2525..d71d7d1bad 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -1,3 +1,4 @@
+// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories

From 89556f0bcb3827f3c4560c2607fc4e1a8d5a84a7 Mon Sep 17 00:00:00 2001
From: Mike Brown <michael.w.brown@intel.com>
Date: Sat, 28 Aug 2021 17:01:58 -0700
Subject: [PATCH 10/15] Override any OpenCL fast math JIT settings for
 born/coul/wolf{/cs}/gpu to resolve numerical deviations seen with some OpenCL
 implementations.

---
 lib/gpu/lal_base_charge.cpp       | 16 +++++++++++-----
 lib/gpu/lal_base_charge.h         |  7 +++++--
 lib/gpu/lal_born_coul_wolf.cpp    |  2 +-
 lib/gpu/lal_born_coul_wolf_cs.cpp |  2 +-
 lib/gpu/lal_device.cpp            | 10 ++++++++++
 lib/gpu/lal_device.h              |  1 +
 6 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index 9045420425..84fbddd4e9 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -56,7 +56,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
                              const int max_nbors, const int maxspecial,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
-                             const char *k_name) {
+                             const char *k_name, const int disable_fast_math) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -83,7 +83,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name,disable_fast_math);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -321,14 +321,20 @@ double BaseChargeT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
+                                  const char *kname,
+                                  const int disable_fast_math) {
   if (_compiled)
     return;
 
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  std::string device_compile_string;
+  if (disable_fast_math)
+    device_compile_string = device->compile_string_nofast();
+  else
+    device_compile_string = device->compile_string();
+  std::string oclstring = device_compile_string+" -DEVFLAG=1";
   pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
@@ -336,7 +342,7 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
   q_tex.get_texture(*pair_program,"q_tex");
 
   #if defined(LAL_OCL_EV_JIT)
-  oclstring = device->compile_string()+" -DEVFLAG=0";
+  oclstring = device_compile_string+" -DEVFLAG=0";
   if (pair_program_noev) delete pair_program_noev;
   pair_program_noev=new UCL_Program(dev);
   pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h
index 6b8761092a..307c5c079f 100644
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@@ -44,6 +44,7 @@ class BaseCharge {
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device
     * \param k_name name for the kernel for force calculation
+    * \param disable_fast_math override any fast math opts for kernel JIT
     *
     * Returns:
     * -  0 if successful
@@ -54,7 +55,8 @@ class BaseCharge {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const void *pair_program, const char *k_name,
+                  const int disable_fast_math = 0);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -198,7 +200,8 @@ class BaseCharge {
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+                       const char *k, const int disable_fast_math);
 
   virtual int loop(const int eflag, const int vflag) = 0;
 };
diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp
index e6caebbab8..9aac866353 100644
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@@ -57,7 +57,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
                         const double alf, const double e_shift, const double f_shift) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,born_coul_wolf,"k_born_coul_wolf");
+                            _screen,born_coul_wolf,"k_born_coul_wolf",1);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_born_coul_wolf_cs.cpp b/lib/gpu/lal_born_coul_wolf_cs.cpp
index 8deceeb1f4..abd4da439a 100644
--- a/lib/gpu/lal_born_coul_wolf_cs.cpp
+++ b/lib/gpu/lal_born_coul_wolf_cs.cpp
@@ -42,7 +42,7 @@ int BornCoulWolfCST::init(const int ntypes, double **host_cutsq, double **host_r
                         const double alf, const double e_shift, const double f_shift) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs");
+                            _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs",1);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index e2b5b9cdb5..0ff7125089 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -420,6 +420,16 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
   return 0;
 }
 
+template <class numtyp, class acctyp>
+std::string DeviceT::compile_string_nofast() {
+  std::string no_fast = _ocl_compile_string;
+  size_t p = no_fast.find("-cl-fast-relaxed-math ");
+  if (p != std::string::npos) no_fast.erase(p,22);
+  p = no_fast.find("-DFAST_MATH=");
+  if (p != std::string::npos) no_fast[p + 12]='0';
+  return no_fast;
+}
+
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                   const bool rot, const int nlocal,
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 1db6ae3127..933a3508b5 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -312,6 +312,7 @@ class Device {
   }
 
   inline std::string compile_string() { return _ocl_compile_string; }
+  std::string compile_string_nofast();
   inline std::string ocl_config_name() { return _ocl_config_name; }
 
   template <class t>

From fb72e00081bef2f61c7cb4689829c9c12e57939f Mon Sep 17 00:00:00 2001
From: Mike Brown <michael.w.brown@intel.com>
Date: Sat, 28 Aug 2021 17:18:05 -0700
Subject: [PATCH 11/15] Fix (the fix) for _MM_SCALE preprocessor defines for
 future Intel compilers.

---
 src/INTEL/intel_preprocess.h | 4 ----
 src/INTEL/intel_simd.h       | 7 +++++++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h
index 27daa5f3d2..c7bd60b00b 100644
--- a/src/INTEL/intel_preprocess.h
+++ b/src/INTEL/intel_preprocess.h
@@ -20,10 +20,6 @@
 #define USE_OMP_SIMD
 #define __INTEL_COMPILER __INTEL_LLVM_COMPILER
 #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
-#define _MM_SCALE_1 1
-#define _MM_SCALE_2 2
-#define _MM_SCALE_4 4
-#define _MM_SCALE_8 8
 #endif
 
 #ifdef __INTEL_COMPILER
diff --git a/src/INTEL/intel_simd.h b/src/INTEL/intel_simd.h
index 2affa6a394..d75b2b9175 100644
--- a/src/INTEL/intel_simd.h
+++ b/src/INTEL/intel_simd.h
@@ -35,6 +35,13 @@ authors for more details.
 
 #ifdef __AVX512F__
 
+#ifndef _MM_SCALE_1
+#define _MM_SCALE_1 1
+#define _MM_SCALE_2 2
+#define _MM_SCALE_4 4
+#define _MM_SCALE_8 8
+#endif
+
 namespace ip_simd {
 
   typedef __mmask16 SIMD_mask;

From 39d8b239ff1c58394dc49d31e8f3c0c43a1baf80 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 29 Aug 2021 17:56:47 -0400
Subject: [PATCH 12/15] don't report bogus timings

---
 lib/gpu/lal_base_ellipsoid.cpp | 11 +++++++----
 lib/gpu/lal_device.cpp         | 12 +++++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 2e22b2f602..fa060bea5a 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -224,7 +224,9 @@ void BaseEllipsoidT::output_times() {
 
   #ifdef USE_OPENCL
   // Workaround for timing issue on Intel OpenCL
+  if (times[0] > 80e6) times[0]=0.0;
   if (times[3] > 80e6) times[3]=0.0;
+  if (times[6] > 80e6) times[6]=0.0;
   #endif
 
   if (device->replica_me()==0)
@@ -237,17 +239,18 @@ void BaseEllipsoidT::output_times() {
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (device->procs_per_gpu()==1 && times[3]>0) {
-        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
+      if (device->procs_per_gpu()==1 && (times[3] > 0.0)) {
+        if (times[0] > 0.0)
+          fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
-        if (nbor->gpu_nbor()>0)
+        if (nbor->gpu_nbor() > 0.0)
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
         else
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
         fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
       }
-      if (times[6]>0)
+      if (times[6] > 0.0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Lanes / atom:    %d.\n",_threads_per_atom);
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index e2b5b9cdb5..50046d8bdd 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -777,28 +777,30 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
 
   #ifdef USE_OPENCL
   // Workaround for timing issue on Intel OpenCL
+  if (times[0] > 80e6) times[0]=0.0;
   if (times[3] > 80e6) times[3]=0.0;
   if (times[5] > 80e6) times[5]=0.0;
   #endif
 
   if (replica_me()==0)
-    if (screen && times[6]>0.0) {
+    if (screen && (times[6] > 0.0)) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
       fprintf(screen,"      Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device() && times[3]>0) {
-        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
+      if (time_device() && (times[3] > 0.0)) {
+        if (times[0] > 0.0)
+          fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
-        if (nbor.gpu_nbor()>0)
+        if (nbor.gpu_nbor() > 0.0)
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
         else
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
-      if (times[5]>0)
+      if (times[5] > 0.0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Lanes / atom:    %d.\n",threads_per_atom);

From 664a07a3fe16e5bd2455bda218172f2c810bf143 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 29 Aug 2021 17:57:30 -0400
Subject: [PATCH 13/15] disallow GPU neighbor list with hybrid pair styles
 (which has still problems)

---
 src/GPU/fix_gpu.cpp | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 71ab3f4cb4..843bff2a35 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -13,25 +13,26 @@
 ------------------------------------------------------------------------- */
 
 #include "fix_gpu.h"
-#include <cstring>
 
 #include "atom.h"
+#include "citeme.h"
 #include "comm.h"
+#include "domain.h"
+#include "error.h"
 #include "force.h"
+#include "gpu_extra.h"
+#include "input.h"
+#include "modify.h"
+#include "neighbor.h"
 #include "pair.h"
 #include "pair_hybrid.h"
 #include "pair_hybrid_overlay.h"
 #include "respa.h"
-#include "input.h"
 #include "timer.h"
-#include "modify.h"
-#include "update.h"
-#include "domain.h"
 #include "universe.h"
-#include "gpu_extra.h"
-#include "neighbor.h"
-#include "citeme.h"
-#include "error.h"
+#include "update.h"
+
+#include <cstring>
 
 #if (LAL_USE_OMP == 1)
 #include <omp.h>
@@ -275,12 +276,15 @@ void FixGPU::init()
     error->warning(FLERR,"Using package gpu without any pair style defined");
 
   // make sure fdotr virial is not accumulated multiple times
+  // also disallow GPU neighbor lists for hybrid styles
 
   if (force->pair_match("^hybrid",0) != nullptr) {
     PairHybrid *hybrid = (PairHybrid *) force->pair;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (!utils::strmatch(hybrid->keywords[i],"/gpu$"))
         force->pair->no_virial_fdotr_compute = 1;
+    if (_gpu_mode != GPU_FORCE)
+      error->all(FLERR, "Must not use GPU neighbor lists with hybrid pair style");
   }
 
   // rRESPA support
@@ -295,8 +299,7 @@ void FixGPU::setup(int vflag)
 {
   if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
     if (neighbor->exclude_setting() != 0)
-      error->all(FLERR,
-                 "Cannot use neigh_modify exclude with GPU neighbor builds");
+      error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds");
 
   if (utils::strmatch(update->integrate_style,"^verlet")) post_force(vflag);
   else {

From 284ed98fb8c3979b625261ceb4443bec7ddfdd2c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 29 Aug 2021 22:08:49 -0400
Subject: [PATCH 14/15] fix spelling error and reformat paragraph

---
 doc/src/pair_snap.rst | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/doc/src/pair_snap.rst b/doc/src/pair_snap.rst
index e00169b306..1bc17fa8c8 100644
--- a/doc/src/pair_snap.rst
+++ b/doc/src/pair_snap.rst
@@ -191,22 +191,19 @@ pair_coeff command, to avoid ambiguity in the number of coefficients.
 
 The keywords *chunksize* and *parallelthresh* are only applicable when
 using the pair style *snap* with the KOKKOS package on GPUs and are
-ignored otherwise.
-The *chunksize* keyword controls
-the number of atoms in each pass used to compute the bispectrum
-components and is used to avoid running out of memory. For example
-if there are 8192 atoms in the simulation and the *chunksize*
-is set to 4096, the bispectrum calculation will be broken up
-into two passes (running on a single GPU).
-The *parallelthresh* keyword controls
-a crossover threshold for performing extra parallelism. For
-small systems, exposing additional parallism can be beneficial when
-there is not enough work to fully saturate the GPU threads otherwise.
-However, the extra parallelism also leads to more divergence
-and can hurt performance when the system is already large enough to
-saturate the GPU threads. Extra parallelism will be performed if the
-*chunksize* (or total number of atoms per GPU) is smaller than
-*parallelthresh*.
+ignored otherwise.  The *chunksize* keyword controls the number of atoms
+in each pass used to compute the bispectrum components and is used to
+avoid running out of memory.  For example if there are 8192 atoms in the
+simulation and the *chunksize* is set to 4096, the bispectrum
+calculation will be broken up into two passes (running on a single GPU).
+The *parallelthresh* keyword controls a crossover threshold for
+performing extra parallelism.  For small systems, exposing additional
+parallelism can be beneficial when there is not enough work to fully
+saturate the GPU threads otherwise.  However, the extra parallelism also
+leads to more divergence and can hurt performance when the system is
+already large enough to saturate the GPU threads.  Extra parallelism
+will be performed if the *chunksize* (or total number of atoms per GPU)
+is smaller than *parallelthresh*.
 
 Detailed definitions for all the other keywords
 are given on the :doc:`compute sna/atom <compute_sna_atom>` doc page.

From 00c3c5cf06b013e4cad7eb052ab5634aa3a16e15 Mon Sep 17 00:00:00 2001
From: Stan Gerald Moore <stamoor@sandia.gov>
Date: Mon, 30 Aug 2021 12:43:07 -0600
Subject: [PATCH 15/15] Port changes from #2903 to Kokkos

---
 src/KOKKOS/fix_property_atom_kokkos.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/KOKKOS/fix_property_atom_kokkos.cpp b/src/KOKKOS/fix_property_atom_kokkos.cpp
index 841b791c5c..97931f5d67 100644
--- a/src/KOKKOS/fix_property_atom_kokkos.cpp
+++ b/src/KOKKOS/fix_property_atom_kokkos.cpp
@@ -45,23 +45,23 @@ FixPropertyAtomKokkos::FixPropertyAtomKokkos(LAMMPS *lmp, int narg, char **arg)
 void FixPropertyAtomKokkos::grow_arrays(int nmax)
 {
   for (int m = 0; m < nvalue; m++) {
-    if (style[m] == MOLECULE) {
+    if (styles[m] == MOLECULE) {
       memory->grow(atom->molecule,nmax,"atom:molecule");
       size_t nbytes = (nmax-nmax_old) * sizeof(tagint);
       memset(&atom->molecule[nmax_old],0,nbytes);
-    } else if (style[m] == CHARGE) {
+    } else if (styles[m] == CHARGE) {
       memory->grow(atom->q,nmax,"atom:q");
       size_t nbytes = (nmax-nmax_old) * sizeof(double);
       memset(&atom->q[nmax_old],0,nbytes);
-    } else if (style[m] == RMASS) {
+    } else if (styles[m] == RMASS) {
       memory->grow(atom->rmass,nmax,"atom:rmass");
       size_t nbytes = (nmax-nmax_old) * sizeof(double);
       memset(&atom->rmass[nmax_old],0,nbytes);
-    } else if (style[m] == INTEGER) {
+    } else if (styles[m] == INTEGER) {
       memory->grow(atom->ivector[index[m]],nmax,"atom:ivector");
       size_t nbytes = (nmax-nmax_old) * sizeof(int);
       memset(&atom->ivector[index[m]][nmax_old],0,nbytes);
-    } else if (style[m] == DOUBLE) {
+    } else if (styles[m] == DOUBLE) {
       atomKK->sync(Device,DVECTOR_MASK);
       memoryKK->grow_kokkos(atomKK->k_dvector,atomKK->dvector,atomKK->k_dvector.extent(0),nmax,
                           "atom:dvector");