From f560cd6dd596857d14d3663e3c225fbaec2096d0 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sat, 21 Mar 2020 01:08:09 -0400
Subject: [PATCH 01/14] make certain, the molecular flag is always initialized

---
 src/atom_vec.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp
index 7b89c2fd79..c4dd53ad18 100644
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@@ -36,6 +36,7 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
   forceclearflag = 0;
   size_data_bonus = 0;
   maxexchange = 0;
+  molecular = 0;
 
   kokkosable = 0;
 

From 79b84c0847d0f9504527ce0b5dcb99688af2943c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 22 Mar 2020 15:44:28 -0400
Subject: [PATCH 02/14] more thorough checking if BUILD_OMP may be enabled by
 default. we need the OpenMP runtime, too.

---
 cmake/CMakeLists.txt | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 4dd079eaae..36bed2d649 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -227,7 +227,15 @@ pkg_depends(USER-LB MPI)
 pkg_depends(USER-PHONON KSPACE)
 pkg_depends(USER-SCAFACOS MPI)
 
+# detect if we may enable OpenMP support by default
+set(BUILD_OMP_DEFAULT OFF)
 find_package(OpenMP QUIET)
+if(OpenMP_FOUND)
+  check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE)
+  if(HAVE_OMP_H_INCLUDE)
+    set(BUILD_OMP_DEFAULT ON)
+  endif()
+endif()
 
 # TODO: this is a temporary workaround until a better solution is found. AK 2019-05-30
 # GNU GCC 9.x uses settings incompatible with our use of 'default(none)' in OpenMP pragmas
@@ -237,14 +245,14 @@ find_package(OpenMP QUIET)
 if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.99.9))
   option(BUILD_OMP "Build with OpenMP support" OFF)
 else()
-  option(BUILD_OMP "Build with OpenMP support" ${OpenMP_FOUND})
+  option(BUILD_OMP "Build with OpenMP support" ${BUILD_OMP_DEFAULT})
 endif()
 
 if(BUILD_OMP)
   find_package(OpenMP REQUIRED)
   check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE)
   if(NOT HAVE_OMP_H_INCLUDE)
-    message(FATAL_ERROR "Cannot find required 'omp.h' header file")
+    message(FATAL_ERROR "Cannot find the 'omp.h' header file required for full OpenMP support")
   endif()
   set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

From e1f01d3e6514eecb83584c1e2b14c88569db70bb Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 22 Mar 2020 21:14:26 -0400
Subject: [PATCH 03/14] use consistent naming for c++11 style kspace thread

---
 src/USER-INTEL/verlet_lrt_intel.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp
index a3c05c46bc..bd143c4c94 100644
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@@ -185,7 +185,7 @@ void VerletLRTIntel::setup(int flag)
   _kspace_done = 0;
   pthread_mutex_unlock(&_kmutex);
   #elif defined(_LMP_INTEL_LRT_11)
-  kspace_thread.join();
+  _kspace_thread.join();
   #endif
 
   if (kspace_compute_flag) _intel_kspace->compute_second(eflag,vflag);
@@ -298,9 +298,9 @@ void VerletLRTIntel::run(int n)
     pthread_cond_signal(&_kcond);
     pthread_mutex_unlock(&_kmutex);
     #elif defined(_LMP_INTEL_LRT_11)
-    std::thread kspace_thread;
+    std::thread _kspace_thread;
     if (kspace_compute_flag)
-      kspace_thread=std::thread([=] {
+      _kspace_thread=std::thread([=] {
         _intel_kspace->compute_first(eflag, vflag);
         timer->stamp(Timer::KSPACE);
       } );
@@ -332,7 +332,7 @@ void VerletLRTIntel::run(int n)
     pthread_mutex_unlock(&_kmutex);
     #elif defined(_LMP_INTEL_LRT_11)
     if (kspace_compute_flag)
-      kspace_thread.join();
+      _kspace_thread.join();
     #endif
 
     if (kspace_compute_flag) {

From 98bfbbd57630fb21421d73c2478571068e343a5c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 22 Mar 2020 22:21:12 -0400
Subject: [PATCH 04/14] fix typo in CMake module

---
 cmake/Modules/Packages/USER-INTEL.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/Packages/USER-INTEL.cmake b/cmake/Modules/Packages/USER-INTEL.cmake
index d0941a0a12..9ae4333ee2 100644
--- a/cmake/Modules/Packages/USER-INTEL.cmake
+++ b/cmake/Modules/Packages/USER-INTEL.cmake
@@ -31,7 +31,7 @@ if(PKG_USER-INTEL)
     endif()
   endif()
   if(INTEL_LRT_MODE STREQUAL "C++11")
-    add_definitions(-DLMP_INTEL_USERLRT -DLMP_INTEL_LRT11)
+    add_definitions(-DLMP_INTEL_USELRT -DLMP_INTEL_LRT11)
   endif()
 
   if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")

From 6e1f18961c87595058f2f8c83414ba0717eb1e13 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Mon, 23 Mar 2020 09:42:46 -0400
Subject: [PATCH 05/14] Convert characters to UTF-8

---
 src/KSPACE/msm.cpp                    | 4 ++--
 src/USER-INTEL/pair_tersoff_intel.cpp | 2 +-
 src/USER-SMTBQ/pair_smtbq.cpp         | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index 126236a328..1520c2c607 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -2920,7 +2920,7 @@ void MSM::compute_phis_and_dphis(const double &dx, const double &dy,
 
 /* ----------------------------------------------------------------------
    compute phi using interpolating polynomial
-   see Eq 7 from Parallel Computing 35 (2009) 164–177
+   see Eq 7 from Parallel Computing 35 (2009) 164-177
    and Hardy's thesis
 ------------------------------------------------------------------------- */
 
@@ -2999,7 +2999,7 @@ inline double MSM::compute_phi(const double &xi)
 /* ----------------------------------------------------------------------
    compute the derivative of phi
    phi is an interpolating polynomial
-   see Eq 7 from Parallel Computing 35 (2009) 164–177
+   see Eq 7 from Parallel Computing 35 (2009) 164-177
    and Hardy's thesis
 ------------------------------------------------------------------------- */
 
diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp
index 76d06b02dd..8784029320 100644
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Markus Höhnerbach (RWTH)
+   Contributing author: Markus HÃ¶hnerbach (RWTH)
 ------------------------------------------------------------------------- */
 
 #include <cmath>
diff --git a/src/USER-SMTBQ/pair_smtbq.cpp b/src/USER-SMTBQ/pair_smtbq.cpp
index f61fc1a72e..fcf6d141f2 100644
--- a/src/USER-SMTBQ/pair_smtbq.cpp
+++ b/src/USER-SMTBQ/pair_smtbq.cpp
@@ -13,7 +13,7 @@
 
 /* ----------------------------------------------------------------------
    The SMTBQ code has been developed with the financial support of  CNRS and
-   of the Regional Council of Burgundy (Convention n¡ 2010-9201AAO037S03129)
+   of the Regional Council of Burgundy (Convention nÂ¡ 2010-9201AAO037S03129)
 
    Copyright (2015)
    Universite de Bourgogne : Nicolas SALLES, Olivier POLITANO
@@ -943,7 +943,7 @@ void PairSMTBQ::compute(int eflag, int vflag)
      3 -> Short int. Ox-Ox
      4 -> Short int. SMTB (repulsion)
      5 -> Covalent energy SMTB
-     6 -> Somme des Q(i)²
+     6 -> Somme des Q(i)Â²
      ------------------------------------------------------------------------- */
 
   /* -------------- N-body forces Calcul --------------- */
@@ -3022,7 +3022,7 @@ void PairSMTBQ::groupQEqAllParallel_QEq()
 
   ngp = igp = 0; nelt[ngp] = 0;
 
-  // On prend un oxygène
+  // On prend un oxygÃ¨ne
   //   printf ("[me %d] On prend un oxygene\n",me);
 
   for (ii = 0; ii < inum; ii++) {

From bcfc606efb2622e9afdfae6984f6b01a187f0997 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Mon, 23 Mar 2020 13:20:56 -0700
Subject: [PATCH 06/14] SNAP optimizations, kernel fusion, large reduction of
 memory usage on the GPU, misc. performance optimizations.

---
 src/KOKKOS/pair_snap_kokkos.h      |  12 +-
 src/KOKKOS/pair_snap_kokkos_impl.h | 111 ++---
 src/KOKKOS/sna_kokkos.h            |  20 +-
 src/KOKKOS/sna_kokkos_impl.h       | 676 ++++++++++++++---------------
 4 files changed, 349 insertions(+), 470 deletions(-)

diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h
index e53dec4d86..b57ef2d9e5 100644
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@@ -37,15 +37,13 @@ struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeNeigh{};
 struct TagPairSNAPPreUi{};
 struct TagPairSNAPComputeUi{};
-struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately
 struct TagPairSNAPComputeUiCPU{};
 struct TagPairSNAPComputeZi{};
 struct TagPairSNAPComputeBi{};
 struct TagPairSNAPZeroYi{};
 struct TagPairSNAPComputeYi{};
-struct TagPairSNAPComputeDuidrj{};
+struct TagPairSNAPComputeFusedDeidrj{};
 struct TagPairSNAPComputeDuidrjCPU{};
-struct TagPairSNAPComputeDeidrj{};
 struct TagPairSNAPComputeDeidrjCPU{};
 
 template<class DeviceType>
@@ -83,9 +81,6 @@ public:
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const;
-
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
 
@@ -102,14 +97,11 @@ public:
   void operator() (TagPairSNAPComputeYi,const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;
+  void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;
-
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
 
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 1156d11c31..d807f149a9 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -30,7 +30,6 @@
 #include "kokkos.h"
 #include "sna.h"
 
-
 #define MAXLINE 1024
 #define MAXWORD 3
 
@@ -255,26 +254,19 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
       // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
         // 2 is for double buffer
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
 
+      const int tile_size = (twojmax+1)*(twojmax+1);
       typedef Kokkos::View< SNAcomplex*,
                             Kokkos::DefaultExecutionSpace::scratch_memory_space,
                             Kokkos::MemoryTraits<Kokkos::Unmanaged> >
               ScratchViewType;
-      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1));
+      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * tile_size );
+
+      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
       policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
 
       Kokkos::parallel_for("ComputeUi",policy_ui,*this);
 
-      // ComputeUitot
-      vector_length = 1;
-      team_size = 128;
-      team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::team_size_max(*this);
-      if (team_size*vector_length > team_size_max)
-        team_size = team_size_max/vector_length;
-
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot> policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
-      Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this);
     }
 
 
@@ -316,7 +308,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
     Kokkos::parallel_for("ComputeYi",policy_yi,*this);
 
-    //ComputeDuidrj
+    //ComputeDuidrj and Deidrj
     if (lmp->kokkos->ngpus == 0) { // CPU
       int vector_length = 1;
       int team_size = 1;
@@ -324,53 +316,37 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
       snaKK.set_dir(-1); // technically doesn't do anything
       Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
-    } else { // GPU, utilize scratch memory and splitting over dimensions
 
-      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::team_size_max(*this);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+
+      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
+    } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei
+
+      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::team_size_max(*this);
       int vector_length = 32;
       int team_size = 2; // need to cap b/c of shared memory reqs
       if (team_size*vector_length > team_size_max)
         team_size = team_size_max/vector_length;
 
-      // scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
+      // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
       // 2 is for double buffer
-      typedef Kokkos::View< SNAcomplex*,
-                          Kokkos::DefaultExecutionSpace::scratch_memory_space,
-                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >
-            ScratchViewType;
+      const int tile_size = (twojmax+1)*(twojmax/2+1);
+
+      typedef Kokkos::View< SNAcomplex*,
+                            Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+              ScratchViewType;
+      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * tile_size);
+
+      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj> policy_fused_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+      policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
 
-      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1));
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-      policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
-      // Need to call three times, once for each direction
       for (int k = 0; k < 3; k++) {
         snaKK.set_dir(k);
-        Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
+        Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
       }
     }
 
-    //ComputeDeidrj
-    if (lmp->kokkos->ngpus == 0) { // CPU
-      int vector_length = 1;
-      int team_size = 1;
-
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
-      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
-
-    } else { // GPU, different loop strategy internally
-
-      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::team_size_max(*this);
-      int vector_length = 32; // coalescing disaster right now, will fix later
-      int team_size = 8;
-      if (team_size*vector_length > team_size_max)
-        team_size = team_size_max/vector_length;
-
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
-      Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
-    }
-
     //ComputeForce
     if (eflag) {
       if (neighflag == HALF) {
@@ -642,25 +618,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
   my_sna.compute_ui(team,ii,jj);
 }
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  // Extract the quantum number
-  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
-  if (idx >= my_sna.idxu_max) return;
-
-  // Extract the atomic index
-  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
-  if (ii >= chunk_size) return;
-
-  // Extract the number of neighbors neighbor number
-  const int ninside = d_ninside(ii);
-
-  my_sna.compute_uitot(team,idx,ii,ninside);
-}
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const {
@@ -718,7 +675,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
 
   // Extract the atom number
@@ -730,7 +687,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
   const int ninside = d_ninside(ii);
   if (jj >= ninside) return;
 
-  my_sna.compute_duidrj(team,ii,jj);
+  my_sna.compute_fused_deidrj(team,ii,jj);
 }
 
 template<class DeviceType>
@@ -750,24 +707,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const t
   my_sna.compute_duidrj_cpu(team,ii,jj);
 }
 
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
-
-  my_sna.compute_deidrj(team,ii,jj);
-}
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 48d9114fbf..a6d9db3218 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -135,14 +135,10 @@ inline
   KOKKOS_INLINE_FUNCTION
   void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
+  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
-  void compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, int); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
   void compute_zi(const int&);    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void zero_yi(const int&,const int&); // ForceSNAP
@@ -155,12 +151,10 @@ inline
   // functions for derivatives
 
   KOKKOS_INLINE_FUNCTION
-  void compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
+  void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
   void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   double compute_sfac(double, double); // add_uarraytot, compute_duarray
@@ -251,10 +245,6 @@ inline
   KOKKOS_INLINE_FUNCTION
   void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui
 
-  KOKKOS_INLINE_FUNCTION
-  void compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                      double, double, double,
-                      double, double); // compute_ui
   KOKKOS_INLINE_FUNCTION
   void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                       double, double, double,
@@ -267,12 +257,8 @@ inline
 inline
   int compute_ncoeff();           // SNAKokkos()
   KOKKOS_INLINE_FUNCTION
-  void compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                       double, double, double, // compute_duidrj
-                       double, double, double, double, double);
-  KOKKOS_INLINE_FUNCTION
   void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                       double, double, double, // compute_duidrj
+                       double, double, double, // compute_duidrj_cpu
                        double, double, double, double, double);
 
   // Sets the style for the switching function
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index ef3312bd16..1daf8fd05c 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -19,6 +19,7 @@
 #include <cmath>
 #include <cstring>
 #include <cstdlib>
+#include <type_traits>
 
 namespace LAMMPS_NS {
 
@@ -231,11 +232,22 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
   zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom);
 
   //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
-  ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
+    // dummy allocation
+    ulist = t_sna_3c_ll("sna:ulist",1,1,1);
+    dulist = t_sna_4c_ll("sna:dulist",1,1,1);
+  } else {
+#endif
+    ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
+    dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
+#ifdef KOKKOS_ENABLE_CUDA
+  }
+#endif
+
   //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
   ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom);
 
-  //dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
   dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
 }
 
@@ -269,14 +281,14 @@ void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>
 }
 
 /* ----------------------------------------------------------------------
-   compute Ui by summing over bispectrum components
+   compute Ui by computing Wigner U-functions for one neighbor and
+   accumulating to the total. GPU only.
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
 {
-  double rsq, r, x, y, z, z0, theta0;
 
   // utot(j,ma,mb) = 0 for all j,ma,ma
   // utot(j,ma,ma) = 1 for all j,ma
@@ -284,22 +296,143 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
   //   compute r0 = (x,y,z,z0)
   //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
 
-  x = rij(iatom,jnbor,0);
-  y = rij(iatom,jnbor,1);
-  z = rij(iatom,jnbor,2);
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
+  // get shared memory offset
+  const int max_m_tile = (twojmax+1)*(twojmax+1);
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * max_m_tile;
 
-  theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
+  // double buffer
+  SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+
+  const double x = rij(iatom,jnbor,0);
+  const double y = rij(iatom,jnbor,1);
+  const double z = rij(iatom,jnbor,2);
+
+  const double wj_local = wj(iatom, jnbor);
+  const double rcut = rcutij(iatom, jnbor);
+
+  const double rsq = x * x + y * y + z * z;
+  const double r = sqrt(rsq);
+
+  const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
   //    theta0 = (r - rmin0) * rscale0;
-  z0 = r / tan(theta0);
+  const double cs = cos(theta0);
+  const double sn = sin(theta0);
+  const double z0 = r * cs / sn; // r / tan(theta0)
 
-  compute_uarray(team, iatom, jnbor, x, y, z, z0, r);
+  // Compute cutoff function
+  const double sfac = compute_sfac(r, rcut) * wj_local;
 
-  // if we're on the GPU, accumulating into uarraytot is done in a separate kernel.
-  // if we're not, it's more efficient to include it in compute_uarray.
+  // compute Cayley-Klein parameters for unit quaternion,
+  // pack into complex number
+  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
+  const SNAcomplex a = { r0inv * z0, -r0inv * z };
+  const SNAcomplex b = { r0inv * y, -r0inv * x };
+
+  // VMK Section 4.8.2
+
+  // All writes go to global memory and shared memory
+  // so we can avoid all global memory reads!
+  Kokkos::single(Kokkos::PerThread(team), [=]() {
+    //ulist(0,iatom,jnbor) = { 1.0, 0.0 };
+    buf1[0] = {1.,0.};
+    Kokkos::atomic_add(&(ulisttot(0,iatom).re), sfac);
+  });
+
+  for (int j = 1; j <= twojmax; j++) {
+    const int jju = idxu_block[j];
+    const int jjup = idxu_block[j-1];
+
+    // fill in left side of matrix layer from previous layer
+
+    // Flatten loop over ma, mb, need to figure out total
+    // number of iterations
+
+    // for (int ma = 0; ma <= j; ma++)
+    const int n_ma = j+1;
+    // for (int mb = 0; 2*mb <= j; mb++)
+    const int n_mb = j/2+1;
+
+    // the last (j / 2) can be avoided due to symmetry
+    const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0);
+
+    //for (int m = 0; m < total_iters; m++) {
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
+      [&] (const int m) {
+
+      // ma fast, mb slow
+      int ma = m % n_ma;
+      int mb = m / n_ma;
+
+      // index into global memory array
+      const int jju_index = jju+m;
+      //const int jjup_index = jjup+mb*j+ma;
+
+      // index into shared memory buffer for this level
+      const int jju_shared_idx = m;
+
+      // index into shared memory buffer for next level
+      const int jjup_shared_idx = jju_shared_idx - mb;
+
+      SNAcomplex u_accum = {0., 0.};
+
+      // VMK recursion relation: grab contribution which is multiplied by b*
+      const double rootpq2 = -rootpqarray(ma, j - mb);
+      const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
+      //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
+      caconjxpy(b, u_up2, u_accum);
+      
+      // VMK recursion relation: grab contribution which is multiplied by a*
+      const double rootpq1 = rootpqarray(j - ma, j - mb);
+      const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
+      //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.);
+      caconjxpy(a, u_up1, u_accum);
+
+      //ulist(jju_index,iatom,jnbor) = u_accum;
+      // back up into shared memory for next iter
+      buf2[jju_shared_idx] = u_accum;
+
+      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re);
+      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im);
+
+      // copy left side to right side with inversion symmetry VMK 4.4(2)
+      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
+      // if j is even (-> physical j integer), last element maps to self, skip
+      //if (!(m == total_iters - 1 && j % 2 == 0)) { 
+      if (m < total_iters - 1 || j % 2 == 1) {
+        const int sign_factor = (((ma+mb)%2==0)?1:-1);
+        const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
+        const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
+        
+
+        if (sign_factor == 1) {
+          u_accum.im = -u_accum.im;
+        } else {
+          u_accum.re = -u_accum.re;
+        }
+        //ulist(jjup_flip,iatom,jnbor) = u_accum;
+        buf2[jju_shared_flip] = u_accum;
+
+        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re);
+        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im);
+      }
+    });
+    // In CUDA backend,
+    // ThreadVectorRange has a __syncwarp (appropriately masked for 
+    // vector lengths < 32) implict at the end
+
+    // swap double buffers
+    auto tmp = buf1; buf1 = buf2; buf2 = tmp;
+    
+
+  }
 }
 
+/* ----------------------------------------------------------------------
+   compute Ui by summing over bispectrum components. CPU only.
+------------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@@ -327,37 +460,6 @@ void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<Dev
 
 }
 
-/* ----------------------------------------------------------------------
-   compute UiTot by summing over neighbors
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int idx, int iatom, int ninside)
-{
-  // fuse initialize in, avoid this load?
-  SNAcomplex utot = ulisttot(idx, iatom);
-  for (int jnbor = 0; jnbor < ninside; jnbor++) {
-
-    const auto x = rij(iatom,jnbor,0);
-    const auto y = rij(iatom,jnbor,1);
-    const auto z = rij(iatom,jnbor,2);
-    const auto rsq = x * x + y * y + z * z;
-    const auto r = sqrt(rsq);
-
-    const double wj_local = wj(iatom, jnbor);
-    const double rcut = rcutij(iatom, jnbor);
-    const double sfac = compute_sfac(r, rcut) * wj_local;
-
-    auto ulist_local = ulist(idx, iatom, jnbor);
-    utot.re += sfac * ulist_local.re;
-    utot.im += sfac * ulist_local.im;
-  }
-
-  ulisttot(idx, iatom) = utot;
-
-}
-
 /* ----------------------------------------------------------------------
    compute Zi by summing over products of Ui
    not updated yet
@@ -509,72 +611,203 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
 }
 
 /* ----------------------------------------------------------------------
-   compute dEidRj
+   Fused calculation of the derivative of Ui w.r.t. atom j
+   and of dEidRj. GPU only.
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
 {
-  t_scalar3<double> final_sum;
+  // get shared memory offset
+  const int max_m_tile = (twojmax+1)*(twojmax/2+1);
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * max_m_tile;
 
-  // Like in ComputeUi/ComputeDuidrj, regular loop over j.
-  for (int j = 0; j <= twojmax; j++) {
-    int jju = idxu_block(j);
+  // double buffer for ulist
+  SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
 
-    // Flatten loop over ma, mb, reduce w/in
+  // double buffer for dulist
+  SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
 
+  const double x = rij(iatom,jnbor,0);
+  const double y = rij(iatom,jnbor,1);
+  const double z = rij(iatom,jnbor,2);
+  const double rsq = x * x + y * y + z * z;
+  const double r = sqrt(rsq);
+  const double rcut = rcutij(iatom, jnbor);
+  const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
+  const double theta0 = (r - rmin0) * rscale0;
+  const double cs = cos(theta0);
+  const double sn = sin(theta0);
+  const double z0 = r * cs / sn;
+  const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
+
+  const double wj_local = wj(iatom, jnbor);
+  const double sfac = wj_local * compute_sfac(r, rcut);
+  const double dsfac = wj_local * compute_dsfac(r, rcut);
+
+  const double rinv = 1.0 / r;
+
+  // extract a single unit vector
+  const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
+
+  // Compute Cayley-Klein parameters for unit quaternion
+  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
+
+  const SNAcomplex a = { r0inv * z0, -r0inv * z };
+  const SNAcomplex b = { r0inv * y, -r0inv * x };
+
+  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
+  const double dr0inv = dr0invdr * u;
+  const double dz0 = dz0dr * u;
+
+  const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
+                              - z * dr0inv + (dir == 2 ? - r0inv : 0.) };
+
+  const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
+                              -x * dr0inv + (dir==0?-r0inv:0.) };
+
+  // Accumulate the full contribution to dedr on the fly
+  const double du_prod = dsfac * u; // chain rule
+  const SNAcomplex y_local = ylist(0, iatom);
+
+  // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
+  double dedr_full_sum = 0.5 * du_prod * y_local.re;
+
+  // single has a warp barrier at the end
+  Kokkos::single(Kokkos::PerThread(team), [=]() {
+    //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
+    ulist_buf1[0] = {1., 0.};
+    dulist_buf1[0] = {0., 0.};
+  });
+
+  for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    int jjup = idxu_block[j-1];
+
+    // flatten the loop over ma,mb
+
+    // for (int ma = 0; ma <= j; ma++)
     const int n_ma = j+1;
     // for (int mb = 0; 2*mb <= j; mb++)
     const int n_mb = j/2+1;
 
     const int total_iters = n_ma * n_mb;
 
-    t_scalar3<double> sum;
+    double dedr_sum = 0.; // j-local sum
 
     //for (int m = 0; m < total_iters; m++) {
     Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m, t_scalar3<double>& sum_tmp) {
+      [&] (const int m, double& sum_tmp) {
 
       // ma fast, mb slow
       int ma = m % n_ma;
       int mb = m / n_ma;
 
-      // get index
-      const int jju_index = jju+mb+mb*j+ma;
-
-      // get ylist, rescale last element by 0.5
-      SNAcomplex y_local = ylist(jju_index,iatom);
-
-      const SNAcomplex du_x = dulist(jju_index,iatom,jnbor,0);
-      const SNAcomplex du_y = dulist(jju_index,iatom,jnbor,1);
-      const SNAcomplex du_z = dulist(jju_index,iatom,jnbor,2);
+      const int jju_index = jju+m;
 
+      // Load y_local, apply the symmetry scaling factor
+      // The "secret" of the shared memory optimization is it eliminates
+      // all global memory reads to duidrj in lieu of caching values in
+      // shared memory and otherwise always writing, making the kernel
+      // ultimately compute bound. We take advantage of that by adding
+      // some reads back in.
+      auto y_local = ylist(jju_index,iatom);
       if (j % 2 == 0 && 2*mb == j) {
         if (ma == mb) { y_local = 0.5*y_local; }
-        else if (ma > mb) { y_local = { 0., 0. }; }
+        else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
         // else the ma < mb gets "double counted", cancelling the 0.5.
       }
 
-      sum_tmp.x += du_x.re * y_local.re + du_x.im * y_local.im;
-      sum_tmp.y += du_y.re * y_local.re + du_y.im * y_local.im;
-      sum_tmp.z += du_z.re * y_local.re + du_z.im * y_local.im;
+      // index into shared memory
+      const int jju_shared_idx = m;
+      const int jjup_shared_idx = jju_shared_idx - mb;
 
-    }, sum); // end loop over flattened ma,mb
+      // Need to compute and accumulate both u and du (mayhaps, we could probably
+      // balance some read and compute by reading u each time).
+      SNAcomplex u_accum = { 0., 0. };
+      SNAcomplex du_accum = { 0., 0. };
 
-    final_sum.x += sum.x;
-    final_sum.y += sum.y;
-    final_sum.z += sum.z;
+      const double rootpq2 = -rootpqarray(ma, j - mb);
+      const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
+      caconjxpy(b, u_up2, u_accum);
+
+      const double rootpq1 = rootpqarray(j - ma, j - mb);
+      const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
+      caconjxpy(a, u_up1, u_accum);
+
+      // Next, spin up du_accum
+      const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
+      caconjxpy(da, u_up1, du_accum);
+      caconjxpy(a, du_up1, du_accum);
+
+      const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
+      caconjxpy(db, u_up2, du_accum);
+      caconjxpy(b, du_up2, du_accum);
+
+      // No need to save u_accum to global memory
+      // Cache u_accum, du_accum to scratch memory.
+      ulist_buf2[jju_shared_idx] = u_accum;
+      dulist_buf2[jju_shared_idx] = du_accum;
+
+      // Directly accumulate deidrj into sum_tmp
+      //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
+      const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
+      sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
+
+      // copy left side to right side with inversion symmetry VMK 4.4(2)
+      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+      if (j%2==1 && mb+1==n_mb) {
+        int sign_factor = (((ma+mb)%2==0)?1:-1); 
+        //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
+        const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
+
+        if (sign_factor == 1) {
+          u_accum.im = -u_accum.im;
+          du_accum.im = -du_accum.im;
+        } else {
+          u_accum.re = -u_accum.re;
+          du_accum.re = -du_accum.re;
+        }
+
+        // We don't need the second half of the tile for the deidrj accumulation.
+        // That's taken care of by the symmetry factor above.
+        //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
+
+        // We do need it for ortho polynomial generation, though
+        ulist_buf2[jju_shared_flip] = u_accum;
+        dulist_buf2[jju_shared_flip] = du_accum;
+      }
+
+    }, dedr_sum);
+
+    // swap buffers
+    auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
+    tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
+
+    // Accumulate dedr. This "should" be in a single, but 
+    // a Kokkos::single call implies a warp sync, and we may
+    // as well avoid that. This does no harm as long as the
+    // final assignment is in a single block.
+    //Kokkos::single(Kokkos::PerThread(team), [=]() {
+      dedr_full_sum += dedr_sum; 
+    //});
   }
 
+  // Store the accumulated dedr.
   Kokkos::single(Kokkos::PerThread(team), [&] () {
-    dedr(iatom,jnbor,0) = final_sum.x*2.0;
-    dedr(iatom,jnbor,1) = final_sum.y*2.0;
-    dedr(iatom,jnbor,2) = final_sum.z*2.0;
+      dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
   });
-
 }
 
+/* ----------------------------------------------------------------------
+   compute dEidRj, CPU path only.
+------------------------------------------------------------------------- */
+
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@@ -624,6 +857,7 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
 
 /* ----------------------------------------------------------------------
    compute Bi by summing conj(Ui)*Zi
+   not updated yet
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -708,28 +942,6 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
    calculate derivative of Ui w.r.t. atom j
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
-{
-  double rsq, r, x, y, z, z0, theta0, cs, sn;
-  double dz0dr;
-
-  x = rij(iatom,jnbor,0);
-  y = rij(iatom,jnbor,1);
-  z = rij(iatom,jnbor,2);
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
-  double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
-  theta0 = (r - rmin0) * rscale0;
-  cs = cos(theta0);
-  sn = sin(theta0);
-  z0 = r * cs / sn;
-  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
-
-  compute_duarray(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
-}
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@@ -774,119 +986,6 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
    compute Wigner U-functions for one neighbor
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
-                         double x, double y, double z,
-                         double z0, double r)
-{
-  // define size of scratch memory buffer
-  const int max_m_tile = (twojmax+1)*(twojmax+1);
-  const int team_rank = team.team_rank();
-
-  // get scratch memory double buffer
-  SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-  SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-
-  // compute Cayley-Klein parameters for unit quaternion,
-  // pack into complex number
-  double r0inv = 1.0 / sqrt(r * r + z0 * z0);
-  SNAcomplex a = { r0inv * z0, -r0inv * z };
-  SNAcomplex b = { r0inv * y, -r0inv * x };
-
-  // VMK Section 4.8.2
-
-  // All writes go to global memory and shared memory
-  // so we can avoid all global memory reads
-  Kokkos::single(Kokkos::PerThread(team), [=]() {
-    ulist(0,iatom,jnbor) = { 1.0, 0.0 };
-    buf1[max_m_tile*team_rank] = {1.,0.};
-  });
-
-  for (int j = 1; j <= twojmax; j++) {
-    const int jju = idxu_block[j];
-    int jjup = idxu_block[j-1];
-
-    // fill in left side of matrix layer from previous layer
-
-    // Flatten loop over ma, mb, need to figure out total
-    // number of iterations
-
-    // for (int ma = 0; ma <= j; ma++)
-    const int n_ma = j+1;
-    // for (int mb = 0; 2*mb <= j; mb++)
-    const int n_mb = j/2+1;
-
-    const int total_iters = n_ma * n_mb;
-
-    //for (int m = 0; m < total_iters; m++) {
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m) {
-
-      // ma fast, mb slow
-      int ma = m % n_ma;
-      int mb = m / n_ma;
-
-      // index into global memory array
-      const int jju_index = jju+mb+mb*j+ma;
-
-      // index into shared memory buffer for previous level
-      const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
-
-      // index into shared memory buffer for next level
-      const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
-
-      SNAcomplex u_accum = {0., 0.};
-
-      // VMK recursion relation: grab contribution which is multiplied by a*
-      const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
-      caconjxpy(a, u_up1, u_accum);
-
-      // VMK recursion relation: grab contribution which is multiplied by b*
-      const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
-      caconjxpy(b, u_up2, u_accum);
-
-      ulist(jju_index,iatom,jnbor) = u_accum;
-
-      // We no longer accumulate into ulisttot in this kernel.
-      // Instead, we have a separate kernel which avoids atomics.
-      // Running two separate kernels is net faster.
-
-      // back up into shared memory for next iter
-      if (j != twojmax) buf2[jju_shared_idx] = u_accum;
-
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
-      // We can avoid this if we're on the last row for an integer j
-      if (!(n_ma % 2 == 1 && (mb+1) == n_mb)) {
-
-        int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
-        const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
-        const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
-
-        if (sign_factor == 1) {
-          u_accum.im = -u_accum.im;
-        } else {
-          u_accum.re = -u_accum.re;
-        }
-        ulist(jjup_flip,iatom,jnbor) = u_accum;
-        if (j != twojmax) buf2[jju_shared_flip] = u_accum;
-      }
-    });
-    // In CUDA backend,
-    // ThreadVectorRange has a __syncwarp (appropriately masked for
-    // vector lengths < 32) implicit at the end
-
-    // swap double buffers
-    auto tmp = buf1; buf1 = buf2; buf2 = tmp;
-    //std::swap(buf1, buf2); // throws warnings
-
-  }
-}
-
-// CPU version
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
@@ -976,152 +1075,9 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
 
 /* ----------------------------------------------------------------------
    compute derivatives of Wigner U-functions for one neighbor
-   see comments in compute_uarray()
+   see comments in compute_uarray_cpu()
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
-                          double x, double y, double z,
-                          double z0, double r, double dz0dr,
-                          double wj, double rcut)
-{
-
-  // get shared memory offset
-  const int max_m_tile = (twojmax+1)*(twojmax+1);
-  const int team_rank = team.team_rank();
-
-  // double buffer for ulist
-  SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-  SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-
-  // double buffer for dulist
-  SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-  SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-
-  const double sfac = wj * compute_sfac(r, rcut);
-  const double dsfac = wj * compute_dsfac(r, rcut);
-
-  const double rinv = 1.0 / r;
-
-  // extract a single unit vector
-  const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
-
-  // Compute Cayley-Klein parameters for unit quaternion
-
-  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
-
-  const SNAcomplex a = { r0inv * z0, -r0inv * z };
-  const SNAcomplex b = { r0inv * y, -r0inv * x };
-
-  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
-  const double dr0inv = dr0invdr * u;
-  const double dz0 = dz0dr * u;
-
-  const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
-                              - z * dr0inv + (dir == 2 ? - r0inv : 0.) };
-
-  const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
-                              -x * dr0inv + (dir==0?-r0inv:0.) };
-
-  // single has a warp barrier at the end
-  Kokkos::single(Kokkos::PerThread(team), [=]() {
-    dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
-    ulist_buf1[max_m_tile*team_rank] = {1., 0.};
-    dulist_buf1[max_m_tile*team_rank] = {0., 0.};
-  });
-
-
-  for (int j = 1; j <= twojmax; j++) {
-    int jju = idxu_block[j];
-    int jjup = idxu_block[j-1];
-
-    // flatten the loop over ma,mb
-
-    // for (int ma = 0; ma <= j; ma++)
-    const int n_ma = j+1;
-    // for (int mb = 0; 2*mb <= j; mb++)
-    const int n_mb = j/2+1;
-
-    const int total_iters = n_ma * n_mb;
-
-    //for (int m = 0; m < total_iters; m++) {
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m) {
-
-      // ma fast, mb slow
-      int ma = m % n_ma;
-      int mb = m / n_ma;
-
-      const int jju_index = jju+mb+mb*j+ma;
-
-      // index into shared memory
-      const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
-      const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
-
-      // Need to compute and accumulate both u and du (mayhaps, we could probably
-      // balance some read and compute by reading u each time).
-      SNAcomplex u_accum = { 0., 0. };
-      SNAcomplex du_accum = { 0., 0. };
-
-      const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
-      caconjxpy(a, u_up1, u_accum);
-
-      const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
-      caconjxpy(b, u_up2, u_accum);
-
-      // No need to save u_accum to global memory
-      if (j != twojmax) ulist_buf2[jju_shared_idx] = u_accum;
-
-      // Next, spin up du_accum
-      const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
-      caconjxpy(da, u_up1, du_accum);
-      caconjxpy(a, du_up1, du_accum);
-
-      const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
-      caconjxpy(db, u_up2, du_accum);
-      caconjxpy(b, du_up2, du_accum);
-
-      dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
-
-      if (j != twojmax) dulist_buf2[jju_shared_idx] = du_accum;
-
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
-
-      int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
-      const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
-      const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
-
-      if (sign_factor == 1) {
-        //ulist_alt(iatom,jnbor,jjup_flip).re = u_accum.re;
-        //ulist_alt(iatom,jnbor,jjup_flip).im = -u_accum.im;
-        u_accum.im = -u_accum.im;
-        du_accum.im = -du_accum.im;
-      } else {
-        //ulist_alt(iatom,jnbor,jjup_flip).re = -u_accum.re;
-        //ulist_alt(iatom,jnbor,jjup_flip).im = u_accum.im;
-        u_accum.re = -u_accum.re;
-        du_accum.re = -du_accum.re;
-      }
-
-      dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
-
-      if (j != twojmax) {
-        ulist_buf2[jju_shared_flip] = u_accum;
-        dulist_buf2[jju_shared_flip] = du_accum;
-      }
-
-    });
-
-    // swap buffers
-    auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
-    tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
-  }
-}
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
@@ -1680,11 +1636,17 @@ double SNAKokkos<DeviceType>::memory_usage()
   bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
   bytes += idxcg_max * sizeof(double);                   // cglist
 
-  bytes += natom * idxu_max * sizeof(double) * 2;        // ulist
+#ifdef KOKKOS_ENABLE_CUDA
+  if (!std::is_same<DeviceType,Kokkos::Cuda>::value) {
+#endif
+    bytes += natom * idxu_max * sizeof(double) * 2;        // ulist
+    bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
+#ifdef KOKKOS_ENABLE_CUDA
+  }
+#endif
   bytes += natom * idxu_max * sizeof(double) * 2;        // ulisttot
   if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
     bytes += natom * idxu_max * sizeof(double) * 2;        // ulisttot_lr
-  bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
 
   bytes += natom * idxz_max * sizeof(double) * 2;        // zlist
   bytes += natom * idxb_max * sizeof(double);            // blist

From 36095bbfdf39c85b6932bd0ec03ac062209e497e Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 23 Mar 2020 21:15:00 -0600
Subject: [PATCH 07/14] Tweak comment

---
 src/KOKKOS/sna_kokkos_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 1daf8fd05c..182a49bfb1 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -333,7 +333,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
   // VMK Section 4.8.2
 
   // All writes go to global memory and shared memory
-  // so we can avoid all global memory reads!
+  // so we can avoid all global memory reads
   Kokkos::single(Kokkos::PerThread(team), [=]() {
     //ulist(0,iatom,jnbor) = { 1.0, 0.0 };
     buf1[0] = {1.,0.};

From 387c2d05077ac5556367677fb65ba7004a1a65c3 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <weinbe2@bu.edu>
Date: Mon, 23 Mar 2020 23:31:12 -0400
Subject: [PATCH 08/14] Cleaned up comment.

---
 src/KOKKOS/sna_kokkos_impl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 182a49bfb1..e7b7087951 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -462,7 +462,6 @@ void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<Dev
 
 /* ----------------------------------------------------------------------
    compute Zi by summing over products of Ui
-   not updated yet
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>

From 5fa99cb07281675ebde5ada638d6597ba23a95f5 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 23 Mar 2020 21:33:11 -0600
Subject: [PATCH 09/14] Comment cleanup

---
 src/KOKKOS/sna_kokkos_impl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index e7b7087951..e6c34a245b 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -856,7 +856,6 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
 
 /* ----------------------------------------------------------------------
    compute Bi by summing conj(Ui)*Zi
-   not updated yet
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>

From 0060473cee2824527c44a1839ae7a870f42f9b08 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Mar 2020 11:35:21 -0400
Subject: [PATCH 10/14] fix up some escaped '*' characters in "code-block"
 sections that do not need to be escaped

---
 doc/src/bond_oxdna.rst          |  2 +-
 doc/src/pair_bop.rst            |  4 ++--
 doc/src/pair_class2.rst         |  8 ++++----
 doc/src/pair_coeff.rst          | 12 ++++++------
 doc/src/pair_cosine_squared.rst |  2 +-
 doc/src/pair_meam_sw_spline.rst |  4 +++-
 doc/src/pair_nb3b_harmonic.rst  |  4 +++-
 doc/src/pair_polymorphic.rst    |  4 ++--
 doc/src/pair_python.rst         | 12 ++++++------
 doc/src/pair_spin_magelec.rst   |  2 +-
 10 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/doc/src/bond_oxdna.rst b/doc/src/bond_oxdna.rst
index 8e69b298bf..71e5105436 100644
--- a/doc/src/bond_oxdna.rst
+++ b/doc/src/bond_oxdna.rst
@@ -32,7 +32,7 @@ Examples
    bond_coeff * 2.0 0.25 0.7564
 
    bond_style oxrna2/fene
-   bond_coeff \* 2.0 0.25 0.76107
+   bond_coeff * 2.0 0.25 0.76107
 
 Description
 """""""""""
diff --git a/doc/src/pair_bop.rst b/doc/src/pair_bop.rst
index ce7b69f41f..5cd045931a 100644
--- a/doc/src/pair_bop.rst
+++ b/doc/src/pair_bop.rst
@@ -132,9 +132,9 @@ and Te.  If your LAMMPS simulation has 4 atoms types and you want the
 1st 3 to be Cd, and the 4th to be Te, you would use the following
 pair_coeff command:
 
-.. parsed-literal::
+.. code-block:: LAMMPS
 
-   pair_coeff \* \* CdTe Cd Cd Cd Te
+   pair_coeff * * CdTe Cd Cd Cd Te
 
 The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types.
 The first three Cd arguments map LAMMPS atom types 1,2,3 to the Cd
diff --git a/doc/src/pair_class2.rst b/doc/src/pair_class2.rst
index d1c673ab97..8131799181 100644
--- a/doc/src/pair_class2.rst
+++ b/doc/src/pair_class2.rst
@@ -60,18 +60,18 @@ Examples
 .. code-block:: LAMMPS
 
    pair_style lj/class2 10.0
-   pair_coeff \* \* 100.0 2.5
-   pair_coeff 1 2\* 100.0 2.5 9.0
+   pair_coeff * * 100.0 2.5
+   pair_coeff 1 2* 100.0 2.5 9.0
 
    pair_style lj/class2/coul/cut 10.0
    pair_style lj/class2/coul/cut 10.0 8.0
-   pair_coeff \* \* 100.0 3.0
+   pair_coeff * * 100.0 3.0
    pair_coeff 1 1 100.0 3.5 9.0
    pair_coeff 1 1 100.0 3.5 9.0 9.0
 
    pair_style lj/class2/coul/long 10.0
    pair_style lj/class2/coul/long 10.0 8.0
-   pair_coeff \* \* 100.0 3.0
+   pair_coeff * * 100.0 3.0
    pair_coeff 1 1 100.0 3.5 9.0
 
 Description
diff --git a/doc/src/pair_coeff.rst b/doc/src/pair_coeff.rst
index 26910c1746..1886ce1118 100644
--- a/doc/src/pair_coeff.rst
+++ b/doc/src/pair_coeff.rst
@@ -19,11 +19,11 @@ Examples
 .. code-block:: LAMMPS
 
    pair_coeff 1 2 1.0 1.0 2.5
-   pair_coeff 2 \* 1.0 1.0
-   pair_coeff 3\* 1\*2 1.0 1.0 2.5
-   pair_coeff \* \* 1.0 1.0
-   pair_coeff \* \* nialhjea 1 1 2
-   pair_coeff \* 3 morse.table ENTRY1
+   pair_coeff 2 * 1.0 1.0
+   pair_coeff 3* 1*2 1.0 1.0 2.5
+   pair_coeff * * 1.0 1.0
+   pair_coeff * * nialhjea 1 1 2
+   pair_coeff * 3 morse.table ENTRY1
    pair_coeff 1 2 lj/cut 1.0 1.0 2.5 (for pair_style hybrid)
 
 Description
@@ -55,7 +55,7 @@ pairs, then overwrite the coeffs for just the I,J = 2,3 pair:
 
 .. code-block:: LAMMPS
 
-   pair_coeff \* \* 1.0 1.0 2.5
+   pair_coeff * * 1.0 1.0 2.5
    pair_coeff 2 3 2.0 1.0 1.12
 
 A line in a data file that specifies pair coefficients uses the exact
diff --git a/doc/src/pair_cosine_squared.rst b/doc/src/pair_cosine_squared.rst
index 4f19ddd1e4..b7fa29bbd5 100644
--- a/doc/src/pair_cosine_squared.rst
+++ b/doc/src/pair_cosine_squared.rst
@@ -31,7 +31,7 @@ Examples
 .. code-block:: LAMMPS
 
    pair_style cosine/squared 3.0
-   pair_coeff \* \* 1.0 1.3
+   pair_coeff * * 1.0 1.3
    pair_coeff 1 3 1.0 1.3 2.0
    pair_coeff 1 3 1.0 1.3 wca
    pair_coeff 1 3 1.0 1.3 2.0 wca
diff --git a/doc/src/pair_meam_sw_spline.rst b/doc/src/pair_meam_sw_spline.rst
index ebb51795c2..0cf4c57975 100644
--- a/doc/src/pair_meam_sw_spline.rst
+++ b/doc/src/pair_meam_sw_spline.rst
@@ -75,7 +75,9 @@ If your LAMMPS simulation has 3 atoms types and they are all to be
 treated with this potential, you would use the following pair_coeff
 command:
 
-pair_coeff \* \* Ti.meam.sw.spline Ti Ti Ti
+.. code-block:: LAMMPS
+
+   pair_coeff * * Ti.meam.sw.spline Ti Ti Ti
 
 The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types.
 The three Ti arguments map LAMMPS atom types 1,2,3 to the Ti element
diff --git a/doc/src/pair_nb3b_harmonic.rst b/doc/src/pair_nb3b_harmonic.rst
index c4f3f3c3e8..9d63df65d5 100644
--- a/doc/src/pair_nb3b_harmonic.rst
+++ b/doc/src/pair_nb3b_harmonic.rst
@@ -64,7 +64,9 @@ NULL values are placeholders for atom types that will be used with
 other potentials. An example of a pair_coeff command for use with the
 *hybrid* pair style is:
 
-pair_coeff \* \* nb3b/harmonic MgOH.nb3b.harmonic Mg O H
+.. code-block:: LAMMPS
+
+   pair_coeff * * nb3b/harmonic MgOH.nb3b.harmonic Mg O H
 
 Three-body non-bonded harmonic files in the *potentials* directory of
 the LAMMPS distribution have a ".nb3b.harmonic" suffix.  Lines that
diff --git a/doc/src/pair_polymorphic.rst b/doc/src/pair_polymorphic.rst
index e06e9e7855..c0db3f10a4 100644
--- a/doc/src/pair_polymorphic.rst
+++ b/doc/src/pair_polymorphic.rst
@@ -180,9 +180,9 @@ functions for Si-C tersoff potential. If your LAMMPS simulation has 4
 atoms types and you want the 1st 3 to be Si, and the 4th to be C, you
 would use the following pair_coeff command:
 
-.. parsed-literal::
+.. code-block:: LAMMPS
 
-   pair_coeff \* \* SiC_tersoff.poly Si Si Si C
+   pair_coeff * * SiC_tersoff.poly Si Si Si C
 
 The 1st 2 arguments must be \* \* so as to span all LAMMPS atom
 types. The first three Si arguments map LAMMPS atom types 1,2,3 to the
diff --git a/doc/src/pair_python.rst b/doc/src/pair_python.rst
index fa76d4c16c..e654a6025f 100644
--- a/doc/src/pair_python.rst
+++ b/doc/src/pair_python.rst
@@ -113,8 +113,8 @@ which the parameters epsilon and sigma are both 1.0:
    class LJCutMelt(LAMMPSPairPotential):
        def __init__(self):
            super(LJCutMelt,self).__init__()
-           # set coeffs: 48\*eps\*sig\*\*12, 24\*eps\*sig\*\*6,
-           #              4\*eps\*sig\*\*12,  4\*eps\*sig\*\*6
+           # set coeffs: 48*eps*sig**12, 24*eps*sig**6,
+           #              4*eps*sig**12,  4*eps*sig**6
            self.units = 'lj'
            self.coeff = {'lj'  : {'lj'  : (48.0,24.0,4.0,4.0)}}
 
@@ -137,18 +137,18 @@ the *LJCutMelt* example, here are the two functions:
       def compute_force(self,rsq,itype,jtype):
            coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]]
            r2inv  = 1.0/rsq
-           r6inv  = r2inv\*r2inv\*r2inv
+           r6inv  = r2inv*r2inv*r2inv
            lj1 = coeff[0]
            lj2 = coeff[1]
-           return (r6inv \* (lj1\*r6inv - lj2))\*r2inv
+           return (r6inv * (lj1*r6inv - lj2))*r2inv
 
        def compute_energy(self,rsq,itype,jtype):
            coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]]
            r2inv  = 1.0/rsq
-           r6inv  = r2inv\*r2inv\*r2inv
+           r6inv  = r2inv*r2inv*r2inv
            lj3 = coeff[2]
            lj4 = coeff[3]
-           return (r6inv \* (lj3\*r6inv - lj4))
+           return (r6inv * (lj3*r6inv - lj4))
 
 .. note::
 
diff --git a/doc/src/pair_spin_magelec.rst b/doc/src/pair_spin_magelec.rst
index a220299b07..e4d6b81b8a 100644
--- a/doc/src/pair_spin_magelec.rst
+++ b/doc/src/pair_spin_magelec.rst
@@ -18,7 +18,7 @@ Examples
 .. code-block:: LAMMPS
 
    pair_style spin/magelec 4.5
-   pair_coeff \* \* magelec 4.5 0.00109 1.0 1.0 1.0
+   pair_coeff * * magelec 4.5 0.00109 1.0 1.0 1.0
 
 Description
 """""""""""

From 0f35c1d009b70258c3c967e51abbd11e923c6f30 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 24 Mar 2020 18:43:20 -0400
Subject: [PATCH 11/14] Update ubuntu package name to libpng-dev

---
 doc/src/Howto_bash.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/Howto_bash.rst b/doc/src/Howto_bash.rst
index 02322e5e1c..98cc6c0dff 100644
--- a/doc/src/Howto_bash.rst
+++ b/doc/src/Howto_bash.rst
@@ -103,7 +103,7 @@ needed for various LAMMPS features:
 
 .. code-block:: bash
 
-   sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng12-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools
+   sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools
 
 Files in Ubuntu on Windows
 ^^^^^^^^^^^^^^^^^^^^^^^^^^

From 431647d943d64001e2e2d6a6e01294c2903c42d7 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 24 Mar 2020 18:52:05 -0400
Subject: [PATCH 12/14] Add link to official WSL docs

---
 doc/src/Howto_bash.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/src/Howto_bash.rst b/doc/src/Howto_bash.rst
index 98cc6c0dff..b452f579d0 100644
--- a/doc/src/Howto_bash.rst
+++ b/doc/src/Howto_bash.rst
@@ -12,6 +12,10 @@ via apt-get and all files are accessible in both the Windows Explorer and your
 Linux shell (bash). This avoids switching to a different operating system or
 installing a virtual machine. Everything runs on Windows.
 
+.. seealso::
+
+   You can find more detailed information at the `Windows Subsystem for Linux Installation Guide for Windows 10 <https://docs.microsoft.com/en-us/windows/wsl/install-win10>`_.
+
 Installing Bash on Windows
 --------------------------
 

From 398c030925e8175f6e15177f2d9b3194fca9524a Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 25 Mar 2020 06:52:37 -0400
Subject: [PATCH 13/14] whitespace cleanup

---
 src/KOKKOS/sna_kokkos_impl.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index e6c34a245b..dcedf333e5 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -382,7 +382,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
       //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
       caconjxpy(b, u_up2, u_accum);
-      
+
       // VMK recursion relation: grab contribution which is multiplied by a*
       const double rootpq1 = rootpqarray(j - ma, j - mb);
       const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
@@ -399,12 +399,12 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       // copy left side to right side with inversion symmetry VMK 4.4(2)
       // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
       // if j is even (-> physical j integer), last element maps to self, skip
-      //if (!(m == total_iters - 1 && j % 2 == 0)) { 
+      //if (!(m == total_iters - 1 && j % 2 == 0)) {
       if (m < total_iters - 1 || j % 2 == 1) {
         const int sign_factor = (((ma+mb)%2==0)?1:-1);
         const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
         const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
-        
+
 
         if (sign_factor == 1) {
           u_accum.im = -u_accum.im;
@@ -419,12 +419,12 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       }
     });
     // In CUDA backend,
-    // ThreadVectorRange has a __syncwarp (appropriately masked for 
+    // ThreadVectorRange has a __syncwarp (appropriately masked for
     // vector lengths < 32) implict at the end
 
     // swap double buffers
     auto tmp = buf1; buf1 = buf2; buf2 = tmp;
-    
+
 
   }
 }
@@ -760,7 +760,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       // copy left side to right side with inversion symmetry VMK 4.4(2)
       // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
       if (j%2==1 && mb+1==n_mb) {
-        int sign_factor = (((ma+mb)%2==0)?1:-1); 
+        int sign_factor = (((ma+mb)%2==0)?1:-1);
         //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
         const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
 
@@ -787,18 +787,18 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
     auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
     tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
 
-    // Accumulate dedr. This "should" be in a single, but 
+    // Accumulate dedr. This "should" be in a single, but
     // a Kokkos::single call implies a warp sync, and we may
     // as well avoid that. This does no harm as long as the
     // final assignment is in a single block.
     //Kokkos::single(Kokkos::PerThread(team), [=]() {
-      dedr_full_sum += dedr_sum; 
+    dedr_full_sum += dedr_sum;
     //});
   }
 
   // Store the accumulated dedr.
   Kokkos::single(Kokkos::PerThread(team), [&] () {
-      dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
+    dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
   });
 }
 

From 1192845ad58327cee3c3e4608596baf8f848b1e7 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 25 Mar 2020 08:19:24 -0400
Subject: [PATCH 14/14] avoid segmentation faults in universe/uloop variable
 increment

---
 src/variable.cpp | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/variable.cpp b/src/variable.cpp
index 1093ce9066..0fc53f8074 100644
--- a/src/variable.cpp
+++ b/src/variable.cpp
@@ -661,6 +661,8 @@ int Variable::next(int narg, char **arg)
 
   } else if (istyle == UNIVERSE || istyle == ULOOP) {
 
+    uloop_again:
+
     // wait until lock file can be created and owned by proc 0 of this world
     // rename() is not atomic in practice, but no known simple fix
     //   means multiple procs can read/write file at the same time (bad!)
@@ -669,7 +671,7 @@ int Variable::next(int narg, char **arg)
     // delay for random fraction of 1 second before subsequent tries
     // when successful, read next available index and Bcast it within my world
 
-    int nextindex;
+    int nextindex = -1;
     if (me == 0) {
       int seed = 12345 + universe->me + which[find(arg[0])];
       RanMars *random = new RanMars(lmp,seed);
@@ -682,10 +684,33 @@ int Variable::next(int narg, char **arg)
       }
       delete random;
 
-      FILE *fp = fopen("tmp.lammps.variable.lock","r");
-      fscanf(fp,"%d",&nextindex);
+      // if the file cannot be found, we may have a race with some
+      // other MPI rank that has called rename at the same time
+      // and we have to start over.
+      // if the read is short (we need at least one byte) we try reading again.
+
+      FILE *fp;
+      char buf[64];
+      for (int loopmax = 0; loopmax < 100; ++loopmax) {
+        fp = fopen("tmp.lammps.variable.lock","r");
+        if (fp == NULL) goto uloop_again;
+
+        buf[0] = buf[1] = '\0';
+        fread(buf,1,64,fp);
+        fclose(fp);
+
+        if (strlen(buf) > 0) {
+           nextindex = atoi(buf);
+           break;
+        }
+        delay = (int) (1000000*random->uniform());
+        usleep(delay);
+      }
+      if (nextindex < 0)
+        error->one(FLERR,"Unexpected error while incrementing uloop "
+                   "style variable. Please contact LAMMPS developers.");
+
       //printf("READ %d %d\n",universe->me,nextindex);
-      fclose(fp);
       fp = fopen("tmp.lammps.variable.lock","w");
       fprintf(fp,"%d\n",nextindex+1);
       //printf("WRITE %d %d\n",universe->me,nextindex+1);