From 8b0e5c66ea9f77be5789227271c771a134796e13 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Fri, 20 Mar 2020 07:23:01 -0600
Subject: [PATCH 01/17] tweak the docs for the new fix numdiff command

---
 doc/src/fix_numdiff.rst | 14 +++++++-------
 examples/README         |  1 +
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/doc/src/fix_numdiff.rst b/doc/src/fix_numdiff.rst
index f0c5969a12..c5d84bf787 100644
--- a/doc/src/fix_numdiff.rst
+++ b/doc/src/fix_numdiff.rst
@@ -29,14 +29,14 @@ Description
 
 Calculate forces through finite difference calculations of energy
 versus position.  These forces can be compared to analytic forces
-computed by pair styles, bond styles, etc.  E.g. for debugging
-purposes.
+computed by pair styles, bond styles, etc.  This can be usefule for
+debugging or other purposes.
 
 The group specified with the command means only atoms within the group
 have their averages computed.  Results are set to 0.0 for atoms not in
 the group.
 
-This fix performs a loop over all atoms (in the group).  For each atom
+This fix performs a loop over all atoms in the group.  For each atom
 and each component of force it adds *delta* to the position, and
 computes the new energy of the entire system.  It then subtracts
 *delta* from the original position and again computes the new energy
@@ -66,10 +66,10 @@ by two times *delta*.
 .. note::
 
    The cost of each energy evaluation is essentially the cost of an MD
-   timestep.  This invoking this fix once has a cost of 2N timesteps,
-   where N is the total number of atoms in the system (assuming all atoms
-   are included in the group).  So this fix can be very expensive to use
-   for large systems.
+   timestep.  Thus invoking this fix for a 3d system once has a cost
+   of 6N timesteps, where N is the total number of atoms in the system
+   (assuming all atoms are included in the group).  So this fix can be
+   very expensive to use for large systems.
 
 ----------
 
diff --git a/examples/README b/examples/README
index 996176c548..dbb9623e3f 100644
--- a/examples/README
+++ b/examples/README
@@ -93,6 +93,7 @@ msst:     MSST shock dynamics
 nb3b:     use of nonbonded 3-body harmonic pair style
 neb:      nudged elastic band (NEB) calculation for barrier finding
 nemd:     non-equilibrium MD of 2d sheared system
+numdiff: numerical difference computation of forces
 obstacle: flow around two voids in a 2d channel
 peptide:  dynamics of a small solvated peptide chain (5-mer)
 peri:     Peridynamic model of cylinder impacted by indenter

From 9ff71e2da08fcf19d1cb834d09c4e1b59d2b0b62 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Fri, 20 Mar 2020 07:26:15 -0600
Subject: [PATCH 02/17] fix a typo

---
 doc/src/fix_numdiff.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/fix_numdiff.rst b/doc/src/fix_numdiff.rst
index c5d84bf787..4317d36120 100644
--- a/doc/src/fix_numdiff.rst
+++ b/doc/src/fix_numdiff.rst
@@ -29,7 +29,7 @@ Description
 
 Calculate forces through finite difference calculations of energy
 versus position.  These forces can be compared to analytic forces
-computed by pair styles, bond styles, etc.  This can be usefule for
+computed by pair styles, bond styles, etc.  This can be useful for
 debugging or other purposes.
 
 The group specified with the command means only atoms within the group

From aabee4b328501f1894f5c465d551af89a965dbd4 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Fri, 20 Mar 2020 07:27:28 -0600
Subject: [PATCH 03/17] one more change

---
 doc/src/fix_numdiff.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/fix_numdiff.rst b/doc/src/fix_numdiff.rst
index 4317d36120..974826d1f2 100644
--- a/doc/src/fix_numdiff.rst
+++ b/doc/src/fix_numdiff.rst
@@ -66,7 +66,7 @@ by two times *delta*.
 .. note::
 
    The cost of each energy evaluation is essentially the cost of an MD
-   timestep.  Thus invoking this fix for a 3d system once has a cost
+   timestep.  Thus invoking this fix once for a 3d system has a cost
    of 6N timesteps, where N is the total number of atoms in the system
    (assuming all atoms are included in the group).  So this fix can be
    very expensive to use for large systems.

From f560cd6dd596857d14d3663e3c225fbaec2096d0 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sat, 21 Mar 2020 01:08:09 -0400
Subject: [PATCH 04/17] make certain, the molecular flag is always initialized

---
 src/atom_vec.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp
index 7b89c2fd79..c4dd53ad18 100644
--- a/src/atom_vec.cpp
+++ b/src/atom_vec.cpp
@@ -36,6 +36,7 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
   forceclearflag = 0;
   size_data_bonus = 0;
   maxexchange = 0;
+  molecular = 0;
 
   kokkosable = 0;
 

From 79b84c0847d0f9504527ce0b5dcb99688af2943c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 22 Mar 2020 15:44:28 -0400
Subject: [PATCH 05/17] more thorough checking if BUILD_OMP may be enabled by
 default. we need the OpenMP runtime, too.

---
 cmake/CMakeLists.txt | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 4dd079eaae..36bed2d649 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -227,7 +227,15 @@ pkg_depends(USER-LB MPI)
 pkg_depends(USER-PHONON KSPACE)
 pkg_depends(USER-SCAFACOS MPI)
 
+# detect if we may enable OpenMP support by default
+set(BUILD_OMP_DEFAULT OFF)
 find_package(OpenMP QUIET)
+if(OpenMP_FOUND)
+  check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE)
+  if(HAVE_OMP_H_INCLUDE)
+    set(BUILD_OMP_DEFAULT ON)
+  endif()
+endif()
 
 # TODO: this is a temporary workaround until a better solution is found. AK 2019-05-30
 # GNU GCC 9.x uses settings incompatible with our use of 'default(none)' in OpenMP pragmas
@@ -237,14 +245,14 @@ find_package(OpenMP QUIET)
 if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.99.9))
   option(BUILD_OMP "Build with OpenMP support" OFF)
 else()
-  option(BUILD_OMP "Build with OpenMP support" ${OpenMP_FOUND})
+  option(BUILD_OMP "Build with OpenMP support" ${BUILD_OMP_DEFAULT})
 endif()
 
 if(BUILD_OMP)
   find_package(OpenMP REQUIRED)
   check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE)
   if(NOT HAVE_OMP_H_INCLUDE)
-    message(FATAL_ERROR "Cannot find required 'omp.h' header file")
+    message(FATAL_ERROR "Cannot find the 'omp.h' header file required for full OpenMP support")
   endif()
   set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")

From e1f01d3e6514eecb83584c1e2b14c88569db70bb Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 22 Mar 2020 21:14:26 -0400
Subject: [PATCH 06/17] use consistent naming for c++11 style kspace thread

---
 src/USER-INTEL/verlet_lrt_intel.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp
index a3c05c46bc..bd143c4c94 100644
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@@ -185,7 +185,7 @@ void VerletLRTIntel::setup(int flag)
   _kspace_done = 0;
   pthread_mutex_unlock(&_kmutex);
   #elif defined(_LMP_INTEL_LRT_11)
-  kspace_thread.join();
+  _kspace_thread.join();
   #endif
 
   if (kspace_compute_flag) _intel_kspace->compute_second(eflag,vflag);
@@ -298,9 +298,9 @@ void VerletLRTIntel::run(int n)
     pthread_cond_signal(&_kcond);
     pthread_mutex_unlock(&_kmutex);
     #elif defined(_LMP_INTEL_LRT_11)
-    std::thread kspace_thread;
+    std::thread _kspace_thread;
     if (kspace_compute_flag)
-      kspace_thread=std::thread([=] {
+      _kspace_thread=std::thread([=] {
         _intel_kspace->compute_first(eflag, vflag);
         timer->stamp(Timer::KSPACE);
       } );
@@ -332,7 +332,7 @@ void VerletLRTIntel::run(int n)
     pthread_mutex_unlock(&_kmutex);
     #elif defined(_LMP_INTEL_LRT_11)
     if (kspace_compute_flag)
-      kspace_thread.join();
+      _kspace_thread.join();
     #endif
 
     if (kspace_compute_flag) {

From 98bfbbd57630fb21421d73c2478571068e343a5c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 22 Mar 2020 22:21:12 -0400
Subject: [PATCH 07/17] fix typo in CMake module

---
 cmake/Modules/Packages/USER-INTEL.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/Packages/USER-INTEL.cmake b/cmake/Modules/Packages/USER-INTEL.cmake
index d0941a0a12..9ae4333ee2 100644
--- a/cmake/Modules/Packages/USER-INTEL.cmake
+++ b/cmake/Modules/Packages/USER-INTEL.cmake
@@ -31,7 +31,7 @@ if(PKG_USER-INTEL)
     endif()
   endif()
   if(INTEL_LRT_MODE STREQUAL "C++11")
-    add_definitions(-DLMP_INTEL_USERLRT -DLMP_INTEL_LRT11)
+    add_definitions(-DLMP_INTEL_USELRT -DLMP_INTEL_LRT11)
   endif()
 
   if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")

From 6e1f18961c87595058f2f8c83414ba0717eb1e13 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Mon, 23 Mar 2020 09:42:46 -0400
Subject: [PATCH 08/17] Convert characters to UTF-8

---
 src/KSPACE/msm.cpp                    | 4 ++--
 src/USER-INTEL/pair_tersoff_intel.cpp | 2 +-
 src/USER-SMTBQ/pair_smtbq.cpp         | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index 126236a328..1520c2c607 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -2920,7 +2920,7 @@ void MSM::compute_phis_and_dphis(const double &dx, const double &dy,
 
 /* ----------------------------------------------------------------------
    compute phi using interpolating polynomial
-   see Eq 7 from Parallel Computing 35 (2009) 164–177
+   see Eq 7 from Parallel Computing 35 (2009) 164-177
    and Hardy's thesis
 ------------------------------------------------------------------------- */
 
@@ -2999,7 +2999,7 @@ inline double MSM::compute_phi(const double &xi)
 /* ----------------------------------------------------------------------
    compute the derivative of phi
    phi is an interpolating polynomial
-   see Eq 7 from Parallel Computing 35 (2009) 164–177
+   see Eq 7 from Parallel Computing 35 (2009) 164-177
    and Hardy's thesis
 ------------------------------------------------------------------------- */
 
diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp
index 76d06b02dd..8784029320 100644
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Markus Höhnerbach (RWTH)
+   Contributing author: Markus HÃ¶hnerbach (RWTH)
 ------------------------------------------------------------------------- */
 
 #include <cmath>
diff --git a/src/USER-SMTBQ/pair_smtbq.cpp b/src/USER-SMTBQ/pair_smtbq.cpp
index f61fc1a72e..fcf6d141f2 100644
--- a/src/USER-SMTBQ/pair_smtbq.cpp
+++ b/src/USER-SMTBQ/pair_smtbq.cpp
@@ -13,7 +13,7 @@
 
 /* ----------------------------------------------------------------------
    The SMTBQ code has been developed with the financial support of  CNRS and
-   of the Regional Council of Burgundy (Convention n¡ 2010-9201AAO037S03129)
+   of the Regional Council of Burgundy (Convention nÂ¡ 2010-9201AAO037S03129)
 
    Copyright (2015)
    Universite de Bourgogne : Nicolas SALLES, Olivier POLITANO
@@ -943,7 +943,7 @@ void PairSMTBQ::compute(int eflag, int vflag)
      3 -> Short int. Ox-Ox
      4 -> Short int. SMTB (repulsion)
      5 -> Covalent energy SMTB
-     6 -> Somme des Q(i)²
+     6 -> Somme des Q(i)Â²
      ------------------------------------------------------------------------- */
 
   /* -------------- N-body forces Calcul --------------- */
@@ -3022,7 +3022,7 @@ void PairSMTBQ::groupQEqAllParallel_QEq()
 
   ngp = igp = 0; nelt[ngp] = 0;
 
-  // On prend un oxygène
+  // On prend un oxygÃ¨ne
   //   printf ("[me %d] On prend un oxygene\n",me);
 
   for (ii = 0; ii < inum; ii++) {

From bcfc606efb2622e9afdfae6984f6b01a187f0997 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <eweinberg@nvidia.com>
Date: Mon, 23 Mar 2020 13:20:56 -0700
Subject: [PATCH 09/17] SNAP optimizations, kernel fusion, large reduction of
 memory usage on the GPU, misc. performance optimizations.

---
 src/KOKKOS/pair_snap_kokkos.h      |  12 +-
 src/KOKKOS/pair_snap_kokkos_impl.h | 111 ++---
 src/KOKKOS/sna_kokkos.h            |  20 +-
 src/KOKKOS/sna_kokkos_impl.h       | 676 ++++++++++++++---------------
 4 files changed, 349 insertions(+), 470 deletions(-)

diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h
index e53dec4d86..b57ef2d9e5 100644
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@@ -37,15 +37,13 @@ struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeNeigh{};
 struct TagPairSNAPPreUi{};
 struct TagPairSNAPComputeUi{};
-struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately
 struct TagPairSNAPComputeUiCPU{};
 struct TagPairSNAPComputeZi{};
 struct TagPairSNAPComputeBi{};
 struct TagPairSNAPZeroYi{};
 struct TagPairSNAPComputeYi{};
-struct TagPairSNAPComputeDuidrj{};
+struct TagPairSNAPComputeFusedDeidrj{};
 struct TagPairSNAPComputeDuidrjCPU{};
-struct TagPairSNAPComputeDeidrj{};
 struct TagPairSNAPComputeDeidrjCPU{};
 
 template<class DeviceType>
@@ -83,9 +81,6 @@ public:
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const;
-
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
 
@@ -102,14 +97,11 @@ public:
   void operator() (TagPairSNAPComputeYi,const int& ii) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const;
+  void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;
-
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;
 
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index 1156d11c31..d807f149a9 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -30,7 +30,6 @@
 #include "kokkos.h"
 #include "sna.h"
 
-
 #define MAXLINE 1024
 #define MAXWORD 3
 
@@ -255,26 +254,19 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
       // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
         // 2 is for double buffer
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
 
+      const int tile_size = (twojmax+1)*(twojmax+1);
       typedef Kokkos::View< SNAcomplex*,
                             Kokkos::DefaultExecutionSpace::scratch_memory_space,
                             Kokkos::MemoryTraits<Kokkos::Unmanaged> >
               ScratchViewType;
-      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1));
+      int scratch_size = ScratchViewType::shmem_size( 2 * team_size * tile_size );
+
+      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
       policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
 
       Kokkos::parallel_for("ComputeUi",policy_ui,*this);
 
-      // ComputeUitot
-      vector_length = 1;
-      team_size = 128;
-      team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::team_size_max(*this);
-      if (team_size*vector_length > team_size_max)
-        team_size = team_size_max/vector_length;
-
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot> policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
-      Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this);
     }
 
 
@@ -316,7 +308,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
     Kokkos::parallel_for("ComputeYi",policy_yi,*this);
 
-    //ComputeDuidrj
+    //ComputeDuidrj and Deidrj
     if (lmp->kokkos->ngpus == 0) { // CPU
       int vector_length = 1;
       int team_size = 1;
@@ -324,53 +316,37 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
       snaKK.set_dir(-1); // technically doesn't do anything
       Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
-    } else { // GPU, utilize scratch memory and splitting over dimensions
 
-      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::team_size_max(*this);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+
+      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
+    } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei
+
+      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::team_size_max(*this);
       int vector_length = 32;
       int team_size = 2; // need to cap b/c of shared memory reqs
       if (team_size*vector_length > team_size_max)
         team_size = team_size_max/vector_length;
 
-      // scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
+      // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
       // 2 is for double buffer
-      typedef Kokkos::View< SNAcomplex*,
-                          Kokkos::DefaultExecutionSpace::scratch_memory_space,
-                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >
-            ScratchViewType;
+      const int tile_size = (twojmax+1)*(twojmax/2+1);
+
+      typedef Kokkos::View< SNAcomplex*,
+                            Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+              ScratchViewType;
+      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * tile_size);
+
+      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj> policy_fused_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
+      policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
 
-      int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1));
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-      policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
-      // Need to call three times, once for each direction
       for (int k = 0; k < 3; k++) {
         snaKK.set_dir(k);
-        Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this);
+        Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
       }
     }
 
-    //ComputeDeidrj
-    if (lmp->kokkos->ngpus == 0) { // CPU
-      int vector_length = 1;
-      int team_size = 1;
-
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
-      Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
-
-    } else { // GPU, different loop strategy internally
-
-      int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::team_size_max(*this);
-      int vector_length = 32; // coalescing disaster right now, will fix later
-      int team_size = 8;
-      if (team_size*vector_length > team_size_max)
-        team_size = team_size_max/vector_length;
-
-      typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-
-      Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
-    }
-
     //ComputeForce
     if (eflag) {
       if (neighflag == HALF) {
@@ -642,25 +618,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
   my_sna.compute_ui(team,ii,jj);
 }
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  // Extract the quantum number
-  const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
-  if (idx >= my_sna.idxu_max) return;
-
-  // Extract the atomic index
-  const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
-  if (ii >= chunk_size) return;
-
-  // Extract the number of neighbors neighbor number
-  const int ninside = d_ninside(ii);
-
-  my_sna.compute_uitot(team,idx,ii,ninside);
-}
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const {
@@ -718,7 +675,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const {
+void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const {
   SNAKokkos<DeviceType> my_sna = snaKK;
 
   // Extract the atom number
@@ -730,7 +687,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
   const int ninside = d_ninside(ii);
   if (jj >= ninside) return;
 
-  my_sna.compute_duidrj(team,ii,jj);
+  my_sna.compute_fused_deidrj(team,ii,jj);
 }
 
 template<class DeviceType>
@@ -750,24 +707,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const t
   my_sna.compute_duidrj_cpu(team,ii,jj);
 }
 
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
-
-  my_sna.compute_deidrj(team,ii,jj);
-}
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 48d9114fbf..a6d9db3218 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -135,14 +135,10 @@ inline
   KOKKOS_INLINE_FUNCTION
   void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
+  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
-  void compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, int); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
   void compute_zi(const int&);    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void zero_yi(const int&,const int&); // ForceSNAP
@@ -155,12 +151,10 @@ inline
   // functions for derivatives
 
   KOKKOS_INLINE_FUNCTION
-  void compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
+  void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
   void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   double compute_sfac(double, double); // add_uarraytot, compute_duarray
@@ -251,10 +245,6 @@ inline
   KOKKOS_INLINE_FUNCTION
   void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui
 
-  KOKKOS_INLINE_FUNCTION
-  void compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                      double, double, double,
-                      double, double); // compute_ui
   KOKKOS_INLINE_FUNCTION
   void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
                       double, double, double,
@@ -267,12 +257,8 @@ inline
 inline
   int compute_ncoeff();           // SNAKokkos()
   KOKKOS_INLINE_FUNCTION
-  void compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                       double, double, double, // compute_duidrj
-                       double, double, double, double, double);
-  KOKKOS_INLINE_FUNCTION
   void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                       double, double, double, // compute_duidrj
+                       double, double, double, // compute_duidrj_cpu
                        double, double, double, double, double);
 
   // Sets the style for the switching function
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index ef3312bd16..1daf8fd05c 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -19,6 +19,7 @@
 #include <cmath>
 #include <cstring>
 #include <cstdlib>
+#include <type_traits>
 
 namespace LAMMPS_NS {
 
@@ -231,11 +232,22 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
   zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom);
 
   //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
-  ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
+    // dummy allocation
+    ulist = t_sna_3c_ll("sna:ulist",1,1,1);
+    dulist = t_sna_4c_ll("sna:dulist",1,1,1);
+  } else {
+#endif
+    ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
+    dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
+#ifdef KOKKOS_ENABLE_CUDA
+  }
+#endif
+
   //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
   ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom);
 
-  //dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
   dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
 }
 
@@ -269,14 +281,14 @@ void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>
 }
 
 /* ----------------------------------------------------------------------
-   compute Ui by summing over bispectrum components
+   compute Ui by computing Wigner U-functions for one neighbor and
+   accumulating to the total. GPU only.
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
 {
-  double rsq, r, x, y, z, z0, theta0;
 
   // utot(j,ma,mb) = 0 for all j,ma,ma
   // utot(j,ma,ma) = 1 for all j,ma
@@ -284,22 +296,143 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
   //   compute r0 = (x,y,z,z0)
   //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
 
-  x = rij(iatom,jnbor,0);
-  y = rij(iatom,jnbor,1);
-  z = rij(iatom,jnbor,2);
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
+  // get shared memory offset
+  const int max_m_tile = (twojmax+1)*(twojmax+1);
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * max_m_tile;
 
-  theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
+  // double buffer
+  SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+
+  const double x = rij(iatom,jnbor,0);
+  const double y = rij(iatom,jnbor,1);
+  const double z = rij(iatom,jnbor,2);
+
+  const double wj_local = wj(iatom, jnbor);
+  const double rcut = rcutij(iatom, jnbor);
+
+  const double rsq = x * x + y * y + z * z;
+  const double r = sqrt(rsq);
+
+  const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
   //    theta0 = (r - rmin0) * rscale0;
-  z0 = r / tan(theta0);
+  const double cs = cos(theta0);
+  const double sn = sin(theta0);
+  const double z0 = r * cs / sn; // r / tan(theta0)
 
-  compute_uarray(team, iatom, jnbor, x, y, z, z0, r);
+  // Compute cutoff function
+  const double sfac = compute_sfac(r, rcut) * wj_local;
 
-  // if we're on the GPU, accumulating into uarraytot is done in a separate kernel.
-  // if we're not, it's more efficient to include it in compute_uarray.
+  // compute Cayley-Klein parameters for unit quaternion,
+  // pack into complex number
+  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
+  const SNAcomplex a = { r0inv * z0, -r0inv * z };
+  const SNAcomplex b = { r0inv * y, -r0inv * x };
+
+  // VMK Section 4.8.2
+
+  // All writes go to global memory and shared memory
+  // so we can avoid all global memory reads!
+  Kokkos::single(Kokkos::PerThread(team), [=]() {
+    //ulist(0,iatom,jnbor) = { 1.0, 0.0 };
+    buf1[0] = {1.,0.};
+    Kokkos::atomic_add(&(ulisttot(0,iatom).re), sfac);
+  });
+
+  for (int j = 1; j <= twojmax; j++) {
+    const int jju = idxu_block[j];
+    const int jjup = idxu_block[j-1];
+
+    // fill in left side of matrix layer from previous layer
+
+    // Flatten loop over ma, mb, need to figure out total
+    // number of iterations
+
+    // for (int ma = 0; ma <= j; ma++)
+    const int n_ma = j+1;
+    // for (int mb = 0; 2*mb <= j; mb++)
+    const int n_mb = j/2+1;
+
+    // the last (j / 2) can be avoided due to symmetry
+    const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0);
+
+    //for (int m = 0; m < total_iters; m++) {
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
+      [&] (const int m) {
+
+      // ma fast, mb slow
+      int ma = m % n_ma;
+      int mb = m / n_ma;
+
+      // index into global memory array
+      const int jju_index = jju+m;
+      //const int jjup_index = jjup+mb*j+ma;
+
+      // index into shared memory buffer for this level
+      const int jju_shared_idx = m;
+
+      // index into shared memory buffer for next level
+      const int jjup_shared_idx = jju_shared_idx - mb;
+
+      SNAcomplex u_accum = {0., 0.};
+
+      // VMK recursion relation: grab contribution which is multiplied by b*
+      const double rootpq2 = -rootpqarray(ma, j - mb);
+      const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
+      //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
+      caconjxpy(b, u_up2, u_accum);
+      
+      // VMK recursion relation: grab contribution which is multiplied by a*
+      const double rootpq1 = rootpqarray(j - ma, j - mb);
+      const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
+      //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.);
+      caconjxpy(a, u_up1, u_accum);
+
+      //ulist(jju_index,iatom,jnbor) = u_accum;
+      // back up into shared memory for next iter
+      buf2[jju_shared_idx] = u_accum;
+
+      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re);
+      Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im);
+
+      // copy left side to right side with inversion symmetry VMK 4.4(2)
+      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
+      // if j is even (-> physical j integer), last element maps to self, skip
+      //if (!(m == total_iters - 1 && j % 2 == 0)) { 
+      if (m < total_iters - 1 || j % 2 == 1) {
+        const int sign_factor = (((ma+mb)%2==0)?1:-1);
+        const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
+        const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
+        
+
+        if (sign_factor == 1) {
+          u_accum.im = -u_accum.im;
+        } else {
+          u_accum.re = -u_accum.re;
+        }
+        //ulist(jjup_flip,iatom,jnbor) = u_accum;
+        buf2[jju_shared_flip] = u_accum;
+
+        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re);
+        Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im);
+      }
+    });
+    // In CUDA backend,
+    // ThreadVectorRange has a __syncwarp (appropriately masked for 
+    // vector lengths < 32) implict at the end
+
+    // swap double buffers
+    auto tmp = buf1; buf1 = buf2; buf2 = tmp;
+    
+
+  }
 }
 
+/* ----------------------------------------------------------------------
+   compute Ui by summing over bispectrum components. CPU only.
+------------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@@ -327,37 +460,6 @@ void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<Dev
 
 }
 
-/* ----------------------------------------------------------------------
-   compute UiTot by summing over neighbors
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int idx, int iatom, int ninside)
-{
-  // fuse initialize in, avoid this load?
-  SNAcomplex utot = ulisttot(idx, iatom);
-  for (int jnbor = 0; jnbor < ninside; jnbor++) {
-
-    const auto x = rij(iatom,jnbor,0);
-    const auto y = rij(iatom,jnbor,1);
-    const auto z = rij(iatom,jnbor,2);
-    const auto rsq = x * x + y * y + z * z;
-    const auto r = sqrt(rsq);
-
-    const double wj_local = wj(iatom, jnbor);
-    const double rcut = rcutij(iatom, jnbor);
-    const double sfac = compute_sfac(r, rcut) * wj_local;
-
-    auto ulist_local = ulist(idx, iatom, jnbor);
-    utot.re += sfac * ulist_local.re;
-    utot.im += sfac * ulist_local.im;
-  }
-
-  ulisttot(idx, iatom) = utot;
-
-}
-
 /* ----------------------------------------------------------------------
    compute Zi by summing over products of Ui
    not updated yet
@@ -509,72 +611,203 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
 }
 
 /* ----------------------------------------------------------------------
-   compute dEidRj
+   Fused calculation of the derivative of Ui w.r.t. atom j
+   and of dEidRj. GPU only.
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
 {
-  t_scalar3<double> final_sum;
+  // get shared memory offset
+  const int max_m_tile = (twojmax+1)*(twojmax/2+1);
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * max_m_tile;
 
-  // Like in ComputeUi/ComputeDuidrj, regular loop over j.
-  for (int j = 0; j <= twojmax; j++) {
-    int jju = idxu_block(j);
+  // double buffer for ulist
+  SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
 
-    // Flatten loop over ma, mb, reduce w/in
+  // double buffer for dulist
+  SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
 
+  const double x = rij(iatom,jnbor,0);
+  const double y = rij(iatom,jnbor,1);
+  const double z = rij(iatom,jnbor,2);
+  const double rsq = x * x + y * y + z * z;
+  const double r = sqrt(rsq);
+  const double rcut = rcutij(iatom, jnbor);
+  const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
+  const double theta0 = (r - rmin0) * rscale0;
+  const double cs = cos(theta0);
+  const double sn = sin(theta0);
+  const double z0 = r * cs / sn;
+  const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
+
+  const double wj_local = wj(iatom, jnbor);
+  const double sfac = wj_local * compute_sfac(r, rcut);
+  const double dsfac = wj_local * compute_dsfac(r, rcut);
+
+  const double rinv = 1.0 / r;
+
+  // extract a single unit vector
+  const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
+
+  // Compute Cayley-Klein parameters for unit quaternion
+  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
+
+  const SNAcomplex a = { r0inv * z0, -r0inv * z };
+  const SNAcomplex b = { r0inv * y, -r0inv * x };
+
+  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
+  const double dr0inv = dr0invdr * u;
+  const double dz0 = dz0dr * u;
+
+  const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
+                              - z * dr0inv + (dir == 2 ? - r0inv : 0.) };
+
+  const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
+                              -x * dr0inv + (dir==0?-r0inv:0.) };
+
+  // Accumulate the full contribution to dedr on the fly
+  const double du_prod = dsfac * u; // chain rule
+  const SNAcomplex y_local = ylist(0, iatom);
+
+  // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
+  double dedr_full_sum = 0.5 * du_prod * y_local.re;
+
+  // single has a warp barrier at the end
+  Kokkos::single(Kokkos::PerThread(team), [=]() {
+    //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
+    ulist_buf1[0] = {1., 0.};
+    dulist_buf1[0] = {0., 0.};
+  });
+
+  for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    int jjup = idxu_block[j-1];
+
+    // flatten the loop over ma,mb
+
+    // for (int ma = 0; ma <= j; ma++)
     const int n_ma = j+1;
     // for (int mb = 0; 2*mb <= j; mb++)
     const int n_mb = j/2+1;
 
     const int total_iters = n_ma * n_mb;
 
-    t_scalar3<double> sum;
+    double dedr_sum = 0.; // j-local sum
 
     //for (int m = 0; m < total_iters; m++) {
     Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m, t_scalar3<double>& sum_tmp) {
+      [&] (const int m, double& sum_tmp) {
 
       // ma fast, mb slow
       int ma = m % n_ma;
       int mb = m / n_ma;
 
-      // get index
-      const int jju_index = jju+mb+mb*j+ma;
-
-      // get ylist, rescale last element by 0.5
-      SNAcomplex y_local = ylist(jju_index,iatom);
-
-      const SNAcomplex du_x = dulist(jju_index,iatom,jnbor,0);
-      const SNAcomplex du_y = dulist(jju_index,iatom,jnbor,1);
-      const SNAcomplex du_z = dulist(jju_index,iatom,jnbor,2);
+      const int jju_index = jju+m;
 
+      // Load y_local, apply the symmetry scaling factor
+      // The "secret" of the shared memory optimization is it eliminates
+      // all global memory reads to duidrj in lieu of caching values in
+      // shared memory and otherwise always writing, making the kernel
+      // ultimately compute bound. We take advantage of that by adding
+      // some reads back in.
+      auto y_local = ylist(jju_index,iatom);
       if (j % 2 == 0 && 2*mb == j) {
         if (ma == mb) { y_local = 0.5*y_local; }
-        else if (ma > mb) { y_local = { 0., 0. }; }
+        else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
         // else the ma < mb gets "double counted", cancelling the 0.5.
       }
 
-      sum_tmp.x += du_x.re * y_local.re + du_x.im * y_local.im;
-      sum_tmp.y += du_y.re * y_local.re + du_y.im * y_local.im;
-      sum_tmp.z += du_z.re * y_local.re + du_z.im * y_local.im;
+      // index into shared memory
+      const int jju_shared_idx = m;
+      const int jjup_shared_idx = jju_shared_idx - mb;
 
-    }, sum); // end loop over flattened ma,mb
+      // Need to compute and accumulate both u and du (mayhaps, we could probably
+      // balance some read and compute by reading u each time).
+      SNAcomplex u_accum = { 0., 0. };
+      SNAcomplex du_accum = { 0., 0. };
 
-    final_sum.x += sum.x;
-    final_sum.y += sum.y;
-    final_sum.z += sum.z;
+      const double rootpq2 = -rootpqarray(ma, j - mb);
+      const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
+      caconjxpy(b, u_up2, u_accum);
+
+      const double rootpq1 = rootpqarray(j - ma, j - mb);
+      const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
+      caconjxpy(a, u_up1, u_accum);
+
+      // Next, spin up du_accum
+      const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
+      caconjxpy(da, u_up1, du_accum);
+      caconjxpy(a, du_up1, du_accum);
+
+      const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
+      caconjxpy(db, u_up2, du_accum);
+      caconjxpy(b, du_up2, du_accum);
+
+      // No need to save u_accum to global memory
+      // Cache u_accum, du_accum to scratch memory.
+      ulist_buf2[jju_shared_idx] = u_accum;
+      dulist_buf2[jju_shared_idx] = du_accum;
+
+      // Directly accumulate deidrj into sum_tmp
+      //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
+      const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
+      sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
+
+      // copy left side to right side with inversion symmetry VMK 4.4(2)
+      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+      if (j%2==1 && mb+1==n_mb) {
+        int sign_factor = (((ma+mb)%2==0)?1:-1); 
+        //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
+        const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
+
+        if (sign_factor == 1) {
+          u_accum.im = -u_accum.im;
+          du_accum.im = -du_accum.im;
+        } else {
+          u_accum.re = -u_accum.re;
+          du_accum.re = -du_accum.re;
+        }
+
+        // We don't need the second half of the tile for the deidrj accumulation.
+        // That's taken care of by the symmetry factor above.
+        //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
+
+        // We do need it for ortho polynomial generation, though
+        ulist_buf2[jju_shared_flip] = u_accum;
+        dulist_buf2[jju_shared_flip] = du_accum;
+      }
+
+    }, dedr_sum);
+
+    // swap buffers
+    auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
+    tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
+
+    // Accumulate dedr. This "should" be in a single, but 
+    // a Kokkos::single call implies a warp sync, and we may
+    // as well avoid that. This does no harm as long as the
+    // final assignment is in a single block.
+    //Kokkos::single(Kokkos::PerThread(team), [=]() {
+      dedr_full_sum += dedr_sum; 
+    //});
   }
 
+  // Store the accumulated dedr.
   Kokkos::single(Kokkos::PerThread(team), [&] () {
-    dedr(iatom,jnbor,0) = final_sum.x*2.0;
-    dedr(iatom,jnbor,1) = final_sum.y*2.0;
-    dedr(iatom,jnbor,2) = final_sum.z*2.0;
+      dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
   });
-
 }
 
+/* ----------------------------------------------------------------------
+   compute dEidRj, CPU path only.
+------------------------------------------------------------------------- */
+
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@@ -624,6 +857,7 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
 
 /* ----------------------------------------------------------------------
    compute Bi by summing conj(Ui)*Zi
+   not updated yet
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -708,28 +942,6 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
    calculate derivative of Ui w.r.t. atom j
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
-{
-  double rsq, r, x, y, z, z0, theta0, cs, sn;
-  double dz0dr;
-
-  x = rij(iatom,jnbor,0);
-  y = rij(iatom,jnbor,1);
-  z = rij(iatom,jnbor,2);
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
-  double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
-  theta0 = (r - rmin0) * rscale0;
-  cs = cos(theta0);
-  sn = sin(theta0);
-  z0 = r * cs / sn;
-  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
-
-  compute_duarray(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
-}
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@@ -774,119 +986,6 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
    compute Wigner U-functions for one neighbor
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
-                         double x, double y, double z,
-                         double z0, double r)
-{
-  // define size of scratch memory buffer
-  const int max_m_tile = (twojmax+1)*(twojmax+1);
-  const int team_rank = team.team_rank();
-
-  // get scratch memory double buffer
-  SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-  SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-
-  // compute Cayley-Klein parameters for unit quaternion,
-  // pack into complex number
-  double r0inv = 1.0 / sqrt(r * r + z0 * z0);
-  SNAcomplex a = { r0inv * z0, -r0inv * z };
-  SNAcomplex b = { r0inv * y, -r0inv * x };
-
-  // VMK Section 4.8.2
-
-  // All writes go to global memory and shared memory
-  // so we can avoid all global memory reads
-  Kokkos::single(Kokkos::PerThread(team), [=]() {
-    ulist(0,iatom,jnbor) = { 1.0, 0.0 };
-    buf1[max_m_tile*team_rank] = {1.,0.};
-  });
-
-  for (int j = 1; j <= twojmax; j++) {
-    const int jju = idxu_block[j];
-    int jjup = idxu_block[j-1];
-
-    // fill in left side of matrix layer from previous layer
-
-    // Flatten loop over ma, mb, need to figure out total
-    // number of iterations
-
-    // for (int ma = 0; ma <= j; ma++)
-    const int n_ma = j+1;
-    // for (int mb = 0; 2*mb <= j; mb++)
-    const int n_mb = j/2+1;
-
-    const int total_iters = n_ma * n_mb;
-
-    //for (int m = 0; m < total_iters; m++) {
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m) {
-
-      // ma fast, mb slow
-      int ma = m % n_ma;
-      int mb = m / n_ma;
-
-      // index into global memory array
-      const int jju_index = jju+mb+mb*j+ma;
-
-      // index into shared memory buffer for previous level
-      const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
-
-      // index into shared memory buffer for next level
-      const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
-
-      SNAcomplex u_accum = {0., 0.};
-
-      // VMK recursion relation: grab contribution which is multiplied by a*
-      const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
-      caconjxpy(a, u_up1, u_accum);
-
-      // VMK recursion relation: grab contribution which is multiplied by b*
-      const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
-      caconjxpy(b, u_up2, u_accum);
-
-      ulist(jju_index,iatom,jnbor) = u_accum;
-
-      // We no longer accumulate into ulisttot in this kernel.
-      // Instead, we have a separate kernel which avoids atomics.
-      // Running two separate kernels is net faster.
-
-      // back up into shared memory for next iter
-      if (j != twojmax) buf2[jju_shared_idx] = u_accum;
-
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
-      // We can avoid this if we're on the last row for an integer j
-      if (!(n_ma % 2 == 1 && (mb+1) == n_mb)) {
-
-        int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
-        const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
-        const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
-
-        if (sign_factor == 1) {
-          u_accum.im = -u_accum.im;
-        } else {
-          u_accum.re = -u_accum.re;
-        }
-        ulist(jjup_flip,iatom,jnbor) = u_accum;
-        if (j != twojmax) buf2[jju_shared_flip] = u_accum;
-      }
-    });
-    // In CUDA backend,
-    // ThreadVectorRange has a __syncwarp (appropriately masked for
-    // vector lengths < 32) implicit at the end
-
-    // swap double buffers
-    auto tmp = buf1; buf1 = buf2; buf2 = tmp;
-    //std::swap(buf1, buf2); // throws warnings
-
-  }
-}
-
-// CPU version
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
@@ -976,152 +1075,9 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
 
 /* ----------------------------------------------------------------------
    compute derivatives of Wigner U-functions for one neighbor
-   see comments in compute_uarray()
+   see comments in compute_uarray_cpu()
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
-                          double x, double y, double z,
-                          double z0, double r, double dz0dr,
-                          double wj, double rcut)
-{
-
-  // get shared memory offset
-  const int max_m_tile = (twojmax+1)*(twojmax+1);
-  const int team_rank = team.team_rank();
-
-  // double buffer for ulist
-  SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-  SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-
-  // double buffer for dulist
-  SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-  SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
-
-  const double sfac = wj * compute_sfac(r, rcut);
-  const double dsfac = wj * compute_dsfac(r, rcut);
-
-  const double rinv = 1.0 / r;
-
-  // extract a single unit vector
-  const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
-
-  // Compute Cayley-Klein parameters for unit quaternion
-
-  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
-
-  const SNAcomplex a = { r0inv * z0, -r0inv * z };
-  const SNAcomplex b = { r0inv * y, -r0inv * x };
-
-  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
-  const double dr0inv = dr0invdr * u;
-  const double dz0 = dz0dr * u;
-
-  const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
-                              - z * dr0inv + (dir == 2 ? - r0inv : 0.) };
-
-  const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
-                              -x * dr0inv + (dir==0?-r0inv:0.) };
-
-  // single has a warp barrier at the end
-  Kokkos::single(Kokkos::PerThread(team), [=]() {
-    dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
-    ulist_buf1[max_m_tile*team_rank] = {1., 0.};
-    dulist_buf1[max_m_tile*team_rank] = {0., 0.};
-  });
-
-
-  for (int j = 1; j <= twojmax; j++) {
-    int jju = idxu_block[j];
-    int jjup = idxu_block[j-1];
-
-    // flatten the loop over ma,mb
-
-    // for (int ma = 0; ma <= j; ma++)
-    const int n_ma = j+1;
-    // for (int mb = 0; 2*mb <= j; mb++)
-    const int n_mb = j/2+1;
-
-    const int total_iters = n_ma * n_mb;
-
-    //for (int m = 0; m < total_iters; m++) {
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m) {
-
-      // ma fast, mb slow
-      int ma = m % n_ma;
-      int mb = m / n_ma;
-
-      const int jju_index = jju+mb+mb*j+ma;
-
-      // index into shared memory
-      const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
-      const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
-
-      // Need to compute and accumulate both u and du (mayhaps, we could probably
-      // balance some read and compute by reading u each time).
-      SNAcomplex u_accum = { 0., 0. };
-      SNAcomplex du_accum = { 0., 0. };
-
-      const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
-      caconjxpy(a, u_up1, u_accum);
-
-      const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
-      caconjxpy(b, u_up2, u_accum);
-
-      // No need to save u_accum to global memory
-      if (j != twojmax) ulist_buf2[jju_shared_idx] = u_accum;
-
-      // Next, spin up du_accum
-      const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
-      caconjxpy(da, u_up1, du_accum);
-      caconjxpy(a, du_up1, du_accum);
-
-      const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
-      caconjxpy(db, u_up2, du_accum);
-      caconjxpy(b, du_up2, du_accum);
-
-      dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
-
-      if (j != twojmax) dulist_buf2[jju_shared_idx] = du_accum;
-
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
-
-      int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
-      const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
-      const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
-
-      if (sign_factor == 1) {
-        //ulist_alt(iatom,jnbor,jjup_flip).re = u_accum.re;
-        //ulist_alt(iatom,jnbor,jjup_flip).im = -u_accum.im;
-        u_accum.im = -u_accum.im;
-        du_accum.im = -du_accum.im;
-      } else {
-        //ulist_alt(iatom,jnbor,jjup_flip).re = -u_accum.re;
-        //ulist_alt(iatom,jnbor,jjup_flip).im = u_accum.im;
-        u_accum.re = -u_accum.re;
-        du_accum.re = -du_accum.re;
-      }
-
-      dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
-
-      if (j != twojmax) {
-        ulist_buf2[jju_shared_flip] = u_accum;
-        dulist_buf2[jju_shared_flip] = du_accum;
-      }
-
-    });
-
-    // swap buffers
-    auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
-    tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
-  }
-}
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
@@ -1680,11 +1636,17 @@ double SNAKokkos<DeviceType>::memory_usage()
   bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
   bytes += idxcg_max * sizeof(double);                   // cglist
 
-  bytes += natom * idxu_max * sizeof(double) * 2;        // ulist
+#ifdef KOKKOS_ENABLE_CUDA
+  if (!std::is_same<DeviceType,Kokkos::Cuda>::value) {
+#endif
+    bytes += natom * idxu_max * sizeof(double) * 2;        // ulist
+    bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
+#ifdef KOKKOS_ENABLE_CUDA
+  }
+#endif
   bytes += natom * idxu_max * sizeof(double) * 2;        // ulisttot
   if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
     bytes += natom * idxu_max * sizeof(double) * 2;        // ulisttot_lr
-  bytes += natom * idxu_max * 3 * sizeof(double) * 2;    // dulist
 
   bytes += natom * idxz_max * sizeof(double) * 2;        // zlist
   bytes += natom * idxb_max * sizeof(double);            // blist

From 36095bbfdf39c85b6932bd0ec03ac062209e497e Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 23 Mar 2020 21:15:00 -0600
Subject: [PATCH 10/17] Tweak comment

---
 src/KOKKOS/sna_kokkos_impl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 1daf8fd05c..182a49bfb1 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -333,7 +333,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
   // VMK Section 4.8.2
 
   // All writes go to global memory and shared memory
-  // so we can avoid all global memory reads!
+  // so we can avoid all global memory reads
   Kokkos::single(Kokkos::PerThread(team), [=]() {
     //ulist(0,iatom,jnbor) = { 1.0, 0.0 };
     buf1[0] = {1.,0.};

From 387c2d05077ac5556367677fb65ba7004a1a65c3 Mon Sep 17 00:00:00 2001
From: Evan Weinberg <weinbe2@bu.edu>
Date: Mon, 23 Mar 2020 23:31:12 -0400
Subject: [PATCH 11/17] Cleaned up comment.

---
 src/KOKKOS/sna_kokkos_impl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 182a49bfb1..e7b7087951 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -462,7 +462,6 @@ void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<Dev
 
 /* ----------------------------------------------------------------------
    compute Zi by summing over products of Ui
-   not updated yet
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>

From 5fa99cb07281675ebde5ada638d6597ba23a95f5 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 23 Mar 2020 21:33:11 -0600
Subject: [PATCH 12/17] Comment cleanup

---
 src/KOKKOS/sna_kokkos_impl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index e7b7087951..e6c34a245b 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -856,7 +856,6 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
 
 /* ----------------------------------------------------------------------
    compute Bi by summing conj(Ui)*Zi
-   not updated yet
 ------------------------------------------------------------------------- */
 
 template<class DeviceType>

From 0060473cee2824527c44a1839ae7a870f42f9b08 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Mar 2020 11:35:21 -0400
Subject: [PATCH 13/17] fix up some escaped '*' characters in "code-block"
 sections that do not need to be escaped

---
 doc/src/bond_oxdna.rst          |  2 +-
 doc/src/pair_bop.rst            |  4 ++--
 doc/src/pair_class2.rst         |  8 ++++----
 doc/src/pair_coeff.rst          | 12 ++++++------
 doc/src/pair_cosine_squared.rst |  2 +-
 doc/src/pair_meam_sw_spline.rst |  4 +++-
 doc/src/pair_nb3b_harmonic.rst  |  4 +++-
 doc/src/pair_polymorphic.rst    |  4 ++--
 doc/src/pair_python.rst         | 12 ++++++------
 doc/src/pair_spin_magelec.rst   |  2 +-
 10 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/doc/src/bond_oxdna.rst b/doc/src/bond_oxdna.rst
index 8e69b298bf..71e5105436 100644
--- a/doc/src/bond_oxdna.rst
+++ b/doc/src/bond_oxdna.rst
@@ -32,7 +32,7 @@ Examples
    bond_coeff * 2.0 0.25 0.7564
 
    bond_style oxrna2/fene
-   bond_coeff \* 2.0 0.25 0.76107
+   bond_coeff * 2.0 0.25 0.76107
 
 Description
 """""""""""
diff --git a/doc/src/pair_bop.rst b/doc/src/pair_bop.rst
index ce7b69f41f..5cd045931a 100644
--- a/doc/src/pair_bop.rst
+++ b/doc/src/pair_bop.rst
@@ -132,9 +132,9 @@ and Te.  If your LAMMPS simulation has 4 atoms types and you want the
 1st 3 to be Cd, and the 4th to be Te, you would use the following
 pair_coeff command:
 
-.. parsed-literal::
+.. code-block:: LAMMPS
 
-   pair_coeff \* \* CdTe Cd Cd Cd Te
+   pair_coeff * * CdTe Cd Cd Cd Te
 
 The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types.
 The first three Cd arguments map LAMMPS atom types 1,2,3 to the Cd
diff --git a/doc/src/pair_class2.rst b/doc/src/pair_class2.rst
index d1c673ab97..8131799181 100644
--- a/doc/src/pair_class2.rst
+++ b/doc/src/pair_class2.rst
@@ -60,18 +60,18 @@ Examples
 .. code-block:: LAMMPS
 
    pair_style lj/class2 10.0
-   pair_coeff \* \* 100.0 2.5
-   pair_coeff 1 2\* 100.0 2.5 9.0
+   pair_coeff * * 100.0 2.5
+   pair_coeff 1 2* 100.0 2.5 9.0
 
    pair_style lj/class2/coul/cut 10.0
    pair_style lj/class2/coul/cut 10.0 8.0
-   pair_coeff \* \* 100.0 3.0
+   pair_coeff * * 100.0 3.0
    pair_coeff 1 1 100.0 3.5 9.0
    pair_coeff 1 1 100.0 3.5 9.0 9.0
 
    pair_style lj/class2/coul/long 10.0
    pair_style lj/class2/coul/long 10.0 8.0
-   pair_coeff \* \* 100.0 3.0
+   pair_coeff * * 100.0 3.0
    pair_coeff 1 1 100.0 3.5 9.0
 
 Description
diff --git a/doc/src/pair_coeff.rst b/doc/src/pair_coeff.rst
index 26910c1746..1886ce1118 100644
--- a/doc/src/pair_coeff.rst
+++ b/doc/src/pair_coeff.rst
@@ -19,11 +19,11 @@ Examples
 .. code-block:: LAMMPS
 
    pair_coeff 1 2 1.0 1.0 2.5
-   pair_coeff 2 \* 1.0 1.0
-   pair_coeff 3\* 1\*2 1.0 1.0 2.5
-   pair_coeff \* \* 1.0 1.0
-   pair_coeff \* \* nialhjea 1 1 2
-   pair_coeff \* 3 morse.table ENTRY1
+   pair_coeff 2 * 1.0 1.0
+   pair_coeff 3* 1*2 1.0 1.0 2.5
+   pair_coeff * * 1.0 1.0
+   pair_coeff * * nialhjea 1 1 2
+   pair_coeff * 3 morse.table ENTRY1
    pair_coeff 1 2 lj/cut 1.0 1.0 2.5 (for pair_style hybrid)
 
 Description
@@ -55,7 +55,7 @@ pairs, then overwrite the coeffs for just the I,J = 2,3 pair:
 
 .. code-block:: LAMMPS
 
-   pair_coeff \* \* 1.0 1.0 2.5
+   pair_coeff * * 1.0 1.0 2.5
    pair_coeff 2 3 2.0 1.0 1.12
 
 A line in a data file that specifies pair coefficients uses the exact
diff --git a/doc/src/pair_cosine_squared.rst b/doc/src/pair_cosine_squared.rst
index 4f19ddd1e4..b7fa29bbd5 100644
--- a/doc/src/pair_cosine_squared.rst
+++ b/doc/src/pair_cosine_squared.rst
@@ -31,7 +31,7 @@ Examples
 .. code-block:: LAMMPS
 
    pair_style cosine/squared 3.0
-   pair_coeff \* \* 1.0 1.3
+   pair_coeff * * 1.0 1.3
    pair_coeff 1 3 1.0 1.3 2.0
    pair_coeff 1 3 1.0 1.3 wca
    pair_coeff 1 3 1.0 1.3 2.0 wca
diff --git a/doc/src/pair_meam_sw_spline.rst b/doc/src/pair_meam_sw_spline.rst
index ebb51795c2..0cf4c57975 100644
--- a/doc/src/pair_meam_sw_spline.rst
+++ b/doc/src/pair_meam_sw_spline.rst
@@ -75,7 +75,9 @@ If your LAMMPS simulation has 3 atoms types and they are all to be
 treated with this potential, you would use the following pair_coeff
 command:
 
-pair_coeff \* \* Ti.meam.sw.spline Ti Ti Ti
+.. code-block:: LAMMPS
+
+   pair_coeff * * Ti.meam.sw.spline Ti Ti Ti
 
 The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types.
 The three Ti arguments map LAMMPS atom types 1,2,3 to the Ti element
diff --git a/doc/src/pair_nb3b_harmonic.rst b/doc/src/pair_nb3b_harmonic.rst
index c4f3f3c3e8..9d63df65d5 100644
--- a/doc/src/pair_nb3b_harmonic.rst
+++ b/doc/src/pair_nb3b_harmonic.rst
@@ -64,7 +64,9 @@ NULL values are placeholders for atom types that will be used with
 other potentials. An example of a pair_coeff command for use with the
 *hybrid* pair style is:
 
-pair_coeff \* \* nb3b/harmonic MgOH.nb3b.harmonic Mg O H
+.. code-block:: LAMMPS
+
+   pair_coeff * * nb3b/harmonic MgOH.nb3b.harmonic Mg O H
 
 Three-body non-bonded harmonic files in the *potentials* directory of
 the LAMMPS distribution have a ".nb3b.harmonic" suffix.  Lines that
diff --git a/doc/src/pair_polymorphic.rst b/doc/src/pair_polymorphic.rst
index e06e9e7855..c0db3f10a4 100644
--- a/doc/src/pair_polymorphic.rst
+++ b/doc/src/pair_polymorphic.rst
@@ -180,9 +180,9 @@ functions for Si-C tersoff potential. If your LAMMPS simulation has 4
 atoms types and you want the 1st 3 to be Si, and the 4th to be C, you
 would use the following pair_coeff command:
 
-.. parsed-literal::
+.. code-block:: LAMMPS
 
-   pair_coeff \* \* SiC_tersoff.poly Si Si Si C
+   pair_coeff * * SiC_tersoff.poly Si Si Si C
 
 The 1st 2 arguments must be \* \* so as to span all LAMMPS atom
 types. The first three Si arguments map LAMMPS atom types 1,2,3 to the
diff --git a/doc/src/pair_python.rst b/doc/src/pair_python.rst
index fa76d4c16c..e654a6025f 100644
--- a/doc/src/pair_python.rst
+++ b/doc/src/pair_python.rst
@@ -113,8 +113,8 @@ which the parameters epsilon and sigma are both 1.0:
    class LJCutMelt(LAMMPSPairPotential):
        def __init__(self):
            super(LJCutMelt,self).__init__()
-           # set coeffs: 48\*eps\*sig\*\*12, 24\*eps\*sig\*\*6,
-           #              4\*eps\*sig\*\*12,  4\*eps\*sig\*\*6
+           # set coeffs: 48*eps*sig**12, 24*eps*sig**6,
+           #              4*eps*sig**12,  4*eps*sig**6
            self.units = 'lj'
            self.coeff = {'lj'  : {'lj'  : (48.0,24.0,4.0,4.0)}}
 
@@ -137,18 +137,18 @@ the *LJCutMelt* example, here are the two functions:
       def compute_force(self,rsq,itype,jtype):
            coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]]
            r2inv  = 1.0/rsq
-           r6inv  = r2inv\*r2inv\*r2inv
+           r6inv  = r2inv*r2inv*r2inv
            lj1 = coeff[0]
            lj2 = coeff[1]
-           return (r6inv \* (lj1\*r6inv - lj2))\*r2inv
+           return (r6inv * (lj1*r6inv - lj2))*r2inv
 
        def compute_energy(self,rsq,itype,jtype):
            coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]]
            r2inv  = 1.0/rsq
-           r6inv  = r2inv\*r2inv\*r2inv
+           r6inv  = r2inv*r2inv*r2inv
            lj3 = coeff[2]
            lj4 = coeff[3]
-           return (r6inv \* (lj3\*r6inv - lj4))
+           return (r6inv * (lj3*r6inv - lj4))
 
 .. note::
 
diff --git a/doc/src/pair_spin_magelec.rst b/doc/src/pair_spin_magelec.rst
index a220299b07..e4d6b81b8a 100644
--- a/doc/src/pair_spin_magelec.rst
+++ b/doc/src/pair_spin_magelec.rst
@@ -18,7 +18,7 @@ Examples
 .. code-block:: LAMMPS
 
    pair_style spin/magelec 4.5
-   pair_coeff \* \* magelec 4.5 0.00109 1.0 1.0 1.0
+   pair_coeff * * magelec 4.5 0.00109 1.0 1.0 1.0
 
 Description
 """""""""""

From 0f35c1d009b70258c3c967e51abbd11e923c6f30 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 24 Mar 2020 18:43:20 -0400
Subject: [PATCH 14/17] Update ubuntu package name to libpng-dev

---
 doc/src/Howto_bash.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/Howto_bash.rst b/doc/src/Howto_bash.rst
index 02322e5e1c..98cc6c0dff 100644
--- a/doc/src/Howto_bash.rst
+++ b/doc/src/Howto_bash.rst
@@ -103,7 +103,7 @@ needed for various LAMMPS features:
 
 .. code-block:: bash
 
-   sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng12-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools
+   sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools
 
 Files in Ubuntu on Windows
 ^^^^^^^^^^^^^^^^^^^^^^^^^^

From 431647d943d64001e2e2d6a6e01294c2903c42d7 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 24 Mar 2020 18:52:05 -0400
Subject: [PATCH 15/17] Add link to official WSL docs

---
 doc/src/Howto_bash.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/src/Howto_bash.rst b/doc/src/Howto_bash.rst
index 98cc6c0dff..b452f579d0 100644
--- a/doc/src/Howto_bash.rst
+++ b/doc/src/Howto_bash.rst
@@ -12,6 +12,10 @@ via apt-get and all files are accessible in both the Windows Explorer and your
 Linux shell (bash). This avoids switching to a different operating system or
 installing a virtual machine. Everything runs on Windows.
 
+.. seealso::
+
+   You can find more detailed information at the `Windows Subsystem for Linux Installation Guide for Windows 10 <https://docs.microsoft.com/en-us/windows/wsl/install-win10>`_.
+
 Installing Bash on Windows
 --------------------------
 

From 398c030925e8175f6e15177f2d9b3194fca9524a Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 25 Mar 2020 06:52:37 -0400
Subject: [PATCH 16/17] whitespace cleanup

---
 src/KOKKOS/sna_kokkos_impl.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index e6c34a245b..dcedf333e5 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -382,7 +382,7 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
       //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
       caconjxpy(b, u_up2, u_accum);
-      
+
       // VMK recursion relation: grab contribution which is multiplied by a*
       const double rootpq1 = rootpqarray(j - ma, j - mb);
       const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
@@ -399,12 +399,12 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       // copy left side to right side with inversion symmetry VMK 4.4(2)
       // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
       // if j is even (-> physical j integer), last element maps to self, skip
-      //if (!(m == total_iters - 1 && j % 2 == 0)) { 
+      //if (!(m == total_iters - 1 && j % 2 == 0)) {
       if (m < total_iters - 1 || j % 2 == 1) {
         const int sign_factor = (((ma+mb)%2==0)?1:-1);
         const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
         const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
-        
+
 
         if (sign_factor == 1) {
           u_accum.im = -u_accum.im;
@@ -419,12 +419,12 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
       }
     });
     // In CUDA backend,
-    // ThreadVectorRange has a __syncwarp (appropriately masked for 
+    // ThreadVectorRange has a __syncwarp (appropriately masked for
     // vector lengths < 32) implict at the end
 
     // swap double buffers
     auto tmp = buf1; buf1 = buf2; buf2 = tmp;
-    
+
 
   }
 }
@@ -760,7 +760,7 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
       // copy left side to right side with inversion symmetry VMK 4.4(2)
       // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
       if (j%2==1 && mb+1==n_mb) {
-        int sign_factor = (((ma+mb)%2==0)?1:-1); 
+        int sign_factor = (((ma+mb)%2==0)?1:-1);
         //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
         const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
 
@@ -787,18 +787,18 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
     auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
     tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
 
-    // Accumulate dedr. This "should" be in a single, but 
+    // Accumulate dedr. This "should" be in a single, but
     // a Kokkos::single call implies a warp sync, and we may
     // as well avoid that. This does no harm as long as the
     // final assignment is in a single block.
     //Kokkos::single(Kokkos::PerThread(team), [=]() {
-      dedr_full_sum += dedr_sum; 
+    dedr_full_sum += dedr_sum;
     //});
   }
 
   // Store the accumulated dedr.
   Kokkos::single(Kokkos::PerThread(team), [&] () {
-      dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
+    dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
   });
 }
 

From 1192845ad58327cee3c3e4608596baf8f848b1e7 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 25 Mar 2020 08:19:24 -0400
Subject: [PATCH 17/17] avoid segmentation faults in universe/uloop variable
 increment

---
 src/variable.cpp | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/src/variable.cpp b/src/variable.cpp
index 1093ce9066..0fc53f8074 100644
--- a/src/variable.cpp
+++ b/src/variable.cpp
@@ -661,6 +661,8 @@ int Variable::next(int narg, char **arg)
 
   } else if (istyle == UNIVERSE || istyle == ULOOP) {
 
+    uloop_again:
+
     // wait until lock file can be created and owned by proc 0 of this world
     // rename() is not atomic in practice, but no known simple fix
     //   means multiple procs can read/write file at the same time (bad!)
@@ -669,7 +671,7 @@ int Variable::next(int narg, char **arg)
     // delay for random fraction of 1 second before subsequent tries
     // when successful, read next available index and Bcast it within my world
 
-    int nextindex;
+    int nextindex = -1;
     if (me == 0) {
       int seed = 12345 + universe->me + which[find(arg[0])];
       RanMars *random = new RanMars(lmp,seed);
@@ -682,10 +684,33 @@ int Variable::next(int narg, char **arg)
       }
       delete random;
 
-      FILE *fp = fopen("tmp.lammps.variable.lock","r");
-      fscanf(fp,"%d",&nextindex);
+      // if the file cannot be found, we may have a race with some
+      // other MPI rank that has called rename at the same time
+      // and we have to start over.
+      // if the read is short (we need at least one byte) we try reading again.
+
+      FILE *fp;
+      char buf[64];
+      for (int loopmax = 0; loopmax < 100; ++loopmax) {
+        fp = fopen("tmp.lammps.variable.lock","r");
+        if (fp == NULL) goto uloop_again;
+
+        buf[0] = buf[1] = '\0';
+        fread(buf,1,64,fp);
+        fclose(fp);
+
+        if (strlen(buf) > 0) {
+           nextindex = atoi(buf);
+           break;
+        }
+        delay = (int) (1000000*random->uniform());
+        usleep(delay);
+      }
+      if (nextindex < 0)
+        error->one(FLERR,"Unexpected error while incrementing uloop "
+                   "style variable. Please contact LAMMPS developers.");
+
       //printf("READ %d %d\n",universe->me,nextindex);
-      fclose(fp);
       fp = fopen("tmp.lammps.variable.lock","w");
       fprintf(fp,"%d\n",nextindex+1);
       //printf("WRITE %d %d\n",universe->me,nextindex+1);