diff --git a/doc/src/compute_orientorder_atom.rst b/doc/src/compute_orientorder_atom.rst
index 2a36fb4cc0..af4e6a21e8 100644
--- a/doc/src/compute_orientorder_atom.rst
+++ b/doc/src/compute_orientorder_atom.rst
@@ -115,8 +115,8 @@ The optional keyword *chunksize* is only applicable when using the
 the KOKKOS package and is ignored otherwise. This keyword controls
 the number of atoms in each pass used to compute the bond-orientational
 order parameters and is used to avoid running out of memory. For example
-if there are 4000 atoms in the simulation and the *chunksize*
-is set to 2000, the parameter calculation will be broken up
+if there are 32768 atoms in the simulation and the *chunksize*
+is set to 16384, the parameter calculation will be broken up
 into two passes.
 
 The value of :math:`Q_l` is set to zero for atoms not in the
@@ -193,7 +193,7 @@ Default
 
 The option defaults are *cutoff* = pair style cutoff, *nnn* = 12,
 *degrees* = 5 4 6 8 10 12 i.e. :math:`Q_4`, :math:`Q_6`, :math:`Q_8`, :math:`Q_{10}`, and :math:`Q_{12}`,
-*wl* = no, *wl/hat* = no, *components* off, and *chunksize* = 2000
+*wl* = no, *wl/hat* = no, *components* off, and *chunksize* = 16384
 
 ----------
 
diff --git a/doc/src/pair_snap.rst b/doc/src/pair_snap.rst
index 8aa10f5986..a99331345e 100644
--- a/doc/src/pair_snap.rst
+++ b/doc/src/pair_snap.rst
@@ -152,7 +152,7 @@ The default values for these keywords are
 * *chemflag* = 0
 * *bnormflag* = 0
 * *wselfallflag* = 0
-* *chunksize* = 2000
+* *chunksize* = 4096
 
 If *quadraticflag* is set to 1, then the SNAP energy expression includes additional quadratic terms
 that have been shown to increase the overall accuracy of the potential without much increase
@@ -189,8 +189,8 @@ pair style *snap* with the KOKKOS package and is ignored otherwise.
 This keyword controls
 the number of atoms in each pass used to compute the bispectrum
 components and is used to avoid running out of memory. For example
-if there are 4000 atoms in the simulation and the *chunksize*
-is set to 2000, the bispectrum calculation will be broken up
+if there are 8192 atoms in the simulation and the *chunksize*
+is set to 4096, the bispectrum calculation will be broken up
 into two passes.
 
 Detailed definitions for all the other keywords
diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index 72513d2b17..fbe9799bee 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -1076,20 +1076,34 @@ struct params_lj_coul {
 
 // Pair SNAP
 
+#define SNAP_KOKKOS_REAL double
+#define SNAP_KOKKOS_HOST_VECLEN 1
+
+#ifdef LMP_KOKKOS_GPU
+#define SNAP_KOKKOS_DEVICE_VECLEN 32
+#else
+#define SNAP_KOKKOS_DEVICE_VECLEN 1
+#endif
+
+
+// intentional: SNAreal/complex gets reused beyond SNAP
 typedef double SNAreal;
 
 //typedef struct { SNAreal re, im; } SNAcomplex;
-template <typename real>
-struct alignas(2*sizeof(real)) SNAComplex
+template <typename real_type_>
+struct alignas(2*sizeof(real_type_)) SNAComplex
 {
-  real re,im;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+  real_type re,im;
 
-  SNAComplex() = default;
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex()
+   : re(static_cast<real_type>(0.)), im(static_cast<real_type>(0.)) { ; }
 
-  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re)
-   : re(re), im(static_cast<real>(0.)) { ; }
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real_type re)
+   : re(re), im(static_cast<real_type>(0.)) { ; }
 
-  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re, real im)
+  KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real_type re, real_type im)
    : re(re), im(im) { ; }
 
   KOKKOS_FORCEINLINE_FUNCTION SNAComplex(const SNAComplex& other)
@@ -1117,27 +1131,24 @@ struct alignas(2*sizeof(real)) SNAComplex
     return *this;
   }
 
+  KOKKOS_INLINE_FUNCTION
+  static constexpr complex zero() { return complex(static_cast<real_type>(0.), static_cast<real_type>(0.)); }
+
+  KOKKOS_INLINE_FUNCTION
+  static constexpr complex one() { return complex(static_cast<real_type>(1.), static_cast<real_type>(0.)); }
+
+  KOKKOS_INLINE_FUNCTION
+  const complex conj() { return complex(re, -im); }
+
 };
 
-template <typename real>
-KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real> operator*(const real& r, const SNAComplex<real>& self) {
-  return SNAComplex<real>(r*self.re, r*self.im);
+template <typename real_type>
+KOKKOS_FORCEINLINE_FUNCTION SNAComplex<real_type> operator*(const real_type& r, const SNAComplex<real_type>& self) {
+  return SNAComplex<real_type>(r*self.re, r*self.im);
 }
 
 typedef SNAComplex<SNAreal> SNAcomplex;
 
-// Cayley-Klein pack
-// Can guarantee it's aligned to 2 complex
-struct alignas(32) CayleyKleinPack {
-
-  SNAcomplex a, b;
-  SNAcomplex da[3], db[3];
-  SNAreal sfac;
-  SNAreal dsfacu[3];
-
-};
-
-
 #if defined(KOKKOS_ENABLE_CXX11)
 #undef ISFINITE
 #define ISFINITE(x) std::isfinite(x)
diff --git a/src/KOKKOS/pair_snap_kokkos.cpp b/src/KOKKOS/pair_snap_kokkos.cpp
index 6f324a97be..4677d178e0 100644
--- a/src/KOKKOS/pair_snap_kokkos.cpp
+++ b/src/KOKKOS/pair_snap_kokkos.cpp
@@ -15,9 +15,11 @@
 #include "pair_snap_kokkos_impl.h"
 
 namespace LAMMPS_NS {
-template class PairSNAPKokkos<LMPDeviceType>;
+
+template class PairSNAPKokkosDevice<LMPDeviceType>;
 #ifdef LMP_KOKKOS_GPU
-template class PairSNAPKokkos<LMPHostType>;
+template class PairSNAPKokkosHost<LMPHostType>;
 #endif
+
 }
 
diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h
index e7a1f3173d..416cc1b888 100644
--- a/src/KOKKOS/pair_snap_kokkos.h
+++ b/src/KOKKOS/pair_snap_kokkos.h
@@ -13,9 +13,13 @@
 
 #ifdef PAIR_CLASS
 
-PairStyle(snap/kk,PairSNAPKokkos<LMPDeviceType>)
-PairStyle(snap/kk/device,PairSNAPKokkos<LMPDeviceType>)
-PairStyle(snap/kk/host,PairSNAPKokkos<LMPHostType>)
+PairStyle(snap/kk,PairSNAPKokkosDevice<LMPDeviceType>)
+PairStyle(snap/kk/device,PairSNAPKokkosDevice<LMPDeviceType>)
+#ifdef LMP_KOKKOS_GPU
+PairStyle(snap/kk/host,PairSNAPKokkosHost<LMPHostType>)
+#else
+PairStyle(snap/kk/host,PairSNAPKokkosDevice<LMPHostType>)
+#endif
 
 #else
 
@@ -33,9 +37,11 @@ namespace LAMMPS_NS {
 // Routines for both the CPU and GPU backend
 template<int NEIGHFLAG, int EVFLAG>
 struct TagPairSNAPComputeForce{};
-struct TagPairSNAPComputeNeigh{};
+
 
 // GPU backend only
+struct TagPairSNAPComputeNeigh{};
+struct TagPairSNAPComputeCayleyKlein{};
 struct TagPairSNAPPreUi{};
 struct TagPairSNAPComputeUi{};
 struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist
@@ -44,10 +50,10 @@ struct TagPairSNAPBeta{};
 struct TagPairSNAPComputeBi{};
 struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS
 struct TagPairSNAPComputeYi{};
-struct TagPairSNAPTransformYi{}; // re-order ylist from AoSoA to AoS
 struct TagPairSNAPComputeFusedDeidrj{};
 
 // CPU backend only
+struct TagPairSNAPComputeNeighCPU{};
 struct TagPairSNAPPreUiCPU{};
 struct TagPairSNAPComputeUiCPU{};
 struct TagPairSNAPTransformUiCPU{};
@@ -59,7 +65,7 @@ struct TagPairSNAPComputeYiCPU{};
 struct TagPairSNAPComputeDuidrjCPU{};
 struct TagPairSNAPComputeDeidrjCPU{};
 
-template<class DeviceType>
+template<class DeviceType, typename real_type_, int vector_length_>
 class PairSNAPKokkos : public PairSNAP {
 public:
   enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD};
@@ -68,6 +74,14 @@ public:
   typedef ArrayTypes<DeviceType> AT;
   typedef EV_FLOAT value_type;
 
+  static constexpr int vector_length = vector_length_;
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+
+  // type-dependent team sizes
+  static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4;
+  static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2;
+
   PairSNAPKokkos(class LAMMPS *);
   ~PairSNAPKokkos();
 
@@ -78,10 +92,10 @@ public:
   double memory_usage();
 
   template<class TagStyle>
-  void check_team_size_for(int, int&, int);
+  void check_team_size_for(int, int&);
 
   template<class TagStyle>
-  void check_team_size_reduce(int, int&, int);
+  void check_team_size_reduce(int, int&);
 
   template<int NEIGHFLAG, int EVFLAG>
   KOKKOS_INLINE_FUNCTION
@@ -91,15 +105,18 @@ public:
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT&) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
-
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPBetaCPU,const int& ii) const;
 
   // GPU backend only
   KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUi>::member_type& team) const;
+  void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeigh>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPPreUi,const int iatom_mod, const int j, const int iatom_div) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
@@ -122,13 +139,13 @@ public:
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const;
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const;
-
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
 
   // CPU backend only
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeNeighCPU>::member_type& team) const;
+
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPPreUiCPU>::member_type& team) const;
 
@@ -173,7 +190,7 @@ protected:
   t_bvec bvec;
   typedef Kokkos::View<F_FLOAT***> t_dbvec;
   t_dbvec dbvec;
-  SNAKokkos<DeviceType> snaKK;
+  SNAKokkos<DeviceType, real_type, vector_length> snaKK;
 
   int inum,max_neighs,chunk_size,chunk_offset;
   int host_flag;
@@ -208,14 +225,14 @@ inline double dist2(double* x,double* y);
   Kokkos::View<F_FLOAT****, Kokkos::LayoutRight, DeviceType> i_uarraytot_r, i_uarraytot_i;
   Kokkos::View<F_FLOAT******, Kokkos::LayoutRight, DeviceType> i_zarray_r, i_zarray_i;
 
-  Kokkos::View<F_FLOAT*, DeviceType> d_radelem;              // element radii
-  Kokkos::View<F_FLOAT*, DeviceType> d_wjelem;               // elements weights
-  Kokkos::View<F_FLOAT**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
+  Kokkos::View<real_type*, DeviceType> d_radelem;              // element radii
+  Kokkos::View<real_type*, DeviceType> d_wjelem;               // elements weights
+  Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType> d_coeffelem;           // element bispectrum coefficients
   Kokkos::View<T_INT*, DeviceType> d_map;                    // mapping from atom types to elements
   Kokkos::View<T_INT*, DeviceType> d_ninside;                // ninside for all atoms in list
-  Kokkos::View<F_FLOAT**, DeviceType> d_beta;                // betas for all atoms in list
-  Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> d_beta_pack;          // betas for all atoms in list, GPU
-  Kokkos::View<F_FLOAT**, DeviceType> d_bispectrum;          // bispectrum components for all atoms in list
+  Kokkos::View<real_type**, DeviceType> d_beta;                // betas for all atoms in list
+  Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> d_beta_pack;          // betas for all atoms in list, GPU
+  Kokkos::View<real_type**, DeviceType> d_bispectrum;          // bispectrum components for all atoms in list
 
   typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
   tdual_fparams k_cutsq;
@@ -237,6 +254,49 @@ inline double dist2(double* x,double* y);
 
 };
 
+
+// These wrapper classes exist to make the pair style factory happy/avoid having
+// to extend the pair style factory to support Pair classes w/an arbitrary number
+// of extra template parameters
+
+template <class DeviceType>
+class PairSNAPKokkosDevice : public PairSNAPKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN> {
+
+private:
+  using Base = PairSNAPKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>;
+
+public:
+
+  PairSNAPKokkosDevice(class LAMMPS *);
+
+  void coeff(int, char**);
+  void init_style();
+  double init_one(int, int);
+  void compute(int, int);
+  double memory_usage();
+
+};
+
+#ifdef LMP_KOKKOS_GPU
+template <class DeviceType>
+class PairSNAPKokkosHost : public PairSNAPKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN> {
+
+private:
+  using Base = PairSNAPKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>;
+
+public:
+
+  PairSNAPKokkosHost(class LAMMPS *);
+
+  void coeff(int, char**);
+  void init_style();
+  double init_one(int, int);
+  void compute(int, int);
+  double memory_usage();
+
+};
+#endif
+
 }
 
 #endif
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index bacb5f75d6..1349232136 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -48,8 +48,8 @@ namespace LAMMPS_NS {
 //static double t7 = 0.0;
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-PairSNAPKokkos<DeviceType>::PairSNAPKokkos(LAMMPS *lmp) : PairSNAP(lmp)
+template<class DeviceType, typename real_type, int vector_length>
+PairSNAPKokkos<DeviceType, real_type, vector_length>::PairSNAPKokkos(LAMMPS *lmp) : PairSNAP(lmp)
 {
   respa_enable = 0;
 
@@ -67,8 +67,8 @@ PairSNAPKokkos<DeviceType>::PairSNAPKokkos(LAMMPS *lmp) : PairSNAP(lmp)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-PairSNAPKokkos<DeviceType>::~PairSNAPKokkos()
+template<class DeviceType, typename real_type, int vector_length>
+PairSNAPKokkos<DeviceType, real_type, vector_length>::~PairSNAPKokkos()
 {
   if (copymode) return;
 
@@ -81,8 +81,8 @@ PairSNAPKokkos<DeviceType>::~PairSNAPKokkos()
    init specific to this pair style
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-void PairSNAPKokkos<DeviceType>::init_style()
+template<class DeviceType, typename real_type, int vector_length>
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::init_style()
 {
   if (force->newton_pair == 0)
     error->all(FLERR,"Pair style SNAP requires newton pair on");
@@ -128,8 +128,8 @@ struct FindMaxNumNeighs {
    This version is a straightforward implementation
    ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+template<class DeviceType, typename real_type, int vector_length>
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in, int vflag_in)
 {
   eflag = eflag_in;
   vflag = vflag_in;
@@ -186,16 +186,15 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   max_neighs = 0;
   Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Max<int>(max_neighs));
 
-  int vector_length_default = 1;
   int team_size_default = 1;
   if (!host_flag)
     team_size_default = 32;//max_neighs;
 
   if (beta_max < inum) {
     beta_max = inum;
-    d_beta = Kokkos::View<F_FLOAT**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
+    d_beta = Kokkos::View<real_type**, DeviceType>("PairSNAPKokkos:beta",ncoeff,inum);
     if (!host_flag)
-      d_beta_pack = Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType>("PairSNAPKokkos:beta_pack",32,ncoeff,(inum+32-1)/32);
+      d_beta_pack = Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType>("PairSNAPKokkos:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length);
     d_ninside = Kokkos::View<int*, DeviceType>("PairSNAPKokkos:ninside",inum);
   }
 
@@ -213,31 +212,28 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     if (chunk_size > inum - chunk_offset)
       chunk_size = inum - chunk_offset;
 
-    //ComputeNeigh
-    {
-      int vector_length = vector_length_default;
-      int team_size = team_size_default;
-      check_team_size_for<TagPairSNAPComputeNeigh>(chunk_size,team_size,vector_length);
-      typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size,vector_length);
-      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
-    }
-
     if (host_flag)
     {
       // Host codepath
 
+      //ComputeNeigh
+      {
+        int team_size = team_size_default;
+        check_team_size_for<TagPairSNAPComputeNeighCPU>(chunk_size,team_size);
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeighCPU> policy_neigh(chunk_size,team_size,vector_length);
+        Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this);
+      }
+
       //PreUi
       {
-        int vector_length = vector_length_default;
         int team_size = team_size_default;
-        check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size,vector_length);
+        check_team_size_for<TagPairSNAPPreUiCPU>(chunk_size,team_size);
         typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU> policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length);
         Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this);
       }
 
       // ComputeUi
       {
-        int vector_length = vector_length_default;
         int team_size = team_size_default;
         // Fused calculation of ulist and accumulation into ulisttot using atomics
         typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU> policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
@@ -259,9 +255,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
         Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this);
 
         //ComputeBi
-        int vector_length = vector_length_default;
         int team_size = team_size_default;
-        check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size,vector_length);
+        check_team_size_for<TagPairSNAPComputeBiCPU>(chunk_size,team_size);
         typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU> policy_bi_cpu(chunk_size,team_size,vector_length);
         Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this);
       }
@@ -281,7 +276,6 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       //ComputeDuidrj and Deidrj
       {
         int team_size = team_size_default;
-        int vector_length = vector_length_default;
 
         typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
         snaKK.set_dir(-1); // technically doesn't do anything
@@ -295,39 +289,67 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     } else { // GPU
 
 #ifdef LMP_KOKKOS_GPU
+
+      //ComputeNeigh
+      {
+        constexpr int team_size = 4;
+
+        // scratch size: max_neighs * sizeof(int) * number of threads per team
+        typedef Kokkos::View< int*,
+                              Kokkos::DefaultExecutionSpace::scratch_memory_space,
+                              Kokkos::MemoryTraits<Kokkos::Unmanaged> >
+                ScratchViewType;
+        int scratch_size = ScratchViewType::shmem_size(team_size * max_neighs);
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh> policy_neigh(chunk_size,team_size,vector_length);
+        policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+        Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+      }
+
+      //ComputeCayleyKlein
+      {
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeCayleyKlein>
+            policy_compute_ck({0,0,0},{vector_length,max_neighs,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1});
+        Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this);
+      }
+
       //PreUi
       {
-        int vector_length = vector_length_default;
-        int team_size = team_size_default;
-        check_team_size_for<TagPairSNAPPreUi>(chunk_size,team_size,vector_length);
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi> policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length);
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPPreUi>
+            policy_preui({0,0,0},{vector_length,twojmax+1,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1});
         Kokkos::parallel_for("PreUi",policy_preui,*this);
+
       }
 
       // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot
       {
+        // new AoSoA form
 
-        int vector_length = 32;
-        int team_size = 4; // need to cap b/c of shared memory reqs
-        check_team_size_for<TagPairSNAPComputeUi>(chunk_size,team_size,vector_length);
+        // team_size_compute_ui is defined in `pair_snap_kokkos.h`
+        constexpr int team_size = team_size_compute_ui;
 
-        // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values, div 2 for symmetry
-        //   2 is for double buffer
-
-        const int tile_size = (twojmax+1)*(twojmax/2+1);
-        typedef Kokkos::View< SNAcomplex*,
+        // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+        const int tile_size = vector_length * (twojmax + 1);
+        typedef Kokkos::View< complex*,
                               Kokkos::DefaultExecutionSpace::scratch_memory_space,
                               Kokkos::MemoryTraits<Kokkos::Unmanaged> >
                 ScratchViewType;
-        int scratch_size = ScratchViewType::shmem_size( 2 * team_size * tile_size );
+        int scratch_size = ScratchViewType::shmem_size(team_size * tile_size);
 
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
+        // total number of teams needed
+        int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+        // (natoms / 32) * (max_neighs) * ("bend" locations)
+        int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        int n_teams_div = (n_teams + team_size - 1) / team_size;
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUi> policy_ui(n_teams_div, team_size, vector_length);
+        policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
 
         Kokkos::parallel_for("ComputeUi",policy_ui,*this);
 
-        //Transform data layout of ulisttot to AoSoA, zero ylist
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,twojmax+1,(chunk_size + 32 - 1) / 32},{32,4,1});
+        // un-"fold" ulisttot, zero ylist
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1});
         Kokkos::parallel_for("TransformUi",policy_transform_ui,*this);
 
       }
@@ -336,19 +358,19 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       if (quadraticflag || eflag) {
         //ComputeZi
         int idxz_max = snaKK.idxz_max;
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{vector_length,idxz_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1});
         Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this);
 
         //ComputeBi
         int idxb_max = snaKK.idxb_max;
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{vector_length,idxb_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1});
         Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this);
 
         //Transform data layout of blist out of AoSoA
         //We need this b/c `blist` gets used in ComputeForce which doesn't
         //take advantage of AoSoA (which at best would only be beneficial
         //on the margins)
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{vector_length,idxb_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1});
         Kokkos::parallel_for("TransformBi",policy_transform_bi,*this);
       }
 
@@ -361,39 +383,41 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
         //ComputeYi
         const int idxz_max = snaKK.idxz_max;
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1});
+        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{vector_length,idxz_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1});
         Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this);
 
-        //Transform data layout of ylist out of AoSoA
-        const int idxu_half_max = snaKK.idxu_half_max;
-        typename Kokkos::MDRangePolicy<DeviceType, Kokkos::IndexType<int>, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_half_max,(chunk_size + 32 - 1) / 32},{32,4,1});
-        Kokkos::parallel_for("TransformYi",policy_transform_yi,*this);
-
       }
 
       // Fused ComputeDuidrj, ComputeDeidrj
       {
-        int vector_length = 32;
-        int team_size = 2; // need to cap b/c of shared memory reqs
-        check_team_size_for<TagPairSNAPComputeFusedDeidrj>(chunk_size,team_size,vector_length);
+        // new AoSoA form
 
-        // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
-        // 2 is for double buffer
-        const int tile_size = (twojmax+1)*(twojmax/2+1);
+        // team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h`
+        constexpr int team_size = team_size_compute_fused_deidrj;
 
-        typedef Kokkos::View< SNAcomplex*,
+        // scratch size: 32 atoms * (twojmax+1) cached values * 2 for u, du, no double buffer
+        const int tile_size = vector_length * (twojmax + 1);
+        typedef Kokkos::View< complex*,
                               Kokkos::DefaultExecutionSpace::scratch_memory_space,
                               Kokkos::MemoryTraits<Kokkos::Unmanaged> >
                 ScratchViewType;
-        int scratch_size = ScratchViewType::shmem_size( 4 * team_size * tile_size);
+        int scratch_size = ScratchViewType::shmem_size(2 * team_size * tile_size);
 
-        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrj> policy_fused_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
-        policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
+        // total number of teams needed
+        int chunk_size_div = (chunk_size + vector_length - 1) / vector_length;
+
+        // (natoms / 32) * (max_neighs) * ("bend" locations)
+        int n_teams = chunk_size_div * max_neighs * (twojmax + 1);
+        int n_teams_div = (n_teams + team_size - 1) / team_size;
+
+        typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrj> policy_fused_deidrj(n_teams_div,team_size,vector_length);
+        policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
 
         for (int k = 0; k < 3; k++) {
           snaKK.set_dir(k);
           Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
         }
+
       }
 
 #endif // LMP_KOKKOS_GPU
@@ -403,27 +427,26 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
     //ComputeForce
     {
       int team_size = team_size_default;
-      int vector_length = vector_length_default;
       if (evflag) {
         if (neighflag == HALF) {
-          check_team_size_reduce<TagPairSNAPComputeForce<HALF,1> >(chunk_size,team_size,vector_length);
+          check_team_size_reduce<TagPairSNAPComputeForce<HALF,1> >(chunk_size,team_size);
           typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALF,1> > policy_force(chunk_size,team_size,vector_length);
           Kokkos::parallel_reduce(policy_force
             ,*this,ev_tmp);
         } else if (neighflag == HALFTHREAD) {
-          check_team_size_reduce<TagPairSNAPComputeForce<HALFTHREAD,1> >(chunk_size,team_size,vector_length);
+          check_team_size_reduce<TagPairSNAPComputeForce<HALFTHREAD,1> >(chunk_size,team_size);
           typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,1> > policy_force(chunk_size,team_size,vector_length);
           Kokkos::parallel_reduce(policy_force
             ,*this,ev_tmp);
         }
       } else {
         if (neighflag == HALF) {
-          check_team_size_for<TagPairSNAPComputeForce<HALF,0> >(chunk_size,team_size,vector_length);
+          check_team_size_for<TagPairSNAPComputeForce<HALF,0> >(chunk_size,team_size);
           typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALF,0> > policy_force(chunk_size,team_size,vector_length);
           Kokkos::parallel_for(policy_force
             ,*this);
         } else if (neighflag == HALFTHREAD) {
-          check_team_size_for<TagPairSNAPComputeForce<HALFTHREAD,0> >(chunk_size,team_size,vector_length);
+          check_team_size_for<TagPairSNAPComputeForce<HALFTHREAD,0> >(chunk_size,team_size);
           typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<HALFTHREAD,0> > policy_force(chunk_size,team_size,vector_length);
           Kokkos::parallel_for(policy_force
             ,*this);
@@ -478,8 +501,8 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    allocate all arrays
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-void PairSNAPKokkos<DeviceType>::allocate()
+template<class DeviceType, typename real_type, int vector_length>
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::allocate()
 {
   PairSNAP::allocate();
 
@@ -492,8 +515,8 @@ void PairSNAPKokkos<DeviceType>::allocate()
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-double PairSNAPKokkos<DeviceType>::init_one(int i, int j)
+template<class DeviceType, typename real_type, int vector_length>
+double PairSNAPKokkos<DeviceType, real_type, vector_length>::init_one(int i, int j)
 {
   double cutone = PairSNAP::init_one(i,j);
   k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
@@ -506,16 +529,16 @@ double PairSNAPKokkos<DeviceType>::init_one(int i, int j)
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
+template<class DeviceType, typename real_type, int vector_length>
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::coeff(int narg, char **arg)
 {
   PairSNAP::coeff(narg,arg);
 
   // Set up element lists
 
-  d_radelem = Kokkos::View<F_FLOAT*, DeviceType>("pair:radelem",nelements);
-  d_wjelem = Kokkos::View<F_FLOAT*, DeviceType>("pair:wjelem",nelements);
-  d_coeffelem = Kokkos::View<F_FLOAT**, Kokkos::LayoutRight, DeviceType>("pair:coeffelem",nelements,ncoeffall);
+  d_radelem = Kokkos::View<real_type*, DeviceType>("pair:radelem",nelements);
+  d_wjelem = Kokkos::View<real_type*, DeviceType>("pair:wjelem",nelements);
+  d_coeffelem = Kokkos::View<real_type**, Kokkos::LayoutRight, DeviceType>("pair:coeffelem",nelements,ncoeffall);
 
   auto h_radelem = Kokkos::create_mirror_view(d_radelem);
   auto h_wjelem = Kokkos::create_mirror_view(d_wjelem);
@@ -539,25 +562,374 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
   Kokkos::deep_copy(d_coeffelem,h_coeffelem);
   Kokkos::deep_copy(d_map,h_map);
 
-  snaKK = SNAKokkos<DeviceType>(rfac0,twojmax,
+  snaKK = SNAKokkos<DeviceType, real_type, vector_length>(rfac0,twojmax,
                   rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements);
   snaKK.grow_rij(0,0);
   snaKK.init();
 }
 
 /* ----------------------------------------------------------------------
-   Begin routines that are called on both CPU and GPU codepaths
+   Begin routines that are unique to the GPU codepath. These take advantage
+   of AoSoA data layouts and scratch memory for recursive polynomials
 ------------------------------------------------------------------------- */
 
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBeta,const int& ii) const {
+
+  if (ii >= chunk_size) return;
+
+  const int iatom_mod = ii % vector_length;
+  const int iatom_div = ii / vector_length;
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+    d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1];
+  }
+
+  if (quadraticflag) {
+    const auto idxb_max = my_sna.idxb_max;
+    int k = ncoeff+1;
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+      const auto idxb = icoeff % idxb_max;
+      const auto idx_chem = icoeff / idxb_max;
+      auto bveci = my_sna.blist(idxb, idx_chem, ii);
+      d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
+      k++;
+      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+        const auto jdxb = jcoeff % idxb_max;
+        const auto jdx_chem = jcoeff / idxb_max;
+        real_type bvecj = my_sna.blist(jdxb, jdx_chem, ii);
+        d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
+        d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
+        k++;
+      }
+    }
+  }
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeigh>::member_type& team) const {
+
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract atom number
+  int ii = team.team_rank() + team.league_rank() * team.team_size();
+  if (ii >= chunk_size) return;
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff.
+  // If it is, type_cache is assigned to the atom type.
+  // If it's not, it's assigned to -1.
+  const int tile_size = max_neighs; // number of elements per thread
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team
+  int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift;
+
+  // Load various info about myself up front
+  const int i = d_ilist[ii + chunk_offset];
+  const F_FLOAT xtmp = x(i,0);
+  const F_FLOAT ytmp = x(i,1);
+  const F_FLOAT ztmp = x(i,2);
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  const double radi = d_radelem[ielem];
+
+  const int num_neighs = d_numneigh[i];
+
+  // rij[][3] = displacements between atom I and those neighbors
+  // inside = indices of neighbors of I within cutoff
+  // wj = weights for neighbors of I within cutoff
+  // rcutij = cutoffs for neighbors of I within cutoff
+  // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+  // Compute the number of neighbors, store rsq
+  int ninside = 0;
+  Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,num_neighs),
+    [&] (const int jj, int& count) {
+    T_INT j = d_neighbors(i,jj);
+    const F_FLOAT dx = x(j,0) - xtmp;
+    const F_FLOAT dy = x(j,1) - ytmp;
+    const F_FLOAT dz = x(j,2) - ztmp;
+
+    int jtype = type(j);
+    const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
+
+    if (rsq >= rnd_cutsq(itype,jtype)) {
+      jtype = -1; // use -1 to signal it's outside the radius
+    }
+
+    type_cache[jj] = jtype;
+
+    if (jtype >= 0)
+     count++;
+  }, ninside);
+
+  d_ninside(ii) = ninside;
+
+  Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs),
+    [&] (const int jj, int& offset, bool final) {
+
+    const int jtype = type_cache[jj];
+
+    if (jtype >= 0) {
+      if (final) {
+        T_INT j = d_neighbors(i,jj);
+        const F_FLOAT dx = x(j,0) - xtmp;
+        const F_FLOAT dy = x(j,1) - ytmp;
+        const F_FLOAT dz = x(j,2) - ztmp;
+        const int elem_j = d_map[jtype];
+        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[elem_j]);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[elem_j])*rcutfac);
+        my_sna.inside(ii,offset) = j;
+        if (chemflag)
+          my_sna.element(ii,offset) = elem_j;
+        else
+          my_sna.element(ii,offset) = 0;
+      }
+      offset++;
+    }
+  });
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  const int ninside = d_ninside(ii);
+  if (jnbor >= ninside) return;
+
+  my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int ii = iatom_mod + iatom_div * vector_length;
+  if (ii >= chunk_size) return;
+
+  int itype = type(ii);
+  int ielem = d_map[itype];
+
+  my_sna.pre_ui(iatom_mod, j, ielem, iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUi>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui;
+
+  // extract neighbor index, iatom_div
+  const int iatom_div = flattened_idx / (max_neighs * (twojmax + 1));
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  const int jj = jj_jbend - jbend * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_ui(team,iatom_mod, jbend, jj, iatom_div);
+  });
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxu > my_sna.idxu_max) return;
+
+  int elem_count = chemflag ? nelements : 1;
+
+  for (int ielem = 0; ielem < elem_count; ielem++) {
+
+    const FullHalfMapper mapper = my_sna.idxu_full_half[idxu];
+
+    auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+    auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div);
+
+    if (mapper.flip_sign == 1) {
+      utot_im = -utot_im;
+    } else if (mapper.flip_sign == -1) {
+      utot_re = -utot_re;
+    }
+
+    my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
+
+    if (mapper.flip_sign == 0) {
+      my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+      my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.;
+    }
+  }
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjz >= my_sna.idxz_max) return;
+
+  my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjz >= my_sna.idxz_max) return;
+
+  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (jjb >= my_sna.idxb_max) return;
+
+  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  const int iatom = iatom_mod + iatom_div * vector_length;
+  if (iatom >= chunk_size) return;
+
+  if (idxb >= my_sna.idxb_max) return;
+
+  const int ntriples = my_sna.ntriples;
+
+  for (int itriple = 0; itriple < ntriples; itriple++) {
+
+    const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
+
+    my_sna.blist(idxb, itriple, iatom) = blocal;
+  }
+
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrj>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  // extract flattened atom_div / neighbor number / bend location
+  int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj;
+
+  // extract neighbor index, iatom_div
+  const int iatom_div = flattened_idx / (max_neighs * (twojmax + 1));
+  const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1));
+  const int jbend = jj_jbend / max_neighs;
+  const int jj = jj_jbend - jbend * max_neighs;
+
+  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length),
+    [&] (const int iatom_mod) {
+    const int ii = iatom_mod + vector_length * iatom_div;
+    if (ii >= chunk_size) return;
+
+    const int ninside = d_ninside(ii);
+    if (jj >= ninside) return;
+
+    my_sna.compute_fused_deidrj(team, iatom_mod, jbend, jj, iatom_div);
+
+  });
+
+}
+
+/* ----------------------------------------------------------------------
+   Begin routines that are unique to the CPU codepath. These do not take
+   advantage of AoSoA data layouts, but that could be a good point of
+   future optimization and unification with the above kernels. It's unlikely
+   that scratch memory optimizations will ever be useful for the CPU due to
+   different arithmetic intensity requirements for the CPU vs GPU.
+------------------------------------------------------------------------- */
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPBetaCPU,const int& ii) const {
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type[i];
+  const int ielem = d_map[itype];
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
+
+  auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
+
+  for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+    d_beta(icoeff,ii) = d_coeffi[icoeff+1];
+
+  if (quadraticflag) {
+    const auto idxb_max = my_sna.idxb_max;
+    int k = ncoeff+1;
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+      const auto idxb = icoeff % idxb_max;
+      const auto idx_chem = icoeff / idxb_max;
+      auto bveci = my_sna.blist(idxb,idx_chem,ii);
+      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
+      k++;
+      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+        const auto jdxb = jcoeff % idxb_max;
+        const auto jdx_chem = jcoeff / idxb_max;
+        auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
+        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
+        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
+        k++;
+      }
+    }
+  }
+}
+
+template<class DeviceType, typename real_type, int vector_length>
+KOKKOS_INLINE_FUNCTION
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeNeighCPU>::member_type& team) const {
+
 
   int ii = team.league_rank();
   const int i = d_ilist[ii + chunk_offset];
-  SNAKokkos<DeviceType> my_sna = snaKK;
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
   const double xtmp = x(i,0);
   const double ytmp = x(i,1);
   const double ztmp = x(i,2);
@@ -585,7 +957,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
       const int jtype = type(j);
       const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
 
-      if ( rsq < rnd_cutsq(itype,jtype) )
+      if (rsq < rnd_cutsq(itype,jtype))
        count++;
     });
   },ninside);
@@ -605,22 +977,13 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
     const F_FLOAT rsq = dx*dx + dy*dy + dz*dz;
     const int elem_j = d_map[jtype];
 
-    if ( rsq < rnd_cutsq(itype,jtype) ) {
+    if (rsq < rnd_cutsq(itype,jtype)) {
       if (final) {
-#ifdef LMP_KOKKOS_GPU
-        if (!host_flag) {
-          my_sna.compute_cayley_klein(ii, offset, dx, dy, dz, (radi + d_radelem[elem_j])*rcutfac,
-                                      d_wjelem[elem_j]);
-        } else {
-#endif
-          my_sna.rij(ii,offset,0) = dx;
-          my_sna.rij(ii,offset,1) = dy;
-          my_sna.rij(ii,offset,2) = dz;
-          my_sna.wj(ii,offset) = d_wjelem[elem_j];
-          my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac;
-#ifdef LMP_KOKKOS_GPU
-        }
-#endif
+        my_sna.rij(ii,offset,0) = static_cast<real_type>(dx);
+        my_sna.rij(ii,offset,1) = static_cast<real_type>(dy);
+        my_sna.rij(ii,offset,2) = static_cast<real_type>(dz);
+        my_sna.wj(ii,offset) = static_cast<real_type>(d_wjelem[elem_j]);
+        my_sna.rcutij(ii,offset) = static_cast<real_type>((radi + d_radelem[elem_j])*rcutfac);
         my_sna.inside(ii,offset) = j;
         if (chemflag)
           my_sna.element(ii,offset) = elem_j;
@@ -632,282 +995,11 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeNeigh,const typen
   });
 }
 
-/* ----------------------------------------------------------------------
-   Begin routines that are unique to the GPU codepath. These take advantage
-   of AoSoA data layouts and scratch memory for recursive polynomials
-------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBeta,const int& ii) const {
-
-  if (ii >= chunk_size) return;
-
-  const int iatom_mod = ii % 32;
-  const int iatom_div = ii / 32;
-
-  const int i = d_ilist[ii + chunk_offset];
-  const int itype = type[i];
-  const int ielem = d_map[itype];
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
-
-  for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-    d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1];
-  }
-
-  if (quadraticflag) {
-    const auto idxb_max = my_sna.idxb_max;
-    int k = ncoeff+1;
-    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-      const auto idxb = icoeff % idxb_max;
-      const auto idx_chem = icoeff / idxb_max;
-      double bveci = my_sna.blist(idxb, idx_chem, ii);
-      d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci;
-      k++;
-      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-        const auto jdxb = jcoeff % idxb_max;
-        const auto jdx_chem = jcoeff / idxb_max;
-        double bvecj = my_sna.blist(jdxb, jdx_chem, ii);
-        d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj;
-        d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci;
-        k++;
-      }
-    }
-  }
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUi>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  // Extract the atom number
-  const int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-  int itype = type(ii);
-  int ielem = d_map[itype];
-
-  my_sna.pre_ui(team,ii,ielem);
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUi>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
-
-  my_sna.compute_ui(team,ii,jj);
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  const int iatom = iatom_mod + iatom_div * 32;
-  if (iatom >= chunk_size) return;
-
-  if (j > twojmax) return;
-
-  int elem_count = chemflag ? nelements : 1;
-
-  for (int ielem = 0; ielem < elem_count; ielem++) {
-    const int jju_half = my_sna.idxu_half_block(j);
-    const int jju = my_sna.idxu_block(j);
-
-    for (int mb = 0; 2*mb <= j; mb++) {
-      for (int ma = 0; ma <= j; ma++) {
-        // Extract top half
-
-        const int idxu_shift = mb * (j + 1) + ma;
-        const int idxu_half = jju_half + idxu_shift;
-        const int idxu = jju + idxu_shift;
-
-        auto utot_re = my_sna.ulisttot_re(idxu_half, ielem, iatom);
-        auto utot_im = my_sna.ulisttot_im(idxu_half, ielem, iatom);
-
-        // Store
-        my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im };
-
-        // Also zero yi
-        my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
-        my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div) = 0.;
-
-        // Symmetric term
-        const int sign_factor = (((ma+mb)%2==0)?1:-1);
-        const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1);
-
-        if (sign_factor == 1) {
-          utot_im = -utot_im;
-        } else {
-          utot_re = -utot_re;
-        }
-
-        my_sna.ulisttot_pack(iatom_mod, idxu_flip, ielem, iatom_div) = { utot_re, utot_im };
-
-        // No need to zero symmetrized ylist
-      }
-    }
-  }
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  const int iatom = iatom_mod + iatom_div * 32;
-  if (iatom >= chunk_size) return;
-
-  if (jjz >= my_sna.idxz_max) return;
-
-  my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack);
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu_half, const int iatom_div) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  const int iatom = iatom_mod + iatom_div * 32;
-  if (iatom >= chunk_size) return;
-
-  if (idxu_half >= my_sna.idxu_half_max) return;
-
-  int elem_count = chemflag ? nelements : 1;
-  for (int ielem = 0; ielem < elem_count; ielem++) {
-    const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div);
-    const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div);
-
-    my_sna.ylist(idxu_half, ielem, iatom) = { y_re, y_im };
-  }
-
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  const int iatom = iatom_mod + iatom_div * 32;
-  if (iatom >= chunk_size) return;
-
-  if (jjz >= my_sna.idxz_max) return;
-
-  my_sna.compute_zi(iatom_mod,jjz,iatom_div);
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  const int iatom = iatom_mod + iatom_div * 32;
-  if (iatom >= chunk_size) return;
-
-  if (jjb >= my_sna.idxb_max) return;
-
-  my_sna.compute_bi(iatom_mod,jjb,iatom_div);
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  const int iatom = iatom_mod + iatom_div * 32;
-  if (iatom >= chunk_size) return;
-
-  if (idxb >= my_sna.idxb_max) return;
-
-  const int ntriples = my_sna.ntriples;
-
-  for (int itriple = 0; itriple < ntriples; itriple++) {
-
-    const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div);
-
-    my_sna.blist(idxb, itriple, iatom) = blocal;
-  }
-
-}
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeFusedDeidrj>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  // Extract the atom number
-  int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
-  if (ii >= chunk_size) return;
-
-  // Extract the neighbor number
-  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
-  const int ninside = d_ninside(ii);
-  if (jj >= ninside) return;
-
-  my_sna.compute_fused_deidrj(team,ii,jj);
-}
-
-/* ----------------------------------------------------------------------
-   Begin routines that are unique to the CPU codepath. These do not take
-   advantage of AoSoA data layouts, but that could be a good point of
-   future optimization and unification with the above kernels. It's unlikely
-   that scratch memory optimizations will ever be useful for the CPU due to
-   different arithmetic intensity requirements for the CPU vs GPU.
-------------------------------------------------------------------------- */
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPBetaCPU,const int& ii) const {
-
-  const int i = d_ilist[ii + chunk_offset];
-  const int itype = type[i];
-  const int ielem = d_map[itype];
-  SNAKokkos<DeviceType> my_sna = snaKK;
-
-  Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-    d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
-
-  for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-    d_beta(icoeff,ii) = d_coeffi[icoeff+1];
-
-  if (quadraticflag) {
-    const auto idxb_max = my_sna.idxb_max;
-    int k = ncoeff+1;
-    for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-      const auto idxb = icoeff % idxb_max;
-      const auto idx_chem = icoeff / idxb_max;
-      double bveci = my_sna.blist(idxb,idx_chem,ii);
-      d_beta(icoeff,ii) += d_coeffi[k]*bveci;
-      k++;
-      for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-        const auto jdxb = jcoeff % idxb_max;
-        const auto jdx_chem = jcoeff / idxb_max;
-        double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
-        d_beta(icoeff,ii) += d_coeffi[k]*bvecj;
-        d_beta(jcoeff,ii) += d_coeffi[k]*bveci;
-        k++;
-      }
-    }
-  }
-}
-
-
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPPreUiCPU>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // Extract the atom number
   const int ii = team.team_rank() + team.team_size() * team.league_rank();
@@ -920,10 +1012,10 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPPreUiCPU,const typename
 
 
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeUiCPU>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // Extract the atom number
   int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
@@ -937,10 +1029,10 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typen
   my_sna.compute_ui_cpu(team,ii,jj);
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   if (iatom >= chunk_size) return;
 
@@ -987,32 +1079,32 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPTransformUiCPU, const in
   }
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeYiCPU,const int& ii) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
   my_sna.compute_yi_cpu(ii,d_beta);
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeZiCPU,const int& ii) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeZiCPU,const int& ii) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
   my_sna.compute_zi_cpu(ii);
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU>::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeBiCPU>::member_type& team) const {
   int ii = team.league_rank();
-  SNAKokkos<DeviceType> my_sna = snaKK;
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
   my_sna.compute_bi_cpu(team,ii);
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDuidrjCPU>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // Extract the atom number
   int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
@@ -1026,10 +1118,10 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const t
   my_sna.compute_duidrj_cpu(team,ii,jj);
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
-  SNAKokkos<DeviceType> my_sna = snaKK;
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeDeidrjCPU>::member_type& team) const {
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
 
   // Extract the atom number
   int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
@@ -1049,10 +1141,10 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const t
    likely not worth it.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT& ev) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team, EV_FLOAT& ev) const {
 
   // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
 
@@ -1061,7 +1153,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
 
   int ii = team.league_rank();
   const int i = d_ilist[ii + chunk_offset];
-  SNAKokkos<DeviceType> my_sna = snaKK;
+  SNAKokkos<DeviceType, real_type, vector_length> my_sna = snaKK;
   const int ninside = d_ninside(ii);
 
   Kokkos::parallel_for (Kokkos::TeamThreadRange(team,ninside),
@@ -1102,14 +1194,13 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
 
       const int itype = type(i);
       const int ielem = d_map[itype];
-      Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-        d_coeffi(d_coeffelem,ielem,Kokkos::ALL);
+      auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL);
 
       Kokkos::single(Kokkos::PerTeam(team), [&] () {
 
         // evdwl = energy of atom I, sum over coeffs_k * Bi_k
 
-        double evdwl = d_coeffi[0];
+        auto evdwl = d_coeffi[0];
 
         // E = beta.B + 0.5*B^t.alpha.B
 
@@ -1130,13 +1221,13 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
             const auto idxb = icoeff % idxb_max;
             const auto idx_chem = icoeff / idxb_max;
-            double bveci = my_sna.blist(idxb,idx_chem,ii);
+            auto bveci = my_sna.blist(idxb,idx_chem,ii);
 
             evdwl += 0.5*d_coeffi[k++]*bveci*bveci;
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
               auto jdxb = jcoeff % idxb_max;
               auto jdx_chem = jcoeff / idxb_max;
-              double bvecj = my_sna.blist(jdxb,jdx_chem,ii);
+              auto bvecj = my_sna.blist(jdxb,jdx_chem,ii);
 
               evdwl += d_coeffi[k++]*bveci*bvecj;
             }
@@ -1151,20 +1242,20 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,E
   }
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 template<int NEIGHFLAG, int EVFLAG>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::operator() (TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>,const typename Kokkos::TeamPolicy<DeviceType,TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG> >::member_type& team) const {
   EV_FLOAT ev;
   this->template operator()<NEIGHFLAG,EVFLAG>(TagPairSNAPComputeForce<NEIGHFLAG,EVFLAG>(), team, ev);
 }
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
-void PairSNAPKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
       const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz,
       const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const
 {
@@ -1209,24 +1300,24 @@ void PairSNAPKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const int &i, const i
    memory usage
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-double PairSNAPKokkos<DeviceType>::memory_usage()
+template<class DeviceType, typename real_type, int vector_length>
+double PairSNAPKokkos<DeviceType, real_type, vector_length>::memory_usage()
 {
   double bytes = Pair::memory_usage();
   int n = atom->ntypes+1;
   bytes += n*n*sizeof(int);
-  bytes += n*n*sizeof(double);
-  bytes += (2*ncoeffall)*sizeof(double);
-  bytes += (ncoeff*3)*sizeof(double);
+  bytes += n*n*sizeof(real_type);
+  bytes += (2*ncoeffall)*sizeof(real_type);
+  bytes += (ncoeff*3)*sizeof(real_type);
   bytes += snaKK.memory_usage();
   return bytes;
 }
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 template<class TagStyle>
-void PairSNAPKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::check_team_size_for(int inum, int &team_size) {
   int team_size_max;
 
   team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
@@ -1235,9 +1326,9 @@ void PairSNAPKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, i
     team_size = team_size_max/vector_length;
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 template<class TagStyle>
-void PairSNAPKokkos<DeviceType>::check_team_size_reduce(int inum, int &team_size, int vector_length) {
+void PairSNAPKokkos<DeviceType, real_type, vector_length>::check_team_size_reduce(int inum, int &team_size) {
   int team_size_max;
 
   team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
@@ -1246,4 +1337,79 @@ void PairSNAPKokkos<DeviceType>::check_team_size_reduce(int inum, int &team_size
     team_size = team_size_max/vector_length;
 }
 
+
+/* ----------------------------------------------------------------------
+   routines used by template reference classes
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairSNAPKokkosDevice<DeviceType>::PairSNAPKokkosDevice(class LAMMPS *lmp)
+   : PairSNAPKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_DEVICE_VECLEN>(lmp) { ; }
+
+template<class DeviceType>
+void PairSNAPKokkosDevice<DeviceType>::coeff(int narg, char **arg)
+{
+  Base::coeff(narg, arg);
+}
+
+template<class DeviceType>
+void PairSNAPKokkosDevice<DeviceType>::init_style()
+{
+  Base::init_style();
+}
+
+template<class DeviceType>
+double PairSNAPKokkosDevice<DeviceType>::init_one(int i, int j)
+{
+  return Base::init_one(i, j);
+}
+
+template<class DeviceType>
+void PairSNAPKokkosDevice<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  Base::compute(eflag_in, vflag_in);
+}
+
+template<class DeviceType>
+double PairSNAPKokkosDevice<DeviceType>::memory_usage()
+{
+  return Base::memory_usage();
+}
+
+#ifdef LMP_KOKKOS_GPU
+template<class DeviceType>
+PairSNAPKokkosHost<DeviceType>::PairSNAPKokkosHost(class LAMMPS *lmp)
+   : PairSNAPKokkos<DeviceType, SNAP_KOKKOS_REAL, SNAP_KOKKOS_HOST_VECLEN>(lmp) { ; }
+
+template<class DeviceType>
+void PairSNAPKokkosHost<DeviceType>::coeff(int narg, char **arg)
+{
+  Base::coeff(narg, arg);
+}
+
+template<class DeviceType>
+void PairSNAPKokkosHost<DeviceType>::init_style()
+{
+  Base::init_style();
+}
+
+template<class DeviceType>
+double PairSNAPKokkosHost<DeviceType>::init_one(int i, int j)
+{
+  return Base::init_one(i, j);
+}
+
+template<class DeviceType>
+void PairSNAPKokkosHost<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  Base::compute(eflag_in, vflag_in);
+}
+
+template<class DeviceType>
+double PairSNAPKokkosHost<DeviceType>::memory_usage()
+{
+  return Base::memory_usage();
+}
+#endif
+
 }
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 7804aaef18..f183acdb57 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -25,45 +25,78 @@
 
 namespace LAMMPS_NS {
 
-template<class DeviceType>
+template<typename real_type_, int vector_length_>
+struct WignerWrapper {
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+  static constexpr int vector_length = vector_length_;
+
+  const int offset; // my offset into the vector (0, ..., vector_length - 1)
+  real_type* buffer; // buffer of real numbers
+
+  KOKKOS_INLINE_FUNCTION
+  WignerWrapper(complex* buffer_, const int offset_)
+   : offset(offset_), buffer(reinterpret_cast<real_type*>(buffer_))
+  { ; }
+
+  KOKKOS_INLINE_FUNCTION
+  complex get(const int& ma) {
+    return complex(buffer[offset + 2 * vector_length * ma], buffer[offset + vector_length + 2 * vector_length * ma]);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void set(const int& ma, const complex& store) {
+    buffer[offset + 2 * vector_length * ma] = store.re;
+    buffer[offset + vector_length + 2 * vector_length * ma] = store.im;
+  }
+};
+
+struct alignas(8) FullHalfMapper {
+  int idxu_half;
+  int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj
+};
+
+template<class DeviceType, typename real_type_, int vector_length_>
 class SNAKokkos {
 
 public:
+  using real_type = real_type_;
+  using complex = SNAComplex<real_type>;
+  static constexpr int vector_length = vector_length_;
+
   typedef Kokkos::View<int*, DeviceType> t_sna_1i;
-  typedef Kokkos::View<double*, DeviceType> t_sna_1d;
-  typedef Kokkos::View<double*, typename KKDevice<DeviceType>::value, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1d_atomic;
+  typedef Kokkos::View<real_type*, DeviceType> t_sna_1d;
+  typedef Kokkos::View<real_type*, typename KKDevice<DeviceType>::value, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1d_atomic;
   typedef Kokkos::View<int**, DeviceType> t_sna_2i;
-  typedef Kokkos::View<double**, DeviceType> t_sna_2d;
-  typedef Kokkos::View<double**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
-  typedef Kokkos::View<double***, DeviceType> t_sna_3d;
-  typedef Kokkos::View<double***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
-  typedef Kokkos::View<double***[3], DeviceType> t_sna_4d;
-  typedef Kokkos::View<double****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
-  typedef Kokkos::View<double**[3], DeviceType> t_sna_3d3;
-  typedef Kokkos::View<double*****, DeviceType> t_sna_5d;
+  typedef Kokkos::View<real_type**, DeviceType> t_sna_2d;
+  typedef Kokkos::View<real_type**, Kokkos::LayoutLeft, DeviceType> t_sna_2d_ll;
+  typedef Kokkos::View<real_type***, DeviceType> t_sna_3d;
+  typedef Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> t_sna_3d_ll;
+  typedef Kokkos::View<real_type***[3], DeviceType> t_sna_4d;
+  typedef Kokkos::View<real_type****, Kokkos::LayoutLeft, DeviceType> t_sna_4d_ll;
+  typedef Kokkos::View<real_type**[3], DeviceType> t_sna_3d3;
+  typedef Kokkos::View<real_type*****, DeviceType> t_sna_5d;
 
-  typedef Kokkos::View<SNAcomplex*, DeviceType> t_sna_1c;
-  typedef Kokkos::View<SNAcomplex*, typename KKDevice<DeviceType>::value, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1c_atomic;
-  typedef Kokkos::View<SNAcomplex**, DeviceType> t_sna_2c;
-  typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutLeft, DeviceType> t_sna_2c_ll;
-  typedef Kokkos::View<SNAcomplex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_lr;
-  typedef Kokkos::View<SNAcomplex***, DeviceType> t_sna_3c;
-  typedef Kokkos::View<SNAcomplex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
-  typedef Kokkos::View<SNAcomplex***[3], DeviceType> t_sna_4c;
-  typedef Kokkos::View<SNAcomplex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
-  typedef Kokkos::View<SNAcomplex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
-  typedef Kokkos::View<SNAcomplex**[3], DeviceType> t_sna_3c3;
-  typedef Kokkos::View<SNAcomplex*****, DeviceType> t_sna_5c;
-
-  typedef Kokkos::View<CayleyKleinPack**, DeviceType> t_sna_2ckp;
+  typedef Kokkos::View<complex*, DeviceType> t_sna_1c;
+  typedef Kokkos::View<complex*, typename KKDevice<DeviceType>::value, Kokkos::MemoryTraits<Kokkos::Atomic> > t_sna_1c_atomic;
+  typedef Kokkos::View<complex**, DeviceType> t_sna_2c;
+  typedef Kokkos::View<complex**, Kokkos::LayoutLeft, DeviceType> t_sna_2c_ll;
+  typedef Kokkos::View<complex**, Kokkos::LayoutRight, DeviceType> t_sna_2c_lr;
+  typedef Kokkos::View<complex***, DeviceType> t_sna_3c;
+  typedef Kokkos::View<complex***, Kokkos::LayoutLeft, DeviceType> t_sna_3c_ll;
+  typedef Kokkos::View<complex***[3], DeviceType> t_sna_4c;
+  typedef Kokkos::View<complex***[3], Kokkos::LayoutLeft, DeviceType> t_sna_4c3_ll;
+  typedef Kokkos::View<complex****, Kokkos::LayoutLeft, DeviceType> t_sna_4c_ll;
+  typedef Kokkos::View<complex**[3], DeviceType> t_sna_3c3;
+  typedef Kokkos::View<complex*****, DeviceType> t_sna_5c;
 
 inline
   SNAKokkos() {};
   KOKKOS_INLINE_FUNCTION
-  SNAKokkos(const SNAKokkos<DeviceType>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
+  SNAKokkos(const SNAKokkos<DeviceType,real_type,vector_length>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
 
 inline
-  SNAKokkos(double, int, double, int, int, int, int, int, int);
+  SNAKokkos(real_type, int, real_type, int, int, int, int, int, int);
 
   KOKKOS_INLINE_FUNCTION
   ~SNAKokkos();
@@ -81,17 +114,16 @@ inline
 
   // functions for bispectrum coefficients, GPU only
   KOKKOS_INLINE_FUNCTION
-  void compute_cayley_klein(const int&, const int&, const double&, const double&,
-                            const double&, const double&, const double&);
+  void compute_cayley_klein(const int&, const int&, const int&);
   KOKKOS_INLINE_FUNCTION
-  void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&,const int&); // ForceSNAP
+  void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
-  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
+  void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_zi(const int&, const int&, const int&);    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_yi(int,int,int,
-   const Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
+   const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack); // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_bi(const int&, const int&, const int&);    // ForceSNAP
 
@@ -104,34 +136,33 @@ inline
   void compute_zi_cpu(const int&);    // ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_yi_cpu(int,
-   const Kokkos::View<F_FLOAT**, DeviceType> &beta); // ForceSNAP
+   const Kokkos::View<real_type**, DeviceType> &beta); // ForceSNAP
     KOKKOS_INLINE_FUNCTION
   void compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int);    // ForceSNAP
 
   // functions for derivatives, GPU only
   KOKKOS_INLINE_FUNCTION
-  void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
+  void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int, const int, const int); //ForceSNAP
 
   // functions for derivatives, CPU only
   KOKKOS_INLINE_FUNCTION
   void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
   KOKKOS_INLINE_FUNCTION
   void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
-  KOKKOS_INLINE_FUNCTION
-  double compute_sfac(double, double); // add_uarraytot, compute_duarray
-  KOKKOS_INLINE_FUNCTION
-  double compute_dsfac(double, double); // compute_duarray
-  KOKKOS_INLINE_FUNCTION
-  void compute_s_dsfac(const double, const double, double&, double&); // compute_cayley_klein
 
-  // efficient complex FMA
-  // efficient caxpy (i.e., y += a x)
-  static KOKKOS_FORCEINLINE_FUNCTION
-  void caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y);
+  KOKKOS_INLINE_FUNCTION
+  real_type compute_sfac(real_type, real_type); // add_uarraytot, compute_duarray
+
+  KOKKOS_INLINE_FUNCTION
+  real_type compute_dsfac(real_type, real_type); // compute_duarray
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_s_dsfac(const real_type, const real_type, real_type&, real_type&); // compute_cayley_klein
 
-  // efficient complex FMA, conjugate of scalar
   static KOKKOS_FORCEINLINE_FUNCTION
-  void caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y);
+  void sincos_wrapper(double x, double* sin_, double *cos_) { sincos(x, sin_, cos_); }
+  static KOKKOS_FORCEINLINE_FUNCTION
+  void sincos_wrapper(float x, float* sin_, float *cos_) { sincosf(x, sin_, cos_); }
 
   // Set the direction for split ComputeDuidrj
   KOKKOS_INLINE_FUNCTION
@@ -146,10 +177,6 @@ inline
 
   //per sna class instance for OMP use
 
-  // Alternative to rij, wj, rcutij...
-  // just calculate everything up front
-  t_sna_2ckp cayleyklein;
-
   // Per InFlight Particle
   t_sna_3d rij;
   t_sna_2i inside;
@@ -175,8 +202,14 @@ inline
   t_sna_4c3_ll dulist;
 
   // Modified structures for GPU backend
-  t_sna_3d_ll ulisttot_re; // split real,
-  t_sna_3d_ll ulisttot_im; // imag
+  t_sna_3c_ll a_pack; // Cayley-Klein `a`
+  t_sna_3c_ll b_pack; // `b`
+  t_sna_4c_ll da_pack; // `da`
+  t_sna_4c_ll db_pack; // `db`
+  t_sna_4d_ll sfac_pack; // sfac, dsfac_{x,y,z}
+
+  t_sna_4d_ll ulisttot_re_pack; // split real,
+  t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened
   t_sna_4c_ll ulisttot_pack; // AoSoA layout
   t_sna_4c_ll zlist_pack; // AoSoA layout
   t_sna_4d_ll blist_pack;
@@ -191,7 +224,7 @@ inline
   int ntriples;
 
 private:
-  double rmin0, rfac0;
+  real_type rmin0, rfac0;
 
   //use indexlist instead of loops, constructor generates these
   // Same across all SNAKokkos
@@ -203,6 +236,7 @@ public:
   Kokkos::View<int*, DeviceType> idxu_block;
   Kokkos::View<int*, DeviceType> idxu_half_block;
   Kokkos::View<int*, DeviceType> idxu_cache_block;
+  Kokkos::View<FullHalfMapper*, DeviceType> idxu_full_half;
 
 private:
   Kokkos::View<int***, DeviceType> idxz_block;
@@ -231,12 +265,12 @@ inline
   void init_rootpqarray();    // init()
 
   KOKKOS_INLINE_FUNCTION
-  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double, int); // compute_ui
+  void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, const real_type&, const real_type&, const real_type&, int); // compute_ui
 
   KOKKOS_INLINE_FUNCTION
   void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                      double, double, double,
-                      double, double); // compute_ui_cpu
+                      const real_type&, const real_type&, const real_type&,
+                      const real_type&, const real_type&); // compute_ui_cpu
 
 
   inline
@@ -246,8 +280,8 @@ inline
   int compute_ncoeff();           // SNAKokkos()
   KOKKOS_INLINE_FUNCTION
   void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
-                       double, double, double, // compute_duidrj_cpu
-                       double, double, double, double, double);
+                       const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu
+                       const real_type&, const real_type&, const real_type&, const real_type&, const real_type&);
 
   // Sets the style for the switching function
   // 0 = none
@@ -259,11 +293,11 @@ inline
   int bnorm_flag;
 
   // Self-weight
-  double wself;
+  real_type wself;
   int wselfall_flag;
 
   int bzero_flag; // 1 if bzero subtracted from barray
-  Kokkos::View<double*, DeviceType> bzero; // array of B values for isolated atoms
+  Kokkos::View<real_type*, DeviceType> bzero; // array of B values for isolated atoms
 
   // for per-direction dulist calculation, specify the direction.
   int dir;
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 3060aa9527..23c1670bd8 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -25,16 +25,16 @@ namespace LAMMPS_NS {
 
 static const double MY_PI  = 3.14159265358979323846; // pi
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
-         int twojmax_in, double rmin0_in, int switch_flag_in, int bzero_flag_in,
+SNAKokkos<DeviceType, real_type, vector_length>::SNAKokkos(real_type rfac0_in,
+         int twojmax_in, real_type rmin0_in, int switch_flag_in, int bzero_flag_in,
          int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in)
 {
   LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   host_flag = (execution_space == LAMMPS_NS::Host);
 
-  wself = 1.0;
+  wself = static_cast<real_type>(1.0);
 
   rfac0 = rfac0_in;
   rmin0 = rmin0_in;
@@ -63,7 +63,7 @@ SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
   cglist = t_sna_1d("SNAKokkos::cglist",idxcg_max);
 
   if (bzero_flag) {
-    bzero = Kokkos::View<double*, Kokkos::LayoutRight, DeviceType>("sna:bzero",twojmax+1);
+    bzero = Kokkos::View<real_type*, Kokkos::LayoutRight, DeviceType>("sna:bzero",twojmax+1);
     auto h_bzero = Kokkos::create_mirror_view(bzero);
 
     double www = wself*wself*wself;
@@ -78,20 +78,20 @@ SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-SNAKokkos<DeviceType>::~SNAKokkos()
+SNAKokkos<DeviceType, real_type, vector_length>::~SNAKokkos()
 {
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-void SNAKokkos<DeviceType>::build_indexlist()
+void SNAKokkos<DeviceType, real_type, vector_length>::build_indexlist()
 {
   // index list for cglist
 
   int jdim = twojmax + 1;
-  idxcg_block = Kokkos::View<int***, DeviceType>("SNAKokkos::idxcg_block",jdim,jdim,jdim);
+  idxcg_block = Kokkos::View<int***, DeviceType>(Kokkos::NoInit("SNAKokkos::idxcg_block"),jdim,jdim,jdim);
   auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block);
 
   int idxcg_count = 0;
@@ -109,7 +109,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
   // index list for uarray
   // need to include both halves
 
-  idxu_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_block",jdim);
+  idxu_block = Kokkos::View<int*, DeviceType>(Kokkos::NoInit("SNAKokkos::idxu_block"),jdim);
   auto h_idxu_block = Kokkos::create_mirror_view(idxu_block);
 
   int idxu_count = 0;
@@ -124,7 +124,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
   Kokkos::deep_copy(idxu_block,h_idxu_block);
 
   // index list for half uarray
-  idxu_half_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_half_block",jdim);
+  idxu_half_block = Kokkos::View<int*, DeviceType>(Kokkos::NoInit("SNAKokkos::idxu_half_block"),jdim);
   auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block);
 
   int idxu_half_count = 0;
@@ -137,10 +137,35 @@ void SNAKokkos<DeviceType>::build_indexlist()
   idxu_half_max = idxu_half_count;
   Kokkos::deep_copy(idxu_half_block, h_idxu_half_block);
 
+  // mapping between full and half indexing, encoding flipping
+  idxu_full_half = Kokkos::View<FullHalfMapper*, DeviceType>(Kokkos::NoInit("SNAKokkos::idxu_full_half"),idxu_max);
+  auto h_idxu_full_half = Kokkos::create_mirror_view(idxu_full_half);
+
+  idxu_count = 0;
+  for(int j = 0; j <= twojmax; j++) {
+    int jju_half = h_idxu_half_block[j];
+    for(int mb = 0; mb <= j; mb++) {
+      for(int ma = 0; ma <= j; ma++) {
+        FullHalfMapper mapper;
+        if (2*mb <= j) {
+          mapper.idxu_half = jju_half + mb * (j + 1) + ma;
+          mapper.flip_sign = 0;
+        } else {
+          mapper.idxu_half = jju_half + (j + 1 - mb) * (j + 1) - (ma + 1);
+          mapper.flip_sign = (((ma+mb)%2==0)?1:-1);
+        }
+        h_idxu_full_half[idxu_count] = mapper;
+        idxu_count++;
+      }
+    }
+  }
+
+  Kokkos::deep_copy(idxu_full_half, h_idxu_full_half);
+
   // index list for "cache" uarray
   // this is the GPU scratch memory requirements
   // applied the CPU structures
-  idxu_cache_block = Kokkos::View<int*, DeviceType>("SNAKokkos::idxu_cache_block",jdim);
+  idxu_cache_block = Kokkos::View<int*, DeviceType>(Kokkos::NoInit("SNAKokkos::idxu_cache_block"),jdim);
   auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block);
 
   int idxu_cache_count = 0;
@@ -162,7 +187,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
         if (j >= j1) idxb_count++;
 
   idxb_max = idxb_count;
-  idxb = Kokkos::View<int*[3], DeviceType>("SNAKokkos::idxb",idxb_max);
+  idxb = Kokkos::View<int*[3], DeviceType>(Kokkos::NoInit("SNAKokkos::idxb"),idxb_max);
   auto h_idxb = Kokkos::create_mirror_view(idxb);
 
   idxb_count = 0;
@@ -179,7 +204,7 @@ void SNAKokkos<DeviceType>::build_indexlist()
 
   // reverse index list for beta and b
 
-  idxb_block = Kokkos::View<int***, DeviceType>("SNAKokkos::idxb_block",jdim,jdim,jdim);
+  idxb_block = Kokkos::View<int***, DeviceType>(Kokkos::NoInit("SNAKokkos::idxb_block"),jdim,jdim,jdim);
   auto h_idxb_block = Kokkos::create_mirror_view(idxb_block);
 
   idxb_count = 0;
@@ -205,10 +230,10 @@ void SNAKokkos<DeviceType>::build_indexlist()
             idxz_count++;
 
   idxz_max = idxz_count;
-  idxz = Kokkos::View<int*[10], DeviceType>("SNAKokkos::idxz",idxz_max);
+  idxz = Kokkos::View<int*[10], DeviceType>(Kokkos::NoInit("SNAKokkos::idxz"),idxz_max);
   auto h_idxz = Kokkos::create_mirror_view(idxz);
 
-  idxz_block = Kokkos::View<int***, DeviceType>("SNAKokkos::idxz_block", jdim,jdim,jdim);
+  idxz_block = Kokkos::View<int***, DeviceType>(Kokkos::NoInit("SNAKokkos::idxz_block"), jdim,jdim,jdim);
   auto h_idxz_block = Kokkos::create_mirror_view(idxz_block);
 
   idxz_count = 0;
@@ -249,53 +274,63 @@ void SNAKokkos<DeviceType>::build_indexlist()
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-void SNAKokkos<DeviceType>::init()
+void SNAKokkos<DeviceType, real_type, vector_length>::init()
 {
   init_clebsch_gordan();
   init_rootpqarray();
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
+void SNAKokkos<DeviceType, real_type, vector_length>::grow_rij(int newnatom, int newnmax)
 {
   if(newnatom <= natom && newnmax <= nmax) return;
   natom = newnatom;
   nmax = newnmax;
 
+  int natom_div = (natom + vector_length - 1) / vector_length;
+
+  rij = t_sna_3d(Kokkos::NoInit("sna:rij"),natom,nmax,3);
+  wj = t_sna_2d(Kokkos::NoInit("sna:wj"),natom,nmax);
+  rcutij = t_sna_2d(Kokkos::NoInit("sna:rcutij"),natom,nmax);
   inside = t_sna_2i(Kokkos::NoInit("sna:inside"),natom,nmax);
-  element = t_sna_2i(Kokkos::NoInit("sna:rcutij"),natom,nmax);
+  element = t_sna_2i(Kokkos::NoInit("sna:element"),natom,nmax);
   dedr = t_sna_3d(Kokkos::NoInit("sna:dedr"),natom,nmax,3);
 
 #ifdef LMP_KOKKOS_GPU
   if (!host_flag) {
-
-    cayleyklein = t_sna_2ckp(Kokkos::NoInit("sna:cayleyklein"), natom, nmax);
+    a_pack = t_sna_3c_ll(Kokkos::NoInit("sna:a_pack"),vector_length,nmax,natom_div);
+    b_pack = t_sna_3c_ll(Kokkos::NoInit("sna:b_pack"),vector_length,nmax,natom_div);
+    da_pack = t_sna_4c_ll(Kokkos::NoInit("sna:da_pack"),vector_length,nmax,natom_div,3);
+    db_pack = t_sna_4c_ll(Kokkos::NoInit("sna:db_pack"),vector_length,nmax,natom_div,3);
+    sfac_pack = t_sna_4d_ll(Kokkos::NoInit("sna:sfac_pack"),vector_length,nmax,natom_div,4);
     ulisttot = t_sna_3c_ll(Kokkos::NoInit("sna:ulisttot"),1,1,1); // dummy allocation
     ulisttot_full = t_sna_3c_ll(Kokkos::NoInit("sna:ulisttot"),1,1,1);
-    ulisttot_re = t_sna_3d_ll(Kokkos::NoInit("sna:ulisttot_re"),idxu_half_max,nelements,natom);
-    ulisttot_im = t_sna_3d_ll(Kokkos::NoInit("sna:ulisttot_im"),idxu_half_max,nelements,natom);
-    ulisttot_pack = t_sna_4c_ll(Kokkos::NoInit("sna:ulisttot_pack"),32,idxu_max,nelements,(natom+32-1)/32);
+    ulisttot_re_pack = t_sna_4d_ll(Kokkos::NoInit("sna:ulisttot_re_pack"),vector_length,idxu_half_max,nelements,natom_div);
+    ulisttot_im_pack = t_sna_4d_ll(Kokkos::NoInit("sna:ulisttot_im_pack"),vector_length,idxu_half_max,nelements,natom_div);
+    ulisttot_pack = t_sna_4c_ll(Kokkos::NoInit("sna:ulisttot_pack"),vector_length,idxu_max,nelements,natom_div);
     ulist = t_sna_3c_ll(Kokkos::NoInit("sna:ulist"),1,1,1);
     zlist = t_sna_3c_ll(Kokkos::NoInit("sna:zlist"),1,1,1);
-    zlist_pack = t_sna_4c_ll(Kokkos::NoInit("sna:zlist_pack"),32,idxz_max,ndoubles,(natom+32-1)/32);
+    zlist_pack = t_sna_4c_ll(Kokkos::NoInit("sna:zlist_pack"),vector_length,idxz_max,ndoubles,natom_div);
     blist = t_sna_3d_ll(Kokkos::NoInit("sna:blist"),idxb_max,ntriples,natom);
-    blist_pack = t_sna_4d_ll(Kokkos::NoInit("sna:blist_pack"),32,idxb_max,ntriples,(natom+32-1)/32);
-    ylist = t_sna_3c_ll(Kokkos::NoInit("sna:ylist"),idxu_half_max,nelements,natom);
-    ylist_pack_re = t_sna_4d_ll(Kokkos::NoInit("sna:ylist_pack_re"),32,idxu_half_max,nelements,(natom+32-1)/32);
-    ylist_pack_im = t_sna_4d_ll(Kokkos::NoInit("sna:ylist_pack_im"),32,idxu_half_max,nelements,(natom+32-1)/32);
+    blist_pack = t_sna_4d_ll(Kokkos::NoInit("sna:blist_pack"),vector_length,idxb_max,ntriples,natom_div);
+    ylist = t_sna_3c_ll(Kokkos::NoInit("sna:ylist"),1,1,1);
+    ylist_pack_re = t_sna_4d_ll(Kokkos::NoInit("sna:ylist_pack_re"),vector_length,idxu_half_max,nelements,natom_div);
+    ylist_pack_im = t_sna_4d_ll(Kokkos::NoInit("sna:ylist_pack_im"),vector_length,idxu_half_max,nelements,natom_div);
     dulist = t_sna_4c3_ll(Kokkos::NoInit("sna:dulist"),1,1,1);
   } else {
 #endif
-    rij = t_sna_3d(Kokkos::NoInit("sna:rij"),natom,nmax,3);
-    wj = t_sna_2d(Kokkos::NoInit("sna:wj"),natom,nmax);
-    rcutij = t_sna_2d(Kokkos::NoInit("sna:rcutij"),natom,nmax);
+    a_pack = t_sna_3c_ll(Kokkos::NoInit("sna:a_pack"),1,1,1);
+    b_pack = t_sna_3c_ll(Kokkos::NoInit("sna:b_pack"),1,1,1);
+    da_pack = t_sna_4c_ll(Kokkos::NoInit("sna:da_pack"),1,1,1,1);
+    db_pack = t_sna_4c_ll(Kokkos::NoInit("sna:db_pack"),1,1,1,1);
+    sfac_pack = t_sna_4d_ll(Kokkos::NoInit("sna:sfac_pack"),1,1,1,1);
     ulisttot = t_sna_3c_ll(Kokkos::NoInit("sna:ulisttot"),idxu_half_max,nelements,natom);
     ulisttot_full = t_sna_3c_ll(Kokkos::NoInit("sna:ulisttot_full"),idxu_max,nelements,natom);
-    ulisttot_re = t_sna_3d_ll(Kokkos::NoInit("sna:ulisttot_re"),1,1,1);
-    ulisttot_im = t_sna_3d_ll(Kokkos::NoInit("sna:ulisttot_im"),1,1,1);
+    ulisttot_re_pack = t_sna_4d_ll(Kokkos::NoInit("sna:ulisttot_re"),1,1,1,1);
+    ulisttot_im_pack = t_sna_4d_ll(Kokkos::NoInit("sna:ulisttot_im"),1,1,1,1);
     ulisttot_pack = t_sna_4c_ll(Kokkos::NoInit("sna:ulisttot_pack"),1,1,1,1);
     ulist = t_sna_3c_ll(Kokkos::NoInit("sna:ulist"),idxu_cache_max,natom,nmax);
     zlist = t_sna_3c_ll(Kokkos::NoInit("sna:zlist"),idxz_max,ndoubles,natom);
@@ -323,81 +358,85 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
    ComputeFusedDeidrj, which are one warp per atom-neighbor pair.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_cayley_klein(const int& iatom, const int& jnbor, const double& x, const double& y,
-                                         const double& z, const double& rcut, const double& wj_local)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div)
 {
-  //const double x = rij(iatom,jnbor,0);
-  //const double y = rij(iatom,jnbor,1);
-  //const double z = rij(iatom,jnbor,2);
-  const double rsq = x * x + y * y + z * z;
-  const double r = sqrt(rsq);
-  //const double rcut = rcutij(iatom, jnbor);
-  const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
-  const double theta0 = (r - rmin0) * rscale0;
-  double sn, cs;
-  sincos(theta0, &sn, &cs);
-  const double z0 = r * cs / sn;
-  const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
+  const int iatom = iatom_mod + vector_length * iatom_div;
+  const auto x = rij(iatom,jnbor,0);
+  const auto y = rij(iatom,jnbor,1);
+  const auto z = rij(iatom,jnbor,2);
+  const auto rsq = x * x + y * y + z * z;
+  const auto r = sqrt(rsq);
+  const auto rcut = rcutij(iatom, jnbor);
+  const auto rscale0 = rfac0 * static_cast<real_type>(MY_PI) / (rcut - rmin0);
+  const auto theta0 = (r - rmin0) * rscale0;
+  real_type sn, cs;
+  sincos_wrapper(theta0, &sn, &cs);
+  const real_type z0 = r * cs / sn;
+  const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
-  //const double wj_local = wj(iatom, jnbor);
-  double sfac, dsfac;
+  const auto wj_local = wj(iatom, jnbor);
+  real_type sfac, dsfac;
   compute_s_dsfac(r, rcut, sfac, dsfac);
   sfac *= wj_local;
   dsfac *= wj_local;
 
-  const double rinv = 1.0 / r;
-  const double ux = x * rinv;
-  const double uy = y * rinv;
-  const double uz = z * rinv;
+  const auto rinv = static_cast<real_type>(1.0) / r;
+  const auto ux = x * rinv;
+  const auto uy = y * rinv;
+  const auto uz = z * rinv;
 
-  const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
+  const auto r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
 
-  const SNAcomplex a = { z0 * r0inv, -z * r0inv };
-  const SNAcomplex b = { r0inv * y, -r0inv * x };
+  const complex a = { z0 * r0inv, -z * r0inv };
+  const complex b = { r0inv * y, -r0inv * x };
 
-  const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
+  const auto dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
 
-  const double dr0invx = dr0invdr * ux;
-  const double dr0invy = dr0invdr * uy;
-  const double dr0invz = dr0invdr * uz;
+  const auto dr0invx = dr0invdr * ux;
+  const auto dr0invy = dr0invdr * uy;
+  const auto dr0invz = dr0invdr * uz;
 
-  const double dz0x = dz0dr * ux;
-  const double dz0y = dz0dr * uy;
-  const double dz0z = dz0dr * uz;
+  const auto dz0x = dz0dr * ux;
+  const auto dz0y = dz0dr * uy;
+  const auto dz0z = dz0dr * uz;
 
-  const SNAcomplex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx };
-  const SNAcomplex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy };
-  const SNAcomplex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv };
+  const complex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx };
+  const complex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy };
+  const complex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv };
 
-  const SNAcomplex dbx = { y * dr0invx, -x * dr0invx - r0inv };
-  const SNAcomplex dby = { y * dr0invy + r0inv, -x * dr0invy };
-  const SNAcomplex dbz = { y * dr0invz, -x * dr0invz };
+  const complex dbx = { y * dr0invx, -x * dr0invx - r0inv };
+  const complex dby = { y * dr0invy + r0inv, -x * dr0invy };
+  const complex dbz = { y * dr0invz, -x * dr0invz };
 
-  const double dsfacux = dsfac * ux;
-  const double dsfacuy = dsfac * uy;
-  const double dsfacuz = dsfac * uz;
+  const auto dsfacux = dsfac * ux;
+  const auto dsfacuy = dsfac * uy;
+  const auto dsfacuz = dsfac * uz;
 
-  CayleyKleinPack ckp{};
-  ckp.a = a;
-  ckp.b = b;
-  ckp.da[0] = dax;
-  ckp.db[0] = dbx;
-  ckp.da[1] = day;
-  ckp.db[1] = dby;
-  ckp.da[2] = daz;
-  ckp.db[2] = dbz;
-  ckp.sfac = sfac;
-  ckp.dsfacu[0] = dsfacux;
-  ckp.dsfacu[1] = dsfacuy;
-  ckp.dsfacu[2] = dsfacuz;
+  a_pack(iatom_mod,jnbor,iatom_div) = a;
+  b_pack(iatom_mod,jnbor,iatom_div) = b;
+
+  da_pack(iatom_mod,jnbor,iatom_div,0) = dax;
+  db_pack(iatom_mod,jnbor,iatom_div,0) = dbx;
+
+  da_pack(iatom_mod,jnbor,iatom_div,1) = day;
+  db_pack(iatom_mod,jnbor,iatom_div,1) = dby;
+
+  da_pack(iatom_mod,jnbor,iatom_div,2) = daz;
+  db_pack(iatom_mod,jnbor,iatom_div,2) = dbz;
+
+  sfac_pack(iatom_mod,jnbor,iatom_div,0) = sfac;
+  sfac_pack(iatom_mod,jnbor,iatom_div,1) = dsfacux;
+  sfac_pack(iatom_mod,jnbor,iatom_div,2) = dsfacuy;
+  sfac_pack(iatom_mod,jnbor,iatom_div,3) = dsfacuz;
+
+  // we need to explicitly zero `dedr` somewhere before hitting
+  // ComputeFusedDeidrj --- this is just a convenient place to do it.
+  dedr(iatom_mod + vector_length * iatom_div, jnbor, 0) = static_cast<real_type>(0.);
+  dedr(iatom_mod + vector_length * iatom_div, jnbor, 1) = static_cast<real_type>(0.);
+  dedr(iatom_mod + vector_length * iatom_div, jnbor, 2) = static_cast<real_type>(0.);
 
-  // Yes, this breaks the standard mantra of using SoA
-  // instead of AoS, but it's net fine because of the
-  // one warp per atom/neighbor pair for the recursive
-  // polynomials. There's good L1 reuse, anyway.
-  cayleyklein(iatom, jnbor) = ckp;
 }
 
 /* ----------------------------------------------------------------------
@@ -406,30 +445,28 @@ void SNAKokkos<DeviceType>::compute_cayley_klein(const int& iatom, const int& jn
    advantage of the symmetry of the Wigner U matrices.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
+void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div)
 {
+
   for (int jelem = 0; jelem < nelements; jelem++) {
-    for (int j = 0; j <= twojmax; j++) {
-      const int jju_half = idxu_half_block(j);
+    int jju_half = idxu_half_block(j);
 
-      // Only diagonal elements get initialized
-      // Top half only: gets symmetrized by TransformUi
-      // for (int m = 0; m < (j+1)*(j/2+1); m++)
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)),
-        [&] (const int m) {
+    // Only diagonal elements get initialized
+    // Top half only: gets symmetrized by TransformUi
 
-        const int jjup = jju_half + m;
+    for (int mb = 0; 2*mb <= j; mb++) {
+      for (int ma = 0; ma <= j; ma++) {
 
-        // if m is on the "diagonal", initialize it with the self energy.
-        // Otherwise zero it out
-        double re_part = 0.;
-        if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; }
+        real_type re_part = static_cast<real_type>(0.);
+        if (ma == mb && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; }
 
-        ulisttot_re(jjup, jelem, iatom) = re_part;
-        ulisttot_im(jjup, jelem, iatom) = 0.;
-      });
+        ulisttot_re_pack(iatom_mod, jju_half, jelem, iatom_div) = re_part;
+        ulisttot_im_pack(iatom_mod, jju_half, jelem, iatom_div) = static_cast<real_type>(0.);
+
+        jju_half++;
+      }
     }
   }
 
@@ -440,9 +477,9 @@ void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>
    accumulating to the total. GPU only.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div)
 {
 
   // utot(j,ma,mb) = 0 for all j,ma,ma
@@ -452,119 +489,122 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
   //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
 
   // get shared memory offset
-  const int max_m_tile = (twojmax+1)*(twojmax/2+1);
+  // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+  const int tile_size = vector_length * (twojmax + 1);
+
   const int team_rank = team.team_rank();
-  const int scratch_shift = team_rank * max_m_tile;
+  const int scratch_shift = team_rank * tile_size;
 
-  // shared memory double buffer
-  // Each level stores a (j+1) x ((j+3)/2) tile in preparation
-  // for the computation on the next level. This is the "cached"
-  // data layout, which also gets used on the CPU.
-  SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
-  SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  // extract and wrap
+  WignerWrapper<real_type, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
 
-  // Each warp is accessing the same pack: take advantage of
-  // L1 reuse
-  const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor);
 
-  const SNAcomplex a = ckp.a;
-  const SNAcomplex b = ckp.b;
-  const double sfac = ckp.sfac;
+  // load parameters
+  const auto a = a_pack(iatom_mod, jnbor, iatom_div);
+  const auto b = b_pack(iatom_mod, jnbor, iatom_div);
+  const auto sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0);
 
-  const int jelem = element(iatom, jnbor);
+  const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor);
 
-  // VMK Section 4.8.2
+  // we need to "choose" when to bend
+  // this for loop is here for context --- we expose additional
+  // parallelism over this loop instead
+  //for (int j_bend = 0; j_bend <= twojmax; j_bend++) {
 
-  // All writes go to global memory and shared memory
-  // so we can avoid all global memory reads
-  Kokkos::single(Kokkos::PerThread(team), [=]() {
-    buf1[0] = {1.,0.};
-    Kokkos::atomic_add(&(ulisttot_re(0,jelem,iatom)), sfac);
-  });
+    // level 0 is just 1.
+    ulist_wrapper.set(0, complex::one());
 
-  for (int j = 1; j <= twojmax; j++) {
+    // j from before the bend, don't store, mb == 0
+    // this is "creeping up the side"
+    for (int j = 1; j <= j_bend; j++) {
 
-    const int jju = idxu_half_block[j];
+      int jjup = idxu_half_block[j-1];
 
-    // fill in left side of matrix layer from previous layer
+      constexpr int mb = 0; // intentional for readability, compiler should optimize this out
 
-    // Flatten loop over ma, mb
-    // for (int ma = 0; ma <= j; ma++)
-    const int n_ma = j+1;
-    // for (int mb = 0; 2*mb <= j; mb++)
-    const int n_mb = j/2+1;
+      complex ulist_accum = complex::zero();
 
-    const int total_iters = n_ma * n_mb;
+      int ma;
+      for (ma = 0; ma < j; ma++) {
+        const int jjup_index = idxu_half_block[j - 1] + mb * j + ma;
 
-    //for (int m = 0; m < total_iters; m++) {
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m) {
+        // grab the cached value
+        const complex ulist_prev = ulist_wrapper.get(ma);
 
-      // ma fast, mb slow
-      // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive.
-      const int mb = m / n_ma;
-      const int ma = m - mb * n_ma;
+        // ulist_accum += rootpq * a.conj() * ulist_prev;
+        real_type rootpq = rootpqarray(j - ma, j - mb);
+        ulist_accum.re += rootpq * (a.re * ulist_prev.re + a.im * ulist_prev.im);
+        ulist_accum.im += rootpq * (a.re * ulist_prev.im - a.im * ulist_prev.re);
 
-      // index into global memory array
-      const int jju_index = jju+m;
+        // store ulist_accum, we atomic accumulate values after the bend, so no atomic add here
+        ulist_wrapper.set(ma, ulist_accum);
 
-      // index into shared memory buffer for this level
-      const int jju_shared_idx = m;
+        // next value
+        // ulist_accum = -rootpq * b.conj() * ulist_prev;
+        rootpq = rootpqarray(ma + 1, j - mb);
+        ulist_accum.re = -rootpq * (b.re * ulist_prev.re + b.im * ulist_prev.im);
+        ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re);
 
-      // index into shared memory buffer for next level
-      const int jjup_shared_idx = jju_shared_idx - mb;
-
-      SNAcomplex u_accum = {0., 0.};
-
-      // VMK recursion relation: grab contribution which is multiplied by b*
-      const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = rootpq2*((ma > 0)?buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.));
-
-      // u_accum += conj(b) * u_up2
-      caconjxpy(b, u_up2, u_accum);
-
-      // VMK recursion relation: grab contribution which is multiplied by a*
-      const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = rootpq1*((ma < j)?buf1[jjup_shared_idx]:SNAcomplex(0.,0.));
-
-      // u_accum += conj(a) * u_up1
-      caconjxpy(a, u_up1, u_accum);
-
-      // back up into shared memory for next iter
-      buf2[jju_shared_idx] = u_accum;
-
-      Kokkos::atomic_add(&(ulisttot_re(jju_index,jelem,iatom)), sfac * u_accum.re);
-      Kokkos::atomic_add(&(ulisttot_im(jju_index,jelem,iatom)), sfac * u_accum.im);
-
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
-      // if j is even (-> physical j integer), last element maps to self, skip
-
-      const int sign_factor = (((ma+mb)%2==0)?1:-1);
-      const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
-
-      if (sign_factor == 1) {
-        u_accum.im = -u_accum.im;
-      } else {
-        u_accum.re = -u_accum.re;
       }
 
-      if (j%2==1 && mb+1==n_mb) {
-        buf2[jju_shared_flip] = u_accum;
+      ulist_wrapper.set(ma, ulist_accum);
+    }
+
+    // now we're after the bend, start storing but only up to the "half way point"
+    const int j_half_way = MIN(2 * j_bend, twojmax);
+
+    int mb = 1;
+    int j; //= j_bend + 1; // need this value below
+    for (j = j_bend + 1; j <= j_half_way; j++) {
+
+      const int jjup = idxu_half_block[j-1] + (mb - 1) * j;
+
+      complex ulist_accum = complex::zero();
+
+      int ma;
+      for (ma = 0; ma < j; ma++) {
+
+        // grab the cached value
+        const complex ulist_prev = ulist_wrapper.get(ma);
+
+        // atomic add the previous level here
+        Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac);
+        Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac);
+
+        // ulist_accum += rootpq * b * ulist_prev;
+        real_type rootpq = rootpqarray(j - ma, mb);
+        ulist_accum.re += rootpq * (b.re * ulist_prev.re - b.im * ulist_prev.im);
+        ulist_accum.im += rootpq * (b.re * ulist_prev.im + b.im * ulist_prev.re);
+
+        // store ulist_accum
+        ulist_wrapper.set(ma, ulist_accum);
+
+        // next value
+        // ulist_accum = rootpq * a * ulist_prev;
+        rootpq = rootpqarray(ma + 1, mb);
+        ulist_accum.re = rootpq * (a.re * ulist_prev.re - a.im * ulist_prev.im);
+        ulist_accum.im = rootpq * (a.re * ulist_prev.im + a.im * ulist_prev.re);
       }
 
-      // symmetric part of ulisttot is generated in TransformUi
+      ulist_wrapper.set(ma, ulist_accum);
 
-    });
-    // In CUDA backend,
-    // ThreadVectorRange has a __syncwarp (appropriately masked for
-    // vector lengths < 32) implict at the end
+      mb++;
+    }
 
-    // swap double buffers
-    auto tmp = buf1; buf1 = buf2; buf2 = tmp;
+    // atomic add the last level
+    const int jjup = idxu_half_block[j-1] + (mb - 1) * j;
+
+    for (int ma = 0; ma < j; ma++) {
+      const complex ulist_prev = ulist_wrapper.get(ma);
+
+      // atomic add the previous level here
+      Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac);
+      Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac);
+    }
+
+  //} // end of "reference" loop over j_bend
 
 
-  }
 }
 
 
@@ -574,9 +614,9 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
    divergence. GPU version
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div)
 {
 
   const int j1 = idxz(jjz, 0);
@@ -589,14 +629,13 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, con
   const int na = idxz(jjz, 7);
   const int nb = idxz(jjz, 8);
 
-  const double* cgblock = cglist.data() + idxcg_block(j1, j2, j);
+  const real_type* cgblock = cglist.data() + idxcg_block(j1, j2, j);
 
   int idouble = 0;
 
   for (int elem1 = 0; elem1 < nelements; elem1++) {
     for (int elem2 = 0; elem2 < nelements; elem2++) {
-      double ztmp_r = 0.;
-      double ztmp_i = 0.;
+      complex ztmp = complex::zero();
 
       int jju1 = idxu_block[j1] + (j1+1)*mb1min;
       int jju2 = idxu_block[j2] + (j2+1)*mb2max;
@@ -615,12 +654,12 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, con
         #pragma unroll
         #endif
         for(int ia = 0; ia < na; ia++) {
-          const SNAcomplex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div);
-          const SNAcomplex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div);
+          const auto utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div);
+          const auto utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div);
           const auto cgcoeff_a = cgblock[icga];
           const auto cgcoeff_b = cgblock[icgb];
-          ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
-          ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
+          ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
+          ztmp.im += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re);
           ma1++;
           ma2--;
           icga += j2;
@@ -632,11 +671,11 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, con
       } // end loop over ib
 
       if (bnorm_flag) {
-        ztmp_r /= (j + 1);
-        ztmp_i /= (j + 1);
+        ztmp.re /= (j + 1);
+        ztmp.im /= (j + 1);
       }
 
-      zlist_pack(iatom_mod,jjz,idouble,iatom_div) = { ztmp_r, ztmp_i };
+      zlist_pack(iatom_mod,jjz,idouble,iatom_div) = ztmp;
 
       idouble++;
     }
@@ -649,9 +688,9 @@ void SNAKokkos<DeviceType>::compute_zi(const int& iatom_mod, const int& jjz, con
    divergence.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div)
 {
   // for j1 = 0,...,twojmax
   //   for j2 = 0,twojmax
@@ -712,10 +751,10 @@ void SNAKokkos<DeviceType>::compute_bi(const int& iatom_mod, const int& jjb, con
 
           const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div);
           const auto zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div);
-          sumzu += 0.5 * (utot.re * zloc.re + utot.im * zloc.im);
+          sumzu += static_cast<real_type>(0.5) * (utot.re * zloc.re + utot.im * zloc.im);
         } // end if jeven
 
-        sumzu *= 2.0;
+        sumzu *= static_cast<real_type>(2.0);
         if (bzero_flag) {
           if (!wselfall_flag) {
             if (elem1 == elem2 && elem1 == elem3) {
@@ -742,12 +781,12 @@ void SNAKokkos<DeviceType>::compute_bi(const int& iatom_mod, const int& jjb, con
    divergence. GPU version.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
- const Kokkos::View<F_FLOAT***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi(int iatom_mod, int jjz, int iatom_div,
+ const Kokkos::View<real_type***, Kokkos::LayoutLeft, DeviceType> &beta_pack)
 {
-  double betaj;
+  real_type betaj;
 
   const int j1 = idxz(jjz, 0);
   const int j2 = idxz(jjz, 1);
@@ -760,15 +799,15 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
   const int nb = idxz(jjz, 8);
   const int jju_half = idxz(jjz, 9);
 
-  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+  const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j);
   //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
   //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
 
   for (int elem1 = 0; elem1 < nelements; elem1++) {
     for (int elem2 = 0; elem2 < nelements; elem2++) {
 
-      double ztmp_r = 0.0;
-      double ztmp_i = 0.0;
+      real_type ztmp_r = 0.0;
+      real_type ztmp_i = 0.0;
 
       int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
       int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
@@ -787,8 +826,8 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
         #pragma unroll
         #endif
         for (int ia = 0; ia < na; ia++) {
-          const SNAcomplex utot1 = ulisttot_pack(iatom_mod,jju1+ma1,elem1,iatom_div);
-          const SNAcomplex utot2 = ulisttot_pack(iatom_mod,jju2+ma2,elem2,iatom_div);
+          const auto utot1 = ulisttot_pack(iatom_mod,jju1+ma1,elem1,iatom_div);
+          const auto utot2 = ulisttot_pack(iatom_mod,jju2+ma2,elem2,iatom_div);
           const auto cgcoeff_a = cgblock[icga];
           const auto cgcoeff_b = cgblock[icgb];
           ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im);
@@ -849,169 +888,173 @@ void SNAKokkos<DeviceType>::compute_yi(int iatom_mod, int jjz, int iatom_div,
    and accumulation into dEidRj. GPU only.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div)
 {
   // get shared memory offset
-  const int max_m_tile = (twojmax+1)*(twojmax/2+1);
+  // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer
+  const int tile_size = vector_length * (twojmax + 1);
+
   const int team_rank = team.team_rank();
-  const int scratch_shift = team_rank * max_m_tile;
-  const int jelem = element(iatom, jnbor);
+  const int scratch_shift = team_rank * tile_size;
 
-  // See notes on data layouts for shared memory caching
-  // in `compute_ui`.
+  // extract, wrap shared memory buffer
+  WignerWrapper<real_type, vector_length> ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
+  WignerWrapper<real_type, vector_length> dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod);
 
-  // double buffer for ulist
-  SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
-  SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  // load parameters
+  const auto a = a_pack(iatom_mod, jnbor, iatom_div);
+  const auto b = b_pack(iatom_mod, jnbor, iatom_div);
+  const auto da = da_pack(iatom_mod, jnbor, iatom_div, dir);
+  const auto db = db_pack(iatom_mod, jnbor, iatom_div, dir);
+  const auto sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0);
+  const auto dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u
 
-  // double buffer for dulist
-  SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
-  SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
+  const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor);
 
-  const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor);
+  auto dedr_full_sum = static_cast<real_type>(0.);
 
-  const SNAcomplex a = ckp.a;
-  const SNAcomplex b = ckp.b;
-  const SNAcomplex da = ckp.da[dir];
-  const SNAcomplex db = ckp.db[dir];
-  const double sfac = ckp.sfac;
-  const double dsfacu = ckp.dsfacu[dir]; // dsfac * u
+  // we need to "choose" when to bend
+  // this for loop is here for context --- we expose additional
+  // parallelism over this loop instead
+  //for (int j_bend = 0; j_bend <= twojmax; j_bend++) {
 
-  // Accumulate the full contribution to dedr on the fly
-  const SNAcomplex y_local = ylist(0, jelem, iatom);
+    // level 0 is just 1, 0
+    ulist_wrapper.set(0, complex::one());
+    dulist_wrapper.set(0, complex::zero());
 
-  // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
-  double dedr_full_sum = 0.5 * dsfacu * y_local.re;
+    // j from before the bend, don't store, mb == 0
+    // this is "creeping up the side"
+    for (int j = 1; j <= j_bend; j++) {
 
-  // single has a warp barrier at the end
-  Kokkos::single(Kokkos::PerThread(team), [=]() {
+      int jjup = idxu_half_block[j-1];
 
-    ulist_buf1[0] = {1., 0.};
-    dulist_buf1[0] = {0., 0.};
-  });
+      constexpr int mb = 0; // intentional for readability, compiler should optimize this out
 
-  for (int j = 1; j <= twojmax; j++) {
-    int jju_half = idxu_half_block[j];
+      complex ulist_accum = complex::zero();
+      complex dulist_accum = complex::zero();
 
-    // flatten the loop over ma,mb
+      int ma;
+      for (ma = 0; ma < j; ma++) {
+        const int jjup_index = idxu_half_block[j - 1] + mb * j + ma;
 
-    // for (int ma = 0; ma <= j; ma++)
-    const int n_ma = j+1;
-    // for (int mb = 0; 2*mb <= j; mb++)
-    const int n_mb = j/2+1;
+        // grab the cached value
+        const complex ulist_prev = ulist_wrapper.get(ma);
+        const complex dulist_prev = dulist_wrapper.get(ma);
 
-    const int total_iters = n_ma * n_mb;
+        // ulist_accum += rootpq * a.conj() * ulist_prev;
+        real_type rootpq = rootpqarray(j - ma, j - mb);
+        ulist_accum.re += rootpq * (a.re * ulist_prev.re + a.im * ulist_prev.im);
+        ulist_accum.im += rootpq * (a.re * ulist_prev.im - a.im * ulist_prev.re);
 
-    double dedr_sum = 0.; // j-local sum
+        // product rule of above
+        dulist_accum.re += rootpq * (da.re * ulist_prev.re + da.im * ulist_prev.im + a.re * dulist_prev.re + a.im * dulist_prev.im);
+        dulist_accum.im += rootpq * (da.re * ulist_prev.im - da.im * ulist_prev.re + a.re * dulist_prev.im - a.im * dulist_prev.re);
 
-    //for (int m = 0; m < total_iters; m++) {
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters),
-      [&] (const int m, double& sum_tmp) {
+        // store ulist_accum, we atomic accumulate values after the bend, so no atomic add here
+        ulist_wrapper.set(ma, ulist_accum);
+        dulist_wrapper.set(ma, dulist_accum);
 
-      // ma fast, mb slow
-      const int mb = m / n_ma;
-      const int ma = m - mb * n_ma;
+        // next value
+        // ulist_accum = -rootpq * b.conj() * ulist_prev;
+        rootpq = rootpqarray(ma + 1, j - mb);
+        ulist_accum.re = -rootpq * (b.re * ulist_prev.re + b.im * ulist_prev.im);
+        ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re);
 
-      const int jju_half_index = jju_half+m;
+        // product rule of above
+        dulist_accum.re = -rootpq * (db.re * ulist_prev.re + db.im * ulist_prev.im + b.re * dulist_prev.re + b.im * dulist_prev.im);
+        dulist_accum.im = -rootpq * (db.re * ulist_prev.im - db.im * ulist_prev.re + b.re * dulist_prev.im - b.im * dulist_prev.re);
 
-      // Load y_local, apply the symmetry scaling factor
-      // The "secret" of the shared memory optimization is it eliminates
-      // all global memory reads to duidrj in lieu of caching values in
-      // shared memory and otherwise always writing, making the kernel
-      // ultimately compute bound. We take advantage of that by adding
-      // some reads back in.
-      auto y_local = ylist(jju_half_index, jelem, iatom);
-      if (j % 2 == 0 && 2*mb == j) {
-        if (ma == mb) { y_local = 0.5*y_local; }
-        else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
+      }
+
+      ulist_wrapper.set(ma, ulist_accum);
+      dulist_wrapper.set(ma, dulist_accum);
+    }
+
+    // now we're after the bend, start storing but only up to the "half way point"
+    const int j_half_way = MIN(2 * j_bend, twojmax);
+
+    int mb = 1;
+    int j; //= j_bend + 1; // need this value below
+    for (j = j_bend + 1; j <= j_half_way; j++) {
+
+      const int jjup = idxu_half_block[j-1] + (mb - 1) * j;
+
+      complex ulist_accum = complex::zero();
+      complex dulist_accum = complex::zero();
+
+      int ma;
+      for (ma = 0; ma < j; ma++) {
+
+        // grab y_local early
+        // this will never be the last element of a row, no need to rescale.
+        auto y_local = complex(ylist_pack_re(iatom_mod, jjup + ma, jelem, iatom_div), ylist_pack_im(iatom_mod, jjup+ma, jelem, iatom_div));
+
+        // grab the cached value
+        const complex ulist_prev = ulist_wrapper.get(ma);
+        const complex dulist_prev = dulist_wrapper.get(ma);
+
+        // ulist_accum += rootpq * b * ulist_prev;
+        real_type rootpq = rootpqarray(j - ma, mb);
+        ulist_accum.re += rootpq * (b.re * ulist_prev.re - b.im * ulist_prev.im);
+        ulist_accum.im += rootpq * (b.re * ulist_prev.im + b.im * ulist_prev.re);
+
+        // product rule of above
+        dulist_accum.re += rootpq * (db.re * ulist_prev.re - db.im * ulist_prev.im + b.re * dulist_prev.re - b.im * dulist_prev.im);
+        dulist_accum.im += rootpq * (db.re * ulist_prev.im + db.im * ulist_prev.re + b.re * dulist_prev.im + b.im * dulist_prev.re);
+
+        // store ulist_accum
+        ulist_wrapper.set(ma, ulist_accum);
+        dulist_wrapper.set(ma, dulist_accum);
+
+        // Directly accumulate deidrj into sum_tmp
+        const complex du_prod = (dsfacu * ulist_prev) + (sfac * dulist_prev);
+        dedr_full_sum += du_prod.re * y_local.re + du_prod.im * y_local.im;
+
+        // next value
+        // ulist_accum = rootpq * a * ulist_prev;
+        rootpq = rootpqarray(ma + 1, mb);
+        ulist_accum.re = rootpq * (a.re * ulist_prev.re - a.im * ulist_prev.im);
+        ulist_accum.im = rootpq * (a.re * ulist_prev.im + a.im * ulist_prev.re);
+
+        // product rule of above
+        dulist_accum.re = rootpq * (da.re * ulist_prev.re - da.im * ulist_prev.im + a.re * dulist_prev.re - a.im * dulist_prev.im);
+        dulist_accum.im = rootpq * (da.re * ulist_prev.im + da.im * ulist_prev.re + a.re * dulist_prev.im + a.im * dulist_prev.re);
+
+      }
+
+      ulist_wrapper.set(ma, ulist_accum);
+      dulist_wrapper.set(ma, dulist_accum);
+
+      mb++;
+    }
+
+    // accumulate the last level
+    const int jjup = idxu_half_block[j-1] + (mb - 1) * j;
+
+    for (int ma = 0; ma < j; ma++) {
+      // grab y_local early
+      auto y_local = complex(ylist_pack_re(iatom_mod, jjup + ma, jelem, iatom_div), ylist_pack_im(iatom_mod, jjup+ma, jelem, iatom_div));
+      if (j % 2 == 1 && 2*(mb-1) == j-1) { // double check me...
+        if (ma == (mb-1)) { y_local = static_cast<real_type>(0.5)*y_local; }
+        else if (ma > (mb-1)) { y_local.re = static_cast<real_type>(0.); y_local.im = static_cast<real_type>(0.); } // can probably avoid this outright
         // else the ma < mb gets "double counted", cancelling the 0.5.
       }
 
-      // index into shared memory
-      const int jju_shared_idx = m;
-      const int jjup_shared_idx = jju_shared_idx - mb;
-
-      // Need to compute and accumulate both u and du (mayhaps, we could probably
-      // balance some read and compute by reading u each time).
-      SNAcomplex u_accum = { 0., 0. };
-      SNAcomplex du_accum = { 0., 0. };
-
-      const double rootpq2 = -rootpqarray(ma, j - mb);
-      const SNAcomplex u_up2 = rootpq2*((ma > 0) ? ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.));
-
-      // u_accum += conj(b) * u_up2
-      caconjxpy(b, u_up2, u_accum);
-
-      const double rootpq1 = rootpqarray(j - ma, j - mb);
-      const SNAcomplex u_up1 = rootpq1*((ma < j) ? ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.));
-
-      // u_accum += conj(a) * u_up1
-      caconjxpy(a, u_up1, u_accum);
-
-      // Next, spin up du_accum
-      const SNAcomplex du_up1 = rootpq1*((ma < j) ? dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.));
-
-      // du_accum += conj(da) * u_up1 + conj(a) * du_up1
-      caconjxpy(da, u_up1, du_accum);
-      caconjxpy(a, du_up1, du_accum);
-
-      const SNAcomplex du_up2 = rootpq2*((ma > 0) ? dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.));
-
-      // du_accum += conj(db) * u_up2 + conj(b) * du_up2
-      caconjxpy(db, u_up2, du_accum);
-      caconjxpy(b, du_up2, du_accum);
-
-      // Cache u_accum, du_accum to scratch memory.
-      ulist_buf2[jju_shared_idx] = u_accum;
-      dulist_buf2[jju_shared_idx] = du_accum;
+      const complex ulist_prev = ulist_wrapper.get(ma);
+      const complex dulist_prev = dulist_wrapper.get(ma);
 
       // Directly accumulate deidrj into sum_tmp
-      const SNAcomplex du_prod = (dsfacu * u_accum) + (sfac * du_accum);
-      sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
+      const complex du_prod = (dsfacu * ulist_prev) + (sfac * dulist_prev);
+      dedr_full_sum += du_prod.re * y_local.re + du_prod.im * y_local.im;
 
-      // copy left side to right side with inversion symmetry VMK 4.4(2)
-      // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
-      if (j%2==1 && mb+1==n_mb) {
-        int sign_factor = (((ma+mb)%2==0)?1:-1);
+    }
+  //} // end reference loop over j_bend
 
-        const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
+  // dedr gets zeroed out at the start of each iteration in compute_cayley_klein
+  Kokkos::atomic_add(&(dedr(iatom_mod + vector_length * iatom_div, jnbor, dir)), static_cast<real_type>(2.0) * dedr_full_sum);
 
-        if (sign_factor == 1) {
-          u_accum.im = -u_accum.im;
-          du_accum.im = -du_accum.im;
-        } else {
-          u_accum.re = -u_accum.re;
-          du_accum.re = -du_accum.re;
-        }
-
-        // We don't need the second half of the tile for the deidrj accumulation.
-        // That's taken care of by the symmetry factor above.
-        // We do need it for ortho polynomial generation, though
-        ulist_buf2[jju_shared_flip] = u_accum;
-        dulist_buf2[jju_shared_flip] = du_accum;
-      }
-
-    }, dedr_sum);
-
-    // swap buffers
-    auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
-    tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
-
-    // Accumulate dedr. This "should" be in a single, but
-    // a Kokkos::single call implies a warp sync, and we may
-    // as well avoid that. This does no harm as long as the
-    // final assignment is in a single block.
-    //Kokkos::single(Kokkos::PerThread(team), [=]() {
-    dedr_full_sum += dedr_sum;
-    //});
-  }
-
-  // Store the accumulated dedr.
-  Kokkos::single(Kokkos::PerThread(team), [&] () {
-    dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
-  });
 }
 
 
@@ -1025,9 +1068,9 @@ void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPoli
    advantage of the symmetry of the Wigner U matrices.
  * ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
+void SNAKokkos<DeviceType, real_type, vector_length>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int& iatom, const int& ielem)
 {
   for (int jelem = 0; jelem < nelements; jelem++) {
     for (int j = 0; j <= twojmax; j++) {
@@ -1042,8 +1085,8 @@ void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceT
 
         // if m is on the "diagonal", initialize it with the self energy.
         // Otherwise zero it out
-        SNAcomplex init = {0., 0.};
-        if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init = {wself, 0.0}; } //need to map iatom to element
+        complex init(static_cast<real_type>(0.),static_cast<real_type>(0.));
+        if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init.re = wself; } //need to map iatom to element
 
         ulisttot(jjup, jelem, iatom) = init;
       });
@@ -1059,11 +1102,11 @@ void SNAKokkos<DeviceType>::pre_ui_cpu(const typename Kokkos::TeamPolicy<DeviceT
    data layout comments.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
 {
-  double rsq, r, x, y, z, z0, theta0;
+  real_type rsq, r, x, y, z, z0, theta0;
 
   // utot(j,ma,mb) = 0 for all j,ma,ma
   // utot(j,ma,ma) = 1 for all j,ma
@@ -1089,9 +1132,9 @@ void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<Dev
    compute Zi by summing over products of Ui, CPU version
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_zi_cpu(const int& iter)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_zi_cpu(const int& iter)
 {
   const int iatom = iter / idxz_max;
   const int jjz = iter % idxz_max;
@@ -1106,22 +1149,22 @@ void SNAKokkos<DeviceType>::compute_zi_cpu(const int& iter)
   const int na = idxz(jjz, 7);
   const int nb = idxz(jjz, 8);
 
-  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+  const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j);
 
   int idouble = 0;
 
   for (int elem1 = 0; elem1 < nelements; elem1++) {
     for (int elem2 = 0; elem2 < nelements; elem2++) {
-      zlist(jjz, idouble, iatom).re = 0.0;
-      zlist(jjz, idouble, iatom).im = 0.0;
+      zlist(jjz, idouble, iatom).re = static_cast<real_type>(0.0);
+      zlist(jjz, idouble, iatom).im = static_cast<real_type>(0.0);
 
       int jju1 = idxu_block[j1] + (j1+1)*mb1min;
       int jju2 = idxu_block[j2] + (j2+1)*mb2max;
       int icgb = mb1min*(j2+1) + mb2max;
       for(int ib = 0; ib < nb; ib++) {
 
-        double suma1_r = 0.0;
-        double suma1_i = 0.0;
+        real_type suma1_r = static_cast<real_type>(0.0);
+        real_type suma1_i = static_cast<real_type>(0.0);
 
         int ma1 = ma1min;
         int ma2 = ma2max;
@@ -1158,9 +1201,9 @@ void SNAKokkos<DeviceType>::compute_zi_cpu(const int& iter)
    compute Bi by summing conj(Ui)*Zi, CPU version
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_bi_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom)
 {
   // for j1 = 0,...,twojmax
   //   for j2 = 0,twojmax
@@ -1186,11 +1229,11 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
 
           int jjz = idxz_block(j1, j2, j);
           int jju = idxu_block[j];
-          double sumzu = 0.0;
-          double sumzu_temp = 0.0;
+          real_type sumzu = static_cast<real_type>(0.0);
+          real_type sumzu_temp = static_cast<real_type>(0.0);
           const int bound = (j+2)/2;
           Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound),
-              [&] (const int mbma, double& sum) {
+              [&] (const int mbma, real_type& sum) {
               //for(int mb = 0; 2*mb < j; mb++)
                 //for(int ma = 0; ma <= j; ma++) {
               const int ma = mbma % (j + 1);
@@ -1209,7 +1252,7 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
           if (j%2 == 0) {
             const int mb = j/2;
             Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb),
-                [&] (const int ma, double& sum) {
+                [&] (const int ma, real_type& sum) {
             //for(int ma = 0; ma < mb; ma++) {
               const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
               const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
@@ -1222,13 +1265,13 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
             const int ma = mb;
             const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma;
             const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma;
-            sumzu += 0.5*
+            sumzu += static_cast<real_type>(0.5)*
               (ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re +
                ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im);
           } // end if jeven
 
           Kokkos::single(Kokkos::PerThread(team), [&] () {
-            sumzu *= 2.0;
+            sumzu *= static_cast<real_type>(2.0);
 
             // apply bzero shift
 
@@ -1260,12 +1303,12 @@ void SNAKokkos<DeviceType>::compute_bi_cpu(const typename Kokkos::TeamPolicy<Dev
    CPU version
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
- const Kokkos::View<F_FLOAT**, DeviceType> &beta)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_yi_cpu(int iter,
+ const Kokkos::View<real_type**, DeviceType> &beta)
 {
-  double betaj;
+  real_type betaj;
   const int iatom = iter / idxz_max;
   const int jjz = iter % idxz_max;
 
@@ -1280,24 +1323,24 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
   const int nb = idxz(jjz, 8);
   const int jju_half = idxz(jjz, 9);
 
-  const double *cgblock = cglist.data() + idxcg_block(j1,j2,j);
+  const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j);
   //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
   //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
 
   for (int elem1 = 0; elem1 < nelements; elem1++) {
     for (int elem2 = 0; elem2 < nelements; elem2++) {
 
-      double ztmp_r = 0.0;
-      double ztmp_i = 0.0;
+      real_type ztmp_r = 0.0;
+      real_type ztmp_i = 0.0;
 
       int jju1 = idxu_block[j1] + (j1 + 1) * mb1min;
       int jju2 = idxu_block[j2] + (j2 + 1) * mb2max;
-      int icgb = mb1min * (j2 +1 ) + mb2max;
+      int icgb = mb1min * (j2 +1) + mb2max;
 
       for (int ib = 0; ib < nb; ib++) {
 
-        double suma1_r = 0.0;
-        double suma1_i = 0.0;
+        real_type suma1_r = 0.0;
+        real_type suma1_i = 0.0;
 
         int ma1 = ma1min;
         int ma2 = ma2max;
@@ -1368,22 +1411,21 @@ void SNAKokkos<DeviceType>::compute_yi_cpu(int iter,
    data layout
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
 {
-  double rsq, r, x, y, z, z0, theta0, cs, sn;
-  double dz0dr;
+  real_type rsq, r, x, y, z, z0, theta0, cs, sn;
+  real_type dz0dr;
 
   x = rij(iatom,jnbor,0);
   y = rij(iatom,jnbor,1);
   z = rij(iatom,jnbor,2);
   rsq = x * x + y * y + z * z;
   r = sqrt(rsq);
-  double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
+  auto rscale0 = rfac0 * static_cast<real_type>(MY_PI) / (rcutij(iatom,jnbor) - rmin0);
   theta0 = (r - rmin0) * rscale0;
-  cs = cos(theta0);
-  sn = sin(theta0);
+  sincos_wrapper(theta0, &sn, &cs);
   z0 = r * cs / sn;
   dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
@@ -1400,16 +1442,16 @@ void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy
 ------------------------------------------------------------------------- */
 
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
 {
-  t_scalar3<double> final_sum;
+  t_scalar3<real_type> final_sum;
   const int jelem = element(iatom, jnbor);
 
   //for(int j = 0; j <= twojmax; j++) {
   Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1),
-      [&] (const int& j, t_scalar3<double>& sum_tmp) {
+      [&] (const int& j, t_scalar3<real_type>& sum_tmp) {
     int jju_half = idxu_half_block[j];
     int jju_cache = idxu_cache_block[j];
 
@@ -1467,12 +1509,12 @@ void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy
    of the symmetry of the Wigner U matrices.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
-                                          double r, double wj, double rcut, int jelem)
+void SNAKokkos<DeviceType, real_type, vector_length>::add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+                                          const real_type& r, const real_type& wj, const real_type& rcut, int jelem)
 {
-  const double sfac = compute_sfac(r, rcut) * wj;
+  const auto sfac = compute_sfac(r, rcut) * wj;
 
   Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1),
       [&] (const int& j) {
@@ -1497,19 +1539,18 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
    information stored between layers via scratch memory on the GPU path
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
-                         double x, double y, double z,
-                         double z0, double r)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+                         const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r)
 {
-  double r0inv;
-  double a_r, b_r, a_i, b_i;
-  double rootpq;
+  real_type r0inv;
+  real_type a_r, b_r, a_i, b_i;
+  real_type rootpq;
 
   // compute Cayley-Klein parameters for unit quaternion
 
-  r0inv = 1.0 / sqrt(r * r + z0 * z0);
+  r0inv = static_cast<real_type>(1.0) / sqrt(r * r + z0 * z0);
   a_r = r0inv * z0;
   a_i = -r0inv * z;
   b_r = r0inv * y;
@@ -1589,23 +1630,23 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
    Uses same cached data layout of ulist
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
-                          double x, double y, double z,
-                          double z0, double r, double dz0dr,
-                          double wj, double rcut)
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
+                          const real_type& x, const real_type& y, const real_type& z,
+                          const real_type& z0, const real_type& r, const real_type& dz0dr,
+                          const real_type& wj, const real_type& rcut)
 {
-double r0inv;
-  double a_r, a_i, b_r, b_i;
-  double da_r[3], da_i[3], db_r[3], db_i[3];
-  double dz0[3], dr0inv[3], dr0invdr;
-  double rootpq;
+  real_type r0inv;
+  real_type a_r, a_i, b_r, b_i;
+  real_type da_r[3], da_i[3], db_r[3], db_i[3];
+  real_type dz0[3], dr0inv[3], dr0invdr;
+  real_type rootpq;
 
-  double rinv = 1.0 / r;
-  double ux = x * rinv;
-  double uy = y * rinv;
-  double uz = z * rinv;
+  real_type rinv = 1.0 / r;
+  real_type ux = x * rinv;
+  real_type uy = y * rinv;
+  real_type uz = z * rinv;
 
   r0inv = 1.0 / sqrt(r * r + z0 * z0);
   a_r = z0 * r0inv;
@@ -1720,8 +1761,8 @@ double r0inv;
     });
   }
 
-  double sfac = compute_sfac(r, rcut);
-  double dsfac = compute_dsfac(r, rcut);
+  real_type sfac = compute_sfac(r, rcut);
+  real_type dsfac = compute_dsfac(r, rcut);
 
   sfac *= wj;
   dsfac *= wj;
@@ -1755,9 +1796,9 @@ double r0inv;
    factorial n, wrapper for precomputed table
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-double SNAKokkos<DeviceType>::factorial(int n)
+double SNAKokkos<DeviceType, real_type, vector_length>::factorial(int n)
 {
   //if (n < 0 || n > nmaxfactorial) {
   //  char str[128];
@@ -1772,8 +1813,8 @@ double SNAKokkos<DeviceType>::factorial(int n)
    factorial n table, size SNA::nmaxfactorial+1
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-const double SNAKokkos<DeviceType>::nfac_table[] = {
+template<class DeviceType, typename real_type, int vector_length>
+const double SNAKokkos<DeviceType, real_type, vector_length>::nfac_table[] = {
   1,
   1,
   2,
@@ -1948,9 +1989,9 @@ const double SNAKokkos<DeviceType>::nfac_table[] = {
    the function delta given by VMK Eq. 8.2(1)
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-double SNAKokkos<DeviceType>::deltacg(int j1, int j2, int j)
+double SNAKokkos<DeviceType, real_type, vector_length>::deltacg(int j1, int j2, int j)
 {
   double sfaccg = factorial((j1 + j2 + j) / 2 + 1);
   return sqrt(factorial((j1 + j2 - j) / 2) *
@@ -1963,9 +2004,9 @@ double SNAKokkos<DeviceType>::deltacg(int j1, int j2, int j)
    the quasi-binomial formula VMK 8.2.1(3)
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-void SNAKokkos<DeviceType>::init_clebsch_gordan()
+void SNAKokkos<DeviceType, real_type, vector_length>::init_clebsch_gordan()
 {
   auto h_cglist = Kokkos::create_mirror_view(cglist);
 
@@ -2033,23 +2074,23 @@ void SNAKokkos<DeviceType>::init_clebsch_gordan()
    the p = 0, q = 0 entries are allocated and skipped for convenience.
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-void SNAKokkos<DeviceType>::init_rootpqarray()
+void SNAKokkos<DeviceType, real_type, vector_length>::init_rootpqarray()
 {
   auto h_rootpqarray = Kokkos::create_mirror_view(rootpqarray);
   for (int p = 1; p <= twojmax; p++)
     for (int q = 1; q <= twojmax; q++)
-      h_rootpqarray(p,q) = sqrt(static_cast<double>(p)/q);
+      h_rootpqarray(p,q) = static_cast<real_type>(sqrt(static_cast<double>(p)/q));
   Kokkos::deep_copy(rootpqarray,h_rootpqarray);
 }
 
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 inline
-int SNAKokkos<DeviceType>::compute_ncoeff()
+int SNAKokkos<DeviceType, real_type, vector_length>::compute_ncoeff()
 {
   int ncount;
 
@@ -2070,88 +2111,72 @@ int SNAKokkos<DeviceType>::compute_ncoeff()
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-double SNAKokkos<DeviceType>::compute_sfac(double r, double rcut)
+real_type SNAKokkos<DeviceType, real_type, vector_length>::compute_sfac(real_type r, real_type rcut)
 {
-  if (switch_flag == 0) return 1.0;
+  constexpr real_type one = static_cast<real_type>(1.0);
+  constexpr real_type zero = static_cast<real_type>(0.0);
+  constexpr real_type onehalf = static_cast<real_type>(0.5);
+  if (switch_flag == 0) return one;
   if (switch_flag == 1) {
-    if(r <= rmin0) return 1.0;
-    else if(r > rcut) return 0.0;
+    if(r <= rmin0) return one;
+    else if(r > rcut) return zero;
     else {
-      double rcutfac = MY_PI / (rcut - rmin0);
-      return 0.5 * (cos((r - rmin0) * rcutfac) + 1.0);
+      auto rcutfac = static_cast<real_type>(MY_PI) / (rcut - rmin0);
+      return onehalf * (cos((r - rmin0) * rcutfac) + one);
     }
   }
-  return 0.0;
+  return zero;
 }
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-double SNAKokkos<DeviceType>::compute_dsfac(double r, double rcut)
+real_type SNAKokkos<DeviceType, real_type, vector_length>::compute_dsfac(real_type r, real_type rcut)
 {
-  if (switch_flag == 0) return 0.0;
+  constexpr real_type zero = static_cast<real_type>(0.0);
+  constexpr real_type onehalf = static_cast<real_type>(0.5);
+  if (switch_flag == 0) return zero;
   if (switch_flag == 1) {
-    if(r <= rmin0) return 0.0;
-    else if(r > rcut) return 0.0;
+    if(r <= rmin0) return zero;
+    else if(r > rcut) return zero;
     else {
-      double rcutfac = MY_PI / (rcut - rmin0);
-      return -0.5 * sin((r - rmin0) * rcutfac) * rcutfac;
+      auto rcutfac = static_cast<real_type>(MY_PI) / (rcut - rmin0);
+      return -onehalf * sin((r - rmin0) * rcutfac) * rcutfac;
     }
   }
-  return 0.0;
+  return zero;
 }
 
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_INLINE_FUNCTION
-void SNAKokkos<DeviceType>::compute_s_dsfac(const double r, const double rcut, double& sfac, double& dsfac) {
-  if (switch_flag == 0) { sfac = 0.; dsfac = 0.; }
+void SNAKokkos<DeviceType, real_type, vector_length>::compute_s_dsfac(const real_type r, const real_type rcut, real_type& sfac, real_type& dsfac) {
+  constexpr real_type one = static_cast<real_type>(1.0);
+  constexpr real_type zero = static_cast<real_type>(0.0);
+  constexpr real_type onehalf = static_cast<real_type>(0.5);
+  if (switch_flag == 0) { sfac = zero; dsfac = zero; }
   else if (switch_flag == 1) {
-    if (r <= rmin0) { sfac = 1.0; dsfac = 0.0; }
-    else if (r > rcut) { sfac = 0.; dsfac = 0.; }
+    if (r <= rmin0) { sfac = one; dsfac = zero; }
+    else if (r > rcut) { sfac = zero; dsfac = zero; }
     else {
-      const double rcutfac = MY_PI / (rcut - rmin0);
-      double sn, cs;
-      sincos((r - rmin0) * rcutfac, &sn, &cs);
-      sfac = 0.5 * (cs + 1.0);
-      dsfac = -0.5 * sn * rcutfac;
+      const auto rcutfac = static_cast<real_type>(MY_PI) / (rcut - rmin0);
+      real_type sn, cs;
+      sincos_wrapper((r - rmin0) * rcutfac, &sn, &cs); // need to create a wrapper
+      sfac = onehalf * (cs + one);
+      dsfac = -onehalf * sn * rcutfac;
 
     }
-  } else { sfac = 0.; dsfac = 0.; }
-}
-
-/* ---------------------------------------------------------------------- */
-
-// efficient complex FMA (i.e., y += a x)
-template<class DeviceType>
-KOKKOS_FORCEINLINE_FUNCTION
-void SNAKokkos<DeviceType>::caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) {
-  y.re += a.re * x.re;
-  y.re -= a.im * x.im;
-  y.im += a.im * x.re;
-  y.im += a.re * x.im;
-}
-
-/* ---------------------------------------------------------------------- */
-
-// efficient complex FMA, conjugate of scalar (i.e.) y += (a.re - i a.im) x)
-template<class DeviceType>
-KOKKOS_FORCEINLINE_FUNCTION
-void SNAKokkos<DeviceType>::caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) {
-  y.re += a.re * x.re;
-  y.re += a.im * x.im;
-  y.im -= a.im * x.re;
-  y.im += a.re * x.im;
+  } else { sfac = zero; dsfac = zero; }
 }
 
 /* ---------------------------------------------------------------------- */
 
 // set direction of batched Duidrj
-template<class DeviceType>
+template<class DeviceType, typename real_type, int vector_length>
 KOKKOS_FORCEINLINE_FUNCTION
-void SNAKokkos<DeviceType>::set_dir(int dir_) {
+void SNAKokkos<DeviceType, real_type, vector_length>::set_dir(int dir_) {
   dir = dir_;
 }
 
@@ -2159,8 +2184,8 @@ void SNAKokkos<DeviceType>::set_dir(int dir_) {
    memory usage of arrays
 ------------------------------------------------------------------------- */
 
-template<class DeviceType>
-double SNAKokkos<DeviceType>::memory_usage()
+template<class DeviceType, typename real_type, int vector_length>
+double SNAKokkos<DeviceType, real_type, vector_length>::memory_usage()
 {
   int jdimpq = twojmax + 2;
   int jdim = twojmax + 1;
@@ -2168,45 +2193,53 @@ double SNAKokkos<DeviceType>::memory_usage()
 
   bytes = 0;
 
-  bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
-  bytes += idxcg_max * sizeof(double);                   // cglist
+  bytes += jdimpq*jdimpq * sizeof(real_type);               // pqarray
+  bytes += idxcg_max * sizeof(real_type);                   // cglist
 
 #ifdef LMP_KOKKOS_GPU
   if (!host_flag) {
 
-    auto natom_pad = (natom+32-1)/32;
+    auto natom_pad = (natom+vector_length-1)/vector_length;
 
-    bytes += natom * idxu_half_max * nelements * sizeof(double);     // ulisttot_re
-    bytes += natom * idxu_half_max * nelements * sizeof(double);     // ulisttot_im
-    bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2;  // ulisttot_pack
+    bytes += natom_pad * nmax * sizeof(real_type) * 2;     // a_pack
+    bytes += natom_pad * nmax * sizeof(real_type) * 2;     // b_pack
+    bytes += natom_pad * nmax * 3 * sizeof(real_type) * 2; // da_pack
+    bytes += natom_pad * nmax * 3 * sizeof(real_type) * 2; // db_pack
+    bytes += natom_pad * nmax * 4 * sizeof(real_type);     // sfac_pack
 
-    bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2;   // zlist_pack
-    bytes += natom_pad * idxb_max * ntriples * sizeof(double);       // blist_pack
 
-    bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_re
-    bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_im
-    bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist
+    bytes += natom_pad * idxu_half_max * nelements * sizeof(real_type);     // ulisttot_re_pack
+    bytes += natom_pad * idxu_half_max * nelements * sizeof(real_type);     // ulisttot_im_pack
+    bytes += natom_pad * idxu_max * nelements * sizeof(real_type) * 2;      // ulisttot_pack
+
+    bytes += natom_pad * idxz_max * ndoubles * sizeof(real_type) * 2;   // zlist_pack
+    bytes += natom_pad * idxb_max * ntriples * sizeof(real_type);       // blist_pack
+
+    bytes += natom_pad * idxu_half_max * nelements * sizeof(real_type); // ylist_pack_re
+    bytes += natom_pad * idxu_half_max * nelements * sizeof(real_type); // ylist_pack_im
   } else {
 #endif
 
-    bytes += natom * nmax * idxu_cache_max * sizeof(double) * 2;     // ulist
-    bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ulisttot
+    bytes += natom * nmax * idxu_cache_max * sizeof(real_type) * 2;     // ulist
+    bytes += natom * idxu_half_max * nelements * sizeof(real_type) * 2; // ulisttot
+    bytes += natom * idxu_max * nelements * sizeof(real_type) * 2;      // ulisttot_full
 
-    bytes += natom * idxz_max * ndoubles * sizeof(double) * 2;       // zlist
-    bytes += natom * idxb_max * ntriples * sizeof(double);           // blist
+    bytes += natom * idxz_max * ndoubles * sizeof(real_type) * 2;       // zlist
+    bytes += natom * idxb_max * ntriples * sizeof(real_type);           // blist
 
-    bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist
+    bytes += natom * idxu_half_max * nelements * sizeof(real_type) * 2; // ylist
 
-    bytes += natom * nmax * idxu_cache_max * 3 * sizeof(double) * 2; // dulist
+    bytes += natom * nmax * idxu_cache_max * 3 * sizeof(real_type) * 2; // dulist
 #ifdef LMP_KOKKOS_GPU
   }
 #endif
 
-  bytes += natom * nmax * 3 * sizeof(double);            // dedr
+  bytes += natom * nmax * 3 * sizeof(real_type);            // dedr
 
   bytes += jdim * jdim * jdim * sizeof(int);             // idxcg_block
   bytes += jdim * sizeof(int);                           // idxu_block
   bytes += jdim * sizeof(int);                           // idxu_half_block
+  bytes += idxu_max * sizeof(FullHalfMapper);            // idxu_full_half
   bytes += jdim * sizeof(int);                           // idxu_cache_block
   bytes += jdim * jdim * jdim * sizeof(int);             // idxz_block
   bytes += jdim * jdim * jdim * sizeof(int);             // idxb_block
@@ -2214,12 +2247,12 @@ double SNAKokkos<DeviceType>::memory_usage()
   bytes += idxz_max * 10 * sizeof(int);                  // idxz
   bytes += idxb_max * 3 * sizeof(int);                   // idxb
 
-  bytes += jdim * sizeof(double);                        // bzero
+  bytes += jdim * sizeof(real_type);                        // bzero
 
-  bytes += natom * nmax * 3 * sizeof(double);            // rij
-  bytes += natom * nmax * sizeof(int);                   // inside
-  bytes += natom * nmax * sizeof(double);                // wj
-  bytes += natom * nmax * sizeof(double);                // rcutij
+  bytes += natom * nmax * 3 * sizeof(real_type);            // rij
+  bytes += natom * nmax * sizeof(real_type);                // inside
+  bytes += natom * nmax * sizeof(real_type);                // wj
+  bytes += natom * nmax * sizeof(real_type);                // rcutij
 
   return bytes;
 }
diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index a7ea6de591..741c237ca8 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -657,7 +657,7 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
   chemflag = 0;
   bnormflag = 0;
   wselfallflag = 0;
-  chunksize = 2000;
+  chunksize = 4096;
 
   // open SNAP parameter file on proc 0