diff --git a/doc/src/compute_sna_atom.txt b/doc/src/compute_sna_atom.txt
index efbf2e9ea3..518d28aec9 100644
--- a/doc/src/compute_sna_atom.txt
+++ b/doc/src/compute_sna_atom.txt
@@ -24,12 +24,7 @@ twojmax = band limit for bispectrum components (non-negative integer) :l
 R_1, R_2,... = list of cutoff radii, one for each type (distance units) :l
 w_1, w_2,... = list of neighbor weights, one for each type  :l
 zero or more keyword/value pairs may be appended :l
-keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
-  {diagonal} value = {0} or {1} or {2} or {3}
-     {0} = all j1, j2, j <= twojmax, j2 <= j1
-     {1} = subset satisfying j1 == j2
-     {2} = subset satisfying j1 == j2 == j3
-     {3} = subset satisfying j2 <= j1 <= j
+keyword = {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
   {rmin0} value = parameter in distance to angle conversion (distance units)
   {switchflag} value = {0} or {1}
      {0} = do not use switching function
@@ -44,7 +39,7 @@ keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag
 
 [Examples:]
 
-compute b all sna/atom 1.4 0.99363 6 2.0 2.4 0.75 1.0 diagonal 3 rmin0 0.0
+compute b all sna/atom 1.4 0.99363 6 2.0 2.4 0.75 1.0 rmin0 0.0
 compute db all sna/atom 1.4 0.95 6 2.0 1.0
 compute vb all sna/atom 1.4 0.95 6 2.0 1.0 :pre
 
@@ -151,7 +146,7 @@ The argument {rfac0} and the optional keyword {rmin0} define the
 linear mapping from radial distance to polar angle {theta0} on the
 3-sphere.
 
-The argument {twojmax} and the keyword {diagonal} define which
+The argument {twojmax} defines which
 bispectrum components are generated. See section below on output for a
 detailed explanation of the number of bispectrum components and the
 ordered in which they are listed.
@@ -192,23 +187,18 @@ command that includes all pairs in the neighbor list.
 Compute {sna/atom} calculates a per-atom array, each column
 corresponding to a particular bispectrum component.  The total number
 of columns and the identity of the bispectrum component contained in
-each column depend on the values of {twojmax} and {diagonal}, as
+each column depend of the value of {twojmax}, as
 described by the following piece of python code:
 
 for j1 in range(0,twojmax+1):
-    if(diagonal==2):
-        print j1/2.,j1/2.,j1/2.
-    elif(diagonal==1):
-        for j in range(0,min(twojmax,2*j1)+1,2):
-            print j1/2.,j1/2.,j/2.
-    elif(diagonal==0):
-        for j2 in range(0,j1+1):
-            for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
-                print j1/2.,j2/2.,j/2.
-    elif(diagonal==3):
-        for j2 in range(0,j1+1):
-            for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
-                if (j>=j1): print j1/2.,j2/2.,j/2. :pre
+    for j2 in range(0,j1+1):
+        for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
+            if (j>=j1): print j1/2.,j2/2.,j/2. :pre
+
+NOTE: the {diagonal} keyword allowing other possible choices
+for the number of bispectrum components was removed in 2019, 
+since all potentials use the value of 3, corresponding to the
+above set of bispectrum components.
 
 Compute {snad/atom} evaluates a per-atom array. The columns are
 arranged into {ntypes} blocks, listed in order of atom type {I}.  Each
@@ -259,7 +249,7 @@ package"_Build_package.html doc page for more info.
 
 [Default:]
 
-The optional keyword defaults are {diagonal} = 0, {rmin0} = 0,
+The optional keyword defaults are {rmin0} = 0,
 {switchflag} = 1, {bzeroflag} = 1, {quadraticflag} = 0,
 
 :line
diff --git a/doc/src/pair_snap.txt b/doc/src/pair_snap.txt
index a796cfdeba..1fba74a188 100644
--- a/doc/src/pair_snap.txt
+++ b/doc/src/pair_snap.txt
@@ -38,7 +38,7 @@ where {B_k^i} is the {k}-th bispectrum component of atom {i},
 and {beta_k^alpha_i} is the corresponding linear coefficient
 that depends on {alpha_i}, the SNAP element of atom {i}. The
 number of bispectrum components used and their definitions
-depend on the values of {twojmax} and {diagonalstyle}
+depend on the value of {twojmax}
 defined in the SNAP parameter file described below.
 The bispectrum calculation is described in more detail
 in "compute sna/atom"_compute_sna_atom.html.
@@ -125,14 +125,13 @@ This line is followed by {ncoeff} coefficients, one per line.
 The SNAP parameter file can contain blank and comment lines (start
 with #) anywhere. Each non-blank non-comment line must contain one
 keyword/value pair. The required keywords are {rcutfac} and
-{twojmax}. Optional keywords are {rfac0}, {rmin0}, {diagonalstyle},
+{twojmax}. Optional keywords are {rfac0}, {rmin0},
 {switchflag}, and {bzeroflag}.
 
 The default values for these keywords are
 
 {rfac0} = 0.99363
 {rmin0} = 0.0
-{diagonalstyle} = 3
 {switchflag} = 0
 {bzeroflag} = 1
 {quadraticflag} = 1 :ul
@@ -144,6 +143,9 @@ If {quadraticflag} is set to 1, then the SNAP energy expression includes the qua
 The SNAP element file should contain {K}({K}+1)/2 additional coefficients
 for each element, the upper-triangular elements of alpha.
 
+NOTE: The previously used {diagonalstyle} keyword was removed in 2019,
+since all known SNAP potentials use the default value of 3.
+
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
diff --git a/potentials/Ta06A.snapparam b/potentials/Ta06A.snapparam
index 283629d658..629d96d708 100644
--- a/potentials/Ta06A.snapparam
+++ b/potentials/Ta06A.snapparam
@@ -10,6 +10,5 @@ twojmax 6
 
 rfac0 0.99363
 rmin0 0
-diagonalstyle 3
 bzeroflag 0
 quadraticflag 0
diff --git a/potentials/W_2940_2017_2.snapparam b/potentials/W_2940_2017_2.snapparam
index 27ab61a266..49f3094d08 100644
--- a/potentials/W_2940_2017_2.snapparam
+++ b/potentials/W_2940_2017_2.snapparam
@@ -8,6 +8,5 @@ twojmax 8
 
 rfac0 0.99363
 rmin0 0
-diagonalstyle 3
 bzeroflag 0
 quadraticflag 0
diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h
index bb2a5e9171..0ec4ed0995 100644
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@@ -85,9 +85,6 @@ void PairSNAPKokkos<DeviceType>::init_style()
   if (force->newton_pair == 0)
     error->all(FLERR,"Pair style SNAP requires newton pair on");
 
-  if (diagonalstyle != 3)
-    error->all(FLERR,"Must use diagonal style = 3 with pair snap/kk");
-
   // irequest = neigh request made by parent class
 
   neighflag = lmp->kokkos->neighflag;
@@ -343,23 +340,12 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
   Kokkos::deep_copy(d_coeffelem,h_coeffelem);
   Kokkos::deep_copy(d_map,h_map);
 
-  // deallocate non-kokkos sna
-
-  if (sna) {
-    for (int tid = 0; tid<nthreads; tid++)
-      delete sna[tid];
-    delete [] sna;
-    sna = NULL;
-  }
-
   // allocate memory for per OpenMP thread data which
   // is wrapped into the sna class
 
   snaKK = SNAKokkos<DeviceType>(rfac0,twojmax,
-                  diagonalstyle,use_shared_arrays,
                   rmin0,switchflag,bzeroflag);
-    //if (!use_shared_arrays)
-  snaKK.grow_rij(nmax);
+  snaKK.grow_rij(0);
   snaKK.init();
 }
 
@@ -667,8 +653,6 @@ double PairSNAPKokkos<DeviceType>::memory_usage()
   int n = atom->ntypes+1;
   bytes += n*n*sizeof(int);
   bytes += n*n*sizeof(double);
-  bytes += 3*nmax*sizeof(double);
-  bytes += nmax*sizeof(int);
   bytes += (2*ncoeffall)*sizeof(double);
   bytes += (ncoeff*3)*sizeof(double);
   bytes += snaKK.memory_usage();
diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h
index 7a80b262b7..40e5fe0ad4 100644
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@@ -48,7 +48,7 @@ inline
   SNAKokkos(const SNAKokkos<DeviceType>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);
 
 inline
-  SNAKokkos(double, int, int, int, double, int, int);
+  SNAKokkos(double, int, double, int, int);
 
   KOKKOS_INLINE_FUNCTION
   ~SNAKokkos();
@@ -178,12 +178,6 @@ inline
                        double, double, double, // compute_duidrj
                        double, double, double, double, double);
 
-  // if number of atoms are small use per atom arrays
-  // for twojmax arrays, rij, inside, bvec
-  // this will increase the memory footprint considerably,
-  // but allows parallel filling and reuse of these arrays
-  int use_shared_arrays;
-
   // Sets the style for the switching function
   // 0 = none
   // 1 = cosine
diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h
index 0f2a450a3d..c43003af97 100644
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@@ -27,19 +27,17 @@ static const double MY_PI  = 3.14159265358979323846; // pi
 template<class DeviceType>
 inline
 SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
-         int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in,
+         int twojmax_in,
          double rmin0_in, int switch_flag_in, int bzero_flag_in)
 {
   wself = 1.0;
 
-  use_shared_arrays = use_shared_arrays_in;
   rfac0 = rfac0_in;
   rmin0 = rmin0_in;
   switch_flag = switch_flag_in;
   bzero_flag = bzero_flag_in;
 
   twojmax = twojmax_in;
-  diagonalstyle = diagonalstyle_in;
 
   ncoeff = compute_ncoeff();
 
@@ -70,14 +68,12 @@ KOKKOS_INLINE_FUNCTION
 SNAKokkos<DeviceType>::SNAKokkos(const SNAKokkos<DeviceType>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) {
   wself = sna.wself;
 
-  use_shared_arrays = sna.use_shared_arrays;
   rfac0 = sna.rfac0;
   rmin0 = sna.rmin0;
   switch_flag = sna.switch_flag;
   bzero_flag = sna.bzero_flag;
 
   twojmax = sna.twojmax;
-  diagonalstyle = sna.diagonalstyle;
 
   ncoeff = sna.ncoeff;
   nmax = sna.nmax;
@@ -104,48 +100,45 @@ template<class DeviceType>
 inline
 void SNAKokkos<DeviceType>::build_indexlist()
 {
-  if(diagonalstyle == 3) {
-    int idxj_count = 0;
-    int idxj_full_count = 0;
+  int idxj_count = 0;
+  int idxj_full_count = 0;
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          if (j >= j1) idxj_count++;
-          idxj_full_count++;
-        }
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+	if (j >= j1) idxj_count++;
+	idxj_full_count++;
+      }
 
-    // indexList can be changed here
+  // indexList can be changed here
 
-    idxj = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj",idxj_count);
-    idxj_full = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj_full",idxj_full_count);
-    auto h_idxj = Kokkos::create_mirror_view(idxj);
-    auto h_idxj_full = Kokkos::create_mirror_view(idxj_full);
+  idxj = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj",idxj_count);
+  idxj_full = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj_full",idxj_full_count);
+  auto h_idxj = Kokkos::create_mirror_view(idxj);
+  auto h_idxj_full = Kokkos::create_mirror_view(idxj_full);
 
-    idxj_max = idxj_count;
-    idxj_full_max = idxj_full_count;
+  idxj_max = idxj_count;
+  idxj_full_max = idxj_full_count;
 
-    idxj_count = 0;
-    idxj_full_count = 0;
+  idxj_count = 0;
+  idxj_full_count = 0;
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          if (j >= j1) {
-            h_idxj[idxj_count].j1 = j1;
-            h_idxj[idxj_count].j2 = j2;
-            h_idxj[idxj_count].j = j;
-            idxj_count++;
-          }
-          h_idxj_full[idxj_full_count].j1 = j1;
-          h_idxj_full[idxj_full_count].j2 = j2;
-          h_idxj_full[idxj_full_count].j = j;
-          idxj_full_count++;
-        }
-    Kokkos::deep_copy(idxj,h_idxj);
-    Kokkos::deep_copy(idxj_full,h_idxj_full);
-
-  }
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+	if (j >= j1) {
+	  h_idxj[idxj_count].j1 = j1;
+	  h_idxj[idxj_count].j2 = j2;
+	  h_idxj[idxj_count].j = j;
+	  idxj_count++;
+	}
+	h_idxj_full[idxj_full_count].j1 = j1;
+	h_idxj_full[idxj_full_count].j2 = j2;
+	h_idxj_full[idxj_full_count].j = j;
+	idxj_full_count++;
+      }
+  Kokkos::deep_copy(idxj,h_idxj);
+  Kokkos::deep_copy(idxj_full,h_idxj_full);
 
 }
 /* ---------------------------------------------------------------------- */
@@ -1223,26 +1216,10 @@ int SNAKokkos<DeviceType>::compute_ncoeff()
   ncount = 0;
 
   for (int j1 = 0; j1 <= twojmax; j1++)
-    if(diagonalstyle == 0) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          ncount++;
-    } else if(diagonalstyle == 1) {
-      int j2 = j1;
-
+    for (int j2 = 0; j2 <= j1; j2++)
       for (int j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2)
-        ncount++;
-    } else if(diagonalstyle == 2) {
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) ncount++;
-    }
-
+	   j <= MIN(twojmax, j1 + j2); j += 2)
+	if (j >= j1) ncount++;
   return ncount;
 }
 
diff --git a/src/SNAP/compute_sna_atom.cpp b/src/SNAP/compute_sna_atom.cpp
index 5ca63a7e85..cc7a84281e 100644
--- a/src/SNAP/compute_sna_atom.cpp
+++ b/src/SNAP/compute_sna_atom.cpp
@@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"
 
 using namespace LAMMPS_NS;
 
@@ -45,7 +44,6 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
 
   // default values
 
-  diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
   bzeroflag = 1;
@@ -85,14 +83,7 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
   int iarg = nargmin;
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      diagonalstyle = atoi(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
       if (iarg+2 > narg)
         error->all(FLERR,"Illegal compute sna/atom command");
       rmin0 = atof(arg[iarg+1]);
@@ -115,28 +106,16 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute sna/atom command");
   }
 
-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   rmin0,switchflag,bzeroflag);
 
-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
   size_peratom_cols = ncoeff;
   if (quadraticflag) size_peratom_cols += (ncoeff*(ncoeff+1))/2;
   peratom_flag = 1;
 
   nmax = 0;
-  njmax = 0;
   sna = NULL;
-
 }
 
 /* ---------------------------------------------------------------------- */
@@ -147,9 +126,7 @@ ComputeSNAAtom::~ComputeSNAAtom()
   memory->destroy(radelem);
   memory->destroy(wjelem);
   memory->destroy(cutsq);
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -176,13 +153,7 @@ void ComputeSNAAtom::init()
     if (strcmp(modify->compute[i]->style,"sna/atom") == 0) count++;
   if (count > 1 && comm->me == 0)
     error->warning(FLERR,"More than one compute sna/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -223,11 +194,7 @@ void ComputeSNAAtom::compute_peratom()
   double** const x = atom->x;
   const int* const mask = atom->mask;
 
-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
   for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
     const int i = ilist[ii];
     if (mask[i] & groupbit) {
 
@@ -241,7 +208,7 @@ void ComputeSNAAtom::compute_peratom()
 
       // insure rij, inside, and typej  are of size jnum
 
-      snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);
 
       // rij[][3] = displacements between atom I and those neighbors
       // inside = indices of neighbors of I within cutoff
@@ -258,26 +225,25 @@ void ComputeSNAAtom::compute_peratom()
         const double rsq = delx*delx + dely*dely + delz*delz;
         int jtype = type[j];
         if (rsq < cutsq[itype][jtype] && rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
           ninside++;
         }
       }
 
-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
-      snaptr[tid]->compute_bi();
-      snaptr[tid]->copy_bi2bvec();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
+      snaptr->compute_bi();
       for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-        sna[i][icoeff] = snaptr[tid]->bvec[icoeff];
+        sna[i][icoeff] = snaptr->blist[icoeff];
       if (quadraticflag) {
         int ncount = ncoeff;
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bi = snaptr[tid]->bvec[icoeff];
+          double bi = snaptr->blist[icoeff];
 
           // diagonal element of quadratic matrix
 
@@ -286,7 +252,7 @@ void ComputeSNAAtom::compute_peratom()
           // upper-triangular elements of quadratic matrix
 
           for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++)
-            sna[i][ncount++] = bi*snaptr[tid]->bvec[jcoeff];
+            sna[i][ncount++] = bi*snaptr->blist[jcoeff];
         }
       }
     } else {
@@ -302,10 +268,9 @@ void ComputeSNAAtom::compute_peratom()
 
 double ComputeSNAAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
+  double bytes = nmax*size_peratom_cols * sizeof(double); // sna
+  bytes += snaptr->memory_usage();                        // SNA object
+
   return bytes;
 }
 
diff --git a/src/SNAP/compute_sna_atom.h b/src/SNAP/compute_sna_atom.h
index 2f6fb18996..105a62a37a 100644
--- a/src/SNAP/compute_sna_atom.h
+++ b/src/SNAP/compute_sna_atom.h
@@ -34,7 +34,7 @@ class ComputeSNAAtom : public Compute {
   double memory_usage();
 
  private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
   int ncoeff;
   double **cutsq;
   class NeighList *list;
@@ -42,10 +42,9 @@ class ComputeSNAAtom : public Compute {
   double rcutfac;
   double *radelem;
   double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
   double cutmax;
   int quadraticflag;
-  int nthreads;
 };
 
 }
diff --git a/src/SNAP/compute_snad_atom.cpp b/src/SNAP/compute_snad_atom.cpp
index b0395d5317..37587a0aae 100644
--- a/src/SNAP/compute_snad_atom.cpp
+++ b/src/SNAP/compute_snad_atom.cpp
@@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"
 
 using namespace LAMMPS_NS;
 
@@ -45,7 +44,6 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
 
   // default values
 
-  diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
   bzeroflag = 1;
@@ -83,14 +81,7 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
   int iarg = nargmin;
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      diagonalstyle = atof(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
       if (iarg+2 > narg)
         error->all(FLERR,"Illegal compute snad/atom command");
       rmin0 = atof(arg[iarg+1]);
@@ -113,20 +104,10 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute snad/atom command");
   }
 
-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   rmin0,switchflag,bzeroflag);
 
-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
   nperdim = ncoeff;
   if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
   yoffset = nperdim;
@@ -136,9 +117,7 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
   peratom_flag = 1;
 
   nmax = 0;
-  njmax = 0;
   snad = NULL;
-
 }
 
 /* ---------------------------------------------------------------------- */
@@ -149,9 +128,7 @@ ComputeSNADAtom::~ComputeSNADAtom()
   memory->destroy(radelem);
   memory->destroy(wjelem);
   memory->destroy(cutsq);
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -178,13 +155,7 @@ void ComputeSNADAtom::init()
     if (strcmp(modify->compute[i]->style,"snad/atom") == 0) count++;
   if (count > 1 && comm->me == 0)
     error->warning(FLERR,"More than one compute snad/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -235,11 +206,7 @@ void ComputeSNADAtom::compute_peratom()
   double** const x = atom->x;
   const int* const mask = atom->mask;
 
-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
   for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
     const int i = ilist[ii];
     if (mask[i] & groupbit) {
 
@@ -258,7 +225,7 @@ void ComputeSNADAtom::compute_peratom()
 
       // insure rij, inside, and typej  are of size jnum
 
-      snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);
 
       // rij[][3] = displacements between atom I and those neighbors
       // inside = indices of neighbors of I within cutoff
@@ -276,30 +243,28 @@ void ComputeSNADAtom::compute_peratom()
         const double rsq = delx*delx + dely*dely + delz*delz;
         int jtype = type[j];
         if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
           ninside++;
         }
       }
 
-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
       if (quadraticflag) {
-        snaptr[tid]->compute_bi();
-        snaptr[tid]->copy_bi2bvec();
+        snaptr->compute_bi();
       }
 
       for (int jj = 0; jj < ninside; jj++) {
-        const int j = snaptr[tid]->inside[jj];
-        snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
-                                    snaptr[tid]->wj[jj],
-                                    snaptr[tid]->rcutij[jj]);
-        snaptr[tid]->compute_dbidrj();
-        snaptr[tid]->copy_dbi2dbvec();
+        const int j = snaptr->inside[jj];
+        snaptr->compute_duidrj(snaptr->rij[jj],
+                                    snaptr->wj[jj],
+                                    snaptr->rcutij[jj]);
+        snaptr->compute_dbidrj();
 
         // Accumulate -dBi/dRi, -dBi/dRj
 
@@ -307,12 +272,12 @@ void ComputeSNADAtom::compute_peratom()
         double *snadj = snad[j]+typeoffset;
 
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snadi[icoeff] += snaptr[tid]->dbvec[icoeff][0];
-          snadi[icoeff+yoffset] += snaptr[tid]->dbvec[icoeff][1];
-          snadi[icoeff+zoffset] += snaptr[tid]->dbvec[icoeff][2];
-          snadj[icoeff] -= snaptr[tid]->dbvec[icoeff][0];
-          snadj[icoeff+yoffset] -= snaptr[tid]->dbvec[icoeff][1];
-          snadj[icoeff+zoffset] -= snaptr[tid]->dbvec[icoeff][2];
+          snadi[icoeff] += snaptr->dblist[icoeff][0];
+          snadi[icoeff+yoffset] += snaptr->dblist[icoeff][1];
+          snadi[icoeff+zoffset] += snaptr->dblist[icoeff][2];
+          snadj[icoeff] -= snaptr->dblist[icoeff][0];
+          snadj[icoeff+yoffset] -= snaptr->dblist[icoeff][1];
+          snadj[icoeff+zoffset] -= snaptr->dblist[icoeff][2];
         }
 
         if (quadraticflag) {
@@ -321,10 +286,10 @@ void ComputeSNADAtom::compute_peratom()
           snadj += quadraticoffset;
           int ncount = 0;
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr[tid]->bvec[icoeff];
-            double bix = snaptr[tid]->dbvec[icoeff][0];
-            double biy = snaptr[tid]->dbvec[icoeff][1];
-            double biz = snaptr[tid]->dbvec[icoeff][2];
+            double bi = snaptr->blist[icoeff];
+            double bix = snaptr->dblist[icoeff][0];
+            double biy = snaptr->dblist[icoeff][1];
+            double biz = snaptr->dblist[icoeff][2];
 
             // diagonal elements of quadratic matrix
 
@@ -343,12 +308,12 @@ void ComputeSNADAtom::compute_peratom()
             // upper-triangular elements of quadratic matrix
 
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
-                + bix*snaptr[tid]->bvec[jcoeff];
-              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
-                + biy*snaptr[tid]->bvec[jcoeff];
-              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
-                + biz*snaptr[tid]->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dblist[jcoeff][0]
+                + bix*snaptr->blist[jcoeff];
+              double dbytmp = bi*snaptr->dblist[jcoeff][1]
+                + biy*snaptr->blist[jcoeff];
+              double dbztmp = bi*snaptr->dblist[jcoeff][2]
+                + biz*snaptr->blist[jcoeff];
 
               snadi[ncount] +=         dbxtmp;
               snadi[ncount+yoffset] += dbytmp;
@@ -404,10 +369,9 @@ void ComputeSNADAtom::unpack_reverse_comm(int n, int *list, double *buf)
 
 double ComputeSNADAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
-  bytes += 3*nperdim*atom->ntypes;
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
+
+  double bytes = nmax*size_peratom_cols * sizeof(double); // snad
+  bytes += snaptr->memory_usage();                        // SNA object
+
   return bytes;
 }
diff --git a/src/SNAP/compute_snad_atom.h b/src/SNAP/compute_snad_atom.h
index 92003a9bc5..ac353d8553 100644
--- a/src/SNAP/compute_snad_atom.h
+++ b/src/SNAP/compute_snad_atom.h
@@ -36,7 +36,7 @@ class ComputeSNADAtom : public Compute {
   double memory_usage();
 
  private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
   int ncoeff, nperdim, yoffset, zoffset;
   double **cutsq;
   class NeighList *list;
@@ -44,10 +44,9 @@ class ComputeSNADAtom : public Compute {
   double rcutfac;
   double *radelem;
   double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
   double cutmax;
   int quadraticflag;
-  int nthreads;
 };
 
 }
diff --git a/src/SNAP/compute_snav_atom.cpp b/src/SNAP/compute_snav_atom.cpp
index b2d555f713..1f702496ed 100644
--- a/src/SNAP/compute_snav_atom.cpp
+++ b/src/SNAP/compute_snav_atom.cpp
@@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"
 
 using namespace LAMMPS_NS;
 
@@ -45,7 +44,6 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
 
   // default values
 
-  diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
   bzeroflag = 1;
@@ -79,14 +77,7 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
   int iarg = nargmin;
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      diagonalstyle = atof(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
       if (iarg+2 > narg)
         error->all(FLERR,"Illegal compute snav/atom command");
       rmin0 = atof(arg[iarg+1]);
@@ -109,20 +100,10 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
     } else error->all(FLERR,"Illegal compute snav/atom command");
   }
 
-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   rmin0,switchflag,bzeroflag);
 
-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
   nperdim = ncoeff;
   if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
   size_peratom_cols = 6*nperdim*atom->ntypes;
@@ -130,7 +111,6 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
   peratom_flag = 1;
 
   nmax = 0;
-  njmax = 0;
   snav = NULL;
 
 }
@@ -144,9 +124,7 @@ ComputeSNAVAtom::~ComputeSNAVAtom()
   memory->destroy(wjelem);
   memory->destroy(cutsq);
 
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -174,13 +152,7 @@ void ComputeSNAVAtom::init()
     if (strcmp(modify->compute[i]->style,"snav/atom") == 0) count++;
   if (count > 1 && comm->me == 0)
     error->warning(FLERR,"More than one compute snav/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -230,11 +202,7 @@ void ComputeSNAVAtom::compute_peratom()
   double** const x = atom->x;
   const int* const mask = atom->mask;
 
-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
   for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
     const int i = ilist[ii];
     if (mask[i] & groupbit) {
 
@@ -251,7 +219,7 @@ void ComputeSNAVAtom::compute_peratom()
 
       // insure rij, inside, and typej  are of size jnum
 
-          snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);
 
       // rij[][3] = displacements between atom I and those neighbors
       // inside = indices of neighbors of I within cutoff
@@ -269,31 +237,29 @@ void ComputeSNAVAtom::compute_peratom()
         const double rsq = delx*delx + dely*dely + delz*delz;
         int jtype = type[j];
         if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
           ninside++;
         }
       }
 
-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
       if (quadraticflag) {
-        snaptr[tid]->compute_bi();
-        snaptr[tid]->copy_bi2bvec();
+        snaptr->compute_bi();
       }
 
       for (int jj = 0; jj < ninside; jj++) {
-        const int j = snaptr[tid]->inside[jj];
+        const int j = snaptr->inside[jj];
 
-        snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
-                                    snaptr[tid]->wj[jj],
-                                    snaptr[tid]->rcutij[jj]);
-        snaptr[tid]->compute_dbidrj();
-        snaptr[tid]->copy_dbi2dbvec();
+        snaptr->compute_duidrj(snaptr->rij[jj],
+                                    snaptr->wj[jj],
+                                    snaptr->rcutij[jj]);
+        snaptr->compute_dbidrj();
 
         // Accumulate -dBi/dRi*Ri, -dBi/dRj*Rj
 
@@ -301,18 +267,18 @@ void ComputeSNAVAtom::compute_peratom()
         double *snavj = snav[j]+typeoffset;
 
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snavi[icoeff]           += snaptr[tid]->dbvec[icoeff][0]*xtmp;
-          snavi[icoeff+nperdim]   += snaptr[tid]->dbvec[icoeff][1]*ytmp;
-          snavi[icoeff+2*nperdim] += snaptr[tid]->dbvec[icoeff][2]*ztmp;
-          snavi[icoeff+3*nperdim] += snaptr[tid]->dbvec[icoeff][1]*ztmp;
-          snavi[icoeff+4*nperdim] += snaptr[tid]->dbvec[icoeff][0]*ztmp;
-          snavi[icoeff+5*nperdim] += snaptr[tid]->dbvec[icoeff][0]*ytmp;
-          snavj[icoeff]           -= snaptr[tid]->dbvec[icoeff][0]*x[j][0];
-          snavj[icoeff+nperdim]   -= snaptr[tid]->dbvec[icoeff][1]*x[j][1];
-          snavj[icoeff+2*nperdim] -= snaptr[tid]->dbvec[icoeff][2]*x[j][2];
-          snavj[icoeff+3*nperdim] -= snaptr[tid]->dbvec[icoeff][1]*x[j][2];
-          snavj[icoeff+4*nperdim] -= snaptr[tid]->dbvec[icoeff][0]*x[j][2];
-          snavj[icoeff+5*nperdim] -= snaptr[tid]->dbvec[icoeff][0]*x[j][1];
+          snavi[icoeff]           += snaptr->dblist[icoeff][0]*xtmp;
+          snavi[icoeff+nperdim]   += snaptr->dblist[icoeff][1]*ytmp;
+          snavi[icoeff+2*nperdim] += snaptr->dblist[icoeff][2]*ztmp;
+          snavi[icoeff+3*nperdim] += snaptr->dblist[icoeff][1]*ztmp;
+          snavi[icoeff+4*nperdim] += snaptr->dblist[icoeff][0]*ztmp;
+          snavi[icoeff+5*nperdim] += snaptr->dblist[icoeff][0]*ytmp;
+          snavj[icoeff]           -= snaptr->dblist[icoeff][0]*x[j][0];
+          snavj[icoeff+nperdim]   -= snaptr->dblist[icoeff][1]*x[j][1];
+          snavj[icoeff+2*nperdim] -= snaptr->dblist[icoeff][2]*x[j][2];
+          snavj[icoeff+3*nperdim] -= snaptr->dblist[icoeff][1]*x[j][2];
+          snavj[icoeff+4*nperdim] -= snaptr->dblist[icoeff][0]*x[j][2];
+          snavj[icoeff+5*nperdim] -= snaptr->dblist[icoeff][0]*x[j][1];
         }
 
         if (quadraticflag) {
@@ -321,10 +287,10 @@ void ComputeSNAVAtom::compute_peratom()
           snavj += quadraticoffset;
           int ncount = 0;
           for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr[tid]->bvec[icoeff];
-            double bix = snaptr[tid]->dbvec[icoeff][0];
-            double biy = snaptr[tid]->dbvec[icoeff][1];
-            double biz = snaptr[tid]->dbvec[icoeff][2];
+            double bi = snaptr->blist[icoeff];
+            double bix = snaptr->dblist[icoeff][0];
+            double biy = snaptr->dblist[icoeff][1];
+            double biz = snaptr->dblist[icoeff][2];
 
             // diagonal element of quadratic matrix
 
@@ -348,12 +314,12 @@ void ComputeSNAVAtom::compute_peratom()
             // upper-triangular elements of quadratic matrix
 
             for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
-                + bix*snaptr[tid]->bvec[jcoeff];
-              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
-                + biy*snaptr[tid]->bvec[jcoeff];
-              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
-                + biz*snaptr[tid]->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dblist[jcoeff][0]
+                + bix*snaptr->blist[jcoeff];
+              double dbytmp = bi*snaptr->dblist[jcoeff][1]
+                + biy*snaptr->blist[jcoeff];
+              double dbztmp = bi*snaptr->dblist[jcoeff][2]
+                + biz*snaptr->blist[jcoeff];
               snavi[ncount] +=           dbxtmp*xtmp;
               snavi[ncount+nperdim] +=   dbytmp*ytmp;
               snavi[ncount+2*nperdim] += dbztmp*ztmp;
@@ -414,11 +380,8 @@ void ComputeSNAVAtom::unpack_reverse_comm(int n, int *list, double *buf)
 
 double ComputeSNAVAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
-  bytes += 6*nperdim*atom->ntypes;
-  if (quadraticflag) bytes += 6*nperdim*atom->ntypes;
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
+  double bytes = nmax*size_peratom_cols * sizeof(double); // snav
+  bytes += snaptr->memory_usage();                        // SNA object
+
   return bytes;
 }
diff --git a/src/SNAP/compute_snav_atom.h b/src/SNAP/compute_snav_atom.h
index 9be5e1d389..9df17cc667 100644
--- a/src/SNAP/compute_snav_atom.h
+++ b/src/SNAP/compute_snav_atom.h
@@ -36,7 +36,7 @@ class ComputeSNAVAtom : public Compute {
   double memory_usage();
 
  private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
   int ncoeff, nperdim;
   double **cutsq;
   class NeighList *list;
@@ -44,9 +44,8 @@ class ComputeSNAVAtom : public Compute {
   double rcutfac;
   double *radelem;
   double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
   int quadraticflag;
-  int nthreads;
 };
 
 }
diff --git a/src/SNAP/openmp_snap.h b/src/SNAP/openmp_snap.h
deleted file mode 100644
index 60a3138c9c..0000000000
--- a/src/SNAP/openmp_snap.h
+++ /dev/null
@@ -1,16 +0,0 @@
-
-#ifndef LMP_OPENMP_SNAP_H
-#define LMP_OPENMP_SNAP_H
-
-#if defined(_OPENMP)
-#include <omp.h>
-#else
-enum omp_sched_t {omp_sched_static, omp_sched_dynamic, omp_sched_guided, omp_sched_auto};
-inline int omp_get_thread_num() { return 0;}
-inline int omp_set_num_threads(int num_threads) {return 1;}
-/* inline int __sync_fetch_and_add(int* ptr, int value) {int tmp = *ptr; ptr[0]+=value; return tmp;} */
-inline void omp_set_schedule(omp_sched_t schedule,int modifier=1) {}
-inline int omp_in_parallel() {return 0;}
-#endif
-
-#endif
diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 8b547e6e73..6eb05f85a4 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -23,7 +23,6 @@
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "sna.h"
-#include "openmp_snap.h"
 #include "domain.h"
 #include "memory.h"
 #include "error.h"
@@ -35,10 +34,6 @@ using namespace LAMMPS_NS;
 #define MAXLINE 1024
 #define MAXWORD 3
 
-// Outstanding issues with quadratic term
-// 1. there seems to a problem with compute_optimized energy calc
-// it does not match compute_regular, even when quadratic coeffs = 0
-
 /* ---------------------------------------------------------------------- */
 
 PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
@@ -54,53 +49,9 @@ PairSNAP::PairSNAP(LAMMPS *lmp) : Pair(lmp)
   wjelem = NULL;
   coeffelem = NULL;
 
-  nmax = 0;
-  nthreads = 1;
-
-  schedule_user = 0;
-  schedule_time_guided = -1;
-  schedule_time_dynamic = -1;
-  ncalls_neigh =-1;
-
-  ilistmask_max = 0;
-  ilistmask = NULL;
-  ghostinum = 0;
-  ghostilist_max = 0;
-  ghostilist = NULL;
-  ghostnumneigh_max = 0;
-  ghostnumneigh = NULL;
-  ghostneighs = NULL;
-  ghostfirstneigh = NULL;
-  ghostneighs_total = 0;
-  ghostneighs_max = 0;
-
-  i_max = 0;
-  i_neighmax = 0;
-  i_numpairs = 0;
-  i_rij = NULL;
-  i_inside = NULL;
-  i_wj = NULL;
-  i_rcutij = NULL;
-  i_ninside = NULL;
-  i_pairs = NULL;
-  i_uarraytot_r = NULL;
-  i_uarraytot_i = NULL;
-  i_zarray_r = NULL;
-  i_zarray_i =NULL;
-
-  use_shared_arrays = 0;
-
-#ifdef TIMING_INFO
-  timers[0] = 0;
-  timers[1] = 0;
-  timers[2] = 0;
-  timers[3] = 0;
-#endif
-
-  // Need to set this because restart not handled by PairHybrid
-
-  sna = NULL;
-
+  beta_max = 0;
+  beta = NULL;
+  bispectrum = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -118,35 +69,10 @@ PairSNAP::~PairSNAP()
     memory->destroy(coeffelem);
   }
 
-  // Need to set this because restart not handled by PairHybrid
+  memory->destroy(beta);
+  memory->destroy(bispectrum);
 
-  if (sna) {
-
-#ifdef TIMING_INFO
-    double time[5];
-    double timeave[5];
-    double timeave_mpi[5];
-    double timemax_mpi[5];
-
-    for (int i = 0; i < 5; i++) {
-      time[i] = 0;
-      timeave[i] = 0;
-      for (int tid = 0; tid<nthreads; tid++) {
-        if (sna[tid]->timers[i]>time[i])
-          time[i] = sna[tid]->timers[i];
-        timeave[i] += sna[tid]->timers[i];
-      }
-      timeave[i] /= nthreads;
-    }
-    MPI_Reduce(timeave, timeave_mpi, 5, MPI_DOUBLE, MPI_SUM, 0, world);
-    MPI_Reduce(time, timemax_mpi, 5, MPI_DOUBLE, MPI_MAX, 0, world);
-#endif
-
-    for (int tid = 0; tid<nthreads; tid++)
-      delete sna[tid];
-    delete [] sna;
-
-  }
+  delete snaptr;
 
   if (allocated) {
     memory->destroy(setflag);
@@ -156,19 +82,11 @@ PairSNAP::~PairSNAP()
 
 }
 
-void PairSNAP::compute(int eflag, int vflag)
-{
-  if (use_optimized)
-    compute_optimized(eflag, vflag);
-  else
-    compute_regular(eflag, vflag);
-}
-
 /* ----------------------------------------------------------------------
    This version is a straightforward implementation
    ---------------------------------------------------------------------- */
 
-void PairSNAP::compute_regular(int eflag, int vflag)
+void PairSNAP::compute(int eflag, int vflag)
 {
   int i,j,jnum,ninside;
   double delx,dely,delz,evdwl,rsq;
@@ -183,7 +101,18 @@ void PairSNAP::compute_regular(int eflag, int vflag)
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int newton_pair = force->newton_pair;
-  class SNA* snaptr = sna[0];
+
+  if (beta_max < list->inum) {
+    memory->grow(beta,list->inum,ncoeff,"PairSNAP:beta");
+    memory->grow(bispectrum,list->inum,ncoeff,"PairSNAP:bispectrum");
+    beta_max = list->inum;
+  }
+
+  // compute dE_i/dB_i = beta_i for all i in list
+
+  if (quadraticflag || eflag) 
+    compute_bispectrum();
+  compute_beta();
 
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
@@ -233,66 +162,22 @@ void PairSNAP::compute_regular(int eflag, int vflag)
       }
     }
 
-    // compute Ui, Zi, and Bi for atom I
+    // compute Ui, Yi for atom I
 
     snaptr->compute_ui(ninside);
-    snaptr->compute_zi();
-    if (quadraticflag) {
-      snaptr->compute_bi();
-      snaptr->copy_bi2bvec();
-    }
 
     // for neighbors of I within cutoff:
-    // compute dUi/drj and dBi/drj
-    // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj
+    // compute Fij = dEi/dRj = -dEi/dRi 
+    // add to Fi, subtract from Fj
 
-    double* coeffi = coeffelem[ielem];
+    snaptr->compute_yi(beta[ii]);
 
     for (int jj = 0; jj < ninside; jj++) {
       int j = snaptr->inside[jj];
       snaptr->compute_duidrj(snaptr->rij[jj],
                              snaptr->wj[jj],snaptr->rcutij[jj]);
 
-      snaptr->compute_dbidrj();
-      snaptr->copy_dbi2dbvec();
-
-      fij[0] = 0.0;
-      fij[1] = 0.0;
-      fij[2] = 0.0;
-
-      // linear contributions
-
-      for (int k = 1; k <= ncoeff; k++) {
-        double bgb = coeffi[k];
-        fij[0] += bgb*snaptr->dbvec[k-1][0];
-        fij[1] += bgb*snaptr->dbvec[k-1][1];
-        fij[2] += bgb*snaptr->dbvec[k-1][2];
-      }
-
-      // quadratic contributions
-
-      if (quadraticflag) {
-        int k = ncoeff+1;
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bveci = snaptr->bvec[icoeff];
-          double fack = coeffi[k]*bveci;
-          double* dbveci = snaptr->dbvec[icoeff];
-          fij[0] += fack*dbveci[0];
-          fij[1] += fack*dbveci[1];
-          fij[2] += fack*dbveci[2];
-          k++;
-          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-            double facki = coeffi[k]*bveci;
-            double fackj = coeffi[k]*snaptr->bvec[jcoeff];
-            double* dbvecj = snaptr->dbvec[jcoeff];
-
-            fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
-            fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
-            fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
-            k++;
-          }
-        }
-      }
+      snaptr->compute_deidrj(fij);
 
       f[i][0] += fij[0];
       f[i][1] += fij[1];
@@ -316,31 +201,26 @@ void PairSNAP::compute_regular(int eflag, int vflag)
 
       // evdwl = energy of atom I, sum over coeffs_k * Bi_k
 
+      double* coeffi = coeffelem[ielem];
       evdwl = coeffi[0];
-      if (!quadraticflag) {
-        snaptr->compute_bi();
-        snaptr->copy_bi2bvec();
-      }
 
       // E = beta.B + 0.5*B^t.alpha.B
-      // coeff[k] = beta[k-1] or
-      // coeff[k] = alpha_ii or
-      // coeff[k] = alpha_ij = alpha_ji, j != i
 
       // linear contributions
 
-      for (int k = 1; k <= ncoeff; k++)
-        evdwl += coeffi[k]*snaptr->bvec[k-1];
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+        evdwl += coeffi[icoeff+1]*bispectrum[ii][icoeff];
 
       // quadratic contributions
 
       if (quadraticflag) {
         int k = ncoeff+1;
         for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bveci = snaptr->bvec[icoeff];
+          double bveci = bispectrum[ii][icoeff];
           evdwl += 0.5*coeffi[k++]*bveci*bveci;
           for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-            evdwl += coeffi[k++]*bveci*snaptr->bvec[jcoeff];
+            double bvecj = bispectrum[ii][jcoeff];
+            evdwl += coeffi[k++]*bveci*bvecj;
           }
         }
       }
@@ -352,948 +232,106 @@ void PairSNAP::compute_regular(int eflag, int vflag)
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
+/* ----------------------------------------------------------------------
+   compute beta
+------------------------------------------------------------------------- */
+
+void PairSNAP::compute_beta()
+{
+  int i;
+  int *type = atom->type;
+
+  for (int ii = 0; ii < list->inum; ii++) {
+    i = list->ilist[ii];
+    const int itype = type[i];
+    const int ielem = map[itype];
+    double* coeffi = coeffelem[ielem];
+
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+      beta[ii][icoeff] = coeffi[icoeff+1];
+
+    if (quadraticflag) {
+      int k = ncoeff+1;
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+        double bveci = bispectrum[ii][icoeff];
+        beta[ii][icoeff] += coeffi[k]*bveci;
+        k++;
+        for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+          double bvecj = bispectrum[ii][jcoeff];
+          beta[ii][icoeff] += coeffi[k]*bvecj;
+          beta[ii][jcoeff] += coeffi[k]*bveci;
+          k++;
+        }
+      }
+    }
+  }
+}
 
 /* ----------------------------------------------------------------------
-   This version is optimized for threading, micro-load balancing
-   ---------------------------------------------------------------------- */
+   compute bispectrum
+------------------------------------------------------------------------- */
 
-void PairSNAP::compute_optimized(int eflag, int vflag)
+void PairSNAP::compute_bispectrum()
 {
-  // if reneighboring took place do load_balance if requested
-  if (do_load_balance > 0 &&
-      (neighbor->ncalls != ncalls_neigh)) {
-    ghostinum = 0;
-    // reset local ghost neighbor lists
-    ncalls_neigh = neighbor->ncalls;
-    if (ilistmask_max < list->inum) {
-      memory->grow(ilistmask,list->inum,"PairSnap::ilistmask");
-      ilistmask_max = list->inum;
-    }
-    for (int i = 0; i < list->inum; i++)
-      ilistmask[i] = 1;
+  int i,j,jnum,ninside;
+  double delx,dely,delz,rsq;
+  int *jlist,*numneigh,**firstneigh;
 
-    //multiple passes for loadbalancing
-    for (int i = 0; i < do_load_balance; i++)
-      load_balance();
-  }
+  double **x = atom->x;
+  int *type = atom->type;
 
-  int numpairs = 0;
   for (int ii = 0; ii < list->inum; ii++) {
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      int i = list->ilist[ii];
-      int jnum = list->numneigh[i];
-      numpairs += jnum;
-    }
-  }
+    i = list->ilist[ii];
 
-  if (do_load_balance)
-    for (int ii = 0; ii < ghostinum; ii++) {
-      int i = ghostilist[ii];
-      int jnum = ghostnumneigh[i];
-      numpairs += jnum;
-    }
-
-  // optimized schedule setting
-
-  int time_dynamic = 0;
-  int time_guided = 0;
-
-  if (schedule_user == 0) schedule_user = 4;
-
-  switch (schedule_user) {
-  case 1:
-    omp_set_schedule(omp_sched_static,1);
-    break;
-  case 2:
-    omp_set_schedule(omp_sched_dynamic,1);
-    break;
-  case 3:
-    omp_set_schedule(omp_sched_guided,2);
-    break;
-  case 4:
-    omp_set_schedule(omp_sched_auto,0);
-    break;
-  case 5:
-    if (numpairs < 8*nthreads) omp_set_schedule(omp_sched_dynamic,1);
-    else if (schedule_time_guided < 0.0) {
-      omp_set_schedule(omp_sched_guided,2);
-      if (!eflag && !vflag) time_guided = 1;
-    } else if (schedule_time_dynamic<0.0) {
-      omp_set_schedule(omp_sched_dynamic,1);
-      if (!eflag && !vflag) time_dynamic = 1;
-    } else if (schedule_time_guided<schedule_time_dynamic)
-      omp_set_schedule(omp_sched_guided,2);
-    else
-      omp_set_schedule(omp_sched_dynamic,1);
-    break;
-  }
-
-  if (use_shared_arrays)
-    build_per_atom_arrays();
-
-#if defined(_OPENMP)
-#pragma omp parallel shared(eflag,vflag,time_dynamic,time_guided) firstprivate(numpairs) default(none)
-#endif
-  {
-    // begin of pragma omp parallel
-
-    int tid = omp_get_thread_num();
-    int** pairs_tid_unique = NULL;
-
-    int** pairs;
-    if (use_shared_arrays) pairs = i_pairs;
-    else {
-      memory->create(pairs_tid_unique,numpairs,4,"numpairs");
-      pairs = pairs_tid_unique;
-    }
-
-    if (!use_shared_arrays) {
-      numpairs = 0;
-      for (int ii = 0; ii < list->inum; ii++) {
-        if ((do_load_balance <= 0) || ilistmask[ii]) {
-          int i = list->ilist[ii];
-          int jnum = list->numneigh[i];
-          for (int jj = 0; jj<jnum; jj++) {
-            pairs[numpairs][0] = i;
-            pairs[numpairs][1] = jj;
-            pairs[numpairs][2] = -1;
-            numpairs++;
-          }
-        }
-      }
-
-      for (int ii = 0; ii < ghostinum; ii++) {
-        int i = ghostilist[ii];
-        int jnum = ghostnumneigh[i];
-        for (int jj = 0; jj<jnum; jj++) {
-          pairs[numpairs][0] = i;
-          pairs[numpairs][1] = jj;
-          pairs[numpairs][2] = -1;
-          numpairs++;
-        }
-      }
-    }
-
-    int ielem;
-    int jj,k,jnum,jtype,ninside;
-    double delx,dely,delz,evdwl,rsq;
-    double fij[3];
-    int *jlist,*numneigh,**firstneigh;
-    evdwl = 0.0;
-
-#if defined(_OPENMP)
-#pragma omp master
-#endif
-    {
-      ev_init(eflag,vflag);
-    }
-
-#if defined(_OPENMP)
-#pragma omp barrier
-    { ; }
-#endif
-
-    double **x = atom->x;
-    double **f = atom->f;
-    int *type = atom->type;
-    int nlocal = atom->nlocal;
-    int newton_pair = force->newton_pair;
-
-    numneigh = list->numneigh;
-    firstneigh = list->firstneigh;
-
-#ifdef TIMING_INFO
-    // only update micro timers after setup
-    static int count=0;
-    if (count<2) {
-      sna[tid]->timers[0] = 0;
-      sna[tid]->timers[1] = 0;
-      sna[tid]->timers[2] = 0;
-      sna[tid]->timers[3] = 0;
-      sna[tid]->timers[4] = 0;
-    }
-    count++;
-#endif
-
-    // did thread start working on interactions of new atom
-    int iold = -1;
-
-    double starttime, endtime;
-    if (time_dynamic || time_guided)
-      starttime = MPI_Wtime();
-
-#if defined(_OPENMP)
-#pragma omp for schedule(runtime)
-#endif
-    for (int iijj = 0; iijj < numpairs; iijj++) {
-      int i = 0;
-      if (use_shared_arrays) {
-        i = i_pairs[iijj][0];
-        if (iold != i) {
-          set_sna_to_shared(tid,i_pairs[iijj][3]);
-          ielem = map[type[i]];
-        }
-        iold = i;
-      } else {
-        i = pairs[iijj][0];
-        if (iold != i) {
-          iold = i;
-          const double xtmp = x[i][0];
-          const double ytmp = x[i][1];
-          const double ztmp = x[i][2];
-          const int itype = type[i];
-          ielem = map[itype];
-          const double radi = radelem[ielem];
-
-          if (i < nlocal) {
-            jlist = firstneigh[i];
-            jnum = numneigh[i];
-          } else {
-            jlist = ghostneighs+ghostfirstneigh[i];
-            jnum = ghostnumneigh[i];
-          }
-
-          // insure rij, inside, wj, and rcutij are of size jnum
-
-          sna[tid]->grow_rij(jnum);
-
-          // rij[][3] = displacements between atom I and those neighbors
-          // inside = indices of neighbors of I within cutoff
-          // wj = weights of neighbors of I within cutoff
-          // rcutij = cutoffs of neighbors of I within cutoff
-          // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
-
-          ninside = 0;
-          for (jj = 0; jj < jnum; jj++) {
-            int j = jlist[jj];
-            j &= NEIGHMASK;
-            delx = x[j][0] - xtmp; //unitialised
-            dely = x[j][1] - ytmp;
-            delz = x[j][2] - ztmp;
-            rsq = delx*delx + dely*dely + delz*delz;
-            jtype = type[j];
-            int jelem = map[jtype];
-
-            if (rsq < cutsq[itype][jtype]&&rsq>1e-20) { //unitialised
-              sna[tid]->rij[ninside][0] = delx;
-              sna[tid]->rij[ninside][1] = dely;
-              sna[tid]->rij[ninside][2] = delz;
-              sna[tid]->inside[ninside] = j;
-              sna[tid]->wj[ninside] = wjelem[jelem];
-              sna[tid]->rcutij[ninside] = (radi + radelem[jelem])*rcutfac;
-              ninside++;
-
-              // update index list with inside index
-              pairs[iijj + (jj - pairs[iijj][1])][2] =
-                ninside-1; //unitialised
-            }
-          }
-
-          // compute Ui and Zi for atom I
-
-          sna[tid]->compute_ui(ninside); //unitialised
-          sna[tid]->compute_zi();
-        }
-      }
-      if (quadraticflag) {
-        sna[tid]->compute_bi();
-        sna[tid]->copy_bi2bvec();
-      }
-
-      // for neighbors of I within cutoff:
-      // compute dUi/drj and dBi/drj
-      // Fij = dEi/dRj = -dEi/dRi => add to Fi, subtract from Fj
-
-      // entry into loop if inside index is set
-
-      double* coeffi = coeffelem[ielem];
-
-      if (pairs[iijj][2] >= 0) {
-        jj = pairs[iijj][2];
-        int j = sna[tid]->inside[jj];
-        sna[tid]->compute_duidrj(sna[tid]->rij[jj],
-                                 sna[tid]->wj[jj],sna[tid]->rcutij[jj]);
-
-        sna[tid]->compute_dbidrj();
-        sna[tid]->copy_dbi2dbvec();
-
-        fij[0] = 0.0;
-        fij[1] = 0.0;
-        fij[2] = 0.0;
-
-        // linear contributions
-
-        for (k = 1; k <= ncoeff; k++) {
-          double bgb = coeffi[k];
-          fij[0] += bgb*sna[tid]->dbvec[k-1][0];
-          fij[1] += bgb*sna[tid]->dbvec[k-1][1];
-          fij[2] += bgb*sna[tid]->dbvec[k-1][2];
-        }
-
-      // quadratic contributions
-
-      if (quadraticflag) {
-        int k = ncoeff+1;
-        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bveci = sna[tid]->bvec[icoeff];
-          double fack = coeffi[k]*bveci;
-          double* dbveci = sna[tid]->dbvec[icoeff];
-          fij[0] += fack*sna[tid]->dbvec[icoeff][0];
-          fij[1] += fack*sna[tid]->dbvec[icoeff][1];
-          fij[2] += fack*sna[tid]->dbvec[icoeff][2];
-          k++;
-          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-            double facki = coeffi[k]*bveci;
-            double fackj = coeffi[k]*sna[tid]->bvec[jcoeff];
-            double* dbvecj = sna[tid]->dbvec[jcoeff];
-            fij[0] += facki*dbvecj[0]+fackj*dbveci[0];
-            fij[1] += facki*dbvecj[1]+fackj*dbveci[1];
-            fij[2] += facki*dbvecj[2]+fackj*dbveci[2];
-            k++;
-          }
-        }
-      }
-
-#if defined(_OPENMP)
-#pragma omp critical
-#endif
-        {
-          f[i][0] += fij[0];
-          f[i][1] += fij[1];
-          f[i][2] += fij[2];
-          f[j][0] -= fij[0];
-          f[j][1] -= fij[1];
-          f[j][2] -= fij[2];
-
-          // tally per-atom virial contribution
-
-          if (vflag)
-            ev_tally_xyz(i,j,nlocal,newton_pair,0.0,0.0,
-                         fij[0],fij[1],fij[2],
-                         -sna[tid]->rij[jj][0],-sna[tid]->rij[jj][1],
-                         -sna[tid]->rij[jj][2]);
-        }
-      }
-
-      // evdwl = energy of atom I, sum over coeffs_k * Bi_k
-      // only call this for first pair of each atom i
-      // if atom has no pairs, eatom=0, which is wrong
-
-      if (eflag&&pairs[iijj][1] == 0) {
-        evdwl = coeffi[0];
-
-        if (!quadraticflag) {
-          sna[tid]->compute_bi();
-          sna[tid]->copy_bi2bvec();
-        }
-
-        // E = beta.B + 0.5*B^t.alpha.B
-        // coeff[k] = beta[k-1] or
-        // coeff[k] = alpha_ii or
-        // coeff[k] = alpha_ij = alpha_ji, j != i
-
-        // linear contributions
-
-        for (int k = 1; k <= ncoeff; k++)
-          evdwl += coeffi[k]*sna[tid]->bvec[k-1];
-
-        // quadratic contributions
-
-        if (quadraticflag) {
-          int k = ncoeff+1;
-          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bveci = sna[tid]->bvec[icoeff];
-            evdwl += 0.5*coeffi[k++]*bveci*bveci;
-            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              evdwl += coeffi[k++]*bveci*sna[tid]->bvec[jcoeff];
-            }
-          }
-        }
-
-#if defined(_OPENMP)
-#pragma omp critical
-#endif
-        ev_tally_full(i,2.0*evdwl,0.0,0.0,0.0,0.0,0.0);
-      }
-
-    }
-    if (time_dynamic || time_guided)
-      endtime = MPI_Wtime();
-    if (time_dynamic) schedule_time_dynamic = endtime - starttime;
-    if (time_guided) schedule_time_guided = endtime - starttime;
-    if (!use_shared_arrays) memory->destroy(pairs);
-
-  }// end of pragma omp parallel
-
-  if (vflag_fdotr) virial_fdotr_compute();
-
-}
-
-inline int PairSNAP::equal(double* x,double* y)
-{
-  double dist2 =
-    (x[0]-y[0])*(x[0]-y[0]) +
-    (x[1]-y[1])*(x[1]-y[1]) +
-    (x[2]-y[2])*(x[2]-y[2]);
-  if (dist2 < 1e-20) return 1;
-  return 0;
-}
-
-inline double PairSNAP::dist2(double* x,double* y)
-{
-  return
-    (x[0]-y[0])*(x[0]-y[0]) +
-    (x[1]-y[1])*(x[1]-y[1]) +
-    (x[2]-y[2])*(x[2]-y[2]);
-}
-
-// return extra communication cutoff
-// extra_cutoff = max(subdomain_length)
-
-double PairSNAP::extra_cutoff()
-{
-  double sublo[3],subhi[3];
-
-  if (domain->triclinic == 0) {
-    for (int dim = 0 ; dim < 3 ; dim++) {
-      sublo[dim] = domain->sublo[dim];
-      subhi[dim] = domain->subhi[dim];
-    }
-  } else {
-    domain->lamda2x(domain->sublo_lamda,sublo);
-    domain->lamda2x(domain->subhi_lamda,subhi);
-  }
-
-  double sub_size[3];
-  for (int dim = 0; dim < 3; dim++)
-    sub_size[dim] = subhi[dim] - sublo[dim];
-
-  double max_sub_size = 0;
-  for (int dim = 0; dim < 3; dim++)
-    max_sub_size = MAX(max_sub_size,sub_size[dim]);
-
-  // note: for triclinic, probably need something different
-  // see Comm::setup()
-
-  return max_sub_size;
-}
-
-// micro load_balancer: each MPI process will
-// check with each of its 26 neighbors,
-// whether an imbalance exists in the number
-// of atoms to calculate forces for.
-// If it does it will set ilistmask of one of
-// its local atoms to zero, and send its Tag
-// to the neighbor process. The neighboring process
-// will check its ghost list for the
-// ghost atom with the same Tag which is closest
-// to its domain center, and build a
-// neighborlist for this ghost atom. For this to work,
-// the communication cutoff has to be
-// as large as the neighbor cutoff +
-// maximum subdomain length.
-
-// Note that at most one atom is exchanged per processor pair.
-
-// Also note that the local atom assignment
-// doesn't change. This load balancer will cause
-// some ghost atoms to have full neighborlists
-// which are unique to PairSNAP.
-// They are not part of the generally accessible neighborlist.
-// At the same time corresponding local atoms on
-// other MPI processes will not be
-// included in the force computation since
-// their ilistmask is 0. This does not effect
-// any other classes which might
-// access the same general neighborlist.
-// Reverse communication (newton on) of forces is required.
-
-// Currently the load balancer does two passes,
-// since its exchanging atoms upstream and downstream.
-
-void PairSNAP::load_balance()
-{
-  double sublo[3],subhi[3];
-  if (domain->triclinic == 0) {
-    double* sublotmp = domain->sublo;
-    double* subhitmp = domain->subhi;
-    for (int dim = 0 ; dim<3 ; dim++) {
-      sublo[dim]=sublotmp[dim];
-      subhi[dim]=subhitmp[dim];
-    }
-  } else {
-    double* sublotmp = domain->sublo_lamda;
-    double* subhitmp = domain->subhi_lamda;
-    domain->lamda2x(sublotmp,sublo);
-    domain->lamda2x(subhitmp,subhi);
-  }
-
-  //if (list->inum==0) list->grow(atom->nmax);
-
-  int nlocal = ghostinum;
-  for (int i=0; i < list->inum; i++)
-    if (ilistmask[i]) nlocal++;
-  int ***grid2proc = comm->grid2proc;
-  int* procgrid = comm->procgrid;
-
-  int nlocal_up,nlocal_down;
-  MPI_Request request;
-
-  double sub_mid[3];
-  for (int dim=0; dim<3; dim++)
-    sub_mid[dim] = (subhi[dim] + sublo[dim])/2;
-
-  if (comm->cutghostuser <
-      neighbor->cutneighmax+extra_cutoff())
-    error->all(FLERR,"Communication cutoff too small for SNAP micro load balancing");
-
-  int nrecv = ghostinum;
-  int totalsend = 0;
-  int nsend = 0;
-  int depth = 1;
-
-  for (int dx = -depth; dx < depth+1; dx++)
-    for (int dy = -depth; dy < depth+1; dy++)
-      for (int dz = -depth; dz < depth+1; dz++) {
-
-        if (dx == dy && dy == dz && dz == 0) continue;
-
-        int sendloc[3] = {comm->myloc[0],
-                          comm->myloc[1], comm->myloc[2]
-                         };
-        sendloc[0] += dx;
-        sendloc[1] += dy;
-        sendloc[2] += dz;
-        for (int dim = 0; dim < 3; dim++)
-          if (sendloc[dim] >= procgrid[dim])
-            sendloc[dim] = sendloc[dim] - procgrid[dim];
-        for (int dim = 0; dim < 3; dim++)
-          if (sendloc[dim] < 0)
-            sendloc[dim] = procgrid[dim] + sendloc[dim];
-        int recvloc[3] = {comm->myloc[0],
-                          comm->myloc[1], comm->myloc[2]
-                         };
-        recvloc[0] -= dx;
-        recvloc[1] -= dy;
-        recvloc[2] -= dz;
-        for (int dim = 0; dim < 3; dim++)
-          if (recvloc[dim] < 0)
-            recvloc[dim] = procgrid[dim] + recvloc[dim];
-        for (int dim = 0; dim < 3; dim++)
-          if (recvloc[dim] >= procgrid[dim])
-            recvloc[dim] = recvloc[dim] - procgrid[dim];
-
-        int sendproc = grid2proc[sendloc[0]][sendloc[1]][sendloc[2]];
-        int recvproc = grid2proc[recvloc[0]][recvloc[1]][recvloc[2]];
-
-        // two stage process, first upstream movement, then downstream
-
-        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
-                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
-                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-        nsend = 0;
-
-        // send upstream
-
-        if (nlocal > nlocal_up+1) {
-
-          int i = totalsend++;
-          while(i < list->inum && ilistmask[i] == 0)
-            i = totalsend++;
-
-          if (i < list->inum)
-            MPI_Isend(&atom->tag[i],1,MPI_INT,recvproc,0,world,&request);
-          else {
-            int j = -1;
-            MPI_Isend(&j,1,MPI_INT,recvproc,0,world,&request);
-          }
-
-          if (i < list->inum) {
-            for (int j = 0; j < list->inum; j++)
-              if (list->ilist[j] == i)
-                ilistmask[j] = 0;
-            nsend = 1;
-          }
-        }
-
-        // recv downstream
-
-        if (nlocal < nlocal_down-1) {
-          nlocal++;
-          int get_tag = -1;
-          MPI_Recv(&get_tag,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-
-          // if get_tag -1 the other process didnt have local atoms to send
-
-          if (get_tag >= 0) {
-            if (ghostinum >= ghostilist_max) {
-              memory->grow(ghostilist,ghostinum+10,
-                           "PairSnap::ghostilist");
-              ghostilist_max = ghostinum+10;
-            }
-            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
-              ghostnumneigh_max = atom->nlocal+atom->nghost+100;
-              memory->grow(ghostnumneigh,ghostnumneigh_max,
-                           "PairSnap::ghostnumneigh");
-              memory->grow(ghostfirstneigh,ghostnumneigh_max,
-                           "PairSnap::ghostfirstneigh");
-            }
-
-            // find closest ghost image of the transfered particle
-
-            double mindist = 1e200;
-            int closestghost = -1;
-            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
-              if (atom->tag[j] == get_tag)
-                if (dist2(sub_mid, atom->x[j]) < mindist) {
-                  closestghost = j;
-                  mindist = dist2(sub_mid, atom->x[j]);
-                }
-
-            // build neighborlist for this particular
-            // ghost atom, and add it to list->ilist
-
-            if (ghostneighs_max - ghostneighs_total <
-                neighbor->oneatom) {
-              memory->grow(ghostneighs,
-                           ghostneighs_total + neighbor->oneatom,
-                           "PairSnap::ghostneighs");
-              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
-            }
-
-            int j = closestghost;
-
-            ghostilist[ghostinum] = j;
-            ghostnumneigh[j] = 0;
-            ghostfirstneigh[j] = ghostneighs_total;
-
-            ghostinum++;
-            int* jlist = ghostneighs + ghostfirstneigh[j];
-
-            // find all neighbors by looping
-            // over all local and ghost atoms
-
-            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
-              if (dist2(atom->x[j],atom->x[k]) <
-                  neighbor->cutneighmax*neighbor->cutneighmax) {
-                jlist[ghostnumneigh[j]] = k;
-                ghostnumneigh[j]++;
-                ghostneighs_total++;
-              }
-          }
-
-          if (get_tag >= 0) nrecv++;
-        }
-
-        // decrease nlocal later, so that it is the
-        // initial number both for receiving and sending
-
-        if (nsend) nlocal--;
-
-        // second pass through the grid
-
-        MPI_Sendrecv(&nlocal,1,MPI_INT,sendproc,0,
-                     &nlocal_up,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-        MPI_Sendrecv(&nlocal,1,MPI_INT,recvproc,0,
-                     &nlocal_down,1,MPI_INT,sendproc,0,world,MPI_STATUS_IGNORE);
-
-        // send downstream
-
-        nsend=0;
-        if (nlocal > nlocal_down+1) {
-          int i = totalsend++;
-          while(i < list->inum && ilistmask[i]==0) i = totalsend++;
-
-          if (i < list->inum)
-            MPI_Isend(&atom->tag[i],1,MPI_INT,sendproc,0,world,&request);
-          else {
-            int j =- 1;
-            MPI_Isend(&j,1,MPI_INT,sendproc,0,world,&request);
-          }
-
-          if (i < list->inum) {
-            for (int j=0; j<list->inum; j++)
-              if (list->ilist[j] == i) ilistmask[j] = 0;
-            nsend = 1;
-          }
-        }
-
-        // receive upstream
-
-        if (nlocal < nlocal_up-1) {
-          nlocal++;
-          int get_tag = -1;
-
-          MPI_Recv(&get_tag,1,MPI_INT,recvproc,0,world,MPI_STATUS_IGNORE);
-
-          if (get_tag >= 0) {
-            if (ghostinum >= ghostilist_max) {
-              memory->grow(ghostilist,ghostinum+10,
-                           "PairSnap::ghostilist");
-              ghostilist_max = ghostinum+10;
-            }
-            if (atom->nlocal + atom->nghost >= ghostnumneigh_max) {
-              ghostnumneigh_max = atom->nlocal + atom->nghost + 100;
-              memory->grow(ghostnumneigh,ghostnumneigh_max,
-                           "PairSnap::ghostnumneigh");
-              memory->grow(ghostfirstneigh,ghostnumneigh_max,
-                           "PairSnap::ghostfirstneigh");
-            }
-
-            // find closest ghost image of the transfered particle
-
-            double mindist = 1e200;
-            int closestghost = -1;
-            for (int j = 0; j < atom->nlocal + atom->nghost; j++)
-              if (atom->tag[j] == get_tag)
-                if (dist2(sub_mid,atom->x[j])<mindist) {
-                  closestghost = j;
-                  mindist = dist2(sub_mid,atom->x[j]);
-                }
-
-            // build neighborlist for this particular ghost atom
-
-            if (ghostneighs_max-ghostneighs_total < neighbor->oneatom) {
-              memory->grow(ghostneighs,ghostneighs_total + neighbor->oneatom,
-                           "PairSnap::ghostneighs");
-              ghostneighs_max = ghostneighs_total + neighbor->oneatom;
-            }
-
-            int j = closestghost;
-
-            ghostilist[ghostinum] = j;
-            ghostnumneigh[j] = 0;
-            ghostfirstneigh[j] = ghostneighs_total;
-
-            ghostinum++;
-            int* jlist = ghostneighs + ghostfirstneigh[j];
-
-            for (int k = 0; k < atom->nlocal + atom->nghost; k++)
-              if (dist2(atom->x[j],atom->x[k]) <
-                  neighbor->cutneighmax*neighbor->cutneighmax) {
-                jlist[ghostnumneigh[j]] = k;
-                ghostnumneigh[j]++;
-                ghostneighs_total++;
-              }
-          }
-
-          if (get_tag >= 0) nrecv++;
-        }
-        if (nsend) nlocal--;
-      }
-}
-
-void PairSNAP::set_sna_to_shared(int snaid,int i)
-{
-  sna[snaid]->rij = i_rij[i];
-  sna[snaid]->inside = i_inside[i];
-  sna[snaid]->wj = i_wj[i];
-  sna[snaid]->rcutij = i_rcutij[i];
-  sna[snaid]->zarray_r = i_zarray_r[i];
-  sna[snaid]->zarray_i = i_zarray_i[i];
-  sna[snaid]->uarraytot_r = i_uarraytot_r[i];
-  sna[snaid]->uarraytot_i = i_uarraytot_i[i];
-}
-
-void PairSNAP::build_per_atom_arrays()
-{
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-
-  int count = 0;
-  int neighmax = 0;
-  for (int ii = 0; ii < list->inum; ii++)
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      neighmax=MAX(neighmax,list->numneigh[list->ilist[ii]]);
-      ++count;
-    }
-  for (int ii = 0; ii < ghostinum; ii++) {
-    neighmax=MAX(neighmax,ghostnumneigh[ghostilist[ii]]);
-    ++count;
-  }
-
-  if (i_max < count || i_neighmax < neighmax) {
-    int i_maxt = MAX(count,i_max);
-    i_neighmax = MAX(neighmax,i_neighmax);
-    memory->destroy(i_rij);
-    memory->destroy(i_inside);
-    memory->destroy(i_wj);
-    memory->destroy(i_rcutij);
-    memory->destroy(i_ninside);
-    memory->destroy(i_pairs);
-    memory->create(i_rij,i_maxt,i_neighmax,3,"PairSNAP::i_rij");
-    memory->create(i_inside,i_maxt,i_neighmax,"PairSNAP::i_inside");
-    memory->create(i_wj,i_maxt,i_neighmax,"PairSNAP::i_wj");
-    memory->create(i_rcutij,i_maxt,i_neighmax,"PairSNAP::i_rcutij");
-    memory->create(i_ninside,i_maxt,"PairSNAP::i_ninside");
-    memory->create(i_pairs,i_maxt*i_neighmax,4,"PairSNAP::i_pairs");
-  }
-
-  if (i_max < count) {
-    int jdim = sna[0]->twojmax+1;
-    memory->destroy(i_uarraytot_r);
-    memory->destroy(i_uarraytot_i);
-    memory->create(i_uarraytot_r,count,jdim,jdim,jdim,
-                   "PairSNAP::i_uarraytot_r");
-    memory->create(i_uarraytot_i,count,jdim,jdim,jdim,
-                   "PairSNAP::i_uarraytot_i");
-    if (i_zarray_r != NULL)
-      for (int i = 0; i < i_max; i++) {
-        memory->destroy(i_zarray_r[i]);
-        memory->destroy(i_zarray_i[i]);
-      }
-
-    delete [] i_zarray_r;
-    delete [] i_zarray_i;
-    i_zarray_r = new double*****[count];
-    i_zarray_i = new double*****[count];
-    for (int i = 0; i < count; i++) {
-      memory->create(i_zarray_r[i],jdim,jdim,jdim,jdim,jdim,
-                     "PairSNAP::i_zarray_r");
-      memory->create(i_zarray_i[i],jdim,jdim,jdim,jdim,jdim,
-                     "PairSNAP::i_zarray_i");
-    }
-  }
-
-  if (i_max < count)
-    i_max = count;
-
-  count = 0;
-  i_numpairs = 0;
-  for (int ii = 0; ii < list->inum; ii++) {
-    if ((do_load_balance <= 0) || ilistmask[ii]) {
-      int i = list->ilist[ii];
-      int jnum = list->numneigh[i];
-      int* jlist = list->firstneigh[i];
-      const double xtmp = atom->x[i][0];
-      const double ytmp = atom->x[i][1];
-      const double ztmp = atom->x[i][2];
-      const int itype = atom->type[i];
-      const int ielem = map[itype];
-      const double radi = radelem[ielem];
-      int ninside = 0;
-      for (int jj = 0; jj < jnum; jj++) {
-        int j = jlist[jj];
-        j &= NEIGHMASK;
-        const double delx = atom->x[j][0] - xtmp;
-        const double dely = atom->x[j][1] - ytmp;
-        const double delz = atom->x[j][2] - ztmp;
-        const double rsq = delx*delx + dely*dely + delz*delz;
-        int jtype = atom->type[j];
-        int jelem = map[jtype];
-
-        i_pairs[i_numpairs][0] = i;
-        i_pairs[i_numpairs][1] = jj;
-        i_pairs[i_numpairs][2] = -1;
-        i_pairs[i_numpairs][3] = count;
-        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          i_rij[count][ninside][0] = delx;
-          i_rij[count][ninside][1] = dely;
-          i_rij[count][ninside][2] = delz;
-          i_inside[count][ninside] = j;
-          i_wj[count][ninside] = wjelem[jelem];
-          i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
-
-          // update index list with inside index
-          i_pairs[i_numpairs][2] = ninside++;
-        }
-        i_numpairs++;
-      }
-      i_ninside[count] = ninside;
-      count++;
-    }
-  }
-
-  for (int ii = 0; ii < ghostinum; ii++) {
-    int i = ghostilist[ii];
-    int jnum = ghostnumneigh[i];
-    int* jlist = ghostneighs+ghostfirstneigh[i];
-    const double xtmp = atom->x[i][0];
-    const double ytmp = atom->x[i][1];
-    const double ztmp = atom->x[i][2];
-    const int itype = atom->type[i];
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
+    const int itype = type[i];
     const int ielem = map[itype];
     const double radi = radelem[ielem];
-    int ninside = 0;
 
+    jlist = list->firstneigh[i];
+    jnum = list->numneigh[i];
+
+    // insure rij, inside, wj, and rcutij are of size jnum
+
+    snaptr->grow_rij(jnum);
+
+    // rij[][3] = displacements between atom I and those neighbors
+    // inside = indices of neighbors of I within cutoff
+    // wj = weights for neighbors of I within cutoff
+    // rcutij = cutoffs for neighbors of I within cutoff
+    // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+    ninside = 0;
     for (int jj = 0; jj < jnum; jj++) {
-      int j = jlist[jj];
+      j = jlist[jj];
       j &= NEIGHMASK;
-      const double delx = atom->x[j][0] - xtmp;
-      const double dely = atom->x[j][1] - ytmp;
-      const double delz = atom->x[j][2] - ztmp;
-      const double rsq = delx*delx + dely*dely + delz*delz;
-      int jtype = atom->type[j];
+      delx = x[j][0] - xtmp;
+      dely = x[j][1] - ytmp;
+      delz = x[j][2] - ztmp;
+      rsq = delx*delx + dely*dely + delz*delz;
+      int jtype = type[j];
       int jelem = map[jtype];
 
-      i_pairs[i_numpairs][0] = i;
-      i_pairs[i_numpairs][1] = jj;
-      i_pairs[i_numpairs][2] = -1;
-      i_pairs[i_numpairs][3] = count;
       if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-        i_rij[count][ninside][0] = delx;
-        i_rij[count][ninside][1] = dely;
-        i_rij[count][ninside][2] = delz;
-        i_inside[count][ninside] = j;
-        i_wj[count][ninside] = wjelem[jelem];
-        i_rcutij[count][ninside] = (radi + radelem[jelem])*rcutfac;
-        // update index list with inside index
-        i_pairs[i_numpairs][2] = ninside++;
+        snaptr->rij[ninside][0] = delx;
+        snaptr->rij[ninside][1] = dely;
+        snaptr->rij[ninside][2] = delz;
+        snaptr->inside[ninside] = j;
+        snaptr->wj[ninside] = wjelem[jelem];
+        snaptr->rcutij[ninside] = (radi + radelem[jelem])*rcutfac;
+        ninside++;
       }
-      i_numpairs++;
     }
-    i_ninside[count] = ninside;
-    count++;
-  }
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
 
-#if defined(_OPENMP)
-#pragma omp parallel for shared(count) default(none)
-#endif
-  for (int ii=0; ii < count; ii++) {
-    int tid = omp_get_thread_num();
-    set_sna_to_shared(tid,ii);
-    //sna[tid]->compute_ui(i_ninside[ii]);
-#ifdef TIMING_INFO
-    clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-    sna[tid]->compute_ui_omp(i_ninside[ii],MAX(int(nthreads/count),1));
-#ifdef TIMING_INFO
-    clock_gettime(CLOCK_REALTIME,&endtime);
-    sna[tid]->timers[0]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-                          (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-  }
+    snaptr->compute_ui(ninside);
+    snaptr->compute_zi();
+    snaptr->compute_bi();
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&starttime);
-#endif
-  for (int ii=0; ii < count; ii++) {
-    int tid = 0;//omp_get_thread_num();
-    set_sna_to_shared(tid,ii);
-    sna[tid]->compute_zi_omp(MAX(int(nthreads/count),1));
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+      bispectrum[ii][icoeff] = snaptr->blist[icoeff];
   }
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  sna[0]->timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-                      (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  timers[1]+=(endtime.tv_sec-starttime.tv_sec+1.0*
-              (endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-#endif
 }
 
 /* ----------------------------------------------------------------------
@@ -1316,96 +354,8 @@ void PairSNAP::allocate()
 
 void PairSNAP::settings(int narg, char **arg)
 {
-
-  // set default values for optional arguments
-
-  nthreads = -1;
-  use_shared_arrays=-1;
-  do_load_balance = 0;
-  use_optimized = 1;
-
-  // optional arguments
-
-  for (int i=0; i < narg; i++) {
-    if (i+2>narg) error->all(FLERR,"Illegal pair_style command");
-    if (strcmp(arg[i],"nthreads")==0) {
-      nthreads=force->inumeric(FLERR,arg[++i]);
-#if defined(LMP_USER_OMP)
-      error->all(FLERR,"Must set number of threads via package omp command");
-#else
-      omp_set_num_threads(nthreads);
-      comm->nthreads=nthreads;
-#endif
-      continue;
-    }
-    if (strcmp(arg[i],"optimized")==0) {
-      use_optimized=force->inumeric(FLERR,arg[++i]);
-      continue;
-    }
-    if (strcmp(arg[i],"shared")==0) {
-      use_shared_arrays=force->inumeric(FLERR,arg[++i]);
-      continue;
-    }
-    if (strcmp(arg[i],"loadbalance")==0) {
-      do_load_balance = force->inumeric(FLERR,arg[++i]);
-      if (do_load_balance) {
-        double mincutoff = extra_cutoff() +
-          rcutmax + neighbor->skin;
-        if (comm->cutghostuser < mincutoff) {
-          char buffer[255];
-
-          //apparently mincutoff is 0 after sprintf command ?????
-
-          double tmp = mincutoff + 0.1;
-          sprintf(buffer, "Communication cutoff is too small "
-                  "for SNAP micro load balancing, increased to %lf",
-                  mincutoff+0.1);
-          if (comm->me==0)
-            error->warning(FLERR,buffer);
-
-          comm->cutghostuser = tmp;
-
-        }
-      }
-      continue;
-    }
-    if (strcmp(arg[i],"schedule")==0) {
-      i++;
-      if (strcmp(arg[i],"static")==0)
-        schedule_user = 1;
-      if (strcmp(arg[i],"dynamic")==0)
-        schedule_user = 2;
-      if (strcmp(arg[i],"guided")==0)
-        schedule_user = 3;
-      if (strcmp(arg[i],"auto")==0)
-        schedule_user = 4;
-      if (strcmp(arg[i],"determine")==0)
-        schedule_user = 5;
-      if (schedule_user == 0)
-        error->all(FLERR,"Illegal pair_style command");
-      continue;
-    }
+  for (int i=0; i < narg; i++)
     error->all(FLERR,"Illegal pair_style command");
-  }
-
-  if (nthreads < 0)
-    nthreads = comm->nthreads;
-
-  if (use_shared_arrays < 0) {
-    if (nthreads > 1 && atom->nlocal <= 2*nthreads)
-      use_shared_arrays = 1;
-    else use_shared_arrays = 0;
-  }
-
-  // check if running non-optimized code with
-  // optimization flags set
-
-  if (!use_optimized)
-    if (nthreads > 1 ||
-        use_shared_arrays ||
-        do_load_balance ||
-        schedule_user)
-      error->all(FLERR,"Illegal pair_style command");
 }
 
 /* ----------------------------------------------------------------------
@@ -1493,26 +443,12 @@ void PairSNAP::coeff(int narg, char **arg)
 
   if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 
-  sna = new SNA*[nthreads];
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   rmin0,switchflag,bzeroflag);
 
-  // allocate memory for per OpenMP thread data which
-  // is wrapped into the sna class
-
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    sna[tid] = new SNA(lmp,rfac0,twojmax,
-                       diagonalstyle,use_shared_arrays,
-                       rmin0,switchflag,bzeroflag);
-    if (!use_shared_arrays)
-      sna[tid]->grow_rij(nmax);
-  }
-
-  if (ncoeff != sna[0]->ncoeff) {
+  if (ncoeff != snaptr->ncoeff) {
     if (comm->me == 0)
-      printf("ncoeff = %d snancoeff = %d \n",ncoeff,sna[0]->ncoeff);
+      printf("ncoeff = %d snancoeff = %d \n",ncoeff,snaptr->ncoeff);
     error->all(FLERR,"Incorrect SNAP parameter file");
   }
 
@@ -1539,13 +475,7 @@ void PairSNAP::init_style()
   neighbor->requests[irequest]->half = 0;
   neighbor->requests[irequest]->full = 1;
 
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    sna[tid]->init();
-  }
+  snaptr->init();
 
 }
 
@@ -1693,6 +623,8 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
     }
   }
 
+  if (comm->me == 0) fclose(fpcoeff);
+
   // set flags for required keywords
 
   rcutfacflag = 0;
@@ -1702,7 +634,6 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
 
   rfac0 = 0.99363;
   rmin0 = 0.0;
-  diagonalstyle = 3;
   switchflag = 1;
   bzeroflag = 1;
   quadraticflag = 0;
@@ -1763,8 +694,6 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
       rfac0 = atof(keyval);
     else if (strcmp(keywd,"rmin0") == 0)
       rmin0 = atof(keyval);
-    else if (strcmp(keywd,"diagonalstyle") == 0)
-      diagonalstyle = atoi(keyval);
     else if (strcmp(keywd,"switchflag") == 0)
       switchflag = atoi(keyval);
     else if (strcmp(keywd,"bzeroflag") == 0)
@@ -1787,14 +716,16 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
 double PairSNAP::memory_usage()
 {
   double bytes = Pair::memory_usage();
+
   int n = atom->ntypes+1;
-  bytes += n*n*sizeof(int);
-  bytes += n*n*sizeof(double);
-  bytes += 3*nmax*sizeof(double);
-  bytes += nmax*sizeof(int);
-  bytes += (2*ncoeffall)*sizeof(double);
-  bytes += (ncoeff*3)*sizeof(double);
-  bytes += sna[0]->memory_usage()*nthreads;
+  bytes += n*n*sizeof(int);      // setflag
+  bytes += n*n*sizeof(double);   // cutsq
+  bytes += n*sizeof(int);        // map
+  bytes += beta_max*ncoeff*sizeof(double); // bispectrum
+  bytes += beta_max*ncoeff*sizeof(double); // beta
+
+  bytes += snaptr->memory_usage(); // SNA object
+
   return bytes;
 }
 
diff --git a/src/SNAP/pair_snap.h b/src/SNAP/pair_snap.h
index 1fa065755c..c64eaa5d4e 100644
--- a/src/SNAP/pair_snap.h
+++ b/src/SNAP/pair_snap.h
@@ -29,8 +29,6 @@ public:
   PairSNAP(class LAMMPS *);
   ~PairSNAP();
   virtual void compute(int, int);
-  void compute_regular(int, int);
-  void compute_optimized(int, int);
   void settings(int, char **);
   virtual void coeff(int, char **);
   virtual void init_style();
@@ -42,56 +40,14 @@ public:
 
 protected:
   int ncoeffq, ncoeffall;
-  double **bvec, ***dbvec;
-  class SNA** sna;
-  int nmax;
-  int nthreads;
+  class SNA* snaptr;
   virtual void allocate();
   void read_files(char *, char *);
   inline int equal(double* x,double* y);
   inline double dist2(double* x,double* y);
-  double extra_cutoff();
-  void load_balance();
-  void set_sna_to_shared(int snaid,int i);
-  void build_per_atom_arrays();
 
-  int schedule_user;
-  double schedule_time_guided;
-  double schedule_time_dynamic;
-
-  int ncalls_neigh;
-  int do_load_balance;
-  int ilistmask_max;
-  int* ilistmask;
-  int ghostinum;
-  int ghostilist_max;
-  int* ghostilist;
-  int ghostnumneigh_max;
-  int* ghostnumneigh;
-  int* ghostneighs;
-  int* ghostfirstneigh;
-  int ghostneighs_total;
-  int ghostneighs_max;
-
-  int use_optimized;
-  int use_shared_arrays;
-
-  int i_max;
-  int i_neighmax;
-  int i_numpairs;
-  int **i_pairs;
-  double ***i_rij;
-  int **i_inside;
-  double **i_wj;
-  double **i_rcutij;
-  int *i_ninside;
-  double ****i_uarraytot_r, ****i_uarraytot_i;
-  double ******i_zarray_r, ******i_zarray_i;
-
-#ifdef TIMING_INFO
-  //  timespec starttime, endtime;
-  double timers[4];
-#endif
+  void compute_beta();
+  void compute_bispectrum();
 
   double rcutmax;               // max cutoff for all elements
   int nelements;                // # of unique elements
@@ -99,10 +55,13 @@ protected:
   double *radelem;              // element radii
   double *wjelem;               // elements weights
   double **coeffelem;           // element bispectrum coefficients
+  double** beta;                // betas for all atoms in list
+  double** bispectrum;          // bispectrum components for all atoms in list
   int *map;                     // mapping from atom types to elements
-  int twojmax, diagonalstyle, switchflag, bzeroflag;
+  int twojmax, switchflag, bzeroflag;
   double rfac0, rmin0, wj1, wj2;
   int rcutfacflag, twojmaxflag; // flags for required parameters
+  int beta_max;                 // length of beta
 };
 
 }
@@ -124,15 +83,6 @@ Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
-E: Must set number of threads via package omp command
-
-Because you are using the USER-OMP package, set the number of threads
-via its settings, not by the pair_style snap nthreads setting.
-
-W: Communication cutoff is too small for SNAP micro load balancing, increased to %lf
-
-Self-explanatory.
-
 E: Incorrect args for pair coefficients
 
 Self-explanatory.  Check the input script or data file.
diff --git a/src/SNAP/sna.cpp b/src/SNAP/sna.cpp
index 7ed1bc1e23..ec545c51b2 100644
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
@@ -21,7 +21,6 @@
 #include "math_extra.h"
 #include <cstring>
 #include <cstdlib>
-#include "openmp_snap.h"
 
 #include "memory.h"
 #include "error.h"
@@ -114,34 +113,30 @@ using namespace MathConst;
 ------------------------------------------------------------------------- */
 
 SNA::SNA(LAMMPS* lmp, double rfac0_in,
-         int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in,
+         int twojmax_in,
          double rmin0_in, int switch_flag_in, int bzero_flag_in) : Pointers(lmp)
 {
   wself = 1.0;
 
-  use_shared_arrays = use_shared_arrays_in;
   rfac0 = rfac0_in;
   rmin0 = rmin0_in;
   switch_flag = switch_flag_in;
   bzero_flag = bzero_flag_in;
 
   twojmax = twojmax_in;
-  diagonalstyle = diagonalstyle_in;
 
   ncoeff = compute_ncoeff();
 
-  create_twojmax_arrays();
-
-  bvec = NULL;
-  dbvec = NULL;
-  memory->create(bvec, ncoeff, "pair:bvec");
-  memory->create(dbvec, ncoeff, 3, "pair:dbvec");
   rij = NULL;
   inside = NULL;
   wj = NULL;
   rcutij = NULL;
   nmax = 0;
-  idxj = NULL;
+  idxz = NULL;
+  idxb = NULL;
+
+  build_indexlist();
+  create_twojmax_arrays();
 
   if (bzero_flag) {
     double www = wself*wself*wself;
@@ -149,134 +144,142 @@ SNA::SNA(LAMMPS* lmp, double rfac0_in,
       bzero[j] = www*(j+1);
   }
 
-#ifdef TIMING_INFO
-  timers = new double[20];
-  for(int i = 0; i < 20; i++) timers[i] = 0;
-  print = 0;
-  counter = 0;
-#endif
-
-  build_indexlist();
-
-
 }
 
 /* ---------------------------------------------------------------------- */
 
 SNA::~SNA()
 {
-  if(!use_shared_arrays) {
-    destroy_twojmax_arrays();
-    memory->destroy(rij);
-    memory->destroy(inside);
-    memory->destroy(wj);
-    memory->destroy(rcutij);
-    memory->destroy(bvec);
-    memory->destroy(dbvec);
-  }
-  delete[] idxj;
+  memory->destroy(rij);
+  memory->destroy(inside);
+  memory->destroy(wj);
+  memory->destroy(rcutij);
+  delete[] idxz;
+  delete[] idxb;
+  destroy_twojmax_arrays();
 }
 
 void SNA::build_indexlist()
 {
-  if(diagonalstyle == 0) {
-    int idxj_count = 0;
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          idxj_count++;
+  // index list for cglist
 
-    // indexList can be changed here
+  int jdim = twojmax + 1;
+  memory->create(idxcg_block, jdim, jdim, jdim,
+                 "sna:idxcg_block");
 
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
+  int idxcg_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        idxcg_block[j1][j2][j] = idxcg_count; 
+        for (int m1 = 0; m1 <= j1; m1++)
+          for (int m2 = 0; m2 <= j2; m2++)
+            idxcg_count++;
+      }
+  idxcg_max = idxcg_count;
 
-    idxj_count = 0;
+  // index list for uarray
+  // need to include both halves
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          idxj[idxj_count].j1 = j1;
-          idxj[idxj_count].j2 = j2;
-          idxj[idxj_count].j = j;
-          idxj_count++;
+  memory->create(idxu_block, jdim,
+                 "sna:idxu_block");
+
+  int idxu_count = 0;
+  
+  for(int j = 0; j <= twojmax; j++) {
+    idxu_block[j] = idxu_count; 
+    for(int mb = 0; mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++)
+        idxu_count++;
+  }
+  idxu_max = idxu_count;
+
+  // index list for beta and B
+
+  int idxb_count = 0;  
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) idxb_count++;
+  
+  idxb_max = idxb_count;
+  idxb = new SNA_BINDICES[idxb_max];
+  
+  idxb_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) {
+          idxb[idxb_count].j1 = j1;
+          idxb[idxb_count].j2 = j2;
+          idxb[idxb_count].j = j;
+          idxb_count++;
         }
-  }
 
-  if(diagonalstyle == 1) {
-    int idxj_count = 0;
+  // reverse index list for beta and b
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j = 0; j <= MIN(twojmax, 2 * j1); j += 2) {
-        idxj_count++;
+  memory->create(idxb_block, jdim, jdim, jdim,
+                 "sna:idxb_block");
+  idxb_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        if (j >= j1) {
+          idxb_block[j1][j2][j] = idxb_count; 
+          idxb_count++;
+        }
       }
 
-    // indexList can be changed here
+  // index list for zlist
 
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
+  int idxz_count = 0;
 
-    idxj_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++)
+            idxz_count++;
+  
+  idxz_max = idxz_count;
+  idxz = new SNA_ZINDICES[idxz_max];
+  
+  memory->create(idxz_block, jdim, jdim, jdim,
+                 "sna:idxz_block");
+  
+  idxz_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        idxz_block[j1][j2][j] = idxz_count; 
 
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j = 0; j <= MIN(twojmax, 2 * j1); j += 2) {
-        idxj[idxj_count].j1 = j1;
-        idxj[idxj_count].j2 = j1;
-        idxj[idxj_count].j = j;
-        idxj_count++;
-      }
-  }
+        // find right beta[jjb] entry
+        // multiply and divide by j+1 factors
+        // account for multiplicity of 1, 2, or 3
 
-  if(diagonalstyle == 2) {
-    int idxj_count = 0;
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++) {
+            idxz[idxz_count].j1 = j1;
+            idxz[idxz_count].j2 = j2;
+            idxz[idxz_count].j = j;
+            idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2);
+            idxz[idxz_count].ma2max = (2 * ma - j - (2 * idxz[idxz_count].ma1min - j1) + j2) / 2;
+            idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - idxz[idxz_count].ma1min + 1;
+            idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2);
+            idxz[idxz_count].mb2max = (2 * mb - j - (2 * idxz[idxz_count].mb1min - j1) + j2) / 2;
+            idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - idxz[idxz_count].mb1min + 1;
 
-    for(int j1 = 0; j1 <= twojmax; j1++) {
-      idxj_count++;
-    }
+            // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
 
-    // indexList can be changed here
+            const int jju = idxu_block[j] + (j+1)*mb + ma;
+            idxz[idxz_count].jju = jju;
 
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
-
-    idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++) {
-      idxj[idxj_count].j1 = j1;
-      idxj[idxj_count].j2 = j1;
-      idxj[idxj_count].j = j1;
-      idxj_count++;
-    }
-  }
-
-  if(diagonalstyle == 3) {
-    int idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) idxj_count++;
-
-    // indexList can be changed here
-
-    idxj = new SNA_LOOPINDICES[idxj_count];
-    idxj_max = idxj_count;
-
-    idxj_count = 0;
-
-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) {
-            idxj[idxj_count].j1 = j1;
-            idxj[idxj_count].j2 = j2;
-            idxj[idxj_count].j = j;
-            idxj_count++;
+            idxz_count++;
           }
-  }
-
+      }
 }
+
 /* ---------------------------------------------------------------------- */
 
 void SNA::init()
@@ -292,17 +295,16 @@ void SNA::grow_rij(int newnmax)
 
   nmax = newnmax;
 
-  if(!use_shared_arrays) {
-    memory->destroy(rij);
-    memory->destroy(inside);
-    memory->destroy(wj);
-    memory->destroy(rcutij);
-    memory->create(rij, nmax, 3, "pair:rij");
-    memory->create(inside, nmax, "pair:inside");
-    memory->create(wj, nmax, "pair:wj");
-    memory->create(rcutij, nmax, "pair:rcutij");
- }
+  memory->destroy(rij);
+  memory->destroy(inside);
+  memory->destroy(wj);
+  memory->destroy(rcutij);
+  memory->create(rij, nmax, 3, "pair:rij");
+  memory->create(inside, nmax, "pair:inside");
+  memory->create(wj, nmax, "pair:wj");
+  memory->create(rcutij, nmax, "pair:rcutij");
 }
+
 /* ----------------------------------------------------------------------
    compute Ui by summing over neighbors j
 ------------------------------------------------------------------------- */
@@ -320,10 +322,6 @@ void SNA::compute_ui(int jnum)
   zero_uarraytot();
   addself_uarraytot(wself);
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
   for(int j = 0; j < jnum; j++) {
     x = rij[j][0];
     y = rij[j][1];
@@ -339,48 +337,6 @@ void SNA::compute_ui(int jnum)
     add_uarraytot(r, wj[j], rcutij[j]);
   }
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[0] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
-}
-
-void SNA::compute_ui_omp(int jnum, int sub_threads)
-{
-  double rsq, r, x, y, z, z0, theta0;
-
-  // utot(j,ma,mb) = 0 for all j,ma,ma
-  // utot(j,ma,ma) = 1 for all j,ma
-  // for j in neighbors of i:
-  //   compute r0 = (x,y,z,z0)
-  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
-
-  zero_uarraytot();
-  addself_uarraytot(wself);
-
-  for(int j = 0; j < jnum; j++) {
-    x = rij[j][0];
-    y = rij[j][1];
-    z = rij[j][2];
-    rsq = x * x + y * y + z * z;
-    r = sqrt(rsq);
-    theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij[j] - rmin0);
-    //    theta0 = (r - rmin0) * rscale0;
-    z0 = r / tan(theta0);
-    omp_set_num_threads(sub_threads);
-
-#if defined(_OPENMP)
-#pragma omp parallel shared(x,y,z,z0,r,sub_threads) default(none)
-#endif
-    {
-      compute_uarray_omp(x, y, z, z0, r, sub_threads);
-    }
-    add_uarraytot(r, wj[j], rcutij[j]);
-  }
-
-
 }
 
 /* ----------------------------------------------------------------------
@@ -389,137 +345,225 @@ void SNA::compute_ui_omp(int jnum, int sub_threads)
 
 void SNA::compute_zi()
 {
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        for ma = 0,...,j
-  //          for mb = 0,...,jmid
-  //            z(j1,j2,j,ma,mb) = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
-  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
+  int ma2, mb2;
+  for(int jjz = 0; jjz < idxz_max; jjz++) {
+    const int j1 = idxz[jjz].j1;
+    const int j2 = idxz[jjz].j2;
+    const int j = idxz[jjz].j;
+    const int ma1min = idxz[jjz].ma1min;
+    const int ma2max = idxz[jjz].ma2max;
+    const int na = idxz[jjz].na;
+    const int mb1min = idxz[jjz].mb1min;
+    const int mb2max = idxz[jjz].mb2max;
+    const int nb = idxz[jjz].nb;
 
-  // compute_dbidrj() requires full j1/j2/j chunk of z elements
-  // use zarray j1/j2 symmetry
+    const double* cgblock = cglist + idxcg_block[j1][j2][j];
 
-  for(int j1 = 0; j1 <= twojmax; j1++)
-    for(int j2 = 0; j2 <= j1; j2++) {
-      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
-        double sumb1_r, sumb1_i;
-        int ma2, mb2;
-        for(int mb = 0; 2*mb <= j; mb++)
-          for(int ma = 0; ma <= j; ma++) {
-            zarray_r[j1][j2][j][ma][mb] = 0.0;
-            zarray_i[j1][j2][j][ma][mb] = 0.0;
+    zlist_r[jjz] = 0.0;
+    zlist_i[jjz] = 0.0;
 
-            for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-                ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) {
-              sumb1_r = 0.0;
-              sumb1_i = 0.0;
+    int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+    int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+    int icgb = mb1min*(j2+1) + mb2max;
+    for(int ib = 0; ib < nb; ib++) {
 
-              ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
+      double suma1_r = 0.0;
+      double suma1_i = 0.0;
 
-              for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2);
-              mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) {
+      const double* u1_r = &ulisttot_r[jju1];
+      const double* u1_i = &ulisttot_i[jju1];
+      const double* u2_r = &ulisttot_r[jju2];
+      const double* u2_i = &ulisttot_i[jju2];
 
-                mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2;
-                sumb1_r += cgarray[j1][j2][j][mb1][mb2] *
-                  (uarraytot_r[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2] -
-                   uarraytot_i[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2]);
-                sumb1_i += cgarray[j1][j2][j][mb1][mb2] *
-                  (uarraytot_r[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2] +
-                   uarraytot_i[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2]);
-              } // end loop over mb1
+      int ma1 = ma1min;
+      int ma2 = ma2max;
+      int icga = ma1min*(j2+1) + ma2max;
+      for(int ia = 0; ia < na; ia++) {
+        suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]);
+        suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]);
+        ma1++;
+        ma2--;
+        icga += j2;
+      } // end loop over ia
 
-              zarray_r[j1][j2][j][ma][mb] +=
-                sumb1_r * cgarray[j1][j2][j][ma1][ma2];
-              zarray_i[j1][j2][j][ma][mb] +=
-                sumb1_i * cgarray[j1][j2][j][ma1][ma2];
-            } // end loop over ma1
-          } // end loop over ma, mb
-      } // end loop over j
-    } // end loop over j1, j2
+      zlist_r[jjz] += cgblock[icgb] * suma1_r;
+      zlist_i[jjz] += cgblock[icgb] * suma1_i;
+      jju1 += j1+1;
+      jju2 -= j2+1;
+      icgb += j2;
+    } // end loop over ib
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[1] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
+  } // end loop over jjz
 }
 
-void SNA::compute_zi_omp(int sub_threads)
+/* ----------------------------------------------------------------------
+   compute Yi from Ui without storing Zi, looping over zlist indices
+------------------------------------------------------------------------- */
+
+void SNA::compute_yi(const double* beta)
 {
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        for ma = 0,...,j
-  //          for mb = 0,...,j
-  //            z(j1,j2,j,ma,mb) = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
-  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)
+  int j;
+  int jjz;
+  int jju;
+  double betaj;
 
-  if(omp_in_parallel())
-    omp_set_num_threads(sub_threads);
+  for(int j = 0; j <= twojmax; j++) {
+    jju = idxu_block[j];
+    for(int mb = 0; 2*mb <= j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+        ylist_r[jju] = 0.0;
+        ylist_i[jju] = 0.0;
+        jju++;
+      } // end loop over ma, mb
+  } // end loop over j
 
-  // compute_dbidrj() requires full j1/j2/j chunk of z elements
-  // use zarray j1/j2 symmetry
+  int ma2, mb2;
+  for(int jjz = 0; jjz < idxz_max; jjz++) {
+    const int j1 = idxz[jjz].j1;
+    const int j2 = idxz[jjz].j2;
+    const int j = idxz[jjz].j;
+    const int ma1min = idxz[jjz].ma1min;
+    const int ma2max = idxz[jjz].ma2max;
+    const int na = idxz[jjz].na;
+    const int mb1min = idxz[jjz].mb1min;
+    const int mb2max = idxz[jjz].mb2max;
+    const int nb = idxz[jjz].nb;
 
-#if defined(_OPENMP)
-#pragma omp parallel for schedule(auto) default(none)
-#endif
-  for(int j1 = 0; j1 <= twojmax; j1++)
-    for(int j2 = 0; j2 <= j1; j2++)
-      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+    const double* cgblock = cglist + idxcg_block[j1][j2][j];
+    int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2;
+    int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2;
 
-    double sumb1_r, sumb1_i;
-    int ma2, mb2;
+    double ztmp_r = 0.0;
+    double ztmp_i = 0.0;
 
-    for(int ma = 0; ma <= j; ma++)
-      for(int mb = 0; mb <= j; mb++) {
-        zarray_r[j1][j2][j][ma][mb] = 0.0;
-        zarray_i[j1][j2][j][ma][mb] = 0.0;
+    int jju1 = idxu_block[j1] + (j1+1)*mb1min;
+    int jju2 = idxu_block[j2] + (j2+1)*mb2max;
+    int icgb = mb1min*(j2+1) + mb2max;
+    for(int ib = 0; ib < nb; ib++) {
 
-        for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-            ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) {
-          sumb1_r = 0.0;
-          sumb1_i = 0.0;
+      double suma1_r = 0.0;
+      double suma1_i = 0.0;
 
-          ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
+      const double* u1_r = &ulisttot_r[jju1];
+      const double* u1_i = &ulisttot_i[jju1];
+      const double* u2_r = &ulisttot_r[jju2];
+      const double* u2_i = &ulisttot_i[jju2];
 
-          for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2);
-              mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) {
+      int ma1 = ma1min;
+      int ma2 = ma2max;
+      int icga = ma1min*(j2+1) + ma2max;
 
-            mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2;
-            sumb1_r += cgarray[j1][j2][j][mb1][mb2] *
-              (uarraytot_r[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2] -
-               uarraytot_i[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2]);
-            sumb1_i += cgarray[j1][j2][j][mb1][mb2] *
-              (uarraytot_r[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2] +
-               uarraytot_i[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2]);
-          }
+      for(int ia = 0; ia < na; ia++) {
+        suma1_r += cgblock[icga] * (u1_r[ma1] * u2_r[ma2] - u1_i[ma1] * u2_i[ma2]);
+        suma1_i += cgblock[icga] * (u1_r[ma1] * u2_i[ma2] + u1_i[ma1] * u2_r[ma2]);
+        ma1++;
+        ma2--;
+        icga += j2;
+      } // end loop over ia
 
-          zarray_r[j1][j2][j][ma][mb] +=
-            sumb1_r * cgarray[j1][j2][j][ma1][ma2];
-          zarray_i[j1][j2][j][ma][mb] +=
-            sumb1_i * cgarray[j1][j2][j][ma1][ma2];
-        }
+      ztmp_r += cgblock[icgb] * suma1_r;
+      ztmp_i += cgblock[icgb] * suma1_i;
+      jju1 += j1+1;
+      jju2 -= j2+1;
+      icgb += j2;
+    } // end loop over ib
+
+    // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+    // find right y_list[jju] and beta[jjb] entries
+    // multiply and divide by j+1 factors
+    // account for multiplicity of 1, 2, or 3
+
+    const int jju = idxz[jjz].jju;
+
+  // pick out right beta value
+
+    if (j >= j1) {
+      const int jjb = idxb_block[j1][j2][j];
+      if (j1 == j) {
+        if (j2 == j) betaj = 3*beta[jjb];
+        else betaj = 2*beta[jjb];
+      } else betaj = beta[jjb]; 
+    } else if (j >= j2) {
+      const int jjb = idxb_block[j][j2][j1];
+      if (j2 == j) betaj = 2*beta[jjb]*(j1+1)/(j+1.0);
+      else betaj = beta[jjb]*(j1+1)/(j+1.0);
+    } else {
+      const int jjb = idxb_block[j2][j][j1];
+      betaj = beta[jjb]*(j1+1)/(j+1.0);
+    }
+
+    ylist_r[jju] += betaj*ztmp_r;
+    ylist_i[jju] += betaj*ztmp_i;
+
+  } // end loop over jjz
+}
+
+/* ----------------------------------------------------------------------
+   compute dEidRj
+------------------------------------------------------------------------- */
+
+void SNA::compute_deidrj(double* dedr)
+{
+
+  for(int k = 0; k < 3; k++)
+    dedr[k] = 0.0;
+
+  for(int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+
+    for(int mb = 0; 2*mb < j; mb++)
+      for(int ma = 0; ma <= j; ma++) {
+
+        double* dudr_r = dulist_r[jju];
+        double* dudr_i = dulist_i[jju];
+        double jjjmambyarray_r = ylist_r[jju];
+        double jjjmambyarray_i = ylist_i[jju];
+
+        for(int k = 0; k < 3; k++)
+          dedr[k] +=
+            dudr_r[k] * jjjmambyarray_r +
+            dudr_i[k] * jjjmambyarray_i;
+        jju++;
+      } //end loop over ma mb
+
+    // For j even, handle middle column
+
+    if (j%2 == 0) {
+
+      int mb = j/2;
+      for(int ma = 0; ma < mb; ma++) {
+        double* dudr_r = dulist_r[jju];
+        double* dudr_i = dulist_i[jju];
+        double jjjmambyarray_r = ylist_r[jju];
+        double jjjmambyarray_i = ylist_i[jju];
+
+        for(int k = 0; k < 3; k++)
+          dedr[k] +=
+            dudr_r[k] * jjjmambyarray_r +
+            dudr_i[k] * jjjmambyarray_i;
+        jju++;
       }
-  }
+
+      int ma = mb;
+      double* dudr_r = dulist_r[jju];
+      double* dudr_i = dulist_i[jju];
+      double jjjmambyarray_r = ylist_r[jju];
+      double jjjmambyarray_i = ylist_i[jju];
+
+      for(int k = 0; k < 3; k++)
+        dedr[k] += 
+          (dudr_r[k] * jjjmambyarray_r +
+           dudr_i[k] * jjjmambyarray_i)*0.5;
+      jju++;
+
+    } // end if jeven
+
+  } // end loop over j
+
+  for(int k = 0; k < 3; k++)
+    dedr[k] *= 2.0;
+
 }
 
 /* ----------------------------------------------------------------------
@@ -537,269 +581,46 @@ void SNA::compute_bi()
   //            b(j1,j2,j) +=
   //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
+  for(int jjb = 0; jjb < idxb_max; jjb++) {
+    const int j1 = idxb[jjb].j1;
+    const int j2 = idxb[jjb].j2;
+    const int j = idxb[jjb].j;
 
-  for(int j1 = 0; j1 <= twojmax; j1++)
-    for(int j2 = 0; j2 <= j1; j2++) {
-      for(int j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2) {
-        barray[j1][j2][j] = 0.0;
+    int jjz = idxz_block[j1][j2][j];
+    int jju = idxu_block[j];
+    double sumzu = 0.0;
+    for (int mb = 0; 2*mb < j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        sumzu += ulisttot_r[jju]*zlist_r[jjz] + 
+          ulisttot_i[jju]*zlist_i[jjz];
+        jjz++;
+        jju++;
+      } // end loop over ma, mb                                                                                                   
 
-        for(int mb = 0; 2*mb < j; mb++)
-          for(int ma = 0; ma <= j; ma++)
-            barray[j1][j2][j] +=
-              uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
-              uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb];
+    // For j even, handle middle column
 
-        // For j even, special treatment for middle column
-
-        if (j%2 == 0) {
-          int mb = j/2;
-          for(int ma = 0; ma < mb; ma++)
-            barray[j1][j2][j] +=
-              uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
-              uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb];
-          int ma = mb;
-          barray[j1][j2][j] +=
-            (uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
-             uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb])*0.5;
-        }
-
-        barray[j1][j2][j] *= 2.0;
-        if (bzero_flag)
-          barray[j1][j2][j] -= bzero[j];
+    if (j%2 == 0) {
+      int mb = j/2;
+      for(int ma = 0; ma < mb; ma++) {
+        sumzu += ulisttot_r[jju]*zlist_r[jjz] + 
+          ulisttot_i[jju]*zlist_i[jjz];
+        jjz++;
+        jju++;
       }
-    }
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[2] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
+      sumzu += 0.5*(ulisttot_r[jju]*zlist_r[jjz] + 
+                   ulisttot_i[jju]*zlist_i[jjz]);
+      jjz++;
+      jju++;
+    } // end if jeven
 
-}
+    blist[jjb] = 2.0*sumzu;
 
-/* ----------------------------------------------------------------------
-   copy Bi array to a vector
-------------------------------------------------------------------------- */
-
-void SNA::copy_bi2bvec()
-{
-  int ncount, j1, j2, j;
-
-  ncount = 0;
-
-  for(j1 = 0; j1 <= twojmax; j1++)
-    if(diagonalstyle == 0) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2) {
-          bvec[ncount] = barray[j1][j2][j];
-          ncount++;
-        }
-    } else if(diagonalstyle == 1) {
-      j2 = j1;
-      for(j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2) {
-        bvec[ncount] = barray[j1][j2][j];
-        ncount++;
-      }
-    } else if(diagonalstyle == 2) {
-      j = j2 = j1;
-      bvec[ncount] = barray[j1][j2][j];
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) {
-            bvec[ncount] = barray[j1][j2][j];
-            ncount++;
-          }
-    }
-}
-
-/* ----------------------------------------------------------------------
-   calculate derivative of Ui w.r.t. atom j
-------------------------------------------------------------------------- */
-
-void SNA::compute_duidrj(double* rij, double wj, double rcut)
-{
-  double rsq, r, x, y, z, z0, theta0, cs, sn;
-  double dz0dr;
-
-  x = rij[0];
-  y = rij[1];
-  z = rij[2];
-  rsq = x * x + y * y + z * z;
-  r = sqrt(rsq);
-  double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
-  theta0 = (r - rmin0) * rscale0;
-  cs = cos(theta0);
-  sn = sin(theta0);
-  z0 = r * cs / sn;
-  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
-  compute_duarray(x, y, z, z0, r, dz0dr, wj, rcut);
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[3] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
-}
-
-/* ----------------------------------------------------------------------
-   calculate derivative of Bi w.r.t. atom j
-   variant using indexlist for j1,j2,j
-   variant not using symmetry relation
-------------------------------------------------------------------------- */
-
-void SNA::compute_dbidrj_nonsymm()
-{
-  // for j1 = 0,...,twojmax
-  //   for j2 = 0,twojmax
-  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
-  //        dbdr(j1,j2,j) = 0
-  //        for ma = 0,...,j
-  //          for mb = 0,...,j
-  //            dzdr = 0
-  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
-  //              sumb1 = 0
-  //              ma2 = ma-ma1+(j1+j2-j)/2;
-  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
-  //                mb2 = mb-mb1+(j1+j2-j)/2;
-  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
-  //                  (dudr(j1,ma1,mb1) * u(j2,ma2,mb2) +
-  //                  u(j1,ma1,mb1) * dudr(j2,ma2,mb2))
-  //              dzdr += sumb1*cg(j1,ma1,j2,ma2,j)
-  //            dbdr(j1,j2,j) +=
-  //              Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) +
-  //              Conj(u(j,ma,mb))*dzdr
-
-  double* dbdr;
-  double* dudr_r, *dudr_i;
-  double sumb1_r[3], sumb1_i[3], dzdr_r[3], dzdr_i[3];
-  int ma2;
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
-
-  for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j = idxj[JJ].j;
-
-    dbdr = dbarray[j1][j2][j];
-    dbdr[0] = 0.0;
-    dbdr[1] = 0.0;
-    dbdr[2] = 0.0;
-
-    double** *j1duarray_r = duarray_r[j1];
-    double** *j2duarray_r = duarray_r[j2];
-    double** *j1duarray_i = duarray_i[j1];
-    double** *j2duarray_i = duarray_i[j2];
-    double** j1uarraytot_r = uarraytot_r[j1];
-    double** j2uarraytot_r = uarraytot_r[j2];
-    double** j1uarraytot_i = uarraytot_i[j1];
-    double** j2uarraytot_i = uarraytot_i[j2];
-    double** j1j2jcgarray = cgarray[j1][j2][j];
-
-    for(int ma = 0; ma <= j; ma++)
-      for(int mb = 0; mb <= j; mb++) {
-        dzdr_r[0] = 0.0;
-        dzdr_r[1] = 0.0;
-        dzdr_r[2] = 0.0;
-        dzdr_i[0] = 0.0;
-        dzdr_i[1] = 0.0;
-        dzdr_i[2] = 0.0;
-
-        const int max_mb1 = MIN(j1, (2 * mb - j + j2 + j1) / 2) + 1;
-        const int max_ma1 = MIN(j1, (2 * ma - j + j2 + j1) / 2) + 1;
-
-        for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
-            ma1 < max_ma1; ma1++) {
-
-          ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
-          sumb1_r[0] = 0.0;
-          sumb1_r[1] = 0.0;
-          sumb1_r[2] = 0.0;
-          sumb1_i[0] = 0.0;
-          sumb1_i[1] = 0.0;
-          sumb1_i[2] = 0.0;
-
-          //inside loop 54 operations (mul and add)
-          for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2),
-              mb2 = mb + (j1 + j2 - j) / 2 - mb1;
-              mb1 < max_mb1; mb1++, mb2--) {
-
-            double* dudr1_r, *dudr1_i, *dudr2_r, *dudr2_i;
-
-            dudr1_r = j1duarray_r[ma1][mb1];
-            dudr2_r = j2duarray_r[ma2][mb2];
-            dudr1_i = j1duarray_i[ma1][mb1];
-            dudr2_i = j2duarray_i[ma2][mb2];
-
-            const double cga_mb1mb2 = j1j2jcgarray[mb1][mb2];
-            const double uat_r_ma2mb2 = cga_mb1mb2 * j2uarraytot_r[ma2][mb2];
-            const double uat_r_ma1mb1 = cga_mb1mb2 * j1uarraytot_r[ma1][mb1];
-            const double uat_i_ma2mb2 = cga_mb1mb2 * j2uarraytot_i[ma2][mb2];
-            const double uat_i_ma1mb1 = cga_mb1mb2 * j1uarraytot_i[ma1][mb1];
-
-            for(int k = 0; k < 3; k++) {
-              sumb1_r[k] += dudr1_r[k] * uat_r_ma2mb2;
-              sumb1_r[k] -= dudr1_i[k] * uat_i_ma2mb2;
-              sumb1_i[k] += dudr1_r[k] * uat_i_ma2mb2;
-              sumb1_i[k] += dudr1_i[k] * uat_r_ma2mb2;
-
-              sumb1_r[k] += dudr2_r[k] * uat_r_ma1mb1;
-              sumb1_r[k] -= dudr2_i[k] * uat_i_ma1mb1;
-              sumb1_i[k] += dudr2_r[k] * uat_i_ma1mb1;
-              sumb1_i[k] += dudr2_i[k] * uat_r_ma1mb1;
-            }
-          } // end loop over mb1,mb2
-
-          // dzdr += sumb1*cg(j1,ma1,j2,ma2,j)
-
-          dzdr_r[0] += sumb1_r[0] * j1j2jcgarray[ma1][ma2];
-          dzdr_r[1] += sumb1_r[1] * j1j2jcgarray[ma1][ma2];
-          dzdr_r[2] += sumb1_r[2] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[0] += sumb1_i[0] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[1] += sumb1_i[1] * j1j2jcgarray[ma1][ma2];
-          dzdr_i[2] += sumb1_i[2] * j1j2jcgarray[ma1][ma2];
-        } // end loop over ma1,ma2
-
-        // dbdr(j1,j2,j) +=
-        //   Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) +
-        //   Conj(u(j,ma,mb))*dzdr
-
-        dudr_r = duarray_r[j][ma][mb];
-        dudr_i = duarray_i[j][ma][mb];
-
-        for(int k = 0; k < 3; k++)
-          dbdr[k] +=
-            (dudr_r[k] * zarray_r[j1][j2][j][ma][mb] +
-             dudr_i[k] * zarray_i[j1][j2][j][ma][mb]) +
-            (uarraytot_r[j][ma][mb] * dzdr_r[k] +
-             uarraytot_i[j][ma][mb] * dzdr_i[k]);
-      } //end loop over ma mb
-
-  } //end loop over j1 j2 j
-
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
+    // apply bzero shift
 
+    if (bzero_flag)
+      blist[jjb] -= bzero[j];
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -839,48 +660,36 @@ void SNA::compute_dbidrj()
   double** jjjzarray_i;
   double jjjmambzarray_r;
   double jjjmambzarray_i;
+  int jjz, jju;
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &starttime);
-#endif
+  for(int jjb = 0; jjb < idxb_max; jjb++) {
+    const int j1 = idxb[jjb].j1;
+    const int j2 = idxb[jjb].j2;
+    const int j = idxb[jjb].j;
 
-  for(int JJ = 0; JJ < idxj_max; JJ++) {
-    const int j1 = idxj[JJ].j1;
-    const int j2 = idxj[JJ].j2;
-    const int j = idxj[JJ].j;
-
-    dbdr = dbarray[j1][j2][j];
+    dbdr = dblist[jjb];
     dbdr[0] = 0.0;
     dbdr[1] = 0.0;
     dbdr[2] = 0.0;
 
     // Sum terms Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb)
 
+    jjz = idxz_block[j1][j2][j];
+    jju = idxu_block[j];
+
     for(int k = 0; k < 3; k++)
       sumzdu_r[k] = 0.0;
 
-    // use zarray j1/j2 symmetry (optional)
-
-    if (j1 >= j2) {
-      jjjzarray_r = zarray_r[j1][j2][j];
-      jjjzarray_i = zarray_i[j1][j2][j];
-    } else {
-      jjjzarray_r = zarray_r[j2][j1][j];
-      jjjzarray_i = zarray_i[j2][j1][j];
-    }
-
     for(int mb = 0; 2*mb < j; mb++)
       for(int ma = 0; ma <= j; ma++) {
-
-        dudr_r = duarray_r[j][ma][mb];
-        dudr_i = duarray_i[j][ma][mb];
-        jjjmambzarray_r = jjjzarray_r[ma][mb];
-        jjjmambzarray_i = jjjzarray_i[ma][mb];
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
-
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
       } //end loop over ma mb
 
     // For j even, handle middle column
@@ -888,24 +697,24 @@ void SNA::compute_dbidrj()
     if (j%2 == 0) {
       int mb = j/2;
       for(int ma = 0; ma < mb; ma++) {
-        dudr_r = duarray_r[j][ma][mb];
-        dudr_i = duarray_i[j][ma][mb];
-        jjjmambzarray_r = jjjzarray_r[ma][mb];
-        jjjmambzarray_i = jjjzarray_i[ma][mb];
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
       }
       int ma = mb;
-      dudr_r = duarray_r[j][ma][mb];
-      dudr_i = duarray_i[j][ma][mb];
-      jjjmambzarray_r = jjjzarray_r[ma][mb];
-      jjjmambzarray_i = jjjzarray_i[ma][mb];
+      dudr_r = dulist_r[jju];
+      dudr_i = dulist_i[jju];
       for(int k = 0; k < 3; k++)
         sumzdu_r[k] +=
-          (dudr_r[k] * jjjmambzarray_r +
-           dudr_i[k] * jjjmambzarray_i)*0.5;
+          (dudr_r[k] * zlist_r[jjz] +
+           dudr_i[k] * zlist_i[jjz])*0.5;
+      jjz++;
+      jju++;
     } // end if jeven
 
     for(int k = 0; k < 3; k++)
@@ -915,115 +724,97 @@ void SNA::compute_dbidrj()
 
     double j1fac = (j+1)/(j1+1.0);
 
+    jjz = idxz_block[j][j2][j1];
+    jju = idxu_block[j1];
+
     for(int k = 0; k < 3; k++)
       sumzdu_r[k] = 0.0;
 
-    // use zarray j1/j2 symmetry (optional)
-
-    if (j >= j2) {
-      jjjzarray_r = zarray_r[j][j2][j1];
-      jjjzarray_i = zarray_i[j][j2][j1];
-    } else {
-      jjjzarray_r = zarray_r[j2][j][j1];
-      jjjzarray_i = zarray_i[j2][j][j1];
-    }
-
-    for(int mb1 = 0; 2*mb1 < j1; mb1++)
-      for(int ma1 = 0; ma1 <= j1; ma1++) {
-
-        dudr_r = duarray_r[j1][ma1][mb1];
-        dudr_i = duarray_i[j1][ma1][mb1];
-        jjjmambzarray_r = jjjzarray_r[ma1][mb1];
-        jjjmambzarray_i = jjjzarray_i[ma1][mb1];
+    for(int mb = 0; 2*mb < j1; mb++)
+      for(int ma = 0; ma <= j1; ma++) {
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
-
-      } //end loop over ma1 mb1
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
+      } //end loop over ma mb
 
     // For j1 even, handle middle column
 
     if (j1%2 == 0) {
-      int mb1 = j1/2;
-      for(int ma1 = 0; ma1 < mb1; ma1++) {
-        dudr_r = duarray_r[j1][ma1][mb1];
-        dudr_i = duarray_i[j1][ma1][mb1];
-        jjjmambzarray_r = jjjzarray_r[ma1][mb1];
-        jjjmambzarray_i = jjjzarray_i[ma1][mb1];
+      int mb = j1/2;
+      for(int ma = 0; ma < mb; ma++) {
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
       }
-      int ma1 = mb1;
-      dudr_r = duarray_r[j1][ma1][mb1];
-      dudr_i = duarray_i[j1][ma1][mb1];
-      jjjmambzarray_r = jjjzarray_r[ma1][mb1];
-      jjjmambzarray_i = jjjzarray_i[ma1][mb1];
+      int ma = mb;
+      dudr_r = dulist_r[jju];
+      dudr_i = dulist_i[jju];
       for(int k = 0; k < 3; k++)
         sumzdu_r[k] +=
-          (dudr_r[k] * jjjmambzarray_r +
-           dudr_i[k] * jjjmambzarray_i)*0.5;
+          (dudr_r[k] * zlist_r[jjz] +
+           dudr_i[k] * zlist_i[jjz])*0.5;
+      jjz++;
+      jju++;
     } // end if j1even
 
     for(int k = 0; k < 3; k++)
       dbdr[k] += 2.0*sumzdu_r[k]*j1fac;
 
-    // Sum over Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2)
+    // Sum over Conj(dudr(j2,ma2,mb2))*z(j,j1,j2,ma2,mb2)
 
     double j2fac = (j+1)/(j2+1.0);
 
+    jjz = idxz_block[j][j1][j2];
+    jju = idxu_block[j2];
+
     for(int k = 0; k < 3; k++)
       sumzdu_r[k] = 0.0;
 
-    // use zarray j1/j2 symmetry (optional)
-
-    if (j1 >= j) {
-      jjjzarray_r = zarray_r[j1][j][j2];
-      jjjzarray_i = zarray_i[j1][j][j2];
-    } else {
-      jjjzarray_r = zarray_r[j][j1][j2];
-      jjjzarray_i = zarray_i[j][j1][j2];
-    }
-
-    for(int mb2 = 0; 2*mb2 < j2; mb2++)
-      for(int ma2 = 0; ma2 <= j2; ma2++) {
-
-        dudr_r = duarray_r[j2][ma2][mb2];
-        dudr_i = duarray_i[j2][ma2][mb2];
-        jjjmambzarray_r = jjjzarray_r[ma2][mb2];
-        jjjmambzarray_i = jjjzarray_i[ma2][mb2];
+    for(int mb = 0; 2*mb < j2; mb++)
+      for(int ma = 0; ma <= j2; ma++) {
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
-
-      } //end loop over ma2 mb2
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
+      } //end loop over ma mb
 
     // For j2 even, handle middle column
 
     if (j2%2 == 0) {
-      int mb2 = j2/2;
-      for(int ma2 = 0; ma2 < mb2; ma2++) {
-        dudr_r = duarray_r[j2][ma2][mb2];
-        dudr_i = duarray_i[j2][ma2][mb2];
-        jjjmambzarray_r = jjjzarray_r[ma2][mb2];
-        jjjmambzarray_i = jjjzarray_i[ma2][mb2];
+      int mb = j2/2;
+      for(int ma = 0; ma < mb; ma++) {
+        dudr_r = dulist_r[jju];
+        dudr_i = dulist_i[jju];
         for(int k = 0; k < 3; k++)
           sumzdu_r[k] +=
-            dudr_r[k] * jjjmambzarray_r +
-            dudr_i[k] * jjjmambzarray_i;
+            dudr_r[k] * zlist_r[jjz] +
+            dudr_i[k] * zlist_i[jjz];
+        jjz++;
+        jju++;
       }
-      int ma2 = mb2;
-      dudr_r = duarray_r[j2][ma2][mb2];
-      dudr_i = duarray_i[j2][ma2][mb2];
-      jjjmambzarray_r = jjjzarray_r[ma2][mb2];
-      jjjmambzarray_i = jjjzarray_i[ma2][mb2];
+      int ma = mb;
+      dudr_r = dulist_r[jju];
+      dudr_i = dulist_i[jju];
       for(int k = 0; k < 3; k++)
         sumzdu_r[k] +=
-          (dudr_r[k] * jjjmambzarray_r +
-           dudr_i[k] * jjjmambzarray_i)*0.5;
+          (dudr_r[k] * zlist_r[jjz] +
+           dudr_i[k] * zlist_i[jjz])*0.5;
+      jjz++;
+      jju++;
     } // end if j2even
 
     for(int k = 0; k < 3; k++)
@@ -1031,84 +822,59 @@ void SNA::compute_dbidrj()
 
   } //end loop over j1 j2 j
 
-#ifdef TIMING_INFO
-  clock_gettime(CLOCK_REALTIME, &endtime);
-  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
-                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-#endif
-
 }
 
 /* ----------------------------------------------------------------------
-   copy Bi derivatives into a vector
+   calculate derivative of Ui w.r.t. atom j
 ------------------------------------------------------------------------- */
 
-void SNA::copy_dbi2dbvec()
+void SNA::compute_duidrj(double* rij, double wj, double rcut)
 {
-  int ncount, j1, j2, j;
+  double rsq, r, x, y, z, z0, theta0, cs, sn;
+  double dz0dr;
 
-  ncount = 0;
+  x = rij[0];
+  y = rij[1];
+  z = rij[2];
+  rsq = x * x + y * y + z * z;
+  r = sqrt(rsq);
+  double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
+  theta0 = (r - rmin0) * rscale0;
+  cs = cos(theta0);
+  sn = sin(theta0);
+  z0 = r * cs / sn;
+  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
 
-  for(j1 = 0; j1 <= twojmax; j1++) {
-    if(diagonalstyle == 0) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2) {
-          dbvec[ncount][0] = dbarray[j1][j2][j][0];
-          dbvec[ncount][1] = dbarray[j1][j2][j][1];
-          dbvec[ncount][2] = dbarray[j1][j2][j][2];
-          ncount++;
-        }
-    } else if(diagonalstyle == 1) {
-      j2 = j1;
-      for(j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2) {
-        dbvec[ncount][0] = dbarray[j1][j2][j][0];
-        dbvec[ncount][1] = dbarray[j1][j2][j][1];
-        dbvec[ncount][2] = dbarray[j1][j2][j][2];
-        ncount++;
-      }
-    } else if(diagonalstyle == 2) {
-      j = j2 = j1;
-      dbvec[ncount][0] = dbarray[j1][j2][j][0];
-      dbvec[ncount][1] = dbarray[j1][j2][j][1];
-      dbvec[ncount][2] = dbarray[j1][j2][j][2];
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for(j2 = 0; j2 <= j1; j2++)
-        for(j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) {
-            dbvec[ncount][0] = dbarray[j1][j2][j][0];
-            dbvec[ncount][1] = dbarray[j1][j2][j][1];
-            dbvec[ncount][2] = dbarray[j1][j2][j][2];
-            ncount++;
-          }
-    }
-  }
+  compute_duarray(x, y, z, z0, r, dz0dr, wj, rcut);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void SNA::zero_uarraytot()
 {
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        uarraytot_r[j][ma][mb] = 0.0;
-        uarraytot_i[j][ma][mb] = 0.0;
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    for (int mb = 0; mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        ulisttot_r[jju] = 0.0;
+        ulisttot_i[jju] = 0.0;
+        jju++;
       }
+  }
 }
 
 /* ---------------------------------------------------------------------- */
 
 void SNA::addself_uarraytot(double wself_in)
 {
-  for (int j = 0; j <= twojmax; j++)
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
     for (int ma = 0; ma <= j; ma++) {
-      uarraytot_r[j][ma][ma] = wself_in;
-      uarraytot_i[j][ma][ma] = 0.0;
+      ulisttot_r[jju] = wself_in;
+      ulisttot_i[jju] = 0.0;
+      jju += j+2;
     }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1123,35 +889,17 @@ void SNA::add_uarraytot(double r, double wj, double rcut)
 
   sfac *= wj;
 
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        uarraytot_r[j][ma][mb] +=
-          sfac * uarray_r[j][ma][mb];
-        uarraytot_i[j][ma][mb] +=
-          sfac * uarray_i[j][ma][mb];
-      }
-}
-
-void SNA::add_uarraytot_omp(double r, double wj, double rcut)
-{
-  double sfac;
-
-  sfac = compute_sfac(r, rcut);
-
-  sfac *= wj;
-
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        uarraytot_r[j][ma][mb] +=
-          sfac * uarray_r[j][ma][mb];
-        uarraytot_i[j][ma][mb] +=
-          sfac * uarray_i[j][ma][mb];
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    for (int mb = 0; mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        ulisttot_r[jju] +=
+          sfac * ulist_r[jju];
+        ulisttot_i[jju] +=
+          sfac * ulist_i[jju];
+        jju++;
       }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1175,145 +923,72 @@ void SNA::compute_uarray(double x, double y, double z,
 
   // VMK Section 4.8.2
 
-  uarray_r[0][0][0] = 1.0;
-  uarray_i[0][0][0] = 0.0;
+  ulist_r[0] = 1.0;
+  ulist_i[0] = 0.0;
 
   for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    int jjup = idxu_block[j-1];
 
     // fill in left side of matrix layer from previous layer
 
     for (int mb = 0; 2*mb <= j; mb++) {
-      uarray_r[j][0][mb] = 0.0;
-      uarray_i[j][0][mb] = 0.0;
+      ulist_r[jju] = 0.0;
+      ulist_i[jju] = 0.0;
 
       for (int ma = 0; ma < j; ma++) {
         rootpq = rootpqarray[j - ma][j - mb];
-        uarray_r[j][ma][mb] +=
+        ulist_r[jju] +=
           rootpq *
-          (a_r * uarray_r[j - 1][ma][mb] +
-           a_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma][mb] +=
+          (a_r * ulist_r[jjup] +
+           a_i * ulist_i[jjup]);
+        ulist_i[jju] +=
           rootpq *
-          (a_r * uarray_i[j - 1][ma][mb] -
-           a_i * uarray_r[j - 1][ma][mb]);
+          (a_r * ulist_i[jjup] -
+           a_i * ulist_r[jjup]);
 
         rootpq = rootpqarray[ma + 1][j - mb];
-        uarray_r[j][ma + 1][mb] =
+        ulist_r[jju+1] =
           -rootpq *
-          (b_r * uarray_r[j - 1][ma][mb] +
-           b_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma + 1][mb] =
+          (b_r * ulist_r[jjup] +
+           b_i * ulist_i[jjup]);
+        ulist_i[jju+1] =
           -rootpq *
-          (b_r * uarray_i[j - 1][ma][mb] -
-           b_i * uarray_r[j - 1][ma][mb]);
+          (b_r * ulist_i[jjup] -
+           b_i * ulist_r[jjup]);
+        jju++;
+        jjup++;
       }
+      jju++;
     }
 
     // copy left side to right side with inversion symmetry VMK 4.4(2)
     // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
 
-    int mbpar = -1;
+    jju = idxu_block[j];
+    jjup = jju+(j+1)*(j+1)-1;
+    int mbpar = 1;
     for (int mb = 0; 2*mb <= j; mb++) {
-      mbpar = -mbpar;
-      int mapar = -mbpar;
+      int mapar = mbpar;
       for (int ma = 0; ma <= j; ma++) {
-        mapar = -mapar;
         if (mapar == 1) {
-          uarray_r[j][j-ma][j-mb] = uarray_r[j][ma][mb];
-          uarray_i[j][j-ma][j-mb] = -uarray_i[j][ma][mb];
+          ulist_r[jjup] = ulist_r[jju];
+          ulist_i[jjup] = -ulist_i[jju];
         } else {
-          uarray_r[j][j-ma][j-mb] = -uarray_r[j][ma][mb];
-          uarray_i[j][j-ma][j-mb] = uarray_i[j][ma][mb];
+          ulist_r[jjup] = -ulist_r[jju];
+          ulist_i[jjup] = ulist_i[jju];
         }
+        mapar = -mapar;
+        jju++;
+        jjup--;
       }
-    }
-  }
-}
-
-void SNA::compute_uarray_omp(double x, double y, double z,
-                             double z0, double r, int /*sub_threads*/)
-{
-  double r0inv;
-  double a_r, b_r, a_i, b_i;
-  double rootpq;
-
-  // compute Cayley-Klein parameters for unit quaternion
-
-  r0inv = 1.0 / sqrt(r * r + z0 * z0);
-  a_r = r0inv * z0;
-  a_i = -r0inv * z;
-  b_r = r0inv * y;
-  b_i = -r0inv * x;
-
-  // VMK Section 4.8.2
-
-  uarray_r[0][0][0] = 1.0;
-  uarray_i[0][0][0] = 0.0;
-
-  for (int j = 1; j <= twojmax; j++) {
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-    for (int mb = 0; mb < j; mb++) {
-      uarray_r[j][0][mb] = 0.0;
-      uarray_i[j][0][mb] = 0.0;
-
-      for (int ma = 0; ma < j; ma++) {
-        rootpq = rootpqarray[j - ma][j - mb];
-        uarray_r[j][ma][mb] +=
-          rootpq *
-          (a_r * uarray_r[j - 1][ma][mb] +
-           a_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma][mb] +=
-          rootpq *
-          (a_r * uarray_i[j - 1][ma][mb] -
-           a_i * uarray_r[j - 1][ma][mb]);
-
-        rootpq = rootpqarray[ma + 1][j - mb];
-        uarray_r[j][ma + 1][mb] =
-          -rootpq *
-          (b_r * uarray_r[j - 1][ma][mb] +
-           b_i * uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma + 1][mb] =
-          -rootpq *
-          (b_r * uarray_i[j - 1][ma][mb] -
-           b_i * uarray_r[j - 1][ma][mb]);
-      }
-    }
-
-    int mb = j;
-    uarray_r[j][0][mb] = 0.0;
-    uarray_i[j][0][mb] = 0.0;
-
-#if defined(_OPENMP)
-#pragma omp for
-#endif
-    for (int ma = 0; ma < j; ma++) {
-      rootpq = rootpqarray[j - ma][mb];
-      uarray_r[j][ma][mb] +=
-        rootpq *
-        (b_r * uarray_r[j - 1][ma][mb - 1] -
-         b_i * uarray_i[j - 1][ma][mb - 1]);
-      uarray_i[j][ma][mb] +=
-        rootpq *
-        (b_r * uarray_i[j - 1][ma][mb - 1] +
-         b_i * uarray_r[j - 1][ma][mb - 1]);
-
-      rootpq = rootpqarray[ma + 1][mb];
-      uarray_r[j][ma + 1][mb] =
-        rootpq *
-        (a_r * uarray_r[j - 1][ma][mb - 1] -
-         a_i * uarray_i[j - 1][ma][mb - 1]);
-      uarray_i[j][ma + 1][mb] =
-        rootpq *
-        (a_r * uarray_i[j - 1][ma][mb - 1] +
-         a_i * uarray_r[j - 1][ma][mb - 1]);
+      mbpar = -mbpar;
     }
   }
 }
 
 /* ----------------------------------------------------------------------
-   compute derivatives of Wigner U-functions for one neighbor
+   Compute derivatives of Wigner U-functions for one neighbor
    see comments in compute_uarray()
 ------------------------------------------------------------------------- */
 
@@ -1363,93 +1038,105 @@ void SNA::compute_duarray(double x, double y, double z,
   db_i[0] += -r0inv;
   db_r[1] += r0inv;
 
-  uarray_r[0][0][0] = 1.0;
-  duarray_r[0][0][0][0] = 0.0;
-  duarray_r[0][0][0][1] = 0.0;
-  duarray_r[0][0][0][2] = 0.0;
-  uarray_i[0][0][0] = 0.0;
-  duarray_i[0][0][0][0] = 0.0;
-  duarray_i[0][0][0][1] = 0.0;
-  duarray_i[0][0][0][2] = 0.0;
+  ulist_r[0] = 1.0;
+  dulist_r[0][0] = 0.0;
+  dulist_r[0][1] = 0.0;
+  dulist_r[0][2] = 0.0;
+  ulist_i[0] = 0.0;
+  dulist_i[0][0] = 0.0;
+  dulist_i[0][1] = 0.0;
+  dulist_i[0][2] = 0.0;
 
   for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    int jjup = idxu_block[j-1];
     for (int mb = 0; 2*mb <= j; mb++) {
-      uarray_r[j][0][mb] = 0.0;
-      duarray_r[j][0][mb][0] = 0.0;
-      duarray_r[j][0][mb][1] = 0.0;
-      duarray_r[j][0][mb][2] = 0.0;
-      uarray_i[j][0][mb] = 0.0;
-      duarray_i[j][0][mb][0] = 0.0;
-      duarray_i[j][0][mb][1] = 0.0;
-      duarray_i[j][0][mb][2] = 0.0;
+      ulist_r[jju] = 0.0;
+      dulist_r[jju][0] = 0.0;
+      dulist_r[jju][1] = 0.0;
+      dulist_r[jju][2] = 0.0;
+      ulist_i[jju] = 0.0;
+      dulist_i[jju][0] = 0.0;
+      dulist_i[jju][1] = 0.0;
+      dulist_i[jju][2] = 0.0;
 
       for (int ma = 0; ma < j; ma++) {
         rootpq = rootpqarray[j - ma][j - mb];
-        uarray_r[j][ma][mb] += rootpq *
-                               (a_r *  uarray_r[j - 1][ma][mb] +
-                                a_i *  uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma][mb] += rootpq *
-                               (a_r *  uarray_i[j - 1][ma][mb] -
-                                a_i *  uarray_r[j - 1][ma][mb]);
+        ulist_r[jju] += rootpq *
+                               (a_r *  ulist_r[jjup] +
+                                a_i *  ulist_i[jjup]);
+        ulist_i[jju] += rootpq *
+                               (a_r *  ulist_i[jjup] -
+                                a_i *  ulist_r[jjup]);
 
         for (int k = 0; k < 3; k++) {
-          duarray_r[j][ma][mb][k] +=
-            rootpq * (da_r[k] * uarray_r[j - 1][ma][mb] +
-                      da_i[k] * uarray_i[j - 1][ma][mb] +
-                      a_r * duarray_r[j - 1][ma][mb][k] +
-                      a_i * duarray_i[j - 1][ma][mb][k]);
-          duarray_i[j][ma][mb][k] +=
-            rootpq * (da_r[k] * uarray_i[j - 1][ma][mb] -
-                      da_i[k] * uarray_r[j - 1][ma][mb] +
-                      a_r * duarray_i[j - 1][ma][mb][k] -
-                      a_i * duarray_r[j - 1][ma][mb][k]);
+          dulist_r[jju][k] +=
+            rootpq * (da_r[k] * ulist_r[jjup] +
+                      da_i[k] * ulist_i[jjup] +
+                      a_r * dulist_r[jjup][k] +
+                      a_i * dulist_i[jjup][k]);
+          dulist_i[jju][k] +=
+            rootpq * (da_r[k] * ulist_i[jjup] -
+                      da_i[k] * ulist_r[jjup] +
+                      a_r * dulist_i[jjup][k] -
+                      a_i * dulist_r[jjup][k]);
         }
 
         rootpq = rootpqarray[ma + 1][j - mb];
-        uarray_r[j][ma + 1][mb] =
-          -rootpq * (b_r *  uarray_r[j - 1][ma][mb] +
-                     b_i *  uarray_i[j - 1][ma][mb]);
-        uarray_i[j][ma + 1][mb] =
-          -rootpq * (b_r *  uarray_i[j - 1][ma][mb] -
-                     b_i *  uarray_r[j - 1][ma][mb]);
+        ulist_r[jju+1] =
+          -rootpq * (b_r *  ulist_r[jjup] +
+                     b_i *  ulist_i[jjup]);
+        ulist_i[jju+1] =
+          -rootpq * (b_r *  ulist_i[jjup] -
+                     b_i *  ulist_r[jjup]);
 
         for (int k = 0; k < 3; k++) {
-          duarray_r[j][ma + 1][mb][k] =
-            -rootpq * (db_r[k] * uarray_r[j - 1][ma][mb] +
-                       db_i[k] * uarray_i[j - 1][ma][mb] +
-                       b_r * duarray_r[j - 1][ma][mb][k] +
-                       b_i * duarray_i[j - 1][ma][mb][k]);
-          duarray_i[j][ma + 1][mb][k] =
-            -rootpq * (db_r[k] * uarray_i[j - 1][ma][mb] -
-                       db_i[k] * uarray_r[j - 1][ma][mb] +
-                       b_r * duarray_i[j - 1][ma][mb][k] -
-                       b_i * duarray_r[j - 1][ma][mb][k]);
+          dulist_r[jju+1][k] =
+            -rootpq * (db_r[k] * ulist_r[jjup] +
+                       db_i[k] * ulist_i[jjup] +
+                       b_r * dulist_r[jjup][k] +
+                       b_i * dulist_i[jjup][k]);
+          dulist_i[jju+1][k] =
+            -rootpq * (db_r[k] * ulist_i[jjup] -
+                       db_i[k] * ulist_r[jjup] +
+                       b_r * dulist_i[jjup][k] -
+                       b_i * dulist_r[jjup][k]);
         }
+        jju++;
+        jjup++;
       }
+      jju++;
     }
 
-    int mbpar = -1;
+    // copy left side to right side with inversion symmetry VMK 4.4(2)
+    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+
+    jju = idxu_block[j];
+    jjup = jju+(j+1)*(j+1)-1;
+    int mbpar = 1;
     for (int mb = 0; 2*mb <= j; mb++) {
-      mbpar = -mbpar;
-      int mapar = -mbpar;
+      int mapar = mbpar;
       for (int ma = 0; ma <= j; ma++) {
-        mapar = -mapar;
         if (mapar == 1) {
-          uarray_r[j][j-ma][j-mb] = uarray_r[j][ma][mb];
-          uarray_i[j][j-ma][j-mb] = -uarray_i[j][ma][mb];
+          ulist_r[jjup] = ulist_r[jju];
+          ulist_i[jjup] = -ulist_i[jju];
           for (int k = 0; k < 3; k++) {
-            duarray_r[j][j-ma][j-mb][k] = duarray_r[j][ma][mb][k];
-            duarray_i[j][j-ma][j-mb][k] = -duarray_i[j][ma][mb][k];
+            dulist_r[jjup][k] = dulist_r[jju][k];
+            dulist_i[jjup][k] = -dulist_i[jju][k];
           }
         } else {
-          uarray_r[j][j-ma][j-mb] = -uarray_r[j][ma][mb];
-          uarray_i[j][j-ma][j-mb] = uarray_i[j][ma][mb];
+          ulist_r[jjup] = -ulist_r[jju];
+          ulist_i[jjup] = ulist_i[jju];
           for (int k = 0; k < 3; k++) {
-            duarray_r[j][j-ma][j-mb][k] = -duarray_r[j][ma][mb][k];
-            duarray_i[j][j-ma][j-mb][k] = duarray_i[j][ma][mb][k];
+            dulist_r[jjup][k] = -dulist_r[jju][k];
+            dulist_i[jjup][k] = dulist_i[jju][k];
           }
         }
+        mapar = -mapar;
+        jju++;
+        jjup--;
       }
+      mbpar = -mbpar;
     }
   }
 
@@ -1458,23 +1145,25 @@ void SNA::compute_duarray(double x, double y, double z,
 
   sfac *= wj;
   dsfac *= wj;
-
-  for (int j = 0; j <= twojmax; j++)
-    for (int ma = 0; ma <= j; ma++)
-      for (int mb = 0; mb <= j; mb++) {
-        duarray_r[j][ma][mb][0] = dsfac * uarray_r[j][ma][mb] * ux +
-                                  sfac * duarray_r[j][ma][mb][0];
-        duarray_i[j][ma][mb][0] = dsfac * uarray_i[j][ma][mb] * ux +
-                                  sfac * duarray_i[j][ma][mb][0];
-        duarray_r[j][ma][mb][1] = dsfac * uarray_r[j][ma][mb] * uy +
-                                  sfac * duarray_r[j][ma][mb][1];
-        duarray_i[j][ma][mb][1] = dsfac * uarray_i[j][ma][mb] * uy +
-                                  sfac * duarray_i[j][ma][mb][1];
-        duarray_r[j][ma][mb][2] = dsfac * uarray_r[j][ma][mb] * uz +
-                                  sfac * duarray_r[j][ma][mb][2];
-        duarray_i[j][ma][mb][2] = dsfac * uarray_i[j][ma][mb] * uz +
-                                  sfac * duarray_i[j][ma][mb][2];
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j];
+    for (int mb = 0; 2*mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        dulist_r[jju][0] = dsfac * ulist_r[jju] * ux +
+                                  sfac * dulist_r[jju][0];
+        dulist_i[jju][0] = dsfac * ulist_i[jju] * ux +
+                                  sfac * dulist_i[jju][0];
+        dulist_r[jju][1] = dsfac * ulist_r[jju] * uy +
+                                  sfac * dulist_r[jju][1];
+        dulist_i[jju][1] = dsfac * ulist_i[jju] * uy +
+                                  sfac * dulist_i[jju][1];
+        dulist_r[jju][2] = dsfac * ulist_r[jju] * uz +
+                                  sfac * dulist_r[jju][2];
+        dulist_i[jju][2] = dsfac * ulist_i[jju] * uz +
+                                  sfac * dulist_i[jju][2];
+        jju++;
       }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1483,87 +1172,96 @@ void SNA::compute_duarray(double x, double y, double z,
 
 double SNA::memory_usage()
 {
+  int jdimpq = twojmax + 2;
   int jdim = twojmax + 1;
   double bytes;
-  bytes = jdim * jdim * jdim * jdim * jdim * sizeof(double);
-  bytes += 2 * jdim * jdim * jdim * sizeof(complex<double>);
-  bytes += 2 * jdim * jdim * jdim * sizeof(double);
-  bytes += jdim * jdim * jdim * 3 * sizeof(complex<double>);
-  bytes += jdim * jdim * jdim * 3 * sizeof(double);
-  bytes += ncoeff * sizeof(double);
-  bytes += jdim * jdim * jdim * jdim * jdim * sizeof(complex<double>);
+
+  bytes = 0;
+
+  bytes += jdimpq*jdimpq * sizeof(double);               // pqarray
+  bytes += idxcg_max * sizeof(double);                   // cglist
+
+  bytes += idxu_max * sizeof(double) * 2;                // ulist
+  bytes += idxu_max * sizeof(double) * 2;                // ulisttot
+  bytes += idxu_max * 3 * sizeof(double) * 2;            // dulist
+
+  bytes += idxz_max * sizeof(double) * 2;                // zlist
+  bytes += idxb_max * sizeof(double);                    // blist
+  bytes += idxb_max * 3 * sizeof(double);                // dblist
+  bytes += idxu_max * sizeof(double) * 2;                // ylist
+
+  bytes += jdim * jdim * jdim * sizeof(int);             // idxcg_block
+  bytes += jdim * sizeof(int);                           // idxu_block
+  bytes += jdim * jdim * jdim * sizeof(int);             // idxz_block
+  bytes += jdim * jdim * jdim * sizeof(int);             // idxb_block
+
+  bytes += idxz_max * sizeof(SNA_ZINDICES);              // idxz
+  bytes += idxb_max * sizeof(SNA_BINDICES);              // idxb
+
+  bytes += jdim * sizeof(double);                        // bzero
+
+  bytes += nmax * 3 * sizeof(double);                    // rij
+  bytes += nmax * sizeof(int);                           // inside
+  bytes += nmax * sizeof(double);                        // wj
+  bytes += nmax * sizeof(double);                        // rcutij
+
   return bytes;
 }
-
 /* ---------------------------------------------------------------------- */
 
 void SNA::create_twojmax_arrays()
 {
-  int jdim = twojmax + 1;
-
-  memory->create(cgarray, jdim, jdim, jdim, jdim, jdim,
-                 "sna:cgarray");
-  memory->create(rootpqarray, jdim+1, jdim+1,
+  int jdimpq = twojmax + 2;
+  memory->create(rootpqarray, jdimpq, jdimpq,
                  "sna:rootpqarray");
-  memory->create(barray, jdim, jdim, jdim,
-                 "sna:barray");
-  memory->create(dbarray, jdim, jdim, jdim, 3,
-                 "sna:dbarray");
-
-  memory->create(duarray_r, jdim, jdim, jdim, 3,
-                 "sna:duarray");
-  memory->create(duarray_i, jdim, jdim, jdim, 3,
-                 "sna:duarray");
-
-  memory->create(uarray_r, jdim, jdim, jdim,
-                 "sna:uarray");
-  memory->create(uarray_i, jdim, jdim, jdim,
-                 "sna:uarray");
+  memory->create(cglist, idxcg_max, "sna:cglist");
+  memory->create(ulist_r, idxu_max, "sna:ulist");
+  memory->create(ulist_i, idxu_max, "sna:ulist");
+  memory->create(ulisttot_r, idxu_max, "sna:ulisttot");
+  memory->create(ulisttot_i, idxu_max, "sna:ulisttot");
+  memory->create(dulist_r, idxu_max, 3, "sna:dulist");
+  memory->create(dulist_i, idxu_max, 3, "sna:dulist");
+  memory->create(zlist_r, idxz_max, "sna:zlist");
+  memory->create(zlist_i, idxz_max, "sna:zlist");
+  memory->create(blist, idxb_max, "sna:blist");
+  memory->create(dblist, idxb_max, 3, "sna:dblist");
+  memory->create(ylist_r, idxu_max, "sna:ylist");
+  memory->create(ylist_i, idxu_max, "sna:ylist");
 
   if (bzero_flag)
-    memory->create(bzero, jdim,"sna:bzero");
+    memory->create(bzero, twojmax+1,"sna:bzero");
   else
     bzero = NULL;
 
-
-  if(!use_shared_arrays) {
-    memory->create(uarraytot_r, jdim, jdim, jdim,
-                   "sna:uarraytot");
-    memory->create(zarray_r, jdim, jdim, jdim, jdim, jdim,
-                   "sna:zarray");
-    memory->create(uarraytot_i, jdim, jdim, jdim,
-                   "sna:uarraytot");
-    memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
-                   "sna:zarray");
-  }
-
 }
 
 /* ---------------------------------------------------------------------- */
 
 void SNA::destroy_twojmax_arrays()
 {
-  memory->destroy(cgarray);
   memory->destroy(rootpqarray);
-  memory->destroy(barray);
+  memory->destroy(cglist);
+  memory->destroy(ulist_r);
+  memory->destroy(ulist_i);
+  memory->destroy(ulisttot_r);
+  memory->destroy(ulisttot_i);
+  memory->destroy(dulist_r);
+  memory->destroy(dulist_i);
+  memory->destroy(zlist_r);
+  memory->destroy(zlist_i);
+  memory->destroy(blist);
+  memory->destroy(dblist);
+  memory->destroy(ylist_r);
+  memory->destroy(ylist_i);
 
-  memory->destroy(dbarray);
-
-  memory->destroy(duarray_r);
-  memory->destroy(duarray_i);
-
-  memory->destroy(uarray_r);
-  memory->destroy(uarray_i);
+  memory->destroy(idxcg_block);
+  memory->destroy(idxu_block);
+  memory->destroy(idxz_block);
+  memory->destroy(idxb_block);
 
   if (bzero_flag)
     memory->destroy(bzero);
 
-  if(!use_shared_arrays) {
-    memory->destroy(uarraytot_r);
-    memory->destroy(zarray_r);
-    memory->destroy(uarraytot_i);
-    memory->destroy(zarray_i);
-  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1779,28 +1477,33 @@ void SNA::init_clebsch_gordan()
   int m, aa2, bb2, cc2;
   int ifac;
 
-  for (int j1 = 0; j1 <= twojmax; j1++)
-    for (int j2 = 0; j2 <= twojmax; j2++)
-      for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
-        for (int m1 = 0; m1 <= j1; m1 += 1) {
+  int idxcg_count = 0;
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        for (int m1 = 0; m1 <= j1; m1++) {
           aa2 = 2 * m1 - j1;
 
-          for (int m2 = 0; m2 <= j2; m2 += 1) {
+          for (int m2 = 0; m2 <= j2; m2++) {
 
             // -c <= cc <= c
 
             bb2 = 2 * m2 - j2;
             m = (aa2 + bb2 + j) / 2;
 
-            if(m < 0 || m > j) continue;
+            if(m < 0 || m > j) {
+              cglist[idxcg_count] = 0.0;
+              idxcg_count++;
+              continue;
+            }
 
             sum = 0.0;
 
             for (int z = MAX(0, MAX(-(j - j2 + aa2)
-                                   / 2, -(j - j1 - bb2) / 2));
-                z <= MIN((j1 + j2 - j) / 2,
-                         MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
-                z++) {
+                                    / 2, -(j - j1 - bb2) / 2));
+                 z <= MIN((j1 + j2 - j) / 2,
+                          MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
+                 z++) {
               ifac = z % 2 ? -1 : 1;
               sum += ifac /
                 (factorial(z) *
@@ -1810,20 +1513,22 @@ void SNA::init_clebsch_gordan()
                  factorial((j - j2 + aa2) / 2 + z) *
                  factorial((j - j1 - bb2) / 2 + z));
             }
-
+            
             cc2 = 2 * m - j;
             dcg = deltacg(j1, j2, j);
             sfaccg = sqrt(factorial((j1 + aa2) / 2) *
-                        factorial((j1 - aa2) / 2) *
-                        factorial((j2 + bb2) / 2) *
-                        factorial((j2 - bb2) / 2) *
-                        factorial((j  + cc2) / 2) *
-                        factorial((j  - cc2) / 2) *
-                        (j + 1));
-
-            cgarray[j1][j2][j][m1][m2] = sum * dcg * sfaccg;
+                          factorial((j1 - aa2) / 2) *
+                          factorial((j2 + bb2) / 2) *
+                          factorial((j2 - bb2) / 2) *
+                          factorial((j  + cc2) / 2) *
+                          factorial((j  - cc2) / 2) *
+                          (j + 1));
+            
+            cglist[idxcg_count] = sum * dcg * sfaccg;
+            idxcg_count++;
           }
         }
+      }
 }
 
 /* ----------------------------------------------------------------------
@@ -1838,74 +1543,6 @@ void SNA::init_rootpqarray()
       rootpqarray[p][q] = sqrt(static_cast<double>(p)/q);
 }
 
-/* ----------------------------------------------------------------------
-   a = j/2
-------------------------------------------------------------------------- */
-
-void SNA::jtostr(char* str, int j)
-{
-  if(j % 2 == 0)
-    sprintf(str, "%d", j / 2);
-  else
-    sprintf(str, "%d/2", j);
-}
-
-/* ----------------------------------------------------------------------
-   aa = m - j/2
-------------------------------------------------------------------------- */
-
-void SNA::mtostr(char* str, int j, int m)
-{
-  if(j % 2 == 0)
-    sprintf(str, "%d", m - j / 2);
-  else
-    sprintf(str, "%d/2", 2 * m - j);
-}
-
-/* ----------------------------------------------------------------------
-   list values of Clebsch-Gordan coefficients
-   using notation of VMK Table 8.11
-------------------------------------------------------------------------- */
-
-void SNA::print_clebsch_gordan(FILE* file)
-{
-  char stra[20], strb[20], strc[20], straa[20], strbb[20], strcc[20];
-  int m, aa2, bb2;
-
-  fprintf(file, "a, aa, b, bb, c, cc, c(a,aa,b,bb,c,cc) \n");
-
-  for (int j1 = 0; j1 <= twojmax; j1++) {
-    jtostr(stra, j1);
-
-    for (int j2 = 0; j2 <= twojmax; j2++) {
-      jtostr(strb, j2);
-
-      for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-        jtostr(strc, j);
-
-        for (int m1 = 0; m1 <= j1; m1 += 1) {
-          mtostr(straa, j1, m1);
-          aa2 = 2 * m1 - j1;
-
-          for (int m2 = 0; m2 <= j2; m2 += 1) {
-            bb2 = 2 * m2 - j2;
-            m = (aa2 + bb2 + j) / 2;
-
-            if(m < 0 || m > j) continue;
-
-            mtostr(strbb, j2, m2);
-            mtostr(strcc, j, m);
-
-            fprintf(file, "%s\t%s\t%s\t%s\t%s\t%s\t%g\n",
-                    stra, straa, strb, strbb, strc, strcc,
-                    cgarray[j1][j2][j][m1][m2]);
-          }
-        }
-      }
-    }
-  }
-}
-
 /* ---------------------------------------------------------------------- */
 
 int SNA::compute_ncoeff()
@@ -1915,25 +1552,10 @@ int SNA::compute_ncoeff()
   ncount = 0;
 
   for (int j1 = 0; j1 <= twojmax; j1++)
-    if(diagonalstyle == 0) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          ncount++;
-    } else if(diagonalstyle == 1) {
-      int j2 = j1;
-
-      for (int j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2)
-        ncount++;
-    } else if(diagonalstyle == 2) {
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) ncount++;
-    }
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2;
+           j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) ncount++;
 
   return ncount;
 }
diff --git a/src/SNAP/sna.h b/src/SNAP/sna.h
index d05ad0fb84..1e08ef123c 100644
--- a/src/SNAP/sna.h
+++ b/src/SNAP/sna.h
@@ -18,20 +18,22 @@
 #ifndef LMP_SNA_H
 #define LMP_SNA_H
 
-#include <complex>
 #include "pointers.h"
-#include <ctime>
 
 namespace LAMMPS_NS {
 
-struct SNA_LOOPINDICES {
+struct SNA_ZINDICES {
+  int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
+};
+
+struct SNA_BINDICES {
   int j1, j2, j;
 };
 
 class SNA : protected Pointers {
 
 public:
-  SNA(LAMMPS*, double, int, int, int, double, int, int);
+  SNA(LAMMPS*, double, int, double, int, int);
 
   SNA(LAMMPS* lmp) : Pointers(lmp) {};
   ~SNA();
@@ -44,31 +46,21 @@ public:
   // functions for bispectrum coefficients
 
   void compute_ui(int);
-  void compute_ui_omp(int, int);
   void compute_zi();
-  void compute_zi_omp(int);
+  void compute_yi(const double*);
+  void compute_yterm(int, int, int, const double*);
   void compute_bi();
-  void copy_bi2bvec();
 
   // functions for derivatives
 
   void compute_duidrj(double*, double, double);
   void compute_dbidrj();
-  void compute_dbidrj_nonsymm();
-  void copy_dbi2dbvec();
+  void compute_deidrj(double*);
   double compute_sfac(double, double);
   double compute_dsfac(double, double);
 
-#ifdef TIMING_INFO
-  double* timers;
-  timespec starttime, endtime;
-  int print;
-  int counter;
-#endif
-
-  //per sna class instance for OMP use
-
-  double* bvec, ** dbvec;
+  double* blist;
+  double** dblist;
   double** rij;
   int* inside;
   double* wj;
@@ -77,29 +69,32 @@ public:
 
   void grow_rij(int);
 
-  int twojmax, diagonalstyle;
-  double*** uarraytot_r, *** uarraytot_i;
-  double***** zarray_r, ***** zarray_i;
-  double*** uarraytot_r_b, *** uarraytot_i_b;
-  double***** zarray_r_b, ***** zarray_i_b;
-  double*** uarray_r, *** uarray_i;
+  int twojmax;
 
 private:
   double rmin0, rfac0;
 
-  //use indexlist instead of loops, constructor generates these
-  SNA_LOOPINDICES* idxj;
-  int idxj_max;
   // data for bispectrum coefficients
 
-  double***** cgarray;
+  SNA_ZINDICES* idxz;
+  SNA_BINDICES* idxb;
+  int idxcg_max, idxu_max, idxz_max, idxb_max;
+
   double** rootpqarray;
-  double*** barray;
+  double* cglist;  
+  int*** idxcg_block; 
 
-  // derivatives of data
+  double* ulisttot_r, * ulisttot_i;
+  double* ulist_r, * ulist_i;
+  int* idxu_block;
 
-  double**** duarray_r, **** duarray_i;
-  double**** dbarray;
+  double* zlist_r, * zlist_i;
+  int*** idxz_block;
+
+  int*** idxb_block;
+
+  double** dulist_r, ** dulist_i;
+  double* ylist_r, * ylist_i;
 
   static const int nmaxfactorial = 167;
   static const double nfac_table[];
@@ -109,28 +104,16 @@ private:
   void destroy_twojmax_arrays();
   void init_clebsch_gordan();
   void init_rootpqarray();
-  void jtostr(char*, int);
-  void mtostr(char*, int, int);
-  void print_clebsch_gordan(FILE*);
   void zero_uarraytot();
   void addself_uarraytot(double);
   void add_uarraytot(double, double, double);
-  void add_uarraytot_omp(double, double, double);
   void compute_uarray(double, double, double,
                       double, double);
-  void compute_uarray_omp(double, double, double,
-                          double, double, int);
   double deltacg(int, int, int);
   int compute_ncoeff();
   void compute_duarray(double, double, double,
                        double, double, double, double, double);
 
-  // if number of atoms are small use per atom arrays
-  // for twojmax arrays, rij, inside, bvec
-  // this will increase the memory footprint considerably,
-  // but allows parallel filling and reuse of these arrays
-  int use_shared_arrays;
-
   // Sets the style for the switching function
   // 0 = none
   // 1 = cosine
@@ -140,7 +123,7 @@ private:
   double wself;
 
   int bzero_flag; // 1 if bzero subtracted from barray
-  double *bzero;  // array of B values for isolated atoms
+  double* bzero;  // array of B values for isolated atoms
 };
 
 }