diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index c45a1d778c..0d54913bd7 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -265,7 +265,7 @@ OPT.
    * :doc:`smd/tri_surface <pair_smd_triangulated_surface>`
    * :doc:`smd/ulsph <pair_smd_ulsph>`
    * :doc:`smtbq <pair_smtbq>`
-   * :doc:`snap (k) <pair_snap>`
+   * :doc:`snap (ik) <pair_snap>`
    * :doc:`soft (go) <pair_soft>`
    * :doc:`sph/heatconduction <pair_sph_heatconduction>`
    * :doc:`sph/idealgas <pair_sph_idealgas>`
diff --git a/doc/src/pair_snap.rst b/doc/src/pair_snap.rst
index ebedb288c1..ffc43c712a 100644
--- a/doc/src/pair_snap.rst
+++ b/doc/src/pair_snap.rst
@@ -1,10 +1,11 @@
 .. index:: pair_style snap
+.. index:: pair_style snap/intel
 .. index:: pair_style snap/kk
 
 pair_style snap command
 =======================
 
-Accelerator Variants: *snap/kk*
+Accelerator Variants: *snap/intel*, *snap/kk*
 
 Syntax
 """"""
@@ -260,6 +261,14 @@ This style is part of the ML-SNAP package.  It is only enabled if LAMMPS
 was built with that package.  See the :doc:`Build package
 <Build_package>` page for more info.
 
+The *snap/intel* accelerator variant will *only* be available if LAMMPS
+is built with Intel *compilers* and for CPUs with AVX-512 support.
+While the INTEL package in general allows multiple floating point
+precision modes to be selected, *snap/intel* will currently always use
+full double precision regardless of the precision mode selected.
+Additionally, the *intel* variant of snap will **NOT** use multiple
+threads with OpenMP.
+
 Related commands
 """"""""""""""""
 
diff --git a/src/Depend.sh b/src/Depend.sh
index 4004f12686..b88c527b55 100755
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -185,6 +185,7 @@ fi
 if (test $1 = "ML-SNAP") then
   depend ML-IAP
   depend KOKKOS
+  depend INTEL
 fi
 
 if (test $1 = "CG-SPICA") then
diff --git a/src/INTEL/TEST/in.intel.snap b/src/INTEL/TEST/in.intel.snap
new file mode 100644
index 0000000000..4e45fe01f5
--- /dev/null
+++ b/src/INTEL/TEST/in.intel.snap
@@ -0,0 +1,70 @@
+# Toy demonstration of SNAP "scale" parameter, using fix/adapt and hybrid/overlay
+# Mixing linear and quadratic SNAP Ni potentials by Zuo et al. JCPA 2020
+
+variable	w index 10	# Warmup Timesteps
+variable	t index 100	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	rr equal floor($t*$m)
+variable        root getenv LMP_ROOT
+
+if "$n > 0"	then "processors * * * grid numa"
+
+# mixing parameter
+
+variable lambda equal 0.2
+
+# Initialize simulation
+
+variable a equal 3.52
+units           metal
+
+# generate the box and atom positions using a FCC lattice
+variable nx equal 20*$x
+variable ny equal 20*$y
+variable nz equal 20*$z
+
+boundary        p p p
+
+lattice         fcc $a
+region          box block 0 ${nx} 0 ${ny} 0 ${nz}
+create_box      1 box
+create_atoms    1 box
+
+mass 1 34.
+
+# choose bundled SNAP Ni potential from Zuo et al. JCPA 2020
+pair_style hybrid/overlay snap snap
+pair_coeff * * snap 1 &
+    ${root}/examples/snap/Ni_Zuo_JPCA2020.snapcoeff &
+    ${root}/examples/snap/Ni_Zuo_JPCA2020.snapparam Ni
+pair_coeff * * snap 2 &
+    ${root}/examples/snap/Ni_Zuo_JPCA2020.quadratic.snapcoeff &
+    ${root}/examples/snap/Ni_Zuo_JPCA2020.quadratic.snapparam Ni
+
+# scale according to mixing parameter
+variable l1 equal ${lambda}
+variable l2 equal 1.0-${lambda}
+fix scale1 all adapt 1 pair snap:1 scale * * v_l1
+fix scale2 all adapt 1 pair snap:2 scale * * v_l2
+
+# Setup output
+thermo          1
+thermo_modify norm yes
+
+# Set up NVE run
+timestep 0.5e-3
+neighbor 1.0 bin
+neigh_modify every 1 delay 0 check yes
+
+# Run MD
+velocity all create 300.0 4928459 loop geom
+fix 1 all nve
+
+if "$w > 0"	then "run $w"
+run		${rr}
diff --git a/src/INTEL/TEST/run_benchmarks.sh b/src/INTEL/TEST/run_benchmarks.sh
index 82eb51c928..eeb9f07a11 100755
--- a/src/INTEL/TEST/run_benchmarks.sh
+++ b/src/INTEL/TEST/run_benchmarks.sh
@@ -35,7 +35,7 @@ export I_MPI_PIN_DOMAIN=core
 # End settings for your system
 #########################################################################
 
-export WORKLOADS="lj rhodo lc sw water eam airebo dpd tersoff"
+export WORKLOADS="lj rhodo lc sw water eam airebo dpd tersoff snap"
 export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1"
 export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1"
 
diff --git a/src/INTEL/intel_simd.h b/src/INTEL/intel_simd.h
index 37842621dc..9f31580dd2 100644
--- a/src/INTEL/intel_simd.h
+++ b/src/INTEL/intel_simd.h
@@ -46,13 +46,38 @@ namespace ip_simd {
 
   typedef __mmask16 SIMD_mask;
 
+  inline bool any(const SIMD_mask &m) { return m != 0; }
+
   struct SIMD_int {
     __m512i v;
     SIMD_int() {}
     SIMD_int(const __m512i in) : v(in) {}
+    inline int & operator[](const int i) { return ((int *)&(v))[i]; }
+    inline const int & operator[](const int i) const
+      { return ((int *)&(v))[i]; }
     operator __m512i() const { return v;}
   };
 
+  struct SIMD256_int {
+    __m256i v;
+    SIMD256_int() {}
+    SIMD256_int(const __m256i in) : v(in) {}
+    SIMD256_int(const int in) : v(_mm256_set1_epi32(in)) {}
+    inline int & operator[](const int i) { return ((int *)&(v))[i]; }
+    inline const int & operator[](const int i) const
+      { return ((int *)&(v))[i]; }
+#ifdef __INTEL_LLVM_COMPILER
+    inline SIMD256_int operator&=(const int i)
+      { v=_mm256_and_epi32(v, _mm256_set1_epi32(i)); return *this; };
+#else
+    inline SIMD256_int operator&=(const int i)
+      { v=_mm256_and_si256(v, _mm256_set1_epi32(i)); return *this; };
+#endif
+    inline SIMD256_int operator+=(const int i)
+      { v=_mm256_add_epi32(v, _mm256_set1_epi32(i)); return *this; };
+    operator __m256i() const { return v;}
+  };
+
   struct SIMD_float {
     __m512 v;
     SIMD_float() {}
@@ -64,7 +89,24 @@ namespace ip_simd {
     __m512d v;
     SIMD_double() {}
     SIMD_double(const __m512d in) : v(in) {}
+    SIMD_double(const double in) { v=_mm512_set1_pd(in); }
+    inline double & operator[](const int i) { return ((double *)&(v))[i]; }
+    inline const double & operator[](const int i) const
+      { return ((double *)&(v))[i]; }
     operator __m512d() const { return v;}
+
+    SIMD_double & operator=(const double i)
+      { _mm512_set1_pd(i); return *this; }
+    SIMD_double &operator=(const SIMD_double &i)
+      { v = i.v; return *this; }
+
+    SIMD_double operator-() { return _mm512_xor_pd(v, _mm512_set1_pd(-0.0)); }
+    SIMD_double & operator+=(const SIMD_double & two)
+      { v = _mm512_add_pd(v, two.v); return *this; }
+    SIMD_double & operator-=(const SIMD_double & two)
+      { v = _mm512_sub_pd(v, two.v); return *this; }
+    SIMD_double & operator*=(const SIMD_double & two)
+      { v = _mm512_mul_pd(v, two.v); return *this; }
   };
 
   template<class flt_t>
@@ -99,6 +141,12 @@ namespace ip_simd {
 
   // ------- Set Operations
 
+  inline SIMD256_int SIMD256_set(const int l0, const int l1, const int l2,
+                                 const int l3, const int l4, const int l5,
+                                 const int l6, const int l7) {
+    return _mm256_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7);
+  }
+
   inline SIMD_int SIMD_set(const int l0, const int l1, const int l2,
                            const int l3, const int l4, const int l5,
                            const int l6, const int l7, const int l8,
@@ -109,6 +157,10 @@ namespace ip_simd {
                              l8,l9,l10,l11,l12,l13,l14,l15);
   }
 
+  inline SIMD256_int SIMD256_set(const int l) {
+    return _mm256_set1_epi32(l);
+  }
+
   inline SIMD_int SIMD_set(const int l) {
     return _mm512_set1_epi32(l);
   }
@@ -121,6 +173,10 @@ namespace ip_simd {
     return _mm512_set1_pd(l);
   }
 
+  inline SIMD256_int SIMD256_count() {
+    return SIMD256_set(0,1,2,3,4,5,6,7);
+  }
+
   inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) {
     return _mm512_maskz_mov_epi32(m, one);
   }
@@ -147,6 +203,10 @@ namespace ip_simd {
 
   // -------- Load Operations
 
+  inline SIMD256_int SIMD_load(const SIMD256_int *p) {
+    return _mm256_load_epi32((int *)p);
+  }
+
   inline SIMD_int SIMD_load(const int *p) {
     return _mm512_load_epi32(p);
   }
@@ -159,6 +219,10 @@ namespace ip_simd {
     return _mm512_load_pd(p);
   }
 
+  inline SIMD_double SIMD_load(const SIMD_double *p) {
+    return _mm512_load_pd((double *)p);
+  }
+
   inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) {
     return _mm512_maskz_load_epi32(m, p);
   }
@@ -171,6 +235,10 @@ namespace ip_simd {
     return _mm512_maskz_load_pd(m, p);
   }
 
+  inline SIMD256_int SIMD_gather(const int *p, const SIMD256_int &i) {
+    return _mm256_i32gather_epi32(p, i, _MM_SCALE_4);
+  }
+
   inline SIMD_int SIMD_gather(const int *p, const SIMD_int &i) {
     return _mm512_i32gather_epi32(i, p, _MM_SCALE_4);
   }
@@ -179,6 +247,10 @@ namespace ip_simd {
     return _mm512_i32gather_ps(i, p, _MM_SCALE_4);
   }
 
+  inline SIMD_double SIMD_gather(const double *p, const SIMD256_int &i) {
+    return _mm512_i32gather_pd(i, p, _MM_SCALE_8);
+  }
+
   inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
     return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
   }
@@ -201,6 +273,12 @@ namespace ip_simd {
                                     _mm512_castsi512_si256(i), p, _MM_SCALE_8);
   }
 
+  inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
+                                 const SIMD256_int &i) {
+    return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    i, p, _MM_SCALE_8);
+  }
+
   template <typename T>
   inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p,
                                       const SIMD_int &i) {
@@ -252,6 +330,15 @@ namespace ip_simd {
     return _mm512_store_pd(p,one);
   }
 
+  inline void SIMD_store(SIMD_double *p, const SIMD_double &one) {
+    return _mm512_store_pd((double *)p,one);
+  }
+
+  inline void SIMD_scatter(const SIMD_mask &m, int *p,
+                           const SIMD256_int &i, const SIMD256_int &vec) {
+    _mm256_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
+  }
+
   inline void SIMD_scatter(const SIMD_mask &m, int *p,
                            const SIMD_int &i, const SIMD_int &vec) {
     _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
@@ -268,8 +355,22 @@ namespace ip_simd {
                               _MM_SCALE_8);
   }
 
+  inline void SIMD_scatter(const SIMD_mask &m, double *p,
+                           const SIMD256_int &i, const SIMD_double &vec) {
+    _mm512_mask_i32scatter_pd(p, m, i, vec, _MM_SCALE_8);
+  }
+
+  inline void SIMD_scatter(double *p,
+                           const SIMD256_int &i, const SIMD_double &vec) {
+    _mm512_i32scatter_pd(p, i, vec, _MM_SCALE_8);
+  }
+
   // ------- Arithmetic Operations
 
+  inline SIMD256_int operator+(const SIMD256_int &one, const SIMD256_int &two) {
+    return _mm256_add_epi32(one,two);
+  }
+
   inline SIMD_int operator+(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_add_epi32(one,two);
   }
@@ -286,6 +387,10 @@ namespace ip_simd {
     return _mm512_add_epi32(one,SIMD_set(two));
   }
 
+  inline SIMD256_int operator+(const SIMD256_int &one, const int two) {
+    return _mm256_add_epi32(one,SIMD256_set(two));
+  }
+
   inline SIMD_float operator+(const SIMD_float &one, const float two) {
     return _mm512_add_ps(one,SIMD_set(two));
   }
@@ -299,6 +404,11 @@ namespace ip_simd {
     return _mm512_mask_add_epi32(one,m,one,SIMD_set(two));
   }
 
+  inline SIMD256_int SIMD_add(const SIMD_mask &m,
+                           const SIMD256_int &one, const int two) {
+    return _mm256_mask_add_epi32(one,m,one,SIMD256_set(two));
+  }
+
   inline SIMD_float SIMD_add(const SIMD_mask &m,
                              const SIMD_float &one, const float two) {
     return _mm512_mask_add_ps(one,m,one,SIMD_set(two));
@@ -309,6 +419,11 @@ namespace ip_simd {
     return _mm512_mask_add_pd(one,m,one,SIMD_set(two));
   }
 
+  inline SIMD_double SIMD_add(const SIMD_mask &m,
+                              const SIMD_double &one, const SIMD_double &two) {
+    return _mm512_mask_add_pd(one,m,one,two);
+  }
+
   inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m,
                            const SIMD_int &one, const SIMD_int &two) {
     return _mm512_mask_add_epi32(s,m,one,two);
@@ -387,6 +502,10 @@ namespace ip_simd {
     return _mm512_mul_pd(one,two);
   }
 
+  inline SIMD256_int operator*(const SIMD256_int &one, const int two) {
+    return _mm256_mullo_epi32(one,SIMD256_set(two));
+  }
+
   inline SIMD_int operator*(const SIMD_int &one, const int two) {
     return _mm512_mullo_epi32(one,SIMD_set(two));
   }
@@ -417,6 +536,12 @@ namespace ip_simd {
     return _mm512_fmadd_pd(one,two,three);
   }
 
+  inline SIMD_double SIMD_fma(const SIMD_mask m, const SIMD_double &one,
+                              const SIMD_double &two,
+                              const SIMD_double &three) {
+    return _mm512_mask3_fmadd_pd(one,two,three,m);
+  }
+
   inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two,
                              const SIMD_float &three) {
     return _mm512_fmsub_ps(one,two,three);
@@ -493,6 +618,10 @@ namespace ip_simd {
     return _mm512_pow_pd(one, two);
   }
 
+  inline SIMD_double SIMD_pow(const SIMD_double &one, const double two) {
+    return _mm512_pow_pd(one, SIMD_set(two));
+  }
+
   inline SIMD_float SIMD_exp(const SIMD_float &one) {
     return _mm512_exp_ps(one);
   }
@@ -501,6 +630,18 @@ namespace ip_simd {
     return _mm512_exp_pd(one);
   }
 
+  inline SIMD_double SIMD_cos(const SIMD_double &one) {
+    return _mm512_cos_pd(one);
+  }
+
+  inline SIMD_double SIMD_sin(const SIMD_double &one) {
+    return _mm512_sin_pd(one);
+  }
+
+  inline SIMD_double SIMD_tan(const SIMD_double &one) {
+    return _mm512_tan_pd(one);
+  }
+
   // ------- Comparison operations
 
   inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one,
@@ -533,6 +674,14 @@ namespace ip_simd {
     return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two);
   }
 
+  inline SIMD_mask operator<(const SIMD256_int &one, const SIMD256_int &two) {
+    return _mm256_cmplt_epi32_mask(one,two);
+  }
+
+  inline SIMD_mask operator<(const int one, const SIMD256_int &two) {
+    return _mm256_cmplt_epi32_mask(SIMD256_set(one),two);
+  }
+
   inline SIMD_mask operator<(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_cmplt_epi32_mask(one,two);
   }
@@ -577,6 +726,10 @@ namespace ip_simd {
     return _mm512_cmple_ps_mask(SIMD_set(one), two);
   }
 
+  inline SIMD_mask operator<=(const SIMD_double &one, const SIMD_double &two) {
+    return _mm512_cmple_pd_mask(one, two);
+  }
+
   inline SIMD_mask operator<=(const double one, const SIMD_double &two) {
     return _mm512_cmple_pd_mask(SIMD_set(one), two);
   }
@@ -593,6 +746,14 @@ namespace ip_simd {
     return _mm512_cmplt_pd_mask(two,one);
   }
 
+  inline SIMD_mask operator>(const SIMD_double &one, const double two) {
+    return _mm512_cmplt_pd_mask(SIMD_set(two),one);
+  }
+
+  inline SIMD_mask operator==(const SIMD256_int &one, const int two) {
+    return _mm256_cmpeq_epi32_mask(one,_mm256_set1_epi32(two));
+  }
+
   inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_cmpeq_epi32_mask(one,two);
   }
diff --git a/src/INTEL/pair_snap_intel.cpp b/src/INTEL/pair_snap_intel.cpp
new file mode 100644
index 0000000000..d91f0adc36
--- /dev/null
+++ b/src/INTEL/pair_snap_intel.cpp
@@ -0,0 +1,779 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
+
+#include "pair_snap_intel.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+#include "sna_intel.h"
+#include "tokenizer.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace ip_simd;
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+/* ---------------------------------------------------------------------- */
+
+PairSNAPIntel::PairSNAPIntel(LAMMPS *lmp) : Pair(lmp)
+{
+  single_enable = 0;
+  restartinfo = 0;
+  one_coeff = 1;
+  manybody_flag = 1;
+  centroidstressflag = CENTROID_NOTAVAIL;
+
+  radelem = nullptr;
+  wjelem = nullptr;
+  coeffelem = nullptr;
+  sinnerelem = nullptr;
+  dinnerelem = nullptr;
+
+  beta = nullptr;
+  bispectrum = nullptr;
+  snaptr = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairSNAPIntel::~PairSNAPIntel()
+{
+  if (copymode) return;
+
+  memory->destroy(radelem);
+  memory->destroy(wjelem);
+  memory->destroy(coeffelem);
+  memory->destroy(sinnerelem);
+  memory->destroy(dinnerelem);
+
+  memory->destroy(beta);
+  memory->destroy(bispectrum);
+
+  delete snaptr;
+
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(scale);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   This version is a straightforward implementation
+   ---------------------------------------------------------------------- */
+
+void PairSNAPIntel::compute(int eflag, int vflag)
+{
+  SNA_DVEC fij[3];
+  int *jlist,*numneigh,**firstneigh;
+
+  ev_init(eflag,vflag);
+  int tally_xyz = 0;
+  if (vflag_atom || (vflag && !vflag_fdotr)) tally_xyz = 1;
+
+  double **x = atom->x;
+  double *_x = atom->x[0];
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int newton_pair = force->newton_pair;
+
+  // compute dE_i/dB_i = beta_i for all i in list
+
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  SNA_DVEC sevdwl(0);
+
+  const int vw = snaptr->vector_width();
+  for (int ii = 0; ii < list->inum; ii+=vw) {
+    SNA_IVEC i, jnum;
+    int max_jnum = 0;
+    for (int l = 0; l < vw; l++) {
+      if (ii + l < list->inum) {
+        i[l] = list->ilist[ii + l];
+        jnum[l] = numneigh[i[l]];
+      } else {
+        i[l] = list->ilist[0];
+        jnum[l] = 0;
+      }
+      if (jnum[l] > max_jnum) max_jnum = jnum[l];
+    }
+
+    // ensure rij, inside, wj, and rcutij are of size jnum
+
+    snaptr->grow_rij(max_jnum);
+
+    SNA_IVEC zero_vec(0);
+
+    const SNA_DVEC xtmp = SIMD_gather(_x, i * 3);
+    const SNA_DVEC ytmp = SIMD_gather(_x, i * 3 + 1);
+    const SNA_DVEC ztmp = SIMD_gather(_x, i * 3 + 2);
+    const SNA_IVEC itype = SIMD_gather(type, i);
+    const SNA_IVEC ielem = SIMD_gather(map, itype);
+    const SNA_DVEC radi = SIMD_gather(radelem, ielem);
+
+    // rij[][3] = displacements between atom I and those neighbors
+    // inside = indices of neighbors of I within cutoff
+    // wj = weights for neighbors of I within cutoff
+    // rcutij = cutoffs for neighbors of I within cutoff
+    // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+    SNA_IVEC ninside(0);
+    for (int jj = 0; jj < max_jnum; jj++) {
+      SIMD_mask m(SIMD256_set(jj) < jnum);
+
+      SNA_IVEC j;
+      SV_for (int l = 0; l < vw; l++) {
+        jlist = firstneigh[i[l]];
+        if (jj < jnum[l]) j[l] = jlist[jj];
+        else j[l] = 0;
+      }
+      j &= NEIGHMASK;
+
+      const SNA_DVEC delx = SIMD_gather(m, _x, j * 3) - xtmp;
+      const SNA_DVEC dely = SIMD_gather(m, _x, j * 3 + 1) - ytmp;
+      const SNA_DVEC delz = SIMD_gather(m, _x, j * 3 + 2) - ztmp;
+      const SNA_IVEC jtype = SIMD_gather(type, j);
+      const SNA_DVEC rsq = delx*delx + dely*dely + delz*delz;
+      const SNA_DVEC vcut = SIMD_gather(m, cutsq[0],
+                                        itype * (atom->ntypes + 1) + jtype);
+
+      m &= rsq < vcut;
+      m &= rsq > SIMD_set(1e-20);
+      const SNA_IVEC jelem = SIMD_gather(map, jtype);
+      const SNA_IVEC ni3 = ninside * vw * 3 + SIMD256_count();
+      SIMD_scatter(m, (double *)(snaptr->rij[0]), ni3, delx);
+      SIMD_scatter(m, (double *)(snaptr->rij[0] + 1), ni3, dely);
+      SIMD_scatter(m, (double *)(snaptr->rij[0] + 2), ni3, delz);
+      const SNA_IVEC ni = ninside * vw + SIMD256_count();
+      SIMD_scatter(m, (int *)(snaptr->inside), ni, j);
+      SIMD_scatter(m, (double *)(snaptr->wj), ni,
+                   SIMD_gather(m, wjelem, jelem));
+      SIMD_scatter(m, (double *)(snaptr->rcutij), ni,
+                   (radi + SIMD_gather(m, radelem, jelem)) * rcutfac);
+      if (switchinnerflag) {
+        SIMD_scatter(m, (double *)(snaptr->sinnerij), ni,
+                     (SIMD_gather(m, sinnerelem, ielem) +
+                      SIMD_gather(m, sinnerelem, jelem)) * 0.5);
+        SIMD_scatter(m, (double *)(snaptr->dinnerij), ni,
+                     (SIMD_gather(m, dinnerelem, ielem) +
+                      SIMD_gather(m, dinnerelem, jelem)) * 0.5);
+      }
+      if (chemflag)
+        SIMD_scatter(m, (int *)(snaptr->element), ni, jelem);
+      ninside = SIMD_add(m, ninside, 1);
+    } // for jj
+
+    // compute Ui, Yi for atom I
+
+    if (chemflag)
+      snaptr->compute_ui(ninside, ielem, max_jnum);
+    else
+      snaptr->compute_ui(ninside, zero_vec, max_jnum);
+
+    // Compute bispectrum
+    if (quadraticflag || eflag) {
+      snaptr->compute_zi_or_yi<0>(beta);
+      if (chemflag)
+        snaptr->compute_bi(ielem);
+      else
+        snaptr->compute_bi(zero_vec);
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+        SIMD_store(bispectrum + icoeff, SIMD_load(snaptr->blist + icoeff));
+    }
+
+    // Compute beta
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+      SIMD_store(beta + icoeff, SIMD_gather(coeffelem[0],
+                                            ielem * ncoeffall + icoeff + 1));
+
+    if (quadraticflag) {
+      int k = ncoeff+1;
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+        SNA_DVEC bveci = SIMD_load(bispectrum + icoeff);
+        SNA_DVEC beta_i = SIMD_load(beta + icoeff) +
+          SIMD_gather(coeffelem[0], ielem * ncoeffall + k) * bveci;
+        k++;
+        for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+          const SNA_DVEC ci = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
+          beta_i = beta_i + ci * SIMD_load(bispectrum + jcoeff);
+          SIMD_store(beta + jcoeff, ci * bveci + SIMD_load(beta + jcoeff));
+          k++;
+        }
+        SIMD_store(beta + icoeff, beta_i);
+      }
+    }
+
+    // for neighbors of I within cutoff:
+    // compute Fij = dEi/dRj = -dEi/dRi
+    // add to Fi, subtract from Fj
+    // scaling is that for type I
+
+    if (quadraticflag || eflag)
+      snaptr->compute_yi_from_zi(beta);
+    else
+      snaptr->compute_zi_or_yi<1>(beta);
+
+    SNA_DVEC fi_x(0.0), fi_y(0.0), fi_z(0.0);
+    SNA_DVEC scalev = SIMD_gather(scale[0], itype * (atom->ntypes+1) + itype);
+    for (int jj = 0; jj < max_jnum; jj++) {
+      snaptr->compute_duidrj(jj, ninside);
+      if (chemflag && nelements > 1)
+        snaptr->compute_deidrj_e(jj, ninside, fij);
+      else
+        snaptr->compute_deidrj(jj, ninside, fij);
+
+      SNA_DVEC fijs_x = fij[0] * scalev;
+      SNA_DVEC fijs_y = fij[1] * scalev;
+      SNA_DVEC fijs_z = fij[2] * scalev;
+
+      fi_x += fijs_x;
+      fi_y += fijs_y;
+      fi_z += fijs_z;
+
+      for (int l = 0; l < vw; l++) {
+        if (jj < ninside[l]) {
+          int j = snaptr->inside[jj][l];
+          f[j][0] -= fijs_x[l];
+          f[j][1] -= fijs_y[l];
+          f[j][2] -= fijs_z[l];
+
+          if (tally_xyz)
+            ev_tally_xyz(i[l],j,nlocal,newton_pair,0.0,0.0,
+                         fij[0][l],fij[1][l],fij[2][l],
+                         -snaptr->rij[jj][0][l],-snaptr->rij[jj][1][l],
+                         -snaptr->rij[jj][2][l]);
+        }
+      } // for l
+    } // for jj
+    SIMD_mask m((SIMD256_count() + ii) < list->inum);
+    SNA_DVEC fix = SIMD_gather(m, f[0], i * 3) +  fi_x;
+    SIMD_scatter(m, f[0], i * 3, fix);
+    SNA_DVEC fiy = SIMD_gather(m, f[0], i * 3 + 1) +  fi_y;
+    SIMD_scatter(m, f[0], i * 3 + 1, fiy);
+    SNA_DVEC fiz = SIMD_gather(m, f[0], i * 3 + 2) +  fi_z;
+    SIMD_scatter(m, f[0], i * 3 + 2, fiz);
+
+    // tally energy contribution
+
+    if (eflag) {
+      SNA_DVEC evdwl = SIMD_gather(coeffelem[0], ielem * ncoeffall);
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+        evdwl += SIMD_gather(coeffelem[0], ielem * ncoeffall + icoeff +1) *
+          bispectrum[icoeff];
+
+      if (quadraticflag) {
+        int k = ncoeff+1;
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+          SNA_DVEC bveci = SIMD_load(bispectrum + icoeff);
+          SNA_DVEC c = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
+          k++;
+          evdwl += c * 0.5 * bveci * bveci;
+          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+            SNA_DVEC bvecj = SIMD_load(bispectrum + jcoeff);
+            SNA_DVEC cj = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
+            k++;
+            evdwl += cj * bveci * bvecj;
+          }
+        }
+      }
+      sevdwl += scalev * evdwl;
+      if (eatom) {
+        SNA_DVEC ea = SIMD_gather(m, eatom, i) + scalev * evdwl;
+        SIMD_scatter(m, eatom, i, ea);
+      }
+    } // if (eflag)
+  } // for ii
+  if (eflag) eng_vdwl += SIMD_sum(sevdwl);
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairSNAPIntel::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+  memory->create(scale,n+1,n+1,"pair:scale");
+  map = new int[n+1];
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSNAPIntel::settings(int narg, char ** /* arg */)
+{
+  if (narg > 0)
+    error->all(FLERR,"Illegal pair_style command");
+  if ((comm->me == 0) && (comm->nthreads > 1))
+    error->warning(FLERR, "Pair style snap/intel does not use OpenMP threads");
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairSNAPIntel::coeff(int narg, char **arg)
+{
+  if (!allocated) allocate();
+  if (narg != 4 + atom->ntypes) error->all(FLERR,"Incorrect args for pair coefficients");
+
+  map_element2type(narg-4,arg+4);
+
+  // read snapcoeff and snapparam files
+
+  read_files(arg[2],arg[3]);
+
+  if (!quadraticflag)
+    ncoeff = ncoeffall - 1;
+  else {
+
+    // ncoeffall should be (ncoeff+2)*(ncoeff+1)/2
+    // so, ncoeff = floor(sqrt(2*ncoeffall))-1
+
+    ncoeff = sqrt(2.0*ncoeffall)-1;
+    ncoeffq = (ncoeff*(ncoeff+1))/2;
+    int ntmp = 1+ncoeff+ncoeffq;
+    if (ntmp != ncoeffall) {
+      error->all(FLERR,"Incorrect SNAP coeff file");
+    }
+  }
+
+  snaptr = new SNAIntel(lmp, rfac0, twojmax,
+                        rmin0, switchflag, bzeroflag,
+                        chemflag, bnormflag, wselfallflag,
+                        nelements, switchinnerflag);
+
+  if (ncoeff != snaptr->ncoeff) {
+    if (comm->me == 0)
+      printf("ncoeff = %d snancoeff = %d \n",ncoeff,snaptr->ncoeff);
+    error->all(FLERR,"Incorrect SNAP parameter file");
+  }
+
+  // Calculate maximum cutoff for all elements
+  rcutmax = 0.0;
+  for (int ielem = 0; ielem < nelements; ielem++)
+    rcutmax = MAX(2.0*radelem[ielem]*rcutfac,rcutmax);
+
+  // set default scaling
+  int n = atom->ntypes;
+  for (int ii = 0; ii < n+1; ii++)
+    for (int jj = 0; jj < n+1; jj++)
+      scale[ii][jj] = 1.0;
+
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairSNAPIntel::init_style()
+{
+  if (force->newton_pair == 0)
+    error->all(FLERR,"Pair style SNAP requires newton pair on");
+
+  // need a full neighbor list
+
+  neighbor->add_request(this, NeighConst::REQ_FULL);
+
+  snaptr->init();
+
+  fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
+  if (!fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
+
+  fix->pair_init_check();
+
+  memory->create(bispectrum,ncoeff,"PairSNAP:bispectrum");
+  memory->create(beta,ncoeff,"PairSNAP:beta");
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairSNAPIntel::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  scale[j][i] = scale[i][j];
+  return (radelem[map[i]] +
+          radelem[map[j]])*rcutfac;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSNAPIntel::read_files(char *coefffilename, char *paramfilename)
+{
+
+  // open SNAP coefficient file on proc 0
+
+  FILE *fpcoeff;
+  if (comm->me == 0) {
+    fpcoeff = utils::open_potential(coefffilename,lmp,nullptr);
+    if (fpcoeff == nullptr)
+      error->one(FLERR,"Cannot open SNAP coefficient file {}: ",
+                                   coefffilename, utils::getsyserror());
+  }
+
+  char line[MAXLINE],*ptr;
+  int eof = 0;
+  int nwords = 0;
+  while (nwords == 0) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpcoeff);
+      if (ptr == nullptr) {
+        eof = 1;
+        fclose(fpcoeff);
+      }
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
+
+    // strip comment, skip line if blank
+
+    nwords = utils::count_words(utils::trim_comment(line));
+  }
+  if (nwords != 2)
+    error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+  // strip single and double quotes from words
+
+  int nelemtmp = 0;
+  try {
+    ValueTokenizer words(utils::trim_comment(line),"\"' \t\n\r\f");
+    nelemtmp = words.next_int();
+    ncoeffall = words.next_int();
+  } catch (TokenizerException &e) {
+    error->all(FLERR,"Incorrect format in SNAP coefficient file: {}", e.what());
+  }
+
+  // clean out old arrays and set up element lists
+
+  memory->destroy(radelem);
+  memory->destroy(wjelem);
+  memory->destroy(coeffelem);
+  memory->destroy(sinnerelem);
+  memory->destroy(dinnerelem);
+  memory->create(radelem,nelements,"pair:radelem");
+  memory->create(wjelem,nelements,"pair:wjelem");
+  memory->create(coeffelem,nelements,ncoeffall,"pair:coeffelem");
+  memory->create(sinnerelem,nelements,"pair:sinnerelem");
+  memory->create(dinnerelem,nelements,"pair:dinnerelem");
+
+  // initialize checklist for all required nelements
+
+  int *elementflags = new int[nelements];
+  for (int jelem = 0; jelem < nelements; jelem++)
+      elementflags[jelem] = 0;
+
+  // loop over nelemtmp blocks in the SNAP coefficient file
+
+  for (int ielem = 0; ielem < nelemtmp; ielem++) {
+
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpcoeff);
+      if (ptr == nullptr) {
+        eof = 1;
+        fclose(fpcoeff);
+      }
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof)
+      error->all(FLERR,"Incorrect format in SNAP coefficient file");
+    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
+
+    std::vector<std::string> words;
+    try {
+      words = Tokenizer(utils::trim_comment(line),"\"' \t\n\r\f").as_vector();
+    } catch (TokenizerException &) {
+      // ignore
+    }
+    if (words.size() != 3)
+      error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+    int jelem;
+    for (jelem = 0; jelem < nelements; jelem++)
+      if (words[0] == elements[jelem]) break;
+
+    // if this element not needed, skip this block
+
+    if (jelem == nelements) {
+      if (comm->me == 0) {
+        for (int icoeff = 0; icoeff < ncoeffall; icoeff++) {
+          ptr = fgets(line,MAXLINE,fpcoeff);
+          if (ptr == nullptr) {
+            eof = 1;
+            fclose(fpcoeff);
+          }
+        }
+      }
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof)
+        error->all(FLERR,"Incorrect format in SNAP coefficient file");
+      continue;
+    }
+
+    if (elementflags[jelem] == 1)
+      error->all(FLERR,"Incorrect format in SNAP coefficient file");
+    else
+      elementflags[jelem] = 1;
+
+    radelem[jelem] = utils::numeric(FLERR,words[1],false,lmp);
+    wjelem[jelem] = utils::numeric(FLERR,words[2],false,lmp);
+
+    if (comm->me == 0)
+      utils::logmesg(lmp,"SNAP Element = {}, Radius {}, Weight {}\n",
+                     elements[jelem], radelem[jelem], wjelem[jelem]);
+
+    for (int icoeff = 0; icoeff < ncoeffall; icoeff++) {
+      if (comm->me == 0) {
+        ptr = fgets(line,MAXLINE,fpcoeff);
+        if (ptr == nullptr) {
+          eof = 1;
+          fclose(fpcoeff);
+        }
+      }
+
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof)
+        error->all(FLERR,"Incorrect format in SNAP coefficient file");
+      MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
+
+      try {
+        ValueTokenizer coeff(utils::trim_comment(line));
+        if (coeff.count() != 1)
+          error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+        coeffelem[jelem][icoeff] = coeff.next_double();
+      } catch (TokenizerException &e) {
+        error->all(FLERR,"Incorrect format in SNAP coefficient file: {}", e.what());
+      }
+    }
+  }
+
+  if (comm->me == 0) fclose(fpcoeff);
+
+  for (int jelem = 0; jelem < nelements; jelem++) {
+    if (elementflags[jelem] == 0)
+      error->all(FLERR,"Element {} not found in SNAP coefficient file", elements[jelem]);
+  }
+  delete[] elementflags;
+
+  // set flags for required keywords
+
+  rcutfacflag = 0;
+  twojmaxflag = 0;
+
+  // Set defaults for optional keywords
+
+  rfac0 = 0.99363;
+  rmin0 = 0.0;
+  switchflag = 1;
+  bzeroflag = 1;
+  quadraticflag = 0;
+  chemflag = 0;
+  bnormflag = 0;
+  wselfallflag = 0;
+  switchinnerflag = 0;
+  chunksize = 32768;
+  parallel_thresh = 8192;
+
+  // set local input checks
+
+  int sinnerflag = 0;
+  int dinnerflag = 0;
+
+  // open SNAP parameter file on proc 0
+
+  FILE *fpparam;
+  if (comm->me == 0) {
+    fpparam = utils::open_potential(paramfilename,lmp,nullptr);
+    if (fpparam == nullptr)
+      error->one(FLERR,"Cannot open SNAP parameter file {}: {}",
+                                   paramfilename, utils::getsyserror());
+  }
+
+  eof = 0;
+  while (true) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpparam);
+      if (ptr == nullptr) {
+        eof = 1;
+        fclose(fpparam);
+      }
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
+
+    // words = ptrs to all words in line
+    // strip single and double quotes from words
+
+    std::vector<std::string> words;
+    try {
+      words = Tokenizer(utils::trim_comment(line),"\"' \t\n\r\f").as_vector();
+    } catch (TokenizerException &) {
+      // ignore
+    }
+
+    if (words.size() == 0) continue;
+
+    if (words.size() < 2)
+      error->all(FLERR,"Incorrect format in SNAP parameter file");
+
+    auto keywd = words[0];
+    auto keyval = words[1];
+
+    // check for keywords with more than one value per element
+
+    if (keywd == "sinner" || keywd == "dinner") {
+
+      if ((int)words.size() != nelements+1)
+        error->all(FLERR,"Incorrect SNAP parameter file");
+
+      // innerlogstr collects all values of sinner or dinner for log output below
+
+      std::string innerlogstr;
+
+      int iword = 1;
+
+      if (keywd == "sinner") {
+        for (int ielem = 0; ielem < nelements; ielem++) {
+          keyval = words[iword];
+          sinnerelem[ielem] = utils::numeric(FLERR,keyval,false,lmp);
+          iword++;
+          innerlogstr += keyval + " ";
+        }
+        sinnerflag = 1;
+      } else if (keywd == "dinner") {
+        for (int ielem = 0; ielem < nelements; ielem++) {
+          keyval = words[iword];
+          dinnerelem[ielem] = utils::numeric(FLERR,keyval,false,lmp);
+          iword++;
+          innerlogstr += keyval + " ";
+        }
+        dinnerflag = 1;
+      }
+
+      if (comm->me == 0)
+        utils::logmesg(lmp,"SNAP keyword {} {} ... \n", keywd, innerlogstr);
+
+    } else {
+
+      // all other keywords take one value
+
+      if (nwords != 2)
+        error->all(FLERR,"Incorrect SNAP parameter file");
+
+      if (comm->me == 0)
+        utils::logmesg(lmp,"SNAP keyword {} {}\n",keywd,keyval);
+
+      if (keywd == "rcutfac") {
+        rcutfac = utils::numeric(FLERR,keyval,false,lmp);
+        rcutfacflag = 1;
+      } else if (keywd == "twojmax") {
+        twojmax = utils::inumeric(FLERR,keyval,false,lmp);
+        twojmaxflag = 1;
+      } else if (keywd == "rfac0")
+        rfac0 = utils::numeric(FLERR,keyval,false,lmp);
+      else if (keywd == "rmin0")
+        rmin0 = utils::numeric(FLERR,keyval,false,lmp);
+      else if (keywd == "switchflag")
+        switchflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "bzeroflag")
+        bzeroflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "quadraticflag")
+        quadraticflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "chemflag")
+        chemflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "bnormflag")
+        bnormflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "wselfallflag")
+        wselfallflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "switchinnerflag")
+        switchinnerflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "chunksize")
+        chunksize = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "parallelthresh")
+        parallel_thresh = utils::inumeric(FLERR,keyval,false,lmp);
+      else
+        error->all(FLERR,"Unknown parameter '{}' in SNAP parameter file", keywd);
+    }
+  }
+
+  if (rcutfacflag == 0 || twojmaxflag == 0)
+    error->all(FLERR,"Incorrect SNAP parameter file");
+
+  if (chemflag && nelemtmp != nelements)
+    error->all(FLERR,"Incorrect SNAP parameter file");
+
+  if (switchinnerflag && !(sinnerflag && dinnerflag))
+    error->all(FLERR,"Incorrect SNAP parameter file");
+
+  if (!switchinnerflag && (sinnerflag || dinnerflag))
+    error->all(FLERR,"Incorrect SNAP parameter file");
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double PairSNAPIntel::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+
+  int n = atom->ntypes+1;
+  bytes += (double)n*n*sizeof(int);         // setflag
+  bytes += (double)n*n*sizeof(double);      // cutsq
+  bytes += (double)n*n*sizeof(double);      // scale
+  bytes += (double)n*sizeof(int);           // map
+  bytes += (double)ncoeff*sizeof(SNA_DVEC); // bispectrum
+  bytes += (double)ncoeff*sizeof(SNA_DVEC); // beta
+
+  bytes += snaptr->memory_usage(); // SNA object
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *PairSNAPIntel::extract(const char *str, int &dim)
+{
+  dim = 2;
+  if (strcmp(str,"scale") == 0) return (void *) scale;
+  return nullptr;
+}
+
+#endif
+#endif
diff --git a/src/INTEL/pair_snap_intel.h b/src/INTEL/pair_snap_intel.h
new file mode 100644
index 0000000000..2dc758f244
--- /dev/null
+++ b/src/INTEL/pair_snap_intel.h
@@ -0,0 +1,83 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(snap/intel,PairSNAPIntel);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_SNAP_INTEL_H
+#define LMP_PAIR_SNAP_INTEL_H
+
+#include "fix_intel.h"
+#include "pair.h"
+
+namespace ip_simd { class SIMD_double; class SIMD_int; };
+#define SNA_DVEC ip_simd::SIMD_double
+#define SNA_IVEC ip_simd::SIMD256_int
+
+namespace LAMMPS_NS {
+
+class PairSNAPIntel : public Pair {
+ public:
+  PairSNAPIntel(class LAMMPS *);
+  ~PairSNAPIntel() override;
+  void compute(int, int) override;
+  void settings(int, char **) override;
+  void coeff(int, char **) override;
+  void init_style() override;
+  double init_one(int, int) override;
+  double memory_usage() override;
+  void *extract(const char *, int &) override;
+
+  double rcutfac, quadraticflag;    // declared public to workaround gcc 4.9
+  int ncoeff;                       //  compiler bug, manifest in KOKKOS package
+
+ protected:
+  FixIntel *fix;
+
+  int ncoeffq, ncoeffall;
+  class SNAIntel *snaptr;
+  virtual void allocate();
+  void read_files(char *, char *);
+  inline int equal(double *x, double *y);
+  inline double dist2(double *x, double *y);
+
+  double rcutmax;         // max cutoff for all elements
+  double *radelem;        // element radii
+  double *wjelem;         // elements weights
+  double **coeffelem;     // element bispectrum coefficients
+  SNA_DVEC *beta;          // betas for all atoms in list
+  SNA_DVEC *bispectrum;    // bispectrum components for all atoms in list
+  double **scale;         // for thermodynamic integration
+  int twojmax, switchflag, bzeroflag, bnormflag;
+  int chemflag, wselfallflag;
+  int switchinnerflag;    // inner cutoff switch
+  double *sinnerelem;     // element inner cutoff midpoint
+  double *dinnerelem;     // element inner cutoff half-width
+  int chunksize, parallel_thresh;
+  double rfac0, rmin0, wj1, wj2;
+  int rcutfacflag, twojmaxflag;    // flags for required parameters
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
+
+#endif
+#endif
diff --git a/src/INTEL/sna_intel.cpp b/src/INTEL/sna_intel.cpp
new file mode 100644
index 0000000000..b83c90688d
--- /dev/null
+++ b/src/INTEL/sna_intel.cpp
@@ -0,0 +1,1505 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown, Intel
+------------------------------------------------------------------------- */
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
+
+#include "sna_intel.h"
+
+#include "comm.h"
+#include "error.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+
+#include <cmath>
+
+using namespace std;
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+using namespace ip_simd;
+
+/* ----------------------------------------------------------------------
+
+   this implementation is based on the method outlined
+   in Bartok[1], using formulae from VMK[2].
+
+   for the Clebsch-Gordan coefficients, we
+   convert the VMK half-integral labels
+   a, b, c, alpha, beta, gamma
+   to array offsets j1, j2, j, m1, m2, m
+   using the following relations:
+
+   j1 = 2*a
+   j2 = 2*b
+   j =  2*c
+
+   m1 = alpha+a      2*alpha = 2*m1 - j1
+   m2 = beta+b    or 2*beta = 2*m2 - j2
+   m =  gamma+c      2*gamma = 2*m - j
+
+   in this way:
+
+   -a <= alpha <= a
+   -b <= beta <= b
+   -c <= gamma <= c
+
+   becomes:
+
+   0 <= m1 <= j1
+   0 <= m2 <= j2
+   0 <= m <= j
+
+   and the requirement that
+   a+b+c be integral implies that
+   j1+j2+j must be even.
+   The requirement that:
+
+   gamma = alpha+beta
+
+   becomes:
+
+   2*m - j = 2*m1 - j1 + 2*m2 - j2
+
+   Similarly, for the Wigner U-functions U(J,m,m') we
+   convert the half-integral labels J,m,m' to
+   array offsets j,ma,mb:
+
+   j = 2*J
+   ma = J+m
+   mb = J+m'
+
+   so that:
+
+   0 <= j <= 2*Jmax
+   0 <= ma, mb <= j.
+
+   For the bispectrum components B(J1,J2,J) we convert to:
+
+   j1 = 2*J1
+   j2 = 2*J2
+   j = 2*J
+
+   and the requirement:
+
+   |J1-J2| <= J <= J1+J2, for j1+j2+j integral
+
+   becomes:
+
+   |j1-j2| <= j <= j1+j2, for j1+j2+j even integer
+
+   or
+
+   j = |j1-j2|, |j1-j2|+2,...,j1+j2-2,j1+j2
+
+   [1] Albert Bartok-Partay, "Gaussian Approximation..."
+   Doctoral Thesis, Cambridge University, (2009)
+
+   [2] D. A. Varshalovich, A. N. Moskalev, and V. K. Khersonskii,
+   "Quantum Theory of Angular Momentum," World Scientific (1988)
+
+------------------------------------------------------------------------- */
+
+SNAIntel::SNAIntel(LAMMPS* lmp, double rfac0_in, int twojmax_in,
+                   double rmin0_in, int switch_flag_in, int bzero_flag_in,
+                   int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in,
+                   int nelements_in, int switch_inner_flag_in) : Pointers(lmp)
+{
+  wself = 1.0;
+
+  rfac0 = rfac0_in;
+  rmin0 = rmin0_in;
+  switch_flag = switch_flag_in;
+  switch_inner_flag = switch_inner_flag_in;
+  bzero_flag = bzero_flag_in;
+  chem_flag = chem_flag_in;
+  bnorm_flag = bnorm_flag_in;
+  wselfall_flag = wselfall_flag_in;
+
+  if (bnorm_flag != chem_flag)
+    lmp->error->warning(FLERR, "bnormflag and chemflag are not equal."
+                        "This is probably not what you intended");
+
+  if (chem_flag)
+    nelements = nelements_in;
+  else
+    nelements = 1;
+
+  twojmax = twojmax_in;
+
+  compute_ncoeff();
+
+  rij = nullptr;
+  inside = nullptr;
+  wj = nullptr;
+  rcutij = nullptr;
+  sinnerij = nullptr;
+  dinnerij = nullptr;
+  element = nullptr;
+  nmax = 0;
+  idxz = nullptr;
+  idxb = nullptr;
+  ulist_r_ij = nullptr;
+  ulist_i_ij = nullptr;
+
+  build_indexlist();
+  create_twojmax_arrays();
+
+  if (bzero_flag) {
+    double www = wself*wself*wself;
+    for (int j = 0; j <= twojmax; j++)
+      if (bnorm_flag)
+        bzero[j] = www;
+      else
+        bzero[j] = www*(j+1);
+  }
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+SNAIntel::~SNAIntel()
+{
+  memory->destroy(rij);
+  memory->destroy(inside);
+  memory->destroy(wj);
+  memory->destroy(rcutij);
+  memory->destroy(sinnerij);
+  memory->destroy(dinnerij);
+  if (chem_flag) memory->destroy(element);
+  memory->destroy(ulist_r_ij);
+  memory->destroy(ulist_i_ij);
+  delete[] idxz;
+  delete[] idxb;
+  destroy_twojmax_arrays();
+}
+
+void SNAIntel::build_indexlist()
+{
+
+  // index list for cglist
+
+  int jdim = twojmax + 1;
+  memory->create(idxcg_block, jdim, jdim, jdim,
+                 "sna:idxcg_block");
+
+  int idxcg_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        idxcg_block[j1][j2][j] = idxcg_count;
+        for (int m1 = 0; m1 <= j1; m1++)
+          for (int m2 = 0; m2 <= j2; m2++)
+            idxcg_count++;
+      }
+  idxcg_max = idxcg_count;
+
+  // index list for uarray
+  // need to include both halves
+
+  memory->create(idxu_block, jdim,
+                 "sna:idxu_block");
+
+  int idxu_count = 0;
+
+  for (int j = 0; j <= twojmax; j++) {
+    idxu_block[j] = idxu_count;
+    for (int mb = 0; mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++)
+        idxu_count++;
+  }
+  idxu_max = idxu_count;
+
+  // index list for beta and B
+
+  int idxb_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) idxb_count++;
+
+  idxb_max = idxb_count;
+  idxb = new SNA_BINDICES[idxb_max];
+
+  idxb_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) {
+          idxb[idxb_count].j1 = j1;
+          idxb[idxb_count].j2 = j2;
+          idxb[idxb_count].j = j;
+          idxb_count++;
+        }
+
+  // reverse index list for beta and b
+
+  memory->create(idxb_block, jdim, jdim, jdim,
+                 "sna:idxb_block");
+  idxb_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        if (j >= j1) {
+          idxb_block[j1][j2][j] = idxb_count;
+          idxb_count++;
+        }
+      }
+
+  // index list for zlist
+
+  int idxz_count = 0;
+
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++)
+            idxz_count++;
+
+  idxz_max = idxz_count;
+  idxz = new SNA_ZINDICES[idxz_max];
+
+  memory->create(idxz_block, jdim, jdim, jdim,
+                 "sna:idxz_block");
+
+  idxz_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        idxz_block[j1][j2][j] = idxz_count;
+
+        // find right beta[jjb] entry
+        // multiply and divide by j+1 factors
+        // account for multiplicity of 1, 2, or 3
+
+        for (int mb = 0; 2*mb <= j; mb++)
+          for (int ma = 0; ma <= j; ma++) {
+            idxz[idxz_count].j1 = j1;
+            idxz[idxz_count].j2 = j2;
+            idxz[idxz_count].j = j;
+            idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2);
+            idxz[idxz_count].ma2max = (2 * ma - j - (2 * idxz[idxz_count].ma1min - j1) + j2) / 2;
+            idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - idxz[idxz_count].ma1min + 1;
+            idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2);
+            idxz[idxz_count].mb2max = (2 * mb - j - (2 * idxz[idxz_count].mb1min - j1) + j2) / 2;
+            idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - idxz[idxz_count].mb1min + 1;
+            // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+
+            const int jju = idxu_block[j] + (j+1)*mb + ma;
+            idxz[idxz_count].jju = jju;
+
+            idxz_count++;
+          }
+      }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void SNAIntel::init()
+{
+  init_clebsch_gordan();
+  //   print_clebsch_gordan();
+  init_rootpqarray();
+}
+
+void SNAIntel::grow_rij(int newnmax)
+{
+  if (newnmax <= nmax) return;
+
+  nmax = newnmax;
+
+  memory->destroy(rij);
+  memory->destroy(inside);
+  memory->destroy(wj);
+  memory->destroy(rcutij);
+  memory->destroy(sinnerij);
+  memory->destroy(dinnerij);
+  if (chem_flag) memory->destroy(element);
+  memory->destroy(ulist_r_ij);
+  memory->destroy(ulist_i_ij);
+  memory->create(rij, nmax, 3, "pair:rij");
+  memory->create(inside, nmax, "pair:inside");
+  memory->create(wj, nmax, "pair:wj");
+  memory->create(rcutij, nmax, "pair:rcutij");
+  memory->create(sinnerij, nmax, "pair:sinnerij");
+  memory->create(dinnerij, nmax, "pair:dinnerij");
+  if (chem_flag) memory->create(element, nmax, "sna:element");
+  memory->create(ulist_r_ij, nmax, idxu_max, "sna:ulist_ij");
+  memory->create(ulist_i_ij, nmax, idxu_max, "sna:ulist_ij");
+}
+
+/* ----------------------------------------------------------------------
+   compute Ui by summing over neighbors j
+------------------------------------------------------------------------- */
+
+void SNAIntel::compute_ui(const SNA_IVEC &jnum, const SNA_IVEC &ielem,
+                          const int max_jnum)
+{
+  // utot(j,ma,mb) = 0 for all j,ma,ma
+  // utot(j,ma,ma) = 1 for all j,ma
+  // for j in neighbors of i:
+  //   compute r0 = (x,y,z,z0)
+  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
+
+  zero_uarraytot(ielem);
+
+  for (int j = 0; j < max_jnum; j++) {
+    const SNA_DVEC x = rij[j][0];
+    const SNA_DVEC y = rij[j][1];
+    const SNA_DVEC z = rij[j][2];
+    const SNA_DVEC rcut = rcutij[j];
+    const SNA_DVEC rsq = x * x + y * y + z * z;
+    const SNA_DVEC r = SIMD_sqrt(rsq);
+    const SNA_DVEC rscale0 = SIMD_rcp(rcut - rmin0) * rfac0 * MY_PI;
+    const SNA_DVEC theta0 = (r - rmin0) * rscale0;
+    const SNA_DVEC z0 = r * SIMD_rcp(SIMD_tan(theta0));
+
+    compute_uarray(x, y, z, z0, r, j, jnum);
+    add_uarraytot(r, j, jnum);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   pick out right beta value
+------------------------------------------------------------------------- */
+
+double SNAIntel::choose_beta(const int j, const int j1, const int j2,
+                             const int elem1, const int elem2, const int elem3,
+                             int &itriple)
+{
+  double bfactor;
+  if (j >= j1) {
+    const int jjb = idxb_block[j1][j2][j];
+    itriple = ((elem1 * nelements + elem2) * nelements + elem3) *
+      idxb_max + jjb;
+    if (j1 == j) {
+      if (j2 == j)
+        bfactor = 3.0;
+      else
+        bfactor = 2.0;
+    } else
+      bfactor = 1.0;
+  } else if (j >= j2) {
+    const int jjb = idxb_block[j][j2][j1];
+    itriple = ((elem3 * nelements + elem2) * nelements + elem1) *
+      idxb_max + jjb;
+    if (j2 == j)
+      bfactor = 2.0;
+    else
+      bfactor = 1.0;
+  } else {
+    const int jjb = idxb_block[j2][j][j1];
+    itriple = ((elem2 * nelements + elem3) * nelements + elem1) *
+      idxb_max + jjb;
+    bfactor = 1.0;
+  }
+
+  if (!bnorm_flag && j1 > j)
+    bfactor *= (1.0 + j1) / (1.0 + j);
+
+  return bfactor;
+}
+
+/* ----------------------------------------------------------------------
+   compute Yi from Ui without storing Zi, looping over zlist indices
+------------------------------------------------------------------------- */
+
+template <int COMPUTE_YI>
+void SNAIntel::compute_zi_or_yi(const SNA_DVEC* beta)
+{
+  if (COMPUTE_YI) {
+    memset(ylist_r,0,idxu_max*nelements*sizeof(SNA_DVEC));
+    memset(ylist_i,0,idxu_max*nelements*sizeof(SNA_DVEC));
+  }
+
+  double *zlist_rp = (double *)zlist_r;
+  double *zlist_ip = (double *)zlist_i;
+
+  int zlist_i = 0;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++)
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      for (int jjz = 0; jjz < idxz_max; jjz++) {
+        const int j1 = idxz[jjz].j1;
+        const int j2 = idxz[jjz].j2;
+        const int j = idxz[jjz].j;
+        const int ma1min = idxz[jjz].ma1min;
+        const int ma2max = idxz[jjz].ma2max;
+        const int na = idxz[jjz].na;
+        const int mb1min = idxz[jjz].mb1min;
+        const int mb2max = idxz[jjz].mb2max;
+        const int nb = idxz[jjz].nb;
+
+        const double *cgblock = cglist + idxcg_block[j1][j2][j];
+
+        SNA_DVEC ztmp_r = 0.0;
+        SNA_DVEC ztmp_i = 0.0;
+
+        const double *u_r = (double *)ulisttot_r;
+        const double *u_i = (double *)ulisttot_i;
+
+        int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min;
+        int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max;
+        jju1 *= vector_width();
+        jju2 *= vector_width();
+        int icgb = mb1min * (j2 + 1) + mb2max;
+        for (int ib = 0; ib < nb; ib++) {
+
+          SNA_DVEC suma1_r = 0.0;
+          SNA_DVEC suma1_i = 0.0;
+
+          int ma1 = ma1min * vector_width();
+          int ma2 = ma2max * vector_width();
+          int icga = ma1min * (j2 + 1) + ma2max;
+
+          for (int ia = 0; ia < na; ia++) {
+            const SNA_DVEC u1_r = SIMD_load(u_r + jju1 + ma1);
+            const SNA_DVEC u2_r = SIMD_load(u_r + jju2 + ma2);
+            const SNA_DVEC u1_i = SIMD_load(u_i + jju1 + ma1);
+            const SNA_DVEC u2_i = SIMD_load(u_i + jju2 + ma2);
+            suma1_r += (u1_r*u2_r - u1_i*u2_i) * cgblock[icga];
+            suma1_i += (u1_r*u2_i + u1_i*u2_r) * cgblock[icga];
+            ma1+= vector_width();
+            ma2-= vector_width();
+            icga += j2;
+          } // end loop over ia
+
+          ztmp_r += suma1_r * cgblock[icgb];
+          ztmp_i += suma1_i * cgblock[icgb];
+
+          jju1 += (j1 + 1) * vector_width();
+          jju2 -= (j2 + 1) * vector_width();
+          icgb += j2;
+        } // end loop over ib
+
+        // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
+        // find right y_list[jju] and beta[jjb] entries
+        // multiply and divide by j+1 factors
+        // account for multiplicity of 1, 2, or 3
+
+        if (bnorm_flag) {
+          ztmp_i *= SIMD_rcp(SIMD_set(static_cast<double>(j+1)));
+          ztmp_r *= SIMD_rcp(SIMD_set(static_cast<double>(j+1)));
+        }
+
+        if (COMPUTE_YI) {
+          int jju = idxz[jjz].jju;
+          for (int elem3 = 0; elem3 < nelements; elem3++) {
+            int itriple;
+            double bfactor = choose_beta(j, j1, j2, elem1, elem2, elem3,
+                                         itriple);
+            const SNA_DVEC betaj = beta[itriple] * bfactor;
+            const int i = elem3 * idxu_max + jju;
+            SIMD_store(&(ylist_r[i]), SIMD_load(ylist_r + i) + betaj * ztmp_r);
+            SIMD_store(&(ylist_i[i]), SIMD_load(ylist_i + i) + betaj * ztmp_i);
+          }
+        } else {
+          SIMD_store(zlist_rp + zlist_i, ztmp_r);
+          SIMD_store(zlist_ip + zlist_i, ztmp_i);
+          zlist_i += vector_width();
+        }
+      }// end loop over jjz
+    }
+}
+
+/* ----------------------------------------------------------------------
+   compute Yi from Zi
+------------------------------------------------------------------------- */
+
+void SNAIntel::compute_yi_from_zi(const SNA_DVEC* beta)
+{
+  memset(ylist_r,0,idxu_max*nelements*sizeof(SNA_DVEC));
+  memset(ylist_i,0,idxu_max*nelements*sizeof(SNA_DVEC));
+
+  double *zlist_rp = (double *)zlist_r;
+  double *zlist_ip = (double *)zlist_i;
+
+  int zlist_i = 0;
+
+  for (int elem1 = 0; elem1 < nelements; elem1++)
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+      for (int jjz = 0; jjz < idxz_max; jjz++) {
+        const int j1 = idxz[jjz].j1;
+        const int j2 = idxz[jjz].j2;
+        const int j = idxz[jjz].j;
+
+        const SNA_DVEC ztmp_r = SIMD_load(zlist_rp + zlist_i);
+        const SNA_DVEC ztmp_i = SIMD_load(zlist_ip + zlist_i);
+        zlist_i += vector_width();
+
+        int jju = idxz[jjz].jju;
+        for (int elem3 = 0; elem3 < nelements; elem3++) {
+          int itriple;
+          double bfactor = choose_beta(j, j1, j2, elem1, elem2, elem3,
+                                       itriple);
+          const SNA_DVEC betaj = beta[itriple] * bfactor;
+          const int i = elem3 * idxu_max + jju;
+          SIMD_store(&(ylist_r[i]), SIMD_load(ylist_r + i) + betaj * ztmp_r);
+          SIMD_store(&(ylist_i[i]), SIMD_load(ylist_i + i) + betaj * ztmp_i);
+        }
+      } // end loop over jjz
+    }
+}
+
+/* ----------------------------------------------------------------------
+   compute dEidRj
+------------------------------------------------------------------------- */
+
+void SNAIntel::compute_deidrj_e(const int jj, const SNA_IVEC &jnum,
+                                SNA_DVEC* dedr)
+{
+  double *ylist_rp = (double *)ylist_r;
+  double *ylist_ip = (double *)ylist_i;
+  double *dulist_rp = (double *)(dulist_r[0]);
+  double *dulist_ip = (double *)(dulist_i[0]);
+
+  for (int k = 0; k < 3; k++)
+    dedr[k] = SIMD_set(0.0);
+
+  SNA_IVEC jelem;
+  if (chem_flag) jelem = SIMD_load(element + jj);
+  else jelem = SIMD256_set(0);
+
+  SIMD_mask m(jj < jnum);
+
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j] * vector_width();
+    int jju3 = jju * 3;
+    SNA_IVEC i = jelem*idxu_max*vector_width() + jju + SIMD256_count();
+
+    for (int mb = 0; 2*mb < j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i);
+        SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i);
+        for (int k = 0; k < 3; k++) {
+          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
+          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
+          SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
+          dedr[k] = SIMD_add(m, dedr[k], du);
+          jju3 += vector_width();
+        }
+        i = i + vector_width();
+      }
+
+    if (j%2 == 0) {
+      int mb = j / 2;
+      for (int ma = 0; ma < mb; ma++) {
+        SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i);
+        SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i);
+        for (int k = 0; k < 3; k++) {
+          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
+          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
+          SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
+          dedr[k] = SIMD_add(m, dedr[k], du);
+          jju3 += vector_width();
+        }
+        i = i + vector_width();
+      }
+
+      SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i);
+      SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i);
+      for (int k = 0; k < 3; k++) {
+        SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
+        SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
+        SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
+        dedr[k] = SIMD_fma(m, SIMD_set(0.5), du, dedr[k]);
+        jju3 += vector_width();
+      }
+    } // if j%2
+  } // for j
+
+  for (int k = 0; k < 3; k++)
+    dedr[k] = dedr[k] * 2.0;
+}
+
+/* ----------------------------------------------------------------------
+   compute dEidRj
+------------------------------------------------------------------------- */
+
+void SNAIntel::compute_deidrj(const int jj, const SNA_IVEC &jnum,
+                              SNA_DVEC* dedr)
+{
+  double *ylist_rp = (double *)ylist_r;
+  double *ylist_ip = (double *)ylist_i;
+  double *dulist_rp = (double *)(dulist_r[0]);
+  double *dulist_ip = (double *)(dulist_i[0]);
+
+  for (int k = 0; k < 3; k++)
+    dedr[k] = SIMD_set(0.0);
+
+  SIMD_mask m(jj < jnum);
+
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j] * vector_width();
+    int jju3 = jju * 3;
+
+    for (int mb = 0; 2*mb < j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju);
+        SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju);
+        for (int k = 0; k < 3; k++) {
+          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
+          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
+          SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
+          dedr[k] = SIMD_add(m, dedr[k], du);
+          jju3 += vector_width();
+        }
+        jju += vector_width();
+      }
+
+    if (j%2 == 0) {
+      int mb = j / 2;
+      for (int ma = 0; ma < mb; ma++) {
+        SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju);
+        SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju);
+        for (int k = 0; k < 3; k++) {
+          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
+          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
+          SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
+          dedr[k] = SIMD_add(m, dedr[k], du);
+          jju3 += vector_width();
+        }
+        jju += vector_width();
+      }
+
+      SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju);
+      SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju);
+      for (int k = 0; k < 3; k++) {
+        SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
+        SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
+        SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
+        dedr[k] = SIMD_fma(m, SIMD_set(0.5), du, dedr[k]);
+        jju3 += vector_width();
+      }
+    } // if j%2
+  } // for j
+
+  for (int k = 0; k < 3; k++)
+    dedr[k] = dedr[k] * 2.0;
+}
+
+/* ----------------------------------------------------------------------
+   compute Bi by summing conj(Ui)*Zi
+------------------------------------------------------------------------- */
+
+void SNAIntel::compute_bi(const SNA_IVEC &ielem) {
+  // for j1 = 0,...,twojmax
+  //   for j2 = 0,twojmax
+  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
+  //        b(j1,j2,j) = 0
+  //        for mb = 0,...,jmid
+  //          for ma = 0,...,j
+  //            b(j1,j2,j) +=
+  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)
+
+  double *ulisttot_rp = (double *)ulisttot_r;
+  double *ulisttot_ip = (double *)ulisttot_i;
+  double *blistp = (double *)blist;
+
+  int itriple = 0;
+  int idouble = 0;
+  for (int elem1 = 0; elem1 < nelements; elem1++)
+    for (int elem2 = 0; elem2 < nelements; elem2++) {
+
+      double *zlist_rp = (double *)(zlist_r + idouble*idxz_max);
+      double *zlist_ip = (double *)(zlist_i + idouble*idxz_max);
+
+      for (int elem3 = 0; elem3 < nelements; elem3++) {
+        for (int jjb = 0; jjb < idxb_max; jjb++) {
+          const int j1 = idxb[jjb].j1;
+          const int j2 = idxb[jjb].j2;
+          const int j = idxb[jjb].j;
+
+          int jjz = idxz_block[j1][j2][j] * vector_width();
+          int jju = (elem3 * idxu_max + idxu_block[j]) * vector_width();
+          SNA_DVEC sumzu(0.0);
+          for (int mb = 0; 2 * mb < j; mb++)
+            for (int ma = 0; ma <= j; ma++) {
+              const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju);
+              const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju);
+              const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz);
+              const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz);
+              sumzu = sumzu + utot_r * z_r + utot_i * z_i;
+              jjz += vector_width();
+              jju += vector_width();
+            } // end loop over ma, mb
+
+          // For j even, handle middle column
+
+          if (j % 2 == 0) {
+            int mb = j / 2;
+            for (int ma = 0; ma < mb; ma++) {
+              const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju);
+              const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju);
+              const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz);
+              const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz);
+              sumzu = sumzu + utot_r * z_r + utot_i * z_i;
+              jjz += vector_width();
+              jju += vector_width();
+            }
+
+            const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju);
+            const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju);
+            const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz);
+            const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz);
+            sumzu = sumzu + (utot_r * z_r + utot_i * z_i) * 0.5;
+          } // end if jeven
+
+          SIMD_store(blistp + (itriple*idxb_max+jjb) * vector_width(),
+                     sumzu * 2.0);
+        }
+        itriple++;
+      }
+      idouble++;
+    }
+
+  // apply bzero shift
+
+  if (bzero_flag) {
+    if (!wselfall_flag) {
+      SNA_IVEC itriplev = (ielem*nelements+ielem)*nelements+ielem;
+      for (int jjb = 0; jjb < idxb_max; jjb++) {
+        const int j = idxb[jjb].j;
+        SNA_IVEC i = (itriplev*idxb_max+jjb) * vector_width() + SIMD256_count();
+        SIMD_scatter(blistp, i, SIMD_gather(blistp, i) - bzero[j]);
+      } // end loop over JJ
+    } else {
+      int itriple = 0;
+      for (int elem1 = 0; elem1 < nelements; elem1++)
+        for (int elem2 = 0; elem2 < nelements; elem2++) {
+          for (int elem3 = 0; elem3 < nelements; elem3++) {
+            for (int jjb = 0; jjb < idxb_max; jjb++) {
+              const int j = idxb[jjb].j;
+              int i = (itriple*idxb_max+jjb) * vector_width();
+              SIMD_store(blistp + i, SIMD_load(blistp + i) - bzero[j]);
+            } // end loop over JJ
+            itriple++;
+          } // end loop over elem3
+        } // end loop over elem1,elem2
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   calculate derivative of Ui w.r.t. atom j
+------------------------------------------------------------------------- */
+
+void SNAIntel::compute_duidrj(const int jj, const SNA_IVEC &jnum)
+{
+  const SNA_DVEC x = rij[jj][0];
+  const SNA_DVEC y = rij[jj][1];
+  const SNA_DVEC z = rij[jj][2];
+  const SNA_DVEC rcut = rcutij[jj];
+  const SNA_DVEC rsq = x * x + y * y + z * z;
+  const SNA_DVEC r = SIMD_sqrt(rsq);
+  const SNA_DVEC rscale0 = SIMD_rcp(rcut - rmin0) * rfac0 * MY_PI;
+  const SNA_DVEC theta0 = (r - rmin0) * rscale0;
+  const SNA_DVEC z0 = r * SIMD_rcp(SIMD_tan(theta0));
+  const SNA_DVEC dz0dr = z0 * SIMD_rcp(r) - (r*rscale0) * (rsq + z0 * z0) *
+    SIMD_rcp(rsq);
+  compute_duarray(x, y, z, z0, r, dz0dr, wj[jj], rcut, jj, jnum);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void SNAIntel::zero_uarraytot(const SNA_IVEC &ielem)
+{
+  double *ulisttot_rp = (double *)ulisttot_r;
+  double *ulisttot_ip = (double *)ulisttot_i;
+  for (int jelem = 0; jelem < nelements; jelem++)
+    for (int j = 0; j <= twojmax; j++) {
+      int jju = (jelem * idxu_max + idxu_block[j]) * vector_width();
+      for (int mb = 0; mb <= j; mb++) {
+        for (int ma = 0; ma <= j; ma++) {
+          SIMD_store(ulisttot_rp + jju, SIMD_set(0.0));
+          SIMD_store(ulisttot_ip + jju, SIMD_set(0.0));
+
+          // utot(j,ma,ma) = wself, sometimes
+          if (ma == mb) {
+            if (wselfall_flag || nelements == 1)
+              SIMD_store(ulisttot_rp + jju, SIMD_set(wself));
+            else {
+              SIMD_mask m(ielem == jelem);
+              SIMD_store(ulisttot_rp + jju,
+                         SIMD_zero_masked(~m, SIMD_set(wself)));
+            }
+          }
+          jju += vector_width();
+        }
+      }
+    }
+}
+
+
+
+/* ----------------------------------------------------------------------
+   add Wigner U-functions for one neighbor to the total
+------------------------------------------------------------------------- */
+
+void SNAIntel::add_uarraytot(const SNA_DVEC &r, const int jj,
+                             const SNA_IVEC &jnum)
+{
+  SNA_DVEC sfac = compute_sfac(r, rcutij[jj], sinnerij[jj], dinnerij[jj]);
+  sfac *= wj[jj];
+
+  double *ulisttot_rp = (double *)ulisttot_r;
+  double *ulisttot_ip = (double *)ulisttot_i;
+  const double* ulist_r = (double *)(ulist_r_ij[jj]);
+  const double* ulist_i = (double *)(ulist_i_ij[jj]);
+
+  SIMD_mask m(jj < jnum);
+
+  if (chem_flag && nelements > 1) {
+    SNA_IVEC jelem = SIMD_load(element+jj);
+    for (int j = 0; j <= twojmax; j++) {
+      int jju = idxu_block[j] * vector_width();
+      SNA_IVEC i = jelem*idxu_max*vector_width() + jju + SIMD256_count();
+      for (int mb = 0; mb <= j; mb++)
+        for (int ma = 0; ma <= j; ma++) {
+          SNA_DVEC utot_r = SIMD_gather(m, ulisttot_rp, i);
+          SNA_DVEC utot_i = SIMD_gather(m, ulisttot_ip, i);
+          utot_r = SIMD_fma(m, sfac, SIMD_load(ulist_r + jju), utot_r);
+          utot_i = SIMD_fma(m, sfac, SIMD_load(ulist_i + jju), utot_i);
+          SIMD_scatter(m, ulisttot_rp, i, utot_r);
+          SIMD_scatter(m, ulisttot_ip, i, utot_i);
+          jju += vector_width();
+          i = i + vector_width();
+        }
+    }
+  } else {
+    for (int j = 0; j <= twojmax; j++) {
+      int jju = idxu_block[j] * vector_width();
+      for (int mb = 0; mb <= j; mb++)
+        for (int ma = 0; ma <= j; ma++) {
+          SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju);
+          SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju);
+          utot_r = SIMD_fma(m, sfac, SIMD_load(ulist_r + jju), utot_r);
+          utot_i = SIMD_fma(m, sfac, SIMD_load(ulist_i + jju), utot_i);
+          SIMD_store(ulisttot_rp + jju, utot_r);
+          SIMD_store(ulisttot_ip + jju, utot_i);
+          jju += vector_width();
+        }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute Wigner U-functions for one neighbor
+------------------------------------------------------------------------- */
+
+void SNAIntel::compute_uarray(const SNA_DVEC &x, const SNA_DVEC &y,
+                              const SNA_DVEC &z, const SNA_DVEC &z0,
+                              const SNA_DVEC &r, const int jj,
+                              const SNA_IVEC &jnum)
+{
+  // compute Cayley-Klein parameters for unit quaternion
+
+  const SNA_DVEC r0inv = SIMD_invsqrt(r * r + z0 * z0);
+  const SNA_DVEC a_r = z0 * r0inv;
+  const SNA_DVEC a_i = -z * r0inv;
+  const SNA_DVEC b_r = y * r0inv;
+  const SNA_DVEC b_i = -x * r0inv;
+
+  // VMK Section 4.8.2
+
+  double *ulist_rp = (double *)(ulist_r_ij[jj]);
+  double *ulist_ip = (double *)(ulist_i_ij[jj]);
+
+  SIMD_store(ulist_rp, SIMD_set(1.0));
+  SIMD_store(ulist_ip, SIMD_set(0.0));
+
+  for (int j = 1; j <= twojmax; j++) {
+    int jju = idxu_block[j] * vector_width();
+    int jjup = idxu_block[j-1] * vector_width();
+
+    // fill in left side of matrix layer from previous layer
+
+    for (int mb = 0; 2*mb <= j; mb++) {
+      SIMD_store(ulist_rp + jju, SIMD_set(0.0));
+      SIMD_store(ulist_ip + jju, SIMD_set(0.0));
+
+      for (int ma = 0; ma < j; ma++) {
+        double rootpq = rootpqarray[j - ma][j - mb];
+        SNA_DVEC u_r = SIMD_load(ulist_rp + jju);
+        SNA_DVEC u_i = SIMD_load(ulist_ip + jju);
+        const SNA_DVEC up_r = SIMD_load(ulist_rp + jjup);
+        const SNA_DVEC up_i = SIMD_load(ulist_ip + jjup);
+
+        SNA_DVEC u_ro, u_io;
+
+        u_ro = a_r * up_r + a_i * up_i;
+        u_r = SIMD_fma(SIMD_set(rootpq), u_ro, u_r);
+        SIMD_store(ulist_rp + jju, u_r);
+        u_io = a_r * up_i - a_i * up_r;
+        u_i = SIMD_fma(SIMD_set(rootpq), u_io, u_i);
+        SIMD_store(ulist_ip + jju, u_i);
+
+        jju += vector_width();
+
+        rootpq = -rootpqarray[ma + 1][j - mb];
+        u_r = (b_r * up_r + b_i * up_i) * rootpq;
+        SIMD_store(ulist_rp + jju, u_r);
+        u_i = (b_r * up_i - b_i * up_r) * rootpq;
+        SIMD_store(ulist_ip + jju, u_i);
+
+        jjup += vector_width();
+      }
+      jju += vector_width();
+    }
+
+    // copy left side to right side with inversion symmetry VMK 4.4(2)
+    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+
+    jju = idxu_block[j];
+    jjup = (jju+(j+1)*(j+1)-1) * vector_width();
+    jju *=  vector_width();
+    int mbpar = 1;
+    for (int mb = 0; 2*mb <= j; mb++) {
+      int mapar = mbpar;
+      for (int ma = 0; ma <= j; ma++) {
+        if (mapar == 1) {
+          SIMD_store(ulist_rp + jjup, SIMD_load(ulist_rp + jju));
+          SIMD_store(ulist_ip + jjup, -SIMD_load(ulist_ip + jju));
+        } else {
+          SIMD_store(ulist_rp + jjup, -SIMD_load(ulist_rp + jju));
+          SIMD_store(ulist_ip + jjup, SIMD_load(ulist_ip + jju));
+        }
+        mapar = -mapar;
+        jju += vector_width();
+        jjup -= vector_width();
+      }
+      mbpar = -mbpar;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Compute derivatives of Wigner U-functions for one neighbor
+   see comments in compute_uarray()
+------------------------------------------------------------------------- */
+
+void SNAIntel::compute_duarray(const SNA_DVEC &x, const SNA_DVEC &y,
+                               const SNA_DVEC &z, const SNA_DVEC &z0,
+                               const SNA_DVEC &r, const SNA_DVEC &dz0dr,
+                               const SNA_DVEC &wj, const SNA_DVEC &rcut,
+                               const int jj, const SNA_IVEC &jnum)
+{
+  const SNA_DVEC rinv = SIMD_rcp(r);
+  const SNA_DVEC r0inv = SIMD_invsqrt(r * r + z0 * z0);
+  SNA_DVEC up[3];
+  up[0] = x * rinv;
+  up[1] = y * rinv;
+  up[2] = z * rinv;
+  const SNA_DVEC a_r = z0 * r0inv;
+  const SNA_DVEC a_i = -z * r0inv;
+  const SNA_DVEC b_r = y * r0inv;
+  const SNA_DVEC b_i = -x * r0inv;
+  const SNA_DVEC dr0invdr = -SIMD_pow(r0inv, 3.0) * (r + z0 * dz0dr);
+
+  SNA_DVEC dr0inv[3], da_r[3], da_i[3];
+  for (int k = 0; k < 3; k++) {
+    dr0inv[k] = dr0invdr * up[k];
+    da_r[k] = dz0dr * up[k] * r0inv + z0 * dr0inv[k];
+    da_i[k] = -z * dr0inv[k];
+  }
+  da_i[2] += -r0inv;
+
+  double *ulist_rp = (double *)(ulist_r_ij[jj]);
+  double *ulist_ip = (double *)(ulist_i_ij[jj]);
+  double *dulist_rp = (double *)(dulist_r[0]);
+  double *dulist_ip = (double *)(dulist_i[0]);
+
+  SNA_DVEC db_r[3], db_i[3];
+  for (int k = 0; k < 3; k++) {
+    SIMD_store(dulist_rp + k * vector_width(), SIMD_set(0.0));
+    SIMD_store(dulist_ip + k * vector_width(), SIMD_set(0.0));
+    db_r[k] = y * dr0inv[k];
+    db_i[k] = -x * dr0inv[k];
+  }
+  db_i[0] -= r0inv;
+  db_r[1] += r0inv;
+
+  for (int j = 1; j <= twojmax; j++) {
+    int jju3 = idxu_block[j] * 3 * vector_width();
+    int jjup = idxu_block[j-1] * vector_width();
+    int jjup3 = jjup * 3;
+    for (int mb = 0; 2*mb <= j; mb++) {
+      for (int k = 0; k < 3; k++) {
+        SIMD_store(dulist_rp + jju3 + k * vector_width(), SIMD_set(0.0));
+        SIMD_store(dulist_ip + jju3 + k * vector_width(), SIMD_set(0.0));
+      }
+
+      for (int ma = 0; ma < j; ma++) {
+        const double rootpq = rootpqarray[j - ma][j - mb];
+        const double mrootpq = -rootpqarray[ma + 1][j - mb];
+        const SNA_DVEC up_r = SIMD_load(ulist_rp + jjup);
+        const SNA_DVEC up_i = SIMD_load(ulist_ip + jjup);
+        for (int k = 0; k < 3; k++) {
+          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
+          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
+          const SNA_DVEC dup_r = SIMD_load(dulist_rp + jjup3);
+          const SNA_DVEC dup_i = SIMD_load(dulist_ip + jjup3);
+
+          SNA_DVEC du_ro, du_io;
+
+          du_ro = (da_r[k]*up_r + da_i[k]*up_i + a_r*dup_r + a_i*dup_i);
+          du_r = SIMD_fma(SIMD_set(rootpq), du_ro, du_r);
+          SIMD_store(dulist_rp + jju3, du_r);
+
+          du_io = (da_r[k]*up_i - da_i[k]*up_r + a_r*dup_i - a_i*dup_r);
+          du_i = SIMD_fma(SIMD_set(rootpq), du_io, du_i);
+          SIMD_store(dulist_ip + jju3, du_i);
+
+          du_r = (db_r[k]*up_r + db_i[k]*up_i + b_r*dup_r + b_i*dup_i);
+          SIMD_store(dulist_rp + jju3 + 3 * vector_width(), du_r * mrootpq);
+
+          du_i = (db_r[k]*up_i - db_i[k]*up_r + b_r*dup_i - b_i*dup_r);
+          SIMD_store(dulist_ip + jju3 + 3 * vector_width(), du_i * mrootpq);
+
+          jju3 += vector_width();
+          jjup3 += vector_width();
+        }
+        jjup += vector_width();
+      } // for ma
+      jju3 += 3 * vector_width();
+    } // for mb
+
+    // copy left side to right side with inversion symmetry VMK 4.4(2)
+    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
+
+    SNA_DVEC *du_r_p = dulist_r[0];
+    SNA_DVEC *du_i_p = dulist_i[0];
+
+    int jju = idxu_block[j];
+    jjup = (jju+(j+1)*(j+1)-1) * 3 * vector_width();
+    jju *=  3 * vector_width();
+    int mbpar = 1;
+    for (int mb = 0; 2*mb <= j; mb++) {
+      int mapar = mbpar;
+      for (int ma = 0; ma <= j; ma++) {
+        if (mapar == 1) {
+          for (int k = 0; k < 3; k++) {
+            SIMD_store(dulist_rp + jjup, SIMD_load(dulist_rp + jju));
+            SIMD_store(dulist_ip + jjup, -SIMD_load(dulist_ip + jju));
+            jju += vector_width();
+            jjup += vector_width();
+          }
+        } else {
+          for (int k = 0; k < 3; k++) {
+            SIMD_store(dulist_rp + jjup, -SIMD_load(dulist_rp + jju));
+            SIMD_store(dulist_ip + jjup, SIMD_load(dulist_ip + jju));
+            jju += vector_width();
+            jjup += vector_width();
+          }
+        }
+        mapar = -mapar;
+        jjup -= 6 * vector_width();
+      } // for ma
+      mbpar = -mbpar;
+    } // for mb
+  } // for j
+
+  SNA_DVEC dsfac;
+  SNA_DVEC sfac = compute_sfac_dsfac(r, rcut, sinnerij[jj], dinnerij[jj],
+                                      dsfac);
+  sfac = sfac * wj;
+  dsfac = dsfac * wj;
+
+  for (int j = 0; j <= twojmax; j++) {
+    int jju = idxu_block[j] * vector_width();
+    int jju3 = jju * 3;
+    for (int mb = 0; 2*mb <= j; mb++)
+      for (int ma = 0; ma <= j; ma++) {
+        const SNA_DVEC ur_dsfac = dsfac * SIMD_load(ulist_rp + jju);
+        const SNA_DVEC ui_dsfac = dsfac * SIMD_load(ulist_ip + jju);
+        jju += vector_width();
+        for (int k = 0; k < 3; k++) {
+          SNA_DVEC du_r = ur_dsfac * up[k] + sfac * SIMD_load(dulist_rp+jju3);
+          SIMD_store(dulist_rp + jju3, du_r);
+          SNA_DVEC du_i = ui_dsfac * up[k] + sfac * SIMD_load(dulist_ip+jju3);
+          SIMD_store(dulist_ip + jju3, du_i);
+          jju3 += vector_width();
+        }
+      }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of arrays
+------------------------------------------------------------------------- */
+
+double SNAIntel::memory_usage()
+{
+  int jdimpq = twojmax + 2;
+  int jdim = twojmax + 1;
+  double bytes;
+
+  bytes = 0;
+
+  bytes += (double)jdimpq*jdimpq * sizeof(double);               // pqarray
+  bytes += (double)idxcg_max * sizeof(double);                   // cglist
+
+  bytes += (double)nmax * idxu_max * sizeof(SNA_DVEC) * 2;       // ulist_ij
+  bytes += (double)idxu_max * nelements * sizeof(SNA_DVEC) * 2;  // ulisttot
+  bytes += (double)idxu_max * 3 * sizeof(SNA_DVEC) * 2;          // dulist
+
+  bytes += (double)idxz_max * ndoubles * sizeof(SNA_DVEC) * 2;   // zlist
+  bytes += (double)idxb_max * ntriples * sizeof(SNA_DVEC);       // blist
+  bytes += (double)idxb_max * ntriples * 3 * sizeof(double);     // dblist
+  bytes += (double)idxu_max * nelements * sizeof(SNA_DVEC) * 2;  // ylist
+
+  bytes += (double)jdim * jdim * jdim * sizeof(int);             // idxcg_block
+  bytes += (double)jdim * sizeof(int);                           // idxu_block
+  bytes += (double)jdim * jdim * jdim * sizeof(int);             // idxz_block
+  bytes += (double)jdim * jdim * jdim * sizeof(int);             // idxb_block
+
+  bytes += (double)idxz_max * sizeof(SNA_ZINDICES);              // idxz
+  bytes += (double)idxb_max * sizeof(SNA_BINDICES);              // idxb
+
+  if (bzero_flag)
+  bytes += (double)jdim * sizeof(double);                        // bzero
+
+  bytes += (double)nmax * 3 * sizeof(SNA_DVEC);                  // rij
+  bytes += (double)nmax * sizeof(SNA_IVEC);                      // inside
+  bytes += (double)nmax * sizeof(SNA_DVEC);                      // wj
+  bytes += (double)nmax * sizeof(SNA_DVEC);                      // rcutij
+  bytes += (double)nmax * sizeof(SNA_DVEC);                      // sinnerij
+  bytes += (double)nmax * sizeof(SNA_DVEC);                      // dinnerij
+  if (chem_flag) bytes += (double)nmax * sizeof(SNA_IVEC);       // element
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void SNAIntel::create_twojmax_arrays()
+{
+  int jdimpq = twojmax + 2;
+  memory->create(rootpqarray, jdimpq, jdimpq,
+                 "sna:rootpqarray");
+  memory->create(cglist, idxcg_max, "sna:cglist");
+  memory->create(ulisttot_r, idxu_max*nelements, "sna:ulisttot");
+  memory->create(ulisttot_i, idxu_max*nelements, "sna:ulisttot");
+  memory->create(dulist_r, idxu_max, 3, "sna:dulist");
+  memory->create(dulist_i, idxu_max, 3, "sna:dulist");
+  memory->create(zlist_r, idxz_max*ndoubles, "sna:zlist");
+  memory->create(zlist_i, idxz_max*ndoubles, "sna:zlist");
+  memory->create(blist, idxb_max*ntriples, "sna:blist");
+  memory->create(dblist, idxb_max*ntriples, 3, "sna:dblist");
+  memory->create(ylist_r, idxu_max*nelements, "sna:ylist");
+  memory->create(ylist_i, idxu_max*nelements, "sna:ylist");
+
+  if (bzero_flag)
+    memory->create(bzero, twojmax+1,"sna:bzero");
+  else
+    bzero = nullptr;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+void SNAIntel::destroy_twojmax_arrays()
+{
+  memory->destroy(rootpqarray);
+  memory->destroy(cglist);
+  memory->destroy(ulisttot_r);
+  memory->destroy(ulisttot_i);
+  memory->destroy(dulist_r);
+  memory->destroy(dulist_i);
+  memory->destroy(zlist_r);
+  memory->destroy(zlist_i);
+  memory->destroy(blist);
+  memory->destroy(dblist);
+  memory->destroy(ylist_r);
+  memory->destroy(ylist_i);
+
+  memory->destroy(idxcg_block);
+  memory->destroy(idxu_block);
+  memory->destroy(idxz_block);
+  memory->destroy(idxb_block);
+
+  if (bzero_flag)
+    memory->destroy(bzero);
+
+}
+
+/* ----------------------------------------------------------------------
+   the function delta given by VMK Eq. 8.2(1)
+------------------------------------------------------------------------- */
+
+double SNAIntel::deltacg(int j1, int j2, int j)
+{
+  double sfaccg = factorial((j1 + j2 + j) / 2 + 1);
+  return sqrt(factorial((j1 + j2 - j) / 2) *
+              factorial((j1 - j2 + j) / 2) *
+              factorial((-j1 + j2 + j) / 2) / sfaccg);
+}
+
+/* ----------------------------------------------------------------------
+   assign Clebsch-Gordan coefficients using
+   the quasi-binomial formula VMK 8.2.1(3)
+------------------------------------------------------------------------- */
+
+void SNAIntel::init_clebsch_gordan()
+{
+  double sum,dcg,sfaccg;
+  int m, aa2, bb2, cc2;
+  int ifac;
+
+  int idxcg_count = 0;
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
+        for (int m1 = 0; m1 <= j1; m1++) {
+          aa2 = 2 * m1 - j1;
+
+          for (int m2 = 0; m2 <= j2; m2++) {
+
+            // -c <= cc <= c
+
+            bb2 = 2 * m2 - j2;
+            m = (aa2 + bb2 + j) / 2;
+
+            if (m < 0 || m > j) {
+              cglist[idxcg_count] = 0.0;
+              idxcg_count++;
+              continue;
+            }
+
+            sum = 0.0;
+
+            for (int z = MAX(0, MAX(-(j - j2 + aa2)
+                                    / 2, -(j - j1 - bb2) / 2));
+                 z <= MIN((j1 + j2 - j) / 2,
+                          MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
+                 z++) {
+              ifac = z % 2 ? -1 : 1;
+              sum += ifac /
+                (factorial(z) *
+                 factorial((j1 + j2 - j) / 2 - z) *
+                 factorial((j1 - aa2) / 2 - z) *
+                 factorial((j2 + bb2) / 2 - z) *
+                 factorial((j - j2 + aa2) / 2 + z) *
+                 factorial((j - j1 - bb2) / 2 + z));
+            }
+
+            cc2 = 2 * m - j;
+            dcg = deltacg(j1, j2, j);
+            sfaccg = sqrt(factorial((j1 + aa2) / 2) *
+                          factorial((j1 - aa2) / 2) *
+                          factorial((j2 + bb2) / 2) *
+                          factorial((j2 - bb2) / 2) *
+                          factorial((j  + cc2) / 2) *
+                          factorial((j  - cc2) / 2) *
+                          (j + 1));
+
+            cglist[idxcg_count] = sum * dcg * sfaccg;
+            idxcg_count++;
+          }
+        }
+      }
+}
+
+/* ----------------------------------------------------------------------
+   print out values of Clebsch-Gordan coefficients
+   format and notation follows VMK Table 8.11
+------------------------------------------------------------------------- */
+
+void SNAIntel::print_clebsch_gordan()
+{
+  if (comm->me) return;
+
+  int aa2, bb2, cc2;
+  for (int j = 0; j <= twojmax; j += 1) {
+    printf("c = %g\n",j/2.0);
+    printf("a alpha b beta C_{a alpha b beta}^{c alpha+beta}\n");
+    for (int j1 = 0; j1 <= twojmax; j1++)
+      for (int j2 = 0; j2 <= j1; j2++)
+        if (j1-j2 <= j && j1+j2 >= j && (j1+j2+j)%2 == 0) {
+          int idxcg_count = idxcg_block[j1][j2][j];
+          for (int m1 = 0; m1 <= j1; m1++) {
+            aa2 = 2*m1-j1;
+            for (int m2 = 0; m2 <= j2; m2++) {
+              bb2 = 2*m2-j2;
+              double cgtmp = cglist[idxcg_count];
+              cc2 = aa2+bb2;
+              if (cc2 >= -j && cc2 <= j)
+                if (j1 != j2 || (aa2 > bb2 && aa2 >= -bb2) || (aa2 == bb2 && aa2 >= 0))
+                  printf("%4g %4g %4g %4g %10.6g\n",
+                         j1/2.0,aa2/2.0,j2/2.0,bb2/2.0,cgtmp);
+              idxcg_count++;
+            }
+          }
+        }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute table of sqrt[p/m2], p, q = 1,twojmax
+   the p = 0, q = 0 entries are allocated and skipped for convenience.
+------------------------------------------------------------------------- */
+
+void SNAIntel::init_rootpqarray()
+{
+  for (int p = 1; p <= twojmax; p++)
+    for (int q = 1; q <= twojmax; q++)
+      rootpqarray[p][q] = sqrt(static_cast<double>(p)/q);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void SNAIntel::compute_ncoeff()
+{
+  int ncount;
+
+  ncount = 0;
+
+  for (int j1 = 0; j1 <= twojmax; j1++)
+    for (int j2 = 0; j2 <= j1; j2++)
+      for (int j = j1 - j2;
+           j <= MIN(twojmax, j1 + j2); j += 2)
+        if (j >= j1) ncount++;
+
+  ndoubles = nelements*nelements;
+  ntriples = nelements*nelements*nelements;
+  if (chem_flag)
+    ncoeff = ncount*ntriples;
+  else
+    ncoeff = ncount;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double SNAIntel::compute_sfac(double r, double rcut, double sinner, double dinner)
+{
+  double sfac;
+
+  // calculate sfac = sfac_outer
+
+  if (switch_flag == 0) sfac = 1.0;
+  else if (r <= rmin0) sfac = 1.0;
+  else if (r > rcut) sfac = 0.0;
+  else {
+    double rcutfac = MY_PI / (rcut - rmin0);
+    sfac = 0.5 * (cos((r - rmin0) * rcutfac) + 1.0);
+  }
+
+  // calculate sfac *= sfac_inner, rarely visited
+
+  if (switch_inner_flag == 1 && r < sinner + dinner) {
+    if (r > sinner - dinner) {
+      double rcutfac = MY_PI2 / dinner;
+      sfac *= 0.5 * (1.0 - cos(MY_PI2 + (r - sinner) * rcutfac));
+    } else sfac = 0.0;
+  }
+
+  return sfac;
+}
+
+/* ---------------------------------------------------------------------- */
+
+SNA_DVEC SNAIntel::compute_sfac(const SNA_DVEC &r, const SNA_DVEC &rcut,
+                                const SNA_DVEC &sinner, const SNA_DVEC &dinner)
+{
+  // calculate sfac = sfac_outer
+
+  // if (switch_flag == 0 || r <= rmin0)
+  SNA_DVEC sfac = SIMD_set(1.0);
+  if (switch_flag != 0) {
+    // r <= rcut && r > rmin0
+    const SIMD_mask i(r > rmin0);
+    const SIMD_mask m(r <= rcut);
+    const SNA_DVEC rcutfac = SIMD_rcp(rcut - rmin0) * MY_PI;
+    const SNA_DVEC sfac_m = (SIMD_cos((r - rmin0) * rcutfac) + 1.0) * 0.5;
+    sfac = SIMD_set(sfac, m & i, sfac_m);
+    // (r > rcut) && (r> rmin0)
+    sfac = SIMD_zero_masked(m | i, sfac);
+  }
+
+  // calculate sfac *= sfac_inner, rarely visited
+
+  if (switch_inner_flag == 1) {
+    const SIMD_mask m(r < sinner + dinner);
+    // if any(m)
+    const SIMD_mask i(r > sinner - dinner);
+    const SNA_DVEC rcutfac = SIMD_rcp(dinner) * MY_PI2;
+    const SNA_DVEC sfac_m = (SIMD_set(1.0) - SIMD_cos((r-sinner) * rcutfac +
+                                                      MY_PI2)) * 0.5;
+    sfac = SIMD_set(sfac, m & i, sfac_m);
+    sfac = SIMD_zero_masked((~m) | i, sfac);
+  }
+
+  return sfac;
+}
+
+/* ---------------------------------------------------------------------- */
+
+SNA_DVEC SNAIntel::compute_sfac_dsfac(const SNA_DVEC & r,
+                                      const SNA_DVEC & rcut,
+                                      const SNA_DVEC & sinner,
+                                      const SNA_DVEC & dinner,
+                                      SNA_DVEC &dsfac)
+{
+  // calculate sfac = sfac_outer
+
+  // if (switch_flag == 0 || r <= rmin0)
+  SNA_DVEC sfac = SIMD_set(1.0);
+  dsfac = SIMD_set(0.0);
+  if (switch_flag != 0) {
+    // r <= rcut && r > rmin0
+    const SIMD_mask i(r > rmin0);
+    const SIMD_mask m(r <= rcut);
+    const SNA_DVEC rcutfac = SIMD_rcp(rcut - rmin0) * MY_PI;
+    const SNA_DVEC trig_arg = (r - rmin0) * rcutfac;
+    const SNA_DVEC sfac_m = (SIMD_cos(trig_arg) + 1.0) * 0.5;
+    const SNA_DVEC dsfac_m = SIMD_sin(trig_arg) * rcutfac * -0.5;
+    sfac = SIMD_set(sfac, m & i, sfac_m);
+    dsfac = SIMD_set(dsfac, m & i, dsfac_m);
+    // (r > rcut) && (r> rmin0)
+    sfac = SIMD_zero_masked(m | i, sfac);
+  }
+
+  // calculate sfac *= sfac_inner, rarely visited
+
+  if (switch_inner_flag == 1) {
+    const SIMD_mask m(r < sinner + dinner);
+    const SIMD_mask i(r > sinner - dinner);
+    if (any(m & i)) {
+      const SNA_DVEC rcutfac = SIMD_rcp(dinner) * MY_PI2;
+      const SNA_DVEC trig_arg = (r - sinner) * rcutfac + MY_PI2;
+      const SNA_DVEC sfac_inner = (SIMD_set(1.0) - SIMD_cos(trig_arg)) * 0.5;
+      const SNA_DVEC dsfac_inner = rcutfac * 0.5 * SIMD_sin(trig_arg);
+      dsfac = SIMD_set(dsfac, m & i, dsfac * sfac_inner +
+                       sfac * dsfac_inner);
+      sfac = SIMD_set(sfac, m & i, sfac_inner);
+    }
+    sfac = SIMD_zero_masked((~m) | i, sfac);
+    dsfac = SIMD_zero_masked((~m) | i, dsfac);
+  }
+
+  return sfac;
+}
+
+template void SNAIntel::compute_zi_or_yi<1>(const SNA_DVEC *);
+template void SNAIntel::compute_zi_or_yi<0>(const SNA_DVEC *);
+
+#endif
+#endif
diff --git a/src/INTEL/sna_intel.h b/src/INTEL/sna_intel.h
new file mode 100644
index 0000000000..7900dee51b
--- /dev/null
+++ b/src/INTEL/sna_intel.h
@@ -0,0 +1,187 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown, Intel
+------------------------------------------------------------------------- */
+
+#ifndef LMP_SNA_INTEL_H
+#define LMP_SNA_INTEL_H
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
+
+#include "pointers.h"
+#include "intel_buffers.h"
+#include "intel_simd.h"
+
+#define SVW 8
+
+#if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+#define SV_for _Pragma("omp simd") _Pragma("vector aligned") for
+#else
+#define SV_for _Pragma("simd assert") _Pragma("vector aligned") for
+#endif
+#else
+#define SV_for for
+#endif
+
+namespace LAMMPS_NS {
+
+struct SNA_ZINDICES {
+  int j1, j2, j, ma1min, ma2max, mb1min;
+  int mb2max, na, nb, jju;
+};
+
+struct SNA_BINDICES {
+  int j1, j2, j;
+};
+
+#define SNA_DVEC ip_simd::SIMD_double
+#define SNA_IVEC ip_simd::SIMD256_int
+
+class SNAIntel : protected Pointers {
+
+ public:
+  SNAIntel(LAMMPS *, double, int, double, int, int, int, int, int, int, int);
+
+  SNAIntel(LAMMPS *lmp) : Pointers(lmp){};
+  ~SNAIntel() override;
+  void build_indexlist();
+  void init();
+  double memory_usage();
+
+  int ncoeff;
+
+  inline int vector_width() const { return SVW; }
+
+  // functions for bispectrum coefficients
+
+  void compute_ui(const SNA_IVEC &, const SNA_IVEC &, const int max_jnum);
+  template <int> void compute_zi_or_yi(const SNA_DVEC *);
+  void compute_yi_from_zi(const SNA_DVEC *);
+  void compute_yterm(int, int, int, const double *);
+  void compute_bi(const SNA_IVEC &);
+
+  // functions for derivatives
+
+  void compute_duidrj(const int, const SNA_IVEC &);
+  void compute_deidrj_e(const int, const SNA_IVEC &, SNA_DVEC *);
+  void compute_deidrj(const int, const SNA_IVEC &, SNA_DVEC *);
+  double compute_sfac(double, double, double, double);
+  SNA_DVEC compute_sfac(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
+                        const SNA_DVEC &);
+  inline SNA_DVEC compute_sfac_dsfac(const SNA_DVEC &, const SNA_DVEC &,
+                                     const SNA_DVEC &, const SNA_DVEC &,
+                                     SNA_DVEC &);
+
+  // public bispectrum data
+
+  int twojmax;
+  SNA_DVEC *blist;
+  double **dblist;
+
+  // short neighbor list data
+
+  void grow_rij(int);
+  int nmax;    // allocated size of short lists
+
+  SNA_DVEC **rij;      // short rij list
+  SNA_IVEC *inside;       // short neighbor list
+  SNA_DVEC *wj;        // short weight list
+  SNA_DVEC *rcutij;    // short cutoff list
+
+  // only allocated for switch_inner_flag=1
+
+  SNA_DVEC *sinnerij;    // short inner cutoff midpoint list
+  SNA_DVEC *dinnerij;    // short inner half-width list
+
+  // only allocated for chem_flag=1
+
+  SNA_IVEC *element;    // short element list [0,nelements)
+
+ private:
+  double rmin0, rfac0;
+
+  // data for bispectrum coefficients
+
+  SNA_ZINDICES *idxz;
+  SNA_BINDICES *idxb;
+
+  double **rootpqarray;
+  double *cglist;
+  int ***idxcg_block;
+
+  SNA_DVEC *ulisttot_r, *ulisttot_i;
+  SNA_DVEC **ulist_r_ij, **ulist_i_ij;    // short u list
+  int *idxu_block;
+
+  SNA_DVEC *zlist_r, *zlist_i;
+  int ***idxz_block;
+
+  int ***idxb_block;
+
+  SNA_DVEC **dulist_r, **dulist_i;
+
+  SNA_DVEC *ylist_r, *ylist_i;
+  int idxcg_max, idxu_max, idxz_max, idxb_max;
+
+  void create_twojmax_arrays();
+  void destroy_twojmax_arrays();
+  void init_clebsch_gordan();
+  void print_clebsch_gordan();
+  void init_rootpqarray();
+  void zero_uarraytot(const SNA_IVEC &);
+  void add_uarraytot(const SNA_DVEC &, const int, const SNA_IVEC &);
+  void compute_uarray(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
+                      const SNA_DVEC &, const SNA_DVEC &, const int,
+                      const SNA_IVEC &);
+  double deltacg(int, int, int);
+  void compute_ncoeff();
+  void compute_duarray(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
+                       const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
+                       const SNA_DVEC &, const SNA_DVEC &, int,
+                       const SNA_IVEC &);
+  inline double choose_beta(const int, const int, const int,
+                            const int, const int, const int,  int &);
+
+  // Sets the style for the switching function
+  // 0 = none
+  // 1 = cosine
+  int switch_flag;
+
+  // Sets the style for the inner switching function
+  // 0 = none
+  // 1 = cosine
+  int switch_inner_flag;
+
+  // Self-weight
+  double wself;
+
+  int bzero_flag;       // 1 if bzero subtracted from barray
+  double *bzero;        // array of B values for isolated atoms
+  int bnorm_flag;       // 1 if barray divided by j+1
+  int chem_flag;        // 1 for multi-element bispectrum components
+  int wselfall_flag;    // 1 for adding wself to all element labelings
+  int nelements;        // number of elements
+  int ndoubles;         // number of multi-element pairs
+  int ntriples;         // number of multi-element triplets
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
+
+#endif