Merge pull request #3921 from wmbrownIntel/snap-intel

Adding intel variant of snap pair style.
2023-10-10 11:02:43 -04:00
parent d97d14745e 508dbb74c5
commit c0ace4aa4b
10 changed files with 2798 additions and 3 deletions
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -265,7 +265,7 @@ OPT.
   * :doc:`smd/tri_surface <pair_smd_triangulated_surface>`
   * :doc:`smd/ulsph <pair_smd_ulsph>`
   * :doc:`smtbq <pair_smtbq>`
-   * :doc:`snap (k) <pair_snap>`
+   * :doc:`snap (ik) <pair_snap>`
   * :doc:`soft (go) <pair_soft>`
   * :doc:`sph/heatconduction <pair_sph_heatconduction>`
   * :doc:`sph/idealgas <pair_sph_idealgas>`
--- a/doc/src/pair_snap.rst
+++ b/doc/src/pair_snap.rst
@ -1,10 +1,11 @@
 .. index:: pair_style snap
+.. index:: pair_style snap/intel
 .. index:: pair_style snap/kk

 pair_style snap command
 =======================

-Accelerator Variants: *snap/kk*
+Accelerator Variants: *snap/intel*, *snap/kk*

 Syntax
 """"""
@ -260,6 +261,14 @@ This style is part of the ML-SNAP package.  It is only enabled if LAMMPS
 was built with that package.  See the :doc:`Build package
 <Build_package>` page for more info.

+The *snap/intel* accelerator variant will *only* be available if LAMMPS
+is built with Intel *compilers* and for CPUs with AVX-512 support.
+While the INTEL package in general allows multiple floating point
+precision modes to be selected, *snap/intel* will currently always use
+full double precision regardless of the precision mode selected.
+Additionally, the *intel* variant of snap will **NOT** use multiple
+threads with OpenMP.
+
 Related commands
 """"""""""""""""

--- a/src/Depend.sh
+++ b/src/Depend.sh
@ -185,6 +185,7 @@ fi
 if (test $1 = "ML-SNAP") then
  depend ML-IAP
  depend KOKKOS
+  depend INTEL
 fi

 if (test $1 = "CG-SPICA") then
--- a/src/INTEL/TEST/in.intel.snap
+++ b/src/INTEL/TEST/in.intel.snap
@ -0,0 +1,70 @@
+# Toy demonstration of SNAP "scale" parameter, using fix/adapt and hybrid/overlay
+# Mixing linear and quadratic SNAP Ni potentials by Zuo et al. JCPA 2020
+
+variable	w index 10	# Warmup Timesteps
+variable	t index 100	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	rr equal floor($t*$m)
+variable        root getenv LMP_ROOT
+
+if "$n > 0"	then "processors * * * grid numa"
+
+# mixing parameter
+
+variable lambda equal 0.2
+
+# Initialize simulation
+
+variable a equal 3.52
+units           metal
+
+# generate the box and atom positions using a FCC lattice
+variable nx equal 20*$x
+variable ny equal 20*$y
+variable nz equal 20*$z
+
+boundary        p p p
+
+lattice         fcc $a
+region          box block 0 ${nx} 0 ${ny} 0 ${nz}
+create_box      1 box
+create_atoms    1 box
+
+mass 1 34.
+
+# choose bundled SNAP Ni potential from Zuo et al. JCPA 2020
+pair_style hybrid/overlay snap snap
+pair_coeff * * snap 1 &
+    ${root}/examples/snap/Ni_Zuo_JPCA2020.snapcoeff &
+    ${root}/examples/snap/Ni_Zuo_JPCA2020.snapparam Ni
+pair_coeff * * snap 2 &
+    ${root}/examples/snap/Ni_Zuo_JPCA2020.quadratic.snapcoeff &
+    ${root}/examples/snap/Ni_Zuo_JPCA2020.quadratic.snapparam Ni
+
+# scale according to mixing parameter
+variable l1 equal ${lambda}
+variable l2 equal 1.0-${lambda}
+fix scale1 all adapt 1 pair snap:1 scale * * v_l1
+fix scale2 all adapt 1 pair snap:2 scale * * v_l2
+
+# Setup output
+thermo          1
+thermo_modify norm yes
+
+# Set up NVE run
+timestep 0.5e-3
+neighbor 1.0 bin
+neigh_modify every 1 delay 0 check yes
+
+# Run MD
+velocity all create 300.0 4928459 loop geom
+fix 1 all nve
+
+if "$w > 0"	then "run $w"
+run		${rr}
--- a/src/INTEL/TEST/run_benchmarks.sh
+++ b/src/INTEL/TEST/run_benchmarks.sh
@ -35,7 +35,7 @@ export I_MPI_PIN_DOMAIN=core
 # End settings for your system
 #########################################################################

-export WORKLOADS="lj rhodo lc sw water eam airebo dpd tersoff"
+export WORKLOADS="lj rhodo lc sw water eam airebo dpd tersoff snap"
 export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1"
 export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1"

--- a/src/INTEL/intel_simd.h
+++ b/src/INTEL/intel_simd.h
@ -46,13 +46,38 @@ namespace ip_simd {

  typedef __mmask16 SIMD_mask;

+  inline bool any(const SIMD_mask &m) { return m != 0; }
+
  struct SIMD_int {
    __m512i v;
    SIMD_int() {}
    SIMD_int(const __m512i in) : v(in) {}
+    inline int & operator[](const int i) { return ((int *)&(v))[i]; }
+    inline const int & operator[](const int i) const
+      { return ((int *)&(v))[i]; }
    operator __m512i() const { return v;}
  };

+  struct SIMD256_int {
+    __m256i v;
+    SIMD256_int() {}
+    SIMD256_int(const __m256i in) : v(in) {}
+    SIMD256_int(const int in) : v(_mm256_set1_epi32(in)) {}
+    inline int & operator[](const int i) { return ((int *)&(v))[i]; }
+    inline const int & operator[](const int i) const
+      { return ((int *)&(v))[i]; }
+#ifdef __INTEL_LLVM_COMPILER
+    inline SIMD256_int operator&=(const int i)
+      { v=_mm256_and_epi32(v, _mm256_set1_epi32(i)); return *this; };
+#else
+    inline SIMD256_int operator&=(const int i)
+      { v=_mm256_and_si256(v, _mm256_set1_epi32(i)); return *this; };
+#endif
+    inline SIMD256_int operator+=(const int i)
+      { v=_mm256_add_epi32(v, _mm256_set1_epi32(i)); return *this; };
+    operator __m256i() const { return v;}
+  };
+
  struct SIMD_float {
    __m512 v;
    SIMD_float() {}
@ -64,7 +89,24 @@ namespace ip_simd {
    __m512d v;
    SIMD_double() {}
    SIMD_double(const __m512d in) : v(in) {}
+    SIMD_double(const double in) { v=_mm512_set1_pd(in); }
+    inline double & operator[](const int i) { return ((double *)&(v))[i]; }
+    inline const double & operator[](const int i) const
+      { return ((double *)&(v))[i]; }
    operator __m512d() const { return v;}
+
+    SIMD_double & operator=(const double i)
+      { _mm512_set1_pd(i); return *this; }
+    SIMD_double &operator=(const SIMD_double &i)
+      { v = i.v; return *this; }
+
+    SIMD_double operator-() { return _mm512_xor_pd(v, _mm512_set1_pd(-0.0)); }
+    SIMD_double & operator+=(const SIMD_double & two)
+      { v = _mm512_add_pd(v, two.v); return *this; }
+    SIMD_double & operator-=(const SIMD_double & two)
+      { v = _mm512_sub_pd(v, two.v); return *this; }
+    SIMD_double & operator*=(const SIMD_double & two)
+      { v = _mm512_mul_pd(v, two.v); return *this; }
  };

  template<class flt_t>
@ -99,6 +141,12 @@ namespace ip_simd {

  // ------- Set Operations

+  inline SIMD256_int SIMD256_set(const int l0, const int l1, const int l2,
+                                 const int l3, const int l4, const int l5,
+                                 const int l6, const int l7) {
+    return _mm256_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7);
+  }
+
  inline SIMD_int SIMD_set(const int l0, const int l1, const int l2,
                           const int l3, const int l4, const int l5,
                           const int l6, const int l7, const int l8,
@ -109,6 +157,10 @@ namespace ip_simd {
                             l8,l9,l10,l11,l12,l13,l14,l15);
  }

+  inline SIMD256_int SIMD256_set(const int l) {
+    return _mm256_set1_epi32(l);
+  }
+
  inline SIMD_int SIMD_set(const int l) {
    return _mm512_set1_epi32(l);
  }
@ -121,6 +173,10 @@ namespace ip_simd {
    return _mm512_set1_pd(l);
  }

+  inline SIMD256_int SIMD256_count() {
+    return SIMD256_set(0,1,2,3,4,5,6,7);
+  }
+
  inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) {
    return _mm512_maskz_mov_epi32(m, one);
  }
@ -147,6 +203,10 @@ namespace ip_simd {

  // -------- Load Operations

+  inline SIMD256_int SIMD_load(const SIMD256_int *p) {
+    return _mm256_load_epi32((int *)p);
+  }
+
  inline SIMD_int SIMD_load(const int *p) {
    return _mm512_load_epi32(p);
  }
@ -159,6 +219,10 @@ namespace ip_simd {
    return _mm512_load_pd(p);
  }

+  inline SIMD_double SIMD_load(const SIMD_double *p) {
+    return _mm512_load_pd((double *)p);
+  }
+
  inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) {
    return _mm512_maskz_load_epi32(m, p);
  }
@ -171,6 +235,10 @@ namespace ip_simd {
    return _mm512_maskz_load_pd(m, p);
  }

+  inline SIMD256_int SIMD_gather(const int *p, const SIMD256_int &i) {
+    return _mm256_i32gather_epi32(p, i, _MM_SCALE_4);
+  }
+
  inline SIMD_int SIMD_gather(const int *p, const SIMD_int &i) {
    return _mm512_i32gather_epi32(i, p, _MM_SCALE_4);
  }
@ -179,6 +247,10 @@ namespace ip_simd {
    return _mm512_i32gather_ps(i, p, _MM_SCALE_4);
  }

+  inline SIMD_double SIMD_gather(const double *p, const SIMD256_int &i) {
+    return _mm512_i32gather_pd(i, p, _MM_SCALE_8);
+  }
+
  inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
    return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
  }
@ -201,6 +273,12 @@ namespace ip_simd {
                                    _mm512_castsi512_si256(i), p, _MM_SCALE_8);
  }

+  inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
+                                 const SIMD256_int &i) {
+    return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
+                                    i, p, _MM_SCALE_8);
+  }
+
  template <typename T>
  inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p,
                                      const SIMD_int &i) {
@ -252,6 +330,15 @@ namespace ip_simd {
    return _mm512_store_pd(p,one);
  }

+  inline void SIMD_store(SIMD_double *p, const SIMD_double &one) {
+    return _mm512_store_pd((double *)p,one);
+  }
+
+  inline void SIMD_scatter(const SIMD_mask &m, int *p,
+                           const SIMD256_int &i, const SIMD256_int &vec) {
+    _mm256_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
+  }
+
  inline void SIMD_scatter(const SIMD_mask &m, int *p,
                           const SIMD_int &i, const SIMD_int &vec) {
    _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
@ -268,8 +355,22 @@ namespace ip_simd {
                              _MM_SCALE_8);
  }

+  inline void SIMD_scatter(const SIMD_mask &m, double *p,
+                           const SIMD256_int &i, const SIMD_double &vec) {
+    _mm512_mask_i32scatter_pd(p, m, i, vec, _MM_SCALE_8);
+  }
+
+  inline void SIMD_scatter(double *p,
+                           const SIMD256_int &i, const SIMD_double &vec) {
+    _mm512_i32scatter_pd(p, i, vec, _MM_SCALE_8);
+  }
+
  // ------- Arithmetic Operations

+  inline SIMD256_int operator+(const SIMD256_int &one, const SIMD256_int &two) {
+    return _mm256_add_epi32(one,two);
+  }
+
  inline SIMD_int operator+(const SIMD_int &one, const SIMD_int &two) {
    return _mm512_add_epi32(one,two);
  }
@ -286,6 +387,10 @@ namespace ip_simd {
    return _mm512_add_epi32(one,SIMD_set(two));
  }

+  inline SIMD256_int operator+(const SIMD256_int &one, const int two) {
+    return _mm256_add_epi32(one,SIMD256_set(two));
+  }
+
  inline SIMD_float operator+(const SIMD_float &one, const float two) {
    return _mm512_add_ps(one,SIMD_set(two));
  }
@ -299,6 +404,11 @@ namespace ip_simd {
    return _mm512_mask_add_epi32(one,m,one,SIMD_set(two));
  }

+  inline SIMD256_int SIMD_add(const SIMD_mask &m,
+                           const SIMD256_int &one, const int two) {
+    return _mm256_mask_add_epi32(one,m,one,SIMD256_set(two));
+  }
+
  inline SIMD_float SIMD_add(const SIMD_mask &m,
                             const SIMD_float &one, const float two) {
    return _mm512_mask_add_ps(one,m,one,SIMD_set(two));
@ -309,6 +419,11 @@ namespace ip_simd {
    return _mm512_mask_add_pd(one,m,one,SIMD_set(two));
  }

+  inline SIMD_double SIMD_add(const SIMD_mask &m,
+                              const SIMD_double &one, const SIMD_double &two) {
+    return _mm512_mask_add_pd(one,m,one,two);
+  }
+
  inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m,
                           const SIMD_int &one, const SIMD_int &two) {
    return _mm512_mask_add_epi32(s,m,one,two);
@ -387,6 +502,10 @@ namespace ip_simd {
    return _mm512_mul_pd(one,two);
  }

+  inline SIMD256_int operator*(const SIMD256_int &one, const int two) {
+    return _mm256_mullo_epi32(one,SIMD256_set(two));
+  }
+
  inline SIMD_int operator*(const SIMD_int &one, const int two) {
    return _mm512_mullo_epi32(one,SIMD_set(two));
  }
@ -417,6 +536,12 @@ namespace ip_simd {
    return _mm512_fmadd_pd(one,two,three);
  }

+  inline SIMD_double SIMD_fma(const SIMD_mask m, const SIMD_double &one,
+                              const SIMD_double &two,
+                              const SIMD_double &three) {
+    return _mm512_mask3_fmadd_pd(one,two,three,m);
+  }
+
  inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two,
                             const SIMD_float &three) {
    return _mm512_fmsub_ps(one,two,three);
@ -493,6 +618,10 @@ namespace ip_simd {
    return _mm512_pow_pd(one, two);
  }

+  inline SIMD_double SIMD_pow(const SIMD_double &one, const double two) {
+    return _mm512_pow_pd(one, SIMD_set(two));
+  }
+
  inline SIMD_float SIMD_exp(const SIMD_float &one) {
    return _mm512_exp_ps(one);
  }
@ -501,6 +630,18 @@ namespace ip_simd {
    return _mm512_exp_pd(one);
  }

+  inline SIMD_double SIMD_cos(const SIMD_double &one) {
+    return _mm512_cos_pd(one);
+  }
+
+  inline SIMD_double SIMD_sin(const SIMD_double &one) {
+    return _mm512_sin_pd(one);
+  }
+
+  inline SIMD_double SIMD_tan(const SIMD_double &one) {
+    return _mm512_tan_pd(one);
+  }
+
  // ------- Comparison operations

  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one,
@ -533,6 +674,14 @@ namespace ip_simd {
    return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two);
  }

+  inline SIMD_mask operator<(const SIMD256_int &one, const SIMD256_int &two) {
+    return _mm256_cmplt_epi32_mask(one,two);
+  }
+
+  inline SIMD_mask operator<(const int one, const SIMD256_int &two) {
+    return _mm256_cmplt_epi32_mask(SIMD256_set(one),two);
+  }
+
  inline SIMD_mask operator<(const SIMD_int &one, const SIMD_int &two) {
    return _mm512_cmplt_epi32_mask(one,two);
  }
@ -577,6 +726,10 @@ namespace ip_simd {
    return _mm512_cmple_ps_mask(SIMD_set(one), two);
  }

+  inline SIMD_mask operator<=(const SIMD_double &one, const SIMD_double &two) {
+    return _mm512_cmple_pd_mask(one, two);
+  }
+
  inline SIMD_mask operator<=(const double one, const SIMD_double &two) {
    return _mm512_cmple_pd_mask(SIMD_set(one), two);
  }
@ -593,6 +746,14 @@ namespace ip_simd {
    return _mm512_cmplt_pd_mask(two,one);
  }

+  inline SIMD_mask operator>(const SIMD_double &one, const double two) {
+    return _mm512_cmplt_pd_mask(SIMD_set(two),one);
+  }
+
+  inline SIMD_mask operator==(const SIMD256_int &one, const int two) {
+    return _mm256_cmpeq_epi32_mask(one,_mm256_set1_epi32(two));
+  }
+
  inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) {
    return _mm512_cmpeq_epi32_mask(one,two);
  }
--- a/src/INTEL/pair_snap_intel.cpp
+++ b/src/INTEL/pair_snap_intel.cpp
@ -0,0 +1,779 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
+
+#include "pair_snap_intel.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+#include "sna_intel.h"
+#include "tokenizer.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace ip_simd;
+
+#define MAXLINE 1024
+#define MAXWORD 3
+
+/* ---------------------------------------------------------------------- */
+
+PairSNAPIntel::PairSNAPIntel(LAMMPS *lmp) : Pair(lmp)
+{
+  single_enable = 0;
+  restartinfo = 0;
+  one_coeff = 1;
+  manybody_flag = 1;
+  centroidstressflag = CENTROID_NOTAVAIL;
+
+  radelem = nullptr;
+  wjelem = nullptr;
+  coeffelem = nullptr;
+  sinnerelem = nullptr;
+  dinnerelem = nullptr;
+
+  beta = nullptr;
+  bispectrum = nullptr;
+  snaptr = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairSNAPIntel::~PairSNAPIntel()
+{
+  if (copymode) return;
+
+  memory->destroy(radelem);
+  memory->destroy(wjelem);
+  memory->destroy(coeffelem);
+  memory->destroy(sinnerelem);
+  memory->destroy(dinnerelem);
+
+  memory->destroy(beta);
+  memory->destroy(bispectrum);
+
+  delete snaptr;
+
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(scale);
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   This version is a straightforward implementation
+   ---------------------------------------------------------------------- */
+
+void PairSNAPIntel::compute(int eflag, int vflag)
+{
+  SNA_DVEC fij[3];
+  int *jlist,*numneigh,**firstneigh;
+
+  ev_init(eflag,vflag);
+  int tally_xyz = 0;
+  if (vflag_atom || (vflag && !vflag_fdotr)) tally_xyz = 1;
+
+  double **x = atom->x;
+  double *_x = atom->x[0];
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int newton_pair = force->newton_pair;
+
+  // compute dE_i/dB_i = beta_i for all i in list
+
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  SNA_DVEC sevdwl(0);
+
+  const int vw = snaptr->vector_width();
+  for (int ii = 0; ii < list->inum; ii+=vw) {
+    SNA_IVEC i, jnum;
+    int max_jnum = 0;
+    for (int l = 0; l < vw; l++) {
+      if (ii + l < list->inum) {
+        i[l] = list->ilist[ii + l];
+        jnum[l] = numneigh[i[l]];
+      } else {
+        i[l] = list->ilist[0];
+        jnum[l] = 0;
+      }
+      if (jnum[l] > max_jnum) max_jnum = jnum[l];
+    }
+
+    // ensure rij, inside, wj, and rcutij are of size jnum
+
+    snaptr->grow_rij(max_jnum);
+
+    SNA_IVEC zero_vec(0);
+
+    const SNA_DVEC xtmp = SIMD_gather(_x, i * 3);
+    const SNA_DVEC ytmp = SIMD_gather(_x, i * 3 + 1);
+    const SNA_DVEC ztmp = SIMD_gather(_x, i * 3 + 2);
+    const SNA_IVEC itype = SIMD_gather(type, i);
+    const SNA_IVEC ielem = SIMD_gather(map, itype);
+    const SNA_DVEC radi = SIMD_gather(radelem, ielem);
+
+    // rij[][3] = displacements between atom I and those neighbors
+    // inside = indices of neighbors of I within cutoff
+    // wj = weights for neighbors of I within cutoff
+    // rcutij = cutoffs for neighbors of I within cutoff
+    // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
+
+    SNA_IVEC ninside(0);
+    for (int jj = 0; jj < max_jnum; jj++) {
+      SIMD_mask m(SIMD256_set(jj) < jnum);
+
+      SNA_IVEC j;
+      SV_for (int l = 0; l < vw; l++) {
+        jlist = firstneigh[i[l]];
+        if (jj < jnum[l]) j[l] = jlist[jj];
+        else j[l] = 0;
+      }
+      j &= NEIGHMASK;
+
+      const SNA_DVEC delx = SIMD_gather(m, _x, j * 3) - xtmp;
+      const SNA_DVEC dely = SIMD_gather(m, _x, j * 3 + 1) - ytmp;
+      const SNA_DVEC delz = SIMD_gather(m, _x, j * 3 + 2) - ztmp;
+      const SNA_IVEC jtype = SIMD_gather(type, j);
+      const SNA_DVEC rsq = delx*delx + dely*dely + delz*delz;
+      const SNA_DVEC vcut = SIMD_gather(m, cutsq[0],
+                                        itype * (atom->ntypes + 1) + jtype);
+
+      m &= rsq < vcut;
+      m &= rsq > SIMD_set(1e-20);
+      const SNA_IVEC jelem = SIMD_gather(map, jtype);
+      const SNA_IVEC ni3 = ninside * vw * 3 + SIMD256_count();
+      SIMD_scatter(m, (double *)(snaptr->rij[0]), ni3, delx);
+      SIMD_scatter(m, (double *)(snaptr->rij[0] + 1), ni3, dely);
+      SIMD_scatter(m, (double *)(snaptr->rij[0] + 2), ni3, delz);
+      const SNA_IVEC ni = ninside * vw + SIMD256_count();
+      SIMD_scatter(m, (int *)(snaptr->inside), ni, j);
+      SIMD_scatter(m, (double *)(snaptr->wj), ni,
+                   SIMD_gather(m, wjelem, jelem));
+      SIMD_scatter(m, (double *)(snaptr->rcutij), ni,
+                   (radi + SIMD_gather(m, radelem, jelem)) * rcutfac);
+      if (switchinnerflag) {
+        SIMD_scatter(m, (double *)(snaptr->sinnerij), ni,
+                     (SIMD_gather(m, sinnerelem, ielem) +
+                      SIMD_gather(m, sinnerelem, jelem)) * 0.5);
+        SIMD_scatter(m, (double *)(snaptr->dinnerij), ni,
+                     (SIMD_gather(m, dinnerelem, ielem) +
+                      SIMD_gather(m, dinnerelem, jelem)) * 0.5);
+      }
+      if (chemflag)
+        SIMD_scatter(m, (int *)(snaptr->element), ni, jelem);
+      ninside = SIMD_add(m, ninside, 1);
+    } // for jj
+
+    // compute Ui, Yi for atom I
+
+    if (chemflag)
+      snaptr->compute_ui(ninside, ielem, max_jnum);
+    else
+      snaptr->compute_ui(ninside, zero_vec, max_jnum);
+
+    // Compute bispectrum
+    if (quadraticflag || eflag) {
+      snaptr->compute_zi_or_yi<0>(beta);
+      if (chemflag)
+        snaptr->compute_bi(ielem);
+      else
+        snaptr->compute_bi(zero_vec);
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+        SIMD_store(bispectrum + icoeff, SIMD_load(snaptr->blist + icoeff));
+    }
+
+    // Compute beta
+    for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+      SIMD_store(beta + icoeff, SIMD_gather(coeffelem[0],
+                                            ielem * ncoeffall + icoeff + 1));
+
+    if (quadraticflag) {
+      int k = ncoeff+1;
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+        SNA_DVEC bveci = SIMD_load(bispectrum + icoeff);
+        SNA_DVEC beta_i = SIMD_load(beta + icoeff) +
+          SIMD_gather(coeffelem[0], ielem * ncoeffall + k) * bveci;
+        k++;
+        for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+          const SNA_DVEC ci = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
+          beta_i = beta_i + ci * SIMD_load(bispectrum + jcoeff);
+          SIMD_store(beta + jcoeff, ci * bveci + SIMD_load(beta + jcoeff));
+          k++;
+        }
+        SIMD_store(beta + icoeff, beta_i);
+      }
+    }
+
+    // for neighbors of I within cutoff:
+    // compute Fij = dEi/dRj = -dEi/dRi
+    // add to Fi, subtract from Fj
+    // scaling is that for type I
+
+    if (quadraticflag || eflag)
+      snaptr->compute_yi_from_zi(beta);
+    else
+      snaptr->compute_zi_or_yi<1>(beta);
+
+    SNA_DVEC fi_x(0.0), fi_y(0.0), fi_z(0.0);
+    SNA_DVEC scalev = SIMD_gather(scale[0], itype * (atom->ntypes+1) + itype);
+    for (int jj = 0; jj < max_jnum; jj++) {
+      snaptr->compute_duidrj(jj, ninside);
+      if (chemflag && nelements > 1)
+        snaptr->compute_deidrj_e(jj, ninside, fij);
+      else
+        snaptr->compute_deidrj(jj, ninside, fij);
+
+      SNA_DVEC fijs_x = fij[0] * scalev;
+      SNA_DVEC fijs_y = fij[1] * scalev;
+      SNA_DVEC fijs_z = fij[2] * scalev;
+
+      fi_x += fijs_x;
+      fi_y += fijs_y;
+      fi_z += fijs_z;
+
+      for (int l = 0; l < vw; l++) {
+        if (jj < ninside[l]) {
+          int j = snaptr->inside[jj][l];
+          f[j][0] -= fijs_x[l];
+          f[j][1] -= fijs_y[l];
+          f[j][2] -= fijs_z[l];
+
+          if (tally_xyz)
+            ev_tally_xyz(i[l],j,nlocal,newton_pair,0.0,0.0,
+                         fij[0][l],fij[1][l],fij[2][l],
+                         -snaptr->rij[jj][0][l],-snaptr->rij[jj][1][l],
+                         -snaptr->rij[jj][2][l]);
+        }
+      } // for l
+    } // for jj
+    SIMD_mask m((SIMD256_count() + ii) < list->inum);
+    SNA_DVEC fix = SIMD_gather(m, f[0], i * 3) +  fi_x;
+    SIMD_scatter(m, f[0], i * 3, fix);
+    SNA_DVEC fiy = SIMD_gather(m, f[0], i * 3 + 1) +  fi_y;
+    SIMD_scatter(m, f[0], i * 3 + 1, fiy);
+    SNA_DVEC fiz = SIMD_gather(m, f[0], i * 3 + 2) +  fi_z;
+    SIMD_scatter(m, f[0], i * 3 + 2, fiz);
+
+    // tally energy contribution
+
+    if (eflag) {
+      SNA_DVEC evdwl = SIMD_gather(coeffelem[0], ielem * ncoeffall);
+      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+        evdwl += SIMD_gather(coeffelem[0], ielem * ncoeffall + icoeff +1) *
+          bispectrum[icoeff];
+
+      if (quadraticflag) {
+        int k = ncoeff+1;
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+          SNA_DVEC bveci = SIMD_load(bispectrum + icoeff);
+          SNA_DVEC c = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
+          k++;
+          evdwl += c * 0.5 * bveci * bveci;
+          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
+            SNA_DVEC bvecj = SIMD_load(bispectrum + jcoeff);
+            SNA_DVEC cj = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
+            k++;
+            evdwl += cj * bveci * bvecj;
+          }
+        }
+      }
+      sevdwl += scalev * evdwl;
+      if (eatom) {
+        SNA_DVEC ea = SIMD_gather(m, eatom, i) + scalev * evdwl;
+        SIMD_scatter(m, eatom, i, ea);
+      }
+    } // if (eflag)
+  } // for ii
+  if (eflag) eng_vdwl += SIMD_sum(sevdwl);
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairSNAPIntel::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+  memory->create(scale,n+1,n+1,"pair:scale");
+  map = new int[n+1];
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSNAPIntel::settings(int narg, char ** /* arg */)
+{
+  if (narg > 0)
+    error->all(FLERR,"Illegal pair_style command");
+  if ((comm->me == 0) && (comm->nthreads > 1))
+    error->warning(FLERR, "Pair style snap/intel does not use OpenMP threads");
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairSNAPIntel::coeff(int narg, char **arg)
+{
+  if (!allocated) allocate();
+  if (narg != 4 + atom->ntypes) error->all(FLERR,"Incorrect args for pair coefficients");
+
+  map_element2type(narg-4,arg+4);
+
+  // read snapcoeff and snapparam files
+
+  read_files(arg[2],arg[3]);
+
+  if (!quadraticflag)
+    ncoeff = ncoeffall - 1;
+  else {
+
+    // ncoeffall should be (ncoeff+2)*(ncoeff+1)/2
+    // so, ncoeff = floor(sqrt(2*ncoeffall))-1
+
+    ncoeff = sqrt(2.0*ncoeffall)-1;
+    ncoeffq = (ncoeff*(ncoeff+1))/2;
+    int ntmp = 1+ncoeff+ncoeffq;
+    if (ntmp != ncoeffall) {
+      error->all(FLERR,"Incorrect SNAP coeff file");
+    }
+  }
+
+  snaptr = new SNAIntel(lmp, rfac0, twojmax,
+                        rmin0, switchflag, bzeroflag,
+                        chemflag, bnormflag, wselfallflag,
+                        nelements, switchinnerflag);
+
+  if (ncoeff != snaptr->ncoeff) {
+    if (comm->me == 0)
+      printf("ncoeff = %d snancoeff = %d \n",ncoeff,snaptr->ncoeff);
+    error->all(FLERR,"Incorrect SNAP parameter file");
+  }
+
+  // Calculate maximum cutoff for all elements
+  rcutmax = 0.0;
+  for (int ielem = 0; ielem < nelements; ielem++)
+    rcutmax = MAX(2.0*radelem[ielem]*rcutfac,rcutmax);
+
+  // set default scaling
+  int n = atom->ntypes;
+  for (int ii = 0; ii < n+1; ii++)
+    for (int jj = 0; jj < n+1; jj++)
+      scale[ii][jj] = 1.0;
+
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairSNAPIntel::init_style()
+{
+  if (force->newton_pair == 0)
+    error->all(FLERR,"Pair style SNAP requires newton pair on");
+
+  // need a full neighbor list
+
+  neighbor->add_request(this, NeighConst::REQ_FULL);
+
+  snaptr->init();
+
+  fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
+  if (!fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
+
+  fix->pair_init_check();
+
+  memory->create(bispectrum,ncoeff,"PairSNAP:bispectrum");
+  memory->create(beta,ncoeff,"PairSNAP:beta");
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairSNAPIntel::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  scale[j][i] = scale[i][j];
+  return (radelem[map[i]] +
+          radelem[map[j]])*rcutfac;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSNAPIntel::read_files(char *coefffilename, char *paramfilename)
+{
+
+  // open SNAP coefficient file on proc 0
+
+  FILE *fpcoeff;
+  if (comm->me == 0) {
+    fpcoeff = utils::open_potential(coefffilename,lmp,nullptr);
+    if (fpcoeff == nullptr)
+      error->one(FLERR,"Cannot open SNAP coefficient file {}: ",
+                                   coefffilename, utils::getsyserror());
+  }
+
+  char line[MAXLINE],*ptr;
+  int eof = 0;
+  int nwords = 0;
+  while (nwords == 0) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpcoeff);
+      if (ptr == nullptr) {
+        eof = 1;
+        fclose(fpcoeff);
+      }
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
+
+    // strip comment, skip line if blank
+
+    nwords = utils::count_words(utils::trim_comment(line));
+  }
+  if (nwords != 2)
+    error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+  // strip single and double quotes from words
+
+  int nelemtmp = 0;
+  try {
+    ValueTokenizer words(utils::trim_comment(line),"\"' \t\n\r\f");
+    nelemtmp = words.next_int();
+    ncoeffall = words.next_int();
+  } catch (TokenizerException &e) {
+    error->all(FLERR,"Incorrect format in SNAP coefficient file: {}", e.what());
+  }
+
+  // clean out old arrays and set up element lists
+
+  memory->destroy(radelem);
+  memory->destroy(wjelem);
+  memory->destroy(coeffelem);
+  memory->destroy(sinnerelem);
+  memory->destroy(dinnerelem);
+  memory->create(radelem,nelements,"pair:radelem");
+  memory->create(wjelem,nelements,"pair:wjelem");
+  memory->create(coeffelem,nelements,ncoeffall,"pair:coeffelem");
+  memory->create(sinnerelem,nelements,"pair:sinnerelem");
+  memory->create(dinnerelem,nelements,"pair:dinnerelem");
+
+  // initialize checklist for all required nelements
+
+  int *elementflags = new int[nelements];
+  for (int jelem = 0; jelem < nelements; jelem++)
+      elementflags[jelem] = 0;
+
+  // loop over nelemtmp blocks in the SNAP coefficient file
+
+  for (int ielem = 0; ielem < nelemtmp; ielem++) {
+
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpcoeff);
+      if (ptr == nullptr) {
+        eof = 1;
+        fclose(fpcoeff);
+      }
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof)
+      error->all(FLERR,"Incorrect format in SNAP coefficient file");
+    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
+
+    std::vector<std::string> words;
+    try {
+      words = Tokenizer(utils::trim_comment(line),"\"' \t\n\r\f").as_vector();
+    } catch (TokenizerException &) {
+      // ignore
+    }
+    if (words.size() != 3)
+      error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+    int jelem;
+    for (jelem = 0; jelem < nelements; jelem++)
+      if (words[0] == elements[jelem]) break;
+
+    // if this element not needed, skip this block
+
+    if (jelem == nelements) {
+      if (comm->me == 0) {
+        for (int icoeff = 0; icoeff < ncoeffall; icoeff++) {
+          ptr = fgets(line,MAXLINE,fpcoeff);
+          if (ptr == nullptr) {
+            eof = 1;
+            fclose(fpcoeff);
+          }
+        }
+      }
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof)
+        error->all(FLERR,"Incorrect format in SNAP coefficient file");
+      continue;
+    }
+
+    if (elementflags[jelem] == 1)
+      error->all(FLERR,"Incorrect format in SNAP coefficient file");
+    else
+      elementflags[jelem] = 1;
+
+    radelem[jelem] = utils::numeric(FLERR,words[1],false,lmp);
+    wjelem[jelem] = utils::numeric(FLERR,words[2],false,lmp);
+
+    if (comm->me == 0)
+      utils::logmesg(lmp,"SNAP Element = {}, Radius {}, Weight {}\n",
+                     elements[jelem], radelem[jelem], wjelem[jelem]);
+
+    for (int icoeff = 0; icoeff < ncoeffall; icoeff++) {
+      if (comm->me == 0) {
+        ptr = fgets(line,MAXLINE,fpcoeff);
+        if (ptr == nullptr) {
+          eof = 1;
+          fclose(fpcoeff);
+        }
+      }
+
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof)
+        error->all(FLERR,"Incorrect format in SNAP coefficient file");
+      MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
+
+      try {
+        ValueTokenizer coeff(utils::trim_comment(line));
+        if (coeff.count() != 1)
+          error->all(FLERR,"Incorrect format in SNAP coefficient file");
+
+        coeffelem[jelem][icoeff] = coeff.next_double();
+      } catch (TokenizerException &e) {
+        error->all(FLERR,"Incorrect format in SNAP coefficient file: {}", e.what());
+      }
+    }
+  }
+
+  if (comm->me == 0) fclose(fpcoeff);
+
+  for (int jelem = 0; jelem < nelements; jelem++) {
+    if (elementflags[jelem] == 0)
+      error->all(FLERR,"Element {} not found in SNAP coefficient file", elements[jelem]);
+  }
+  delete[] elementflags;
+
+  // set flags for required keywords
+
+  rcutfacflag = 0;
+  twojmaxflag = 0;
+
+  // Set defaults for optional keywords
+
+  rfac0 = 0.99363;
+  rmin0 = 0.0;
+  switchflag = 1;
+  bzeroflag = 1;
+  quadraticflag = 0;
+  chemflag = 0;
+  bnormflag = 0;
+  wselfallflag = 0;
+  switchinnerflag = 0;
+  chunksize = 32768;
+  parallel_thresh = 8192;
+
+  // set local input checks
+
+  int sinnerflag = 0;
+  int dinnerflag = 0;
+
+  // open SNAP parameter file on proc 0
+
+  FILE *fpparam;
+  if (comm->me == 0) {
+    fpparam = utils::open_potential(paramfilename,lmp,nullptr);
+    if (fpparam == nullptr)
+      error->one(FLERR,"Cannot open SNAP parameter file {}: {}",
+                                   paramfilename, utils::getsyserror());
+  }
+
+  eof = 0;
+  while (true) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fpparam);
+      if (ptr == nullptr) {
+        eof = 1;
+        fclose(fpparam);
+      }
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
+
+    // words = ptrs to all words in line
+    // strip single and double quotes from words
+
+    std::vector<std::string> words;
+    try {
+      words = Tokenizer(utils::trim_comment(line),"\"' \t\n\r\f").as_vector();
+    } catch (TokenizerException &) {
+      // ignore
+    }
+
+    if (words.size() == 0) continue;
+
+    if (words.size() < 2)
+      error->all(FLERR,"Incorrect format in SNAP parameter file");
+
+    auto keywd = words[0];
+    auto keyval = words[1];
+
+    // check for keywords with more than one value per element
+
+    if (keywd == "sinner" || keywd == "dinner") {
+
+      if ((int)words.size() != nelements+1)
+        error->all(FLERR,"Incorrect SNAP parameter file");
+
+      // innerlogstr collects all values of sinner or dinner for log output below
+
+      std::string innerlogstr;
+
+      int iword = 1;
+
+      if (keywd == "sinner") {
+        for (int ielem = 0; ielem < nelements; ielem++) {
+          keyval = words[iword];
+          sinnerelem[ielem] = utils::numeric(FLERR,keyval,false,lmp);
+          iword++;
+          innerlogstr += keyval + " ";
+        }
+        sinnerflag = 1;
+      } else if (keywd == "dinner") {
+        for (int ielem = 0; ielem < nelements; ielem++) {
+          keyval = words[iword];
+          dinnerelem[ielem] = utils::numeric(FLERR,keyval,false,lmp);
+          iword++;
+          innerlogstr += keyval + " ";
+        }
+        dinnerflag = 1;
+      }
+
+      if (comm->me == 0)
+        utils::logmesg(lmp,"SNAP keyword {} {} ... \n", keywd, innerlogstr);
+
+    } else {
+
+      // all other keywords take one value
+
+      if (nwords != 2)
+        error->all(FLERR,"Incorrect SNAP parameter file");
+
+      if (comm->me == 0)
+        utils::logmesg(lmp,"SNAP keyword {} {}\n",keywd,keyval);
+
+      if (keywd == "rcutfac") {
+        rcutfac = utils::numeric(FLERR,keyval,false,lmp);
+        rcutfacflag = 1;
+      } else if (keywd == "twojmax") {
+        twojmax = utils::inumeric(FLERR,keyval,false,lmp);
+        twojmaxflag = 1;
+      } else if (keywd == "rfac0")
+        rfac0 = utils::numeric(FLERR,keyval,false,lmp);
+      else if (keywd == "rmin0")
+        rmin0 = utils::numeric(FLERR,keyval,false,lmp);
+      else if (keywd == "switchflag")
+        switchflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "bzeroflag")
+        bzeroflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "quadraticflag")
+        quadraticflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "chemflag")
+        chemflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "bnormflag")
+        bnormflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "wselfallflag")
+        wselfallflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "switchinnerflag")
+        switchinnerflag = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "chunksize")
+        chunksize = utils::inumeric(FLERR,keyval,false,lmp);
+      else if (keywd == "parallelthresh")
+        parallel_thresh = utils::inumeric(FLERR,keyval,false,lmp);
+      else
+        error->all(FLERR,"Unknown parameter '{}' in SNAP parameter file", keywd);
+    }
+  }
+
+  if (rcutfacflag == 0 || twojmaxflag == 0)
+    error->all(FLERR,"Incorrect SNAP parameter file");
+
+  if (chemflag && nelemtmp != nelements)
+    error->all(FLERR,"Incorrect SNAP parameter file");
+
+  if (switchinnerflag && !(sinnerflag && dinnerflag))
+    error->all(FLERR,"Incorrect SNAP parameter file");
+
+  if (!switchinnerflag && (sinnerflag || dinnerflag))
+    error->all(FLERR,"Incorrect SNAP parameter file");
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double PairSNAPIntel::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+
+  int n = atom->ntypes+1;
+  bytes += (double)n*n*sizeof(int);         // setflag
+  bytes += (double)n*n*sizeof(double);      // cutsq
+  bytes += (double)n*n*sizeof(double);      // scale
+  bytes += (double)n*sizeof(int);           // map
+  bytes += (double)ncoeff*sizeof(SNA_DVEC); // bispectrum
+  bytes += (double)ncoeff*sizeof(SNA_DVEC); // beta
+
+  bytes += snaptr->memory_usage(); // SNA object
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *PairSNAPIntel::extract(const char *str, int &dim)
+{
+  dim = 2;
+  if (strcmp(str,"scale") == 0) return (void *) scale;
+  return nullptr;
+}
+
+#endif
+#endif
--- a/src/INTEL/pair_snap_intel.h
+++ b/src/INTEL/pair_snap_intel.h
@ -0,0 +1,83 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(snap/intel,PairSNAPIntel);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_SNAP_INTEL_H
+#define LMP_PAIR_SNAP_INTEL_H
+
+#include "fix_intel.h"
+#include "pair.h"
+
+namespace ip_simd { class SIMD_double; class SIMD_int; };
+#define SNA_DVEC ip_simd::SIMD_double
+#define SNA_IVEC ip_simd::SIMD256_int
+
+namespace LAMMPS_NS {
+
+class PairSNAPIntel : public Pair {
+ public:
+  PairSNAPIntel(class LAMMPS *);
+  ~PairSNAPIntel() override;
+  void compute(int, int) override;
+  void settings(int, char **) override;
+  void coeff(int, char **) override;
+  void init_style() override;
+  double init_one(int, int) override;
+  double memory_usage() override;
+  void *extract(const char *, int &) override;
+
+  double rcutfac, quadraticflag;    // declared public to workaround gcc 4.9
+  int ncoeff;                       //  compiler bug, manifest in KOKKOS package
+
+ protected:
+  FixIntel *fix;
+
+  int ncoeffq, ncoeffall;
+  class SNAIntel *snaptr;
+  virtual void allocate();
+  void read_files(char *, char *);
+  inline int equal(double *x, double *y);
+  inline double dist2(double *x, double *y);
+
+  double rcutmax;         // max cutoff for all elements
+  double *radelem;        // element radii
+  double *wjelem;         // elements weights
+  double **coeffelem;     // element bispectrum coefficients
+  SNA_DVEC *beta;          // betas for all atoms in list
+  SNA_DVEC *bispectrum;    // bispectrum components for all atoms in list
+  double **scale;         // for thermodynamic integration
+  int twojmax, switchflag, bzeroflag, bnormflag;
+  int chemflag, wselfallflag;
+  int switchinnerflag;    // inner cutoff switch
+  double *sinnerelem;     // element inner cutoff midpoint
+  double *dinnerelem;     // element inner cutoff half-width
+  int chunksize, parallel_thresh;
+  double rfac0, rmin0, wj1, wj2;
+  int rcutfacflag, twojmaxflag;    // flags for required parameters
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
+
+#endif
+#endif
--- a/src/INTEL/sna_intel.cpp
+++ b/src/INTEL/sna_intel.cpp
--- a/src/INTEL/sna_intel.h
+++ b/src/INTEL/sna_intel.h
@ -0,0 +1,187 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown, Intel
+------------------------------------------------------------------------- */
+
+#ifndef LMP_SNA_INTEL_H
+#define LMP_SNA_INTEL_H
+
+#if defined(__AVX512F__)
+#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
+
+#include "pointers.h"
+#include "intel_buffers.h"
+#include "intel_simd.h"
+
+#define SVW 8
+
+#if defined(LMP_SIMD_COMPILER)
+#if defined(USE_OMP_SIMD)
+#define SV_for _Pragma("omp simd") _Pragma("vector aligned") for
+#else
+#define SV_for _Pragma("simd assert") _Pragma("vector aligned") for
+#endif
+#else
+#define SV_for for
+#endif
+
+namespace LAMMPS_NS {
+
+struct SNA_ZINDICES {
+  int j1, j2, j, ma1min, ma2max, mb1min;
+  int mb2max, na, nb, jju;
+};
+
+struct SNA_BINDICES {
+  int j1, j2, j;
+};
+
+#define SNA_DVEC ip_simd::SIMD_double
+#define SNA_IVEC ip_simd::SIMD256_int
+
+class SNAIntel : protected Pointers {
+
+ public:
+  SNAIntel(LAMMPS *, double, int, double, int, int, int, int, int, int, int);
+
+  SNAIntel(LAMMPS *lmp) : Pointers(lmp){};
+  ~SNAIntel() override;
+  void build_indexlist();
+  void init();
+  double memory_usage();
+
+  int ncoeff;
+
+  inline int vector_width() const { return SVW; }
+
+  // functions for bispectrum coefficients
+
+  void compute_ui(const SNA_IVEC &, const SNA_IVEC &, const int max_jnum);
+  template <int> void compute_zi_or_yi(const SNA_DVEC *);
+  void compute_yi_from_zi(const SNA_DVEC *);
+  void compute_yterm(int, int, int, const double *);
+  void compute_bi(const SNA_IVEC &);
+
+  // functions for derivatives
+
+  void compute_duidrj(const int, const SNA_IVEC &);
+  void compute_deidrj_e(const int, const SNA_IVEC &, SNA_DVEC *);
+  void compute_deidrj(const int, const SNA_IVEC &, SNA_DVEC *);
+  double compute_sfac(double, double, double, double);
+  SNA_DVEC compute_sfac(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
+                        const SNA_DVEC &);
+  inline SNA_DVEC compute_sfac_dsfac(const SNA_DVEC &, const SNA_DVEC &,
+                                     const SNA_DVEC &, const SNA_DVEC &,
+                                     SNA_DVEC &);
+
+  // public bispectrum data
+
+  int twojmax;
+  SNA_DVEC *blist;
+  double **dblist;
+
+  // short neighbor list data
+
+  void grow_rij(int);
+  int nmax;    // allocated size of short lists
+
+  SNA_DVEC **rij;      // short rij list
+  SNA_IVEC *inside;       // short neighbor list
+  SNA_DVEC *wj;        // short weight list
+  SNA_DVEC *rcutij;    // short cutoff list
+
+  // only allocated for switch_inner_flag=1
+
+  SNA_DVEC *sinnerij;    // short inner cutoff midpoint list
+  SNA_DVEC *dinnerij;    // short inner half-width list
+
+  // only allocated for chem_flag=1
+
+  SNA_IVEC *element;    // short element list [0,nelements)
+
+ private:
+  double rmin0, rfac0;
+
+  // data for bispectrum coefficients
+
+  SNA_ZINDICES *idxz;
+  SNA_BINDICES *idxb;
+
+  double **rootpqarray;
+  double *cglist;
+  int ***idxcg_block;
+
+  SNA_DVEC *ulisttot_r, *ulisttot_i;
+  SNA_DVEC **ulist_r_ij, **ulist_i_ij;    // short u list
+  int *idxu_block;
+
+  SNA_DVEC *zlist_r, *zlist_i;
+  int ***idxz_block;
+
+  int ***idxb_block;
+
+  SNA_DVEC **dulist_r, **dulist_i;
+
+  SNA_DVEC *ylist_r, *ylist_i;
+  int idxcg_max, idxu_max, idxz_max, idxb_max;
+
+  void create_twojmax_arrays();
+  void destroy_twojmax_arrays();
+  void init_clebsch_gordan();
+  void print_clebsch_gordan();
+  void init_rootpqarray();
+  void zero_uarraytot(const SNA_IVEC &);
+  void add_uarraytot(const SNA_DVEC &, const int, const SNA_IVEC &);
+  void compute_uarray(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
+                      const SNA_DVEC &, const SNA_DVEC &, const int,
+                      const SNA_IVEC &);
+  double deltacg(int, int, int);
+  void compute_ncoeff();
+  void compute_duarray(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
+                       const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
+                       const SNA_DVEC &, const SNA_DVEC &, int,
+                       const SNA_IVEC &);
+  inline double choose_beta(const int, const int, const int,
+                            const int, const int, const int,  int &);
+
+  // Sets the style for the switching function
+  // 0 = none
+  // 1 = cosine
+  int switch_flag;
+
+  // Sets the style for the inner switching function
+  // 0 = none
+  // 1 = cosine
+  int switch_inner_flag;
+
+  // Self-weight
+  double wself;
+
+  int bzero_flag;       // 1 if bzero subtracted from barray
+  double *bzero;        // array of B values for isolated atoms
+  int bnorm_flag;       // 1 if barray divided by j+1
+  int chem_flag;        // 1 for multi-element bispectrum components
+  int wselfall_flag;    // 1 for adding wself to all element labelings
+  int nelements;        // number of elements
+  int ndoubles;         // number of multi-element pairs
+  int ntriples;         // number of multi-element triplets
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
+
+#endif