Merge pull request #3921 from wmbrownIntel/snap-intel

Adding intel variant of snap pair style.
2023-10-10 11:02:43 -04:00
parent d97d14745e 508dbb74c5
commit c0ace4aa4b
10 changed files with 2798 additions and 3 deletions
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -265,7 +265,7 @@ OPT.
   * :doc:`smd/tri_surface <pair_smd_triangulated_surface>`
   * :doc:`smd/ulsph <pair_smd_ulsph>`
   * :doc:`smtbq <pair_smtbq>`
-   * :doc:`snap (k) <pair_snap>`
+   * :doc:`snap (ik) <pair_snap>`
   * :doc:`soft (go) <pair_soft>`
   * :doc:`sph/heatconduction <pair_sph_heatconduction>`
   * :doc:`sph/idealgas <pair_sph_idealgas>`
--- a/doc/src/pair_snap.rst
+++ b/doc/src/pair_snap.rst
@ -1,10 +1,11 @@
 .. index:: pair_style snap
 .. index:: pair_style snap/intel
 .. index:: pair_style snap/kk
 pair_style snap command
 =======================
-Accelerator Variants: *snap/kk*
+Accelerator Variants: *snap/intel*, *snap/kk*
 Syntax
 """"""
@ -260,6 +261,14 @@ This style is part of the ML-SNAP package.  It is only enabled if LAMMPS
 was built with that package.  See the :doc:`Build package
 <Build_package>` page for more info.
 The *snap/intel* accelerator variant will *only* be available if LAMMPS
 is built with Intel *compilers* and for CPUs with AVX-512 support.
 While the INTEL package in general allows multiple floating point
 precision modes to be selected, *snap/intel* will currently always use
 full double precision regardless of the precision mode selected.
 Additionally, the *intel* variant of snap will **NOT** use multiple
 threads with OpenMP.
 Related commands
 """"""""""""""""
--- a/src/Depend.sh
+++ b/src/Depend.sh
@ -185,6 +185,7 @@ fi
 if (test $1 = "ML-SNAP") then
  depend ML-IAP
  depend KOKKOS
  depend INTEL
 fi
 if (test $1 = "CG-SPICA") then
--- a/src/INTEL/TEST/in.intel.snap
+++ b/src/INTEL/TEST/in.intel.snap
@ -0,0 +1,70 @@
 # Toy demonstration of SNAP "scale" parameter, using fix/adapt and hybrid/overlay
 # Mixing linear and quadratic SNAP Ni potentials by Zuo et al. JCPA 2020
 variable	w index 10	# Warmup Timesteps
 variable	t index 100	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
 variable	n index 0	# Use NUMA Mapping for Multi-Node
 variable	x index 4
 variable	y index 2
 variable	z index 2
 variable	rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 if "$n > 0"	then "processors * * * grid numa"
 # mixing parameter
 variable lambda equal 0.2
 # Initialize simulation
 variable a equal 3.52
 units           metal
 # generate the box and atom positions using a FCC lattice
 variable nx equal 20*$x
 variable ny equal 20*$y
 variable nz equal 20*$z
 boundary        p p p
 lattice         fcc $a
 region          box block 0 ${nx} 0 ${ny} 0 ${nz}
 create_box      1 box
 create_atoms    1 box
 mass 1 34.
 # choose bundled SNAP Ni potential from Zuo et al. JCPA 2020
 pair_style hybrid/overlay snap snap
 pair_coeff * * snap 1 &
    ${root}/examples/snap/Ni_Zuo_JPCA2020.snapcoeff &
    ${root}/examples/snap/Ni_Zuo_JPCA2020.snapparam Ni
 pair_coeff * * snap 2 &
    ${root}/examples/snap/Ni_Zuo_JPCA2020.quadratic.snapcoeff &
    ${root}/examples/snap/Ni_Zuo_JPCA2020.quadratic.snapparam Ni
 # scale according to mixing parameter
 variable l1 equal ${lambda}
 variable l2 equal 1.0-${lambda}
 fix scale1 all adapt 1 pair snap:1 scale * * v_l1
 fix scale2 all adapt 1 pair snap:2 scale * * v_l2
 # Setup output
 thermo          1
 thermo_modify norm yes
 # Set up NVE run
 timestep 0.5e-3
 neighbor 1.0 bin
 neigh_modify every 1 delay 0 check yes
 # Run MD
 velocity all create 300.0 4928459 loop geom
 fix 1 all nve
 if "$w > 0"	then "run $w"
 run		${rr}
--- a/src/INTEL/TEST/run_benchmarks.sh
+++ b/src/INTEL/TEST/run_benchmarks.sh
@ -35,7 +35,7 @@ export I_MPI_PIN_DOMAIN=core
 # End settings for your system
 #########################################################################
-export WORKLOADS="lj rhodo lc sw water eam airebo dpd tersoff"
+export WORKLOADS="lj rhodo lc sw water eam airebo dpd tersoff snap"
 export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1"
 export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1"
--- a/src/INTEL/intel_simd.h
+++ b/src/INTEL/intel_simd.h
@ -46,13 +46,38 @@ namespace ip_simd {
  typedef __mmask16 SIMD_mask;
  inline bool any(const SIMD_mask &m) { return m != 0; }
  struct SIMD_int {
    __m512i v;
    SIMD_int() {}
    SIMD_int(const __m512i in) : v(in) {}
    inline int & operator[](const int i) { return ((int *)&(v))[i]; }
    inline const int & operator[](const int i) const
      { return ((int *)&(v))[i]; }
    operator __m512i() const { return v;}
  };
  struct SIMD256_int {
    __m256i v;
    SIMD256_int() {}
    SIMD256_int(const __m256i in) : v(in) {}
    SIMD256_int(const int in) : v(_mm256_set1_epi32(in)) {}
    inline int & operator[](const int i) { return ((int *)&(v))[i]; }
    inline const int & operator[](const int i) const
      { return ((int *)&(v))[i]; }
 #ifdef __INTEL_LLVM_COMPILER
    inline SIMD256_int operator&=(const int i)
      { v=_mm256_and_epi32(v, _mm256_set1_epi32(i)); return *this; };
 #else
    inline SIMD256_int operator&=(const int i)
      { v=_mm256_and_si256(v, _mm256_set1_epi32(i)); return *this; };
 #endif
    inline SIMD256_int operator+=(const int i)
      { v=_mm256_add_epi32(v, _mm256_set1_epi32(i)); return *this; };
    operator __m256i() const { return v;}
  };
  struct SIMD_float {
    __m512 v;
    SIMD_float() {}
@ -64,7 +89,24 @@ namespace ip_simd {
    __m512d v;
    SIMD_double() {}
    SIMD_double(const __m512d in) : v(in) {}
    SIMD_double(const double in) { v=_mm512_set1_pd(in); }
    inline double & operator[](const int i) { return ((double *)&(v))[i]; }
    inline const double & operator[](const int i) const
      { return ((double *)&(v))[i]; }
    operator __m512d() const { return v;}
    SIMD_double & operator=(const double i)
      { _mm512_set1_pd(i); return *this; }
    SIMD_double &operator=(const SIMD_double &i)
      { v = i.v; return *this; }
    SIMD_double operator-() { return _mm512_xor_pd(v, _mm512_set1_pd(-0.0)); }
    SIMD_double & operator+=(const SIMD_double & two)
      { v = _mm512_add_pd(v, two.v); return *this; }
    SIMD_double & operator-=(const SIMD_double & two)
      { v = _mm512_sub_pd(v, two.v); return *this; }
    SIMD_double & operator*=(const SIMD_double & two)
      { v = _mm512_mul_pd(v, two.v); return *this; }
  };
  template<class flt_t>
@ -99,6 +141,12 @@ namespace ip_simd {
  // ------- Set Operations
  inline SIMD256_int SIMD256_set(const int l0, const int l1, const int l2,
                                 const int l3, const int l4, const int l5,
                                 const int l6, const int l7) {
    return _mm256_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7);
  }
  inline SIMD_int SIMD_set(const int l0, const int l1, const int l2,
                           const int l3, const int l4, const int l5,
                           const int l6, const int l7, const int l8,
@ -109,6 +157,10 @@ namespace ip_simd {
                             l8,l9,l10,l11,l12,l13,l14,l15);
  }
  inline SIMD256_int SIMD256_set(const int l) {
    return _mm256_set1_epi32(l);
  }
  inline SIMD_int SIMD_set(const int l) {
    return _mm512_set1_epi32(l);
  }
@ -121,6 +173,10 @@ namespace ip_simd {
    return _mm512_set1_pd(l);
  }
  inline SIMD256_int SIMD256_count() {
    return SIMD256_set(0,1,2,3,4,5,6,7);
  }
  inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) {
    return _mm512_maskz_mov_epi32(m, one);
  }
@ -147,6 +203,10 @@ namespace ip_simd {
  // -------- Load Operations
  inline SIMD256_int SIMD_load(const SIMD256_int *p) {
    return _mm256_load_epi32((int *)p);
  }
  inline SIMD_int SIMD_load(const int *p) {
    return _mm512_load_epi32(p);
  }
@ -159,6 +219,10 @@ namespace ip_simd {
    return _mm512_load_pd(p);
  }
  inline SIMD_double SIMD_load(const SIMD_double *p) {
    return _mm512_load_pd((double *)p);
  }
  inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) {
    return _mm512_maskz_load_epi32(m, p);
  }
@ -171,6 +235,10 @@ namespace ip_simd {
    return _mm512_maskz_load_pd(m, p);
  }
  inline SIMD256_int SIMD_gather(const int *p, const SIMD256_int &i) {
    return _mm256_i32gather_epi32(p, i, _MM_SCALE_4);
  }
  inline SIMD_int SIMD_gather(const int *p, const SIMD_int &i) {
    return _mm512_i32gather_epi32(i, p, _MM_SCALE_4);
  }
@ -179,6 +247,10 @@ namespace ip_simd {
    return _mm512_i32gather_ps(i, p, _MM_SCALE_4);
  }
  inline SIMD_double SIMD_gather(const double *p, const SIMD256_int &i) {
    return _mm512_i32gather_pd(i, p, _MM_SCALE_8);
  }
  inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
    return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
  }
@ -201,6 +273,12 @@ namespace ip_simd {
                                    _mm512_castsi512_si256(i), p, _MM_SCALE_8);
  }
  inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
                                 const SIMD256_int &i) {
    return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
                                    i, p, _MM_SCALE_8);
  }
  template <typename T>
  inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p,
                                      const SIMD_int &i) {
@ -252,6 +330,15 @@ namespace ip_simd {
    return _mm512_store_pd(p,one);
  }
  inline void SIMD_store(SIMD_double *p, const SIMD_double &one) {
    return _mm512_store_pd((double *)p,one);
  }
  inline void SIMD_scatter(const SIMD_mask &m, int *p,
                           const SIMD256_int &i, const SIMD256_int &vec) {
    _mm256_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
  }
  inline void SIMD_scatter(const SIMD_mask &m, int *p,
                           const SIMD_int &i, const SIMD_int &vec) {
    _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
@ -268,8 +355,22 @@ namespace ip_simd {
                              _MM_SCALE_8);
  }
  inline void SIMD_scatter(const SIMD_mask &m, double *p,
                           const SIMD256_int &i, const SIMD_double &vec) {
    _mm512_mask_i32scatter_pd(p, m, i, vec, _MM_SCALE_8);
  }
  inline void SIMD_scatter(double *p,
                           const SIMD256_int &i, const SIMD_double &vec) {
    _mm512_i32scatter_pd(p, i, vec, _MM_SCALE_8);
  }
  // ------- Arithmetic Operations
  inline SIMD256_int operator+(const SIMD256_int &one, const SIMD256_int &two) {
    return _mm256_add_epi32(one,two);
  }
  inline SIMD_int operator+(const SIMD_int &one, const SIMD_int &two) {
    return _mm512_add_epi32(one,two);
  }
@ -286,6 +387,10 @@ namespace ip_simd {
    return _mm512_add_epi32(one,SIMD_set(two));
  }
  inline SIMD256_int operator+(const SIMD256_int &one, const int two) {
    return _mm256_add_epi32(one,SIMD256_set(two));
  }
  inline SIMD_float operator+(const SIMD_float &one, const float two) {
    return _mm512_add_ps(one,SIMD_set(two));
  }
@ -299,6 +404,11 @@ namespace ip_simd {
    return _mm512_mask_add_epi32(one,m,one,SIMD_set(two));
  }
  inline SIMD256_int SIMD_add(const SIMD_mask &m,
                           const SIMD256_int &one, const int two) {
    return _mm256_mask_add_epi32(one,m,one,SIMD256_set(two));
  }
  inline SIMD_float SIMD_add(const SIMD_mask &m,
                             const SIMD_float &one, const float two) {
    return _mm512_mask_add_ps(one,m,one,SIMD_set(two));
@ -309,6 +419,11 @@ namespace ip_simd {
    return _mm512_mask_add_pd(one,m,one,SIMD_set(two));
  }
  inline SIMD_double SIMD_add(const SIMD_mask &m,
                              const SIMD_double &one, const SIMD_double &two) {
    return _mm512_mask_add_pd(one,m,one,two);
  }
  inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m,
                           const SIMD_int &one, const SIMD_int &two) {
    return _mm512_mask_add_epi32(s,m,one,two);
@ -387,6 +502,10 @@ namespace ip_simd {
    return _mm512_mul_pd(one,two);
  }
  inline SIMD256_int operator*(const SIMD256_int &one, const int two) {
    return _mm256_mullo_epi32(one,SIMD256_set(two));
  }
  inline SIMD_int operator*(const SIMD_int &one, const int two) {
    return _mm512_mullo_epi32(one,SIMD_set(two));
  }
@ -417,6 +536,12 @@ namespace ip_simd {
    return _mm512_fmadd_pd(one,two,three);
  }
  inline SIMD_double SIMD_fma(const SIMD_mask m, const SIMD_double &one,
                              const SIMD_double &two,
                              const SIMD_double &three) {
    return _mm512_mask3_fmadd_pd(one,two,three,m);
  }
  inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two,
                             const SIMD_float &three) {
    return _mm512_fmsub_ps(one,two,three);
@ -493,6 +618,10 @@ namespace ip_simd {
    return _mm512_pow_pd(one, two);
  }
  inline SIMD_double SIMD_pow(const SIMD_double &one, const double two) {
    return _mm512_pow_pd(one, SIMD_set(two));
  }
  inline SIMD_float SIMD_exp(const SIMD_float &one) {
    return _mm512_exp_ps(one);
  }
@ -501,6 +630,18 @@ namespace ip_simd {
    return _mm512_exp_pd(one);
  }
  inline SIMD_double SIMD_cos(const SIMD_double &one) {
    return _mm512_cos_pd(one);
  }
  inline SIMD_double SIMD_sin(const SIMD_double &one) {
    return _mm512_sin_pd(one);
  }
  inline SIMD_double SIMD_tan(const SIMD_double &one) {
    return _mm512_tan_pd(one);
  }
  // ------- Comparison operations
  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one,
@ -533,6 +674,14 @@ namespace ip_simd {
    return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two);
  }
  inline SIMD_mask operator<(const SIMD256_int &one, const SIMD256_int &two) {
    return _mm256_cmplt_epi32_mask(one,two);
  }
  inline SIMD_mask operator<(const int one, const SIMD256_int &two) {
    return _mm256_cmplt_epi32_mask(SIMD256_set(one),two);
  }
  inline SIMD_mask operator<(const SIMD_int &one, const SIMD_int &two) {
    return _mm512_cmplt_epi32_mask(one,two);
  }
@ -577,6 +726,10 @@ namespace ip_simd {
    return _mm512_cmple_ps_mask(SIMD_set(one), two);
  }
  inline SIMD_mask operator<=(const SIMD_double &one, const SIMD_double &two) {
    return _mm512_cmple_pd_mask(one, two);
  }
  inline SIMD_mask operator<=(const double one, const SIMD_double &two) {
    return _mm512_cmple_pd_mask(SIMD_set(one), two);
  }
@ -593,6 +746,14 @@ namespace ip_simd {
    return _mm512_cmplt_pd_mask(two,one);
  }
  inline SIMD_mask operator>(const SIMD_double &one, const double two) {
    return _mm512_cmplt_pd_mask(SIMD_set(two),one);
  }
  inline SIMD_mask operator==(const SIMD256_int &one, const int two) {
    return _mm256_cmpeq_epi32_mask(one,_mm256_set1_epi32(two));
  }
  inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) {
    return _mm512_cmpeq_epi32_mask(one,two);
  }
--- a/src/INTEL/pair_snap_intel.cpp
+++ b/src/INTEL/pair_snap_intel.cpp
@ -0,0 +1,779 @@
 // clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #if defined(__AVX512F__)
 #if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
 #include "pair_snap_intel.h"
 #include "atom.h"
 #include "comm.h"
 #include "error.h"
 #include "force.h"
 #include "memory.h"
 #include "modify.h"
 #include "neigh_list.h"
 #include "neighbor.h"
 #include "sna_intel.h"
 #include "tokenizer.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 using namespace ip_simd;
 #define MAXLINE 1024
 #define MAXWORD 3
 /* ---------------------------------------------------------------------- */
 PairSNAPIntel::PairSNAPIntel(LAMMPS *lmp) : Pair(lmp)
 {
  single_enable = 0;
  restartinfo = 0;
  one_coeff = 1;
  manybody_flag = 1;
  centroidstressflag = CENTROID_NOTAVAIL;
  radelem = nullptr;
  wjelem = nullptr;
  coeffelem = nullptr;
  sinnerelem = nullptr;
  dinnerelem = nullptr;
  beta = nullptr;
  bispectrum = nullptr;
  snaptr = nullptr;
 }
 /* ---------------------------------------------------------------------- */
 PairSNAPIntel::~PairSNAPIntel()
 {
  if (copymode) return;
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(coeffelem);
  memory->destroy(sinnerelem);
  memory->destroy(dinnerelem);
  memory->destroy(beta);
  memory->destroy(bispectrum);
  delete snaptr;
  if (allocated) {
    memory->destroy(setflag);
    memory->destroy(cutsq);
    memory->destroy(scale);
  }
 }
 /* ----------------------------------------------------------------------
   This version is a straightforward implementation
   ---------------------------------------------------------------------- */
 void PairSNAPIntel::compute(int eflag, int vflag)
 {
  SNA_DVEC fij[3];
  int *jlist,*numneigh,**firstneigh;
  ev_init(eflag,vflag);
  int tally_xyz = 0;
  if (vflag_atom || (vflag && !vflag_fdotr)) tally_xyz = 1;
  double **x = atom->x;
  double *_x = atom->x[0];
  double **f = atom->f;
  int *type = atom->type;
  int nlocal = atom->nlocal;
  int newton_pair = force->newton_pair;
  // compute dE_i/dB_i = beta_i for all i in list
  numneigh = list->numneigh;
  firstneigh = list->firstneigh;
  SNA_DVEC sevdwl(0);
  const int vw = snaptr->vector_width();
  for (int ii = 0; ii < list->inum; ii+=vw) {
    SNA_IVEC i, jnum;
    int max_jnum = 0;
    for (int l = 0; l < vw; l++) {
      if (ii + l < list->inum) {
        i[l] = list->ilist[ii + l];
        jnum[l] = numneigh[i[l]];
      } else {
        i[l] = list->ilist[0];
        jnum[l] = 0;
      }
      if (jnum[l] > max_jnum) max_jnum = jnum[l];
    }
    // ensure rij, inside, wj, and rcutij are of size jnum
    snaptr->grow_rij(max_jnum);
    SNA_IVEC zero_vec(0);
    const SNA_DVEC xtmp = SIMD_gather(_x, i * 3);
    const SNA_DVEC ytmp = SIMD_gather(_x, i * 3 + 1);
    const SNA_DVEC ztmp = SIMD_gather(_x, i * 3 + 2);
    const SNA_IVEC itype = SIMD_gather(type, i);
    const SNA_IVEC ielem = SIMD_gather(map, itype);
    const SNA_DVEC radi = SIMD_gather(radelem, ielem);
    // rij[][3] = displacements between atom I and those neighbors
    // inside = indices of neighbors of I within cutoff
    // wj = weights for neighbors of I within cutoff
    // rcutij = cutoffs for neighbors of I within cutoff
    // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi
    SNA_IVEC ninside(0);
    for (int jj = 0; jj < max_jnum; jj++) {
      SIMD_mask m(SIMD256_set(jj) < jnum);
      SNA_IVEC j;
      SV_for (int l = 0; l < vw; l++) {
        jlist = firstneigh[i[l]];
        if (jj < jnum[l]) j[l] = jlist[jj];
        else j[l] = 0;
      }
      j &= NEIGHMASK;
      const SNA_DVEC delx = SIMD_gather(m, _x, j * 3) - xtmp;
      const SNA_DVEC dely = SIMD_gather(m, _x, j * 3 + 1) - ytmp;
      const SNA_DVEC delz = SIMD_gather(m, _x, j * 3 + 2) - ztmp;
      const SNA_IVEC jtype = SIMD_gather(type, j);
      const SNA_DVEC rsq = delx*delx + dely*dely + delz*delz;
      const SNA_DVEC vcut = SIMD_gather(m, cutsq[0],
                                        itype * (atom->ntypes + 1) + jtype);
      m &= rsq < vcut;
      m &= rsq > SIMD_set(1e-20);
      const SNA_IVEC jelem = SIMD_gather(map, jtype);
      const SNA_IVEC ni3 = ninside * vw * 3 + SIMD256_count();
      SIMD_scatter(m, (double *)(snaptr->rij[0]), ni3, delx);
      SIMD_scatter(m, (double *)(snaptr->rij[0] + 1), ni3, dely);
      SIMD_scatter(m, (double *)(snaptr->rij[0] + 2), ni3, delz);
      const SNA_IVEC ni = ninside * vw + SIMD256_count();
      SIMD_scatter(m, (int *)(snaptr->inside), ni, j);
      SIMD_scatter(m, (double *)(snaptr->wj), ni,
                   SIMD_gather(m, wjelem, jelem));
      SIMD_scatter(m, (double *)(snaptr->rcutij), ni,
                   (radi + SIMD_gather(m, radelem, jelem)) * rcutfac);
      if (switchinnerflag) {
        SIMD_scatter(m, (double *)(snaptr->sinnerij), ni,
                     (SIMD_gather(m, sinnerelem, ielem) +
                      SIMD_gather(m, sinnerelem, jelem)) * 0.5);
        SIMD_scatter(m, (double *)(snaptr->dinnerij), ni,
                     (SIMD_gather(m, dinnerelem, ielem) +
                      SIMD_gather(m, dinnerelem, jelem)) * 0.5);
      }
      if (chemflag)
        SIMD_scatter(m, (int *)(snaptr->element), ni, jelem);
      ninside = SIMD_add(m, ninside, 1);
    } // for jj
    // compute Ui, Yi for atom I
    if (chemflag)
      snaptr->compute_ui(ninside, ielem, max_jnum);
    else
      snaptr->compute_ui(ninside, zero_vec, max_jnum);
    // Compute bispectrum
    if (quadraticflag || eflag) {
      snaptr->compute_zi_or_yi<0>(beta);
      if (chemflag)
        snaptr->compute_bi(ielem);
      else
        snaptr->compute_bi(zero_vec);
      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
        SIMD_store(bispectrum + icoeff, SIMD_load(snaptr->blist + icoeff));
    }
    // Compute beta
    for (int icoeff = 0; icoeff < ncoeff; icoeff++)
      SIMD_store(beta + icoeff, SIMD_gather(coeffelem[0],
                                            ielem * ncoeffall + icoeff + 1));
    if (quadraticflag) {
      int k = ncoeff+1;
      for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
        SNA_DVEC bveci = SIMD_load(bispectrum + icoeff);
        SNA_DVEC beta_i = SIMD_load(beta + icoeff) +
          SIMD_gather(coeffelem[0], ielem * ncoeffall + k) * bveci;
        k++;
        for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
          const SNA_DVEC ci = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
          beta_i = beta_i + ci * SIMD_load(bispectrum + jcoeff);
          SIMD_store(beta + jcoeff, ci * bveci + SIMD_load(beta + jcoeff));
          k++;
        }
        SIMD_store(beta + icoeff, beta_i);
      }
    }
    // for neighbors of I within cutoff:
    // compute Fij = dEi/dRj = -dEi/dRi
    // add to Fi, subtract from Fj
    // scaling is that for type I
    if (quadraticflag || eflag)
      snaptr->compute_yi_from_zi(beta);
    else
      snaptr->compute_zi_or_yi<1>(beta);
    SNA_DVEC fi_x(0.0), fi_y(0.0), fi_z(0.0);
    SNA_DVEC scalev = SIMD_gather(scale[0], itype * (atom->ntypes+1) + itype);
    for (int jj = 0; jj < max_jnum; jj++) {
      snaptr->compute_duidrj(jj, ninside);
      if (chemflag && nelements > 1)
        snaptr->compute_deidrj_e(jj, ninside, fij);
      else
        snaptr->compute_deidrj(jj, ninside, fij);
      SNA_DVEC fijs_x = fij[0] * scalev;
      SNA_DVEC fijs_y = fij[1] * scalev;
      SNA_DVEC fijs_z = fij[2] * scalev;
      fi_x += fijs_x;
      fi_y += fijs_y;
      fi_z += fijs_z;
      for (int l = 0; l < vw; l++) {
        if (jj < ninside[l]) {
          int j = snaptr->inside[jj][l];
          f[j][0] -= fijs_x[l];
          f[j][1] -= fijs_y[l];
          f[j][2] -= fijs_z[l];
          if (tally_xyz)
            ev_tally_xyz(i[l],j,nlocal,newton_pair,0.0,0.0,
                         fij[0][l],fij[1][l],fij[2][l],
                         -snaptr->rij[jj][0][l],-snaptr->rij[jj][1][l],
                         -snaptr->rij[jj][2][l]);
        }
      } // for l
    } // for jj
    SIMD_mask m((SIMD256_count() + ii) < list->inum);
    SNA_DVEC fix = SIMD_gather(m, f[0], i * 3) +  fi_x;
    SIMD_scatter(m, f[0], i * 3, fix);
    SNA_DVEC fiy = SIMD_gather(m, f[0], i * 3 + 1) +  fi_y;
    SIMD_scatter(m, f[0], i * 3 + 1, fiy);
    SNA_DVEC fiz = SIMD_gather(m, f[0], i * 3 + 2) +  fi_z;
    SIMD_scatter(m, f[0], i * 3 + 2, fiz);
    // tally energy contribution
    if (eflag) {
      SNA_DVEC evdwl = SIMD_gather(coeffelem[0], ielem * ncoeffall);
      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
        evdwl += SIMD_gather(coeffelem[0], ielem * ncoeffall + icoeff +1) *
          bispectrum[icoeff];
      if (quadraticflag) {
        int k = ncoeff+1;
        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
          SNA_DVEC bveci = SIMD_load(bispectrum + icoeff);
          SNA_DVEC c = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
          k++;
          evdwl += c * 0.5 * bveci * bveci;
          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
            SNA_DVEC bvecj = SIMD_load(bispectrum + jcoeff);
            SNA_DVEC cj = SIMD_gather(coeffelem[0], ielem * ncoeffall + k);
            k++;
            evdwl += cj * bveci * bvecj;
          }
        }
      }
      sevdwl += scalev * evdwl;
      if (eatom) {
        SNA_DVEC ea = SIMD_gather(m, eatom, i) + scalev * evdwl;
        SIMD_scatter(m, eatom, i, ea);
      }
    } // if (eflag)
  } // for ii
  if (eflag) eng_vdwl += SIMD_sum(sevdwl);
  if (vflag_fdotr) virial_fdotr_compute();
 }
 /* ----------------------------------------------------------------------
   allocate all arrays
 ------------------------------------------------------------------------- */
 void PairSNAPIntel::allocate()
 {
  allocated = 1;
  int n = atom->ntypes;
  memory->create(setflag,n+1,n+1,"pair:setflag");
  memory->create(cutsq,n+1,n+1,"pair:cutsq");
  memory->create(scale,n+1,n+1,"pair:scale");
  map = new int[n+1];
 }
 /* ----------------------------------------------------------------------
   global settings
 ------------------------------------------------------------------------- */
 void PairSNAPIntel::settings(int narg, char ** /* arg */)
 {
  if (narg > 0)
    error->all(FLERR,"Illegal pair_style command");
  if ((comm->me == 0) && (comm->nthreads > 1))
    error->warning(FLERR, "Pair style snap/intel does not use OpenMP threads");
 }
 /* ----------------------------------------------------------------------
   set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 void PairSNAPIntel::coeff(int narg, char **arg)
 {
  if (!allocated) allocate();
  if (narg != 4 + atom->ntypes) error->all(FLERR,"Incorrect args for pair coefficients");
  map_element2type(narg-4,arg+4);
  // read snapcoeff and snapparam files
  read_files(arg[2],arg[3]);
  if (!quadraticflag)
    ncoeff = ncoeffall - 1;
  else {
    // ncoeffall should be (ncoeff+2)*(ncoeff+1)/2
    // so, ncoeff = floor(sqrt(2*ncoeffall))-1
    ncoeff = sqrt(2.0*ncoeffall)-1;
    ncoeffq = (ncoeff*(ncoeff+1))/2;
    int ntmp = 1+ncoeff+ncoeffq;
    if (ntmp != ncoeffall) {
      error->all(FLERR,"Incorrect SNAP coeff file");
    }
  }
  snaptr = new SNAIntel(lmp, rfac0, twojmax,
                        rmin0, switchflag, bzeroflag,
                        chemflag, bnormflag, wselfallflag,
                        nelements, switchinnerflag);
  if (ncoeff != snaptr->ncoeff) {
    if (comm->me == 0)
      printf("ncoeff = %d snancoeff = %d \n",ncoeff,snaptr->ncoeff);
    error->all(FLERR,"Incorrect SNAP parameter file");
  }
  // Calculate maximum cutoff for all elements
  rcutmax = 0.0;
  for (int ielem = 0; ielem < nelements; ielem++)
    rcutmax = MAX(2.0*radelem[ielem]*rcutfac,rcutmax);
  // set default scaling
  int n = atom->ntypes;
  for (int ii = 0; ii < n+1; ii++)
    for (int jj = 0; jj < n+1; jj++)
      scale[ii][jj] = 1.0;
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 void PairSNAPIntel::init_style()
 {
  if (force->newton_pair == 0)
    error->all(FLERR,"Pair style SNAP requires newton pair on");
  // need a full neighbor list
  neighbor->add_request(this, NeighConst::REQ_FULL);
  snaptr->init();
  fix = static_cast<FixIntel *>(modify->get_fix_by_id("package_intel"));
  if (!fix) error->all(FLERR, "The 'package intel' command is required for /intel styles");
  fix->pair_init_check();
  memory->create(bispectrum,ncoeff,"PairSNAP:bispectrum");
  memory->create(beta,ncoeff,"PairSNAP:beta");
 }
 /* ----------------------------------------------------------------------
   init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 double PairSNAPIntel::init_one(int i, int j)
 {
  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
  scale[j][i] = scale[i][j];
  return (radelem[map[i]] +
          radelem[map[j]])*rcutfac;
 }
 /* ---------------------------------------------------------------------- */
 void PairSNAPIntel::read_files(char *coefffilename, char *paramfilename)
 {
  // open SNAP coefficient file on proc 0
  FILE *fpcoeff;
  if (comm->me == 0) {
    fpcoeff = utils::open_potential(coefffilename,lmp,nullptr);
    if (fpcoeff == nullptr)
      error->one(FLERR,"Cannot open SNAP coefficient file {}: ",
                                   coefffilename, utils::getsyserror());
  }
  char line[MAXLINE],*ptr;
  int eof = 0;
  int nwords = 0;
  while (nwords == 0) {
    if (comm->me == 0) {
      ptr = fgets(line,MAXLINE,fpcoeff);
      if (ptr == nullptr) {
        eof = 1;
        fclose(fpcoeff);
      }
    }
    MPI_Bcast(&eof,1,MPI_INT,0,world);
    if (eof) break;
    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
    // strip comment, skip line if blank
    nwords = utils::count_words(utils::trim_comment(line));
  }
  if (nwords != 2)
    error->all(FLERR,"Incorrect format in SNAP coefficient file");
  // strip single and double quotes from words
  int nelemtmp = 0;
  try {
    ValueTokenizer words(utils::trim_comment(line),"\"' \t\n\r\f");
    nelemtmp = words.next_int();
    ncoeffall = words.next_int();
  } catch (TokenizerException &e) {
    error->all(FLERR,"Incorrect format in SNAP coefficient file: {}", e.what());
  }
  // clean out old arrays and set up element lists
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(coeffelem);
  memory->destroy(sinnerelem);
  memory->destroy(dinnerelem);
  memory->create(radelem,nelements,"pair:radelem");
  memory->create(wjelem,nelements,"pair:wjelem");
  memory->create(coeffelem,nelements,ncoeffall,"pair:coeffelem");
  memory->create(sinnerelem,nelements,"pair:sinnerelem");
  memory->create(dinnerelem,nelements,"pair:dinnerelem");
  // initialize checklist for all required nelements
  int *elementflags = new int[nelements];
  for (int jelem = 0; jelem < nelements; jelem++)
      elementflags[jelem] = 0;
  // loop over nelemtmp blocks in the SNAP coefficient file
  for (int ielem = 0; ielem < nelemtmp; ielem++) {
    if (comm->me == 0) {
      ptr = fgets(line,MAXLINE,fpcoeff);
      if (ptr == nullptr) {
        eof = 1;
        fclose(fpcoeff);
      }
    }
    MPI_Bcast(&eof,1,MPI_INT,0,world);
    if (eof)
      error->all(FLERR,"Incorrect format in SNAP coefficient file");
    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
    std::vector<std::string> words;
    try {
      words = Tokenizer(utils::trim_comment(line),"\"' \t\n\r\f").as_vector();
    } catch (TokenizerException &) {
      // ignore
    }
    if (words.size() != 3)
      error->all(FLERR,"Incorrect format in SNAP coefficient file");
    int jelem;
    for (jelem = 0; jelem < nelements; jelem++)
      if (words[0] == elements[jelem]) break;
    // if this element not needed, skip this block
    if (jelem == nelements) {
      if (comm->me == 0) {
        for (int icoeff = 0; icoeff < ncoeffall; icoeff++) {
          ptr = fgets(line,MAXLINE,fpcoeff);
          if (ptr == nullptr) {
            eof = 1;
            fclose(fpcoeff);
          }
        }
      }
      MPI_Bcast(&eof,1,MPI_INT,0,world);
      if (eof)
        error->all(FLERR,"Incorrect format in SNAP coefficient file");
      continue;
    }
    if (elementflags[jelem] == 1)
      error->all(FLERR,"Incorrect format in SNAP coefficient file");
    else
      elementflags[jelem] = 1;
    radelem[jelem] = utils::numeric(FLERR,words[1],false,lmp);
    wjelem[jelem] = utils::numeric(FLERR,words[2],false,lmp);
    if (comm->me == 0)
      utils::logmesg(lmp,"SNAP Element = {}, Radius {}, Weight {}\n",
                     elements[jelem], radelem[jelem], wjelem[jelem]);
    for (int icoeff = 0; icoeff < ncoeffall; icoeff++) {
      if (comm->me == 0) {
        ptr = fgets(line,MAXLINE,fpcoeff);
        if (ptr == nullptr) {
          eof = 1;
          fclose(fpcoeff);
        }
      }
      MPI_Bcast(&eof,1,MPI_INT,0,world);
      if (eof)
        error->all(FLERR,"Incorrect format in SNAP coefficient file");
      MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
      try {
        ValueTokenizer coeff(utils::trim_comment(line));
        if (coeff.count() != 1)
          error->all(FLERR,"Incorrect format in SNAP coefficient file");
        coeffelem[jelem][icoeff] = coeff.next_double();
      } catch (TokenizerException &e) {
        error->all(FLERR,"Incorrect format in SNAP coefficient file: {}", e.what());
      }
    }
  }
  if (comm->me == 0) fclose(fpcoeff);
  for (int jelem = 0; jelem < nelements; jelem++) {
    if (elementflags[jelem] == 0)
      error->all(FLERR,"Element {} not found in SNAP coefficient file", elements[jelem]);
  }
  delete[] elementflags;
  // set flags for required keywords
  rcutfacflag = 0;
  twojmaxflag = 0;
  // Set defaults for optional keywords
  rfac0 = 0.99363;
  rmin0 = 0.0;
  switchflag = 1;
  bzeroflag = 1;
  quadraticflag = 0;
  chemflag = 0;
  bnormflag = 0;
  wselfallflag = 0;
  switchinnerflag = 0;
  chunksize = 32768;
  parallel_thresh = 8192;
  // set local input checks
  int sinnerflag = 0;
  int dinnerflag = 0;
  // open SNAP parameter file on proc 0
  FILE *fpparam;
  if (comm->me == 0) {
    fpparam = utils::open_potential(paramfilename,lmp,nullptr);
    if (fpparam == nullptr)
      error->one(FLERR,"Cannot open SNAP parameter file {}: {}",
                                   paramfilename, utils::getsyserror());
  }
  eof = 0;
  while (true) {
    if (comm->me == 0) {
      ptr = fgets(line,MAXLINE,fpparam);
      if (ptr == nullptr) {
        eof = 1;
        fclose(fpparam);
      }
    }
    MPI_Bcast(&eof,1,MPI_INT,0,world);
    if (eof) break;
    MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world);
    // words = ptrs to all words in line
    // strip single and double quotes from words
    std::vector<std::string> words;
    try {
      words = Tokenizer(utils::trim_comment(line),"\"' \t\n\r\f").as_vector();
    } catch (TokenizerException &) {
      // ignore
    }
    if (words.size() == 0) continue;
    if (words.size() < 2)
      error->all(FLERR,"Incorrect format in SNAP parameter file");
    auto keywd = words[0];
    auto keyval = words[1];
    // check for keywords with more than one value per element
    if (keywd == "sinner" || keywd == "dinner") {
      if ((int)words.size() != nelements+1)
        error->all(FLERR,"Incorrect SNAP parameter file");
      // innerlogstr collects all values of sinner or dinner for log output below
      std::string innerlogstr;
      int iword = 1;
      if (keywd == "sinner") {
        for (int ielem = 0; ielem < nelements; ielem++) {
          keyval = words[iword];
          sinnerelem[ielem] = utils::numeric(FLERR,keyval,false,lmp);
          iword++;
          innerlogstr += keyval + " ";
        }
        sinnerflag = 1;
      } else if (keywd == "dinner") {
        for (int ielem = 0; ielem < nelements; ielem++) {
          keyval = words[iword];
          dinnerelem[ielem] = utils::numeric(FLERR,keyval,false,lmp);
          iword++;
          innerlogstr += keyval + " ";
        }
        dinnerflag = 1;
      }
      if (comm->me == 0)
        utils::logmesg(lmp,"SNAP keyword {} {} ... \n", keywd, innerlogstr);
    } else {
      // all other keywords take one value
      if (nwords != 2)
        error->all(FLERR,"Incorrect SNAP parameter file");
      if (comm->me == 0)
        utils::logmesg(lmp,"SNAP keyword {} {}\n",keywd,keyval);
      if (keywd == "rcutfac") {
        rcutfac = utils::numeric(FLERR,keyval,false,lmp);
        rcutfacflag = 1;
      } else if (keywd == "twojmax") {
        twojmax = utils::inumeric(FLERR,keyval,false,lmp);
        twojmaxflag = 1;
      } else if (keywd == "rfac0")
        rfac0 = utils::numeric(FLERR,keyval,false,lmp);
      else if (keywd == "rmin0")
        rmin0 = utils::numeric(FLERR,keyval,false,lmp);
      else if (keywd == "switchflag")
        switchflag = utils::inumeric(FLERR,keyval,false,lmp);
      else if (keywd == "bzeroflag")
        bzeroflag = utils::inumeric(FLERR,keyval,false,lmp);
      else if (keywd == "quadraticflag")
        quadraticflag = utils::inumeric(FLERR,keyval,false,lmp);
      else if (keywd == "chemflag")
        chemflag = utils::inumeric(FLERR,keyval,false,lmp);
      else if (keywd == "bnormflag")
        bnormflag = utils::inumeric(FLERR,keyval,false,lmp);
      else if (keywd == "wselfallflag")
        wselfallflag = utils::inumeric(FLERR,keyval,false,lmp);
      else if (keywd == "switchinnerflag")
        switchinnerflag = utils::inumeric(FLERR,keyval,false,lmp);
      else if (keywd == "chunksize")
        chunksize = utils::inumeric(FLERR,keyval,false,lmp);
      else if (keywd == "parallelthresh")
        parallel_thresh = utils::inumeric(FLERR,keyval,false,lmp);
      else
        error->all(FLERR,"Unknown parameter '{}' in SNAP parameter file", keywd);
    }
  }
  if (rcutfacflag == 0 || twojmaxflag == 0)
    error->all(FLERR,"Incorrect SNAP parameter file");
  if (chemflag && nelemtmp != nelements)
    error->all(FLERR,"Incorrect SNAP parameter file");
  if (switchinnerflag && !(sinnerflag && dinnerflag))
    error->all(FLERR,"Incorrect SNAP parameter file");
  if (!switchinnerflag && (sinnerflag || dinnerflag))
    error->all(FLERR,"Incorrect SNAP parameter file");
 }
 /* ----------------------------------------------------------------------
   memory usage
 ------------------------------------------------------------------------- */
 double PairSNAPIntel::memory_usage()
 {
  double bytes = Pair::memory_usage();
  int n = atom->ntypes+1;
  bytes += (double)n*n*sizeof(int);         // setflag
  bytes += (double)n*n*sizeof(double);      // cutsq
  bytes += (double)n*n*sizeof(double);      // scale
  bytes += (double)n*sizeof(int);           // map
  bytes += (double)ncoeff*sizeof(SNA_DVEC); // bispectrum
  bytes += (double)ncoeff*sizeof(SNA_DVEC); // beta
  bytes += snaptr->memory_usage(); // SNA object
  return bytes;
 }
 /* ---------------------------------------------------------------------- */
 void *PairSNAPIntel::extract(const char *str, int &dim)
 {
  dim = 2;
  if (strcmp(str,"scale") == 0) return (void *) scale;
  return nullptr;
 }
 #endif
 #endif
--- a/src/INTEL/pair_snap_intel.h
+++ b/src/INTEL/pair_snap_intel.h
@ -0,0 +1,83 @@
 /* -*- c++ -*- ----------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 #if defined(__AVX512F__)
 #if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
 #ifdef PAIR_CLASS
 // clang-format off
 PairStyle(snap/intel,PairSNAPIntel);
 // clang-format on
 #else
 #ifndef LMP_PAIR_SNAP_INTEL_H
 #define LMP_PAIR_SNAP_INTEL_H
 #include "fix_intel.h"
 #include "pair.h"
 namespace ip_simd { class SIMD_double; class SIMD_int; };
 #define SNA_DVEC ip_simd::SIMD_double
 #define SNA_IVEC ip_simd::SIMD256_int
 namespace LAMMPS_NS {
 class PairSNAPIntel : public Pair {
 public:
  PairSNAPIntel(class LAMMPS *);
  ~PairSNAPIntel() override;
  void compute(int, int) override;
  void settings(int, char **) override;
  void coeff(int, char **) override;
  void init_style() override;
  double init_one(int, int) override;
  double memory_usage() override;
  void *extract(const char *, int &) override;
  double rcutfac, quadraticflag;    // declared public to workaround gcc 4.9
  int ncoeff;                       //  compiler bug, manifest in KOKKOS package
 protected:
  FixIntel *fix;
  int ncoeffq, ncoeffall;
  class SNAIntel *snaptr;
  virtual void allocate();
  void read_files(char *, char *);
  inline int equal(double *x, double *y);
  inline double dist2(double *x, double *y);
  double rcutmax;         // max cutoff for all elements
  double *radelem;        // element radii
  double *wjelem;         // elements weights
  double **coeffelem;     // element bispectrum coefficients
  SNA_DVEC *beta;          // betas for all atoms in list
  SNA_DVEC *bispectrum;    // bispectrum components for all atoms in list
  double **scale;         // for thermodynamic integration
  int twojmax, switchflag, bzeroflag, bnormflag;
  int chemflag, wselfallflag;
  int switchinnerflag;    // inner cutoff switch
  double *sinnerelem;     // element inner cutoff midpoint
  double *dinnerelem;     // element inner cutoff half-width
  int chunksize, parallel_thresh;
  double rfac0, rmin0, wj1, wj2;
  int rcutfacflag, twojmaxflag;    // flags for required parameters
 };
 }    // namespace LAMMPS_NS
 #endif
 #endif
 #endif
 #endif
--- a/src/INTEL/sna_intel.cpp
+++ b/src/INTEL/sna_intel.cpp
--- a/src/INTEL/sna_intel.h
+++ b/src/INTEL/sna_intel.h
@ -0,0 +1,187 @@
 /* -*- c++ -*- -------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
   Contributing authors: W. Michael Brown, Intel
 ------------------------------------------------------------------------- */
 #ifndef LMP_SNA_INTEL_H
 #define LMP_SNA_INTEL_H
 #if defined(__AVX512F__)
 #if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
 #include "pointers.h"
 #include "intel_buffers.h"
 #include "intel_simd.h"
 #define SVW 8
 #if defined(LMP_SIMD_COMPILER)
 #if defined(USE_OMP_SIMD)
 #define SV_for _Pragma("omp simd") _Pragma("vector aligned") for
 #else
 #define SV_for _Pragma("simd assert") _Pragma("vector aligned") for
 #endif
 #else
 #define SV_for for
 #endif
 namespace LAMMPS_NS {
 struct SNA_ZINDICES {
  int j1, j2, j, ma1min, ma2max, mb1min;
  int mb2max, na, nb, jju;
 };
 struct SNA_BINDICES {
  int j1, j2, j;
 };
 #define SNA_DVEC ip_simd::SIMD_double
 #define SNA_IVEC ip_simd::SIMD256_int
 class SNAIntel : protected Pointers {
 public:
  SNAIntel(LAMMPS *, double, int, double, int, int, int, int, int, int, int);
  SNAIntel(LAMMPS *lmp) : Pointers(lmp){};
  ~SNAIntel() override;
  void build_indexlist();
  void init();
  double memory_usage();
  int ncoeff;
  inline int vector_width() const { return SVW; }
  // functions for bispectrum coefficients
  void compute_ui(const SNA_IVEC &, const SNA_IVEC &, const int max_jnum);
  template <int> void compute_zi_or_yi(const SNA_DVEC *);
  void compute_yi_from_zi(const SNA_DVEC *);
  void compute_yterm(int, int, int, const double *);
  void compute_bi(const SNA_IVEC &);
  // functions for derivatives
  void compute_duidrj(const int, const SNA_IVEC &);
  void compute_deidrj_e(const int, const SNA_IVEC &, SNA_DVEC *);
  void compute_deidrj(const int, const SNA_IVEC &, SNA_DVEC *);
  double compute_sfac(double, double, double, double);
  SNA_DVEC compute_sfac(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
                        const SNA_DVEC &);
  inline SNA_DVEC compute_sfac_dsfac(const SNA_DVEC &, const SNA_DVEC &,
                                     const SNA_DVEC &, const SNA_DVEC &,
                                     SNA_DVEC &);
  // public bispectrum data
  int twojmax;
  SNA_DVEC *blist;
  double **dblist;
  // short neighbor list data
  void grow_rij(int);
  int nmax;    // allocated size of short lists
  SNA_DVEC **rij;      // short rij list
  SNA_IVEC *inside;       // short neighbor list
  SNA_DVEC *wj;        // short weight list
  SNA_DVEC *rcutij;    // short cutoff list
  // only allocated for switch_inner_flag=1
  SNA_DVEC *sinnerij;    // short inner cutoff midpoint list
  SNA_DVEC *dinnerij;    // short inner half-width list
  // only allocated for chem_flag=1
  SNA_IVEC *element;    // short element list [0,nelements)
 private:
  double rmin0, rfac0;
  // data for bispectrum coefficients
  SNA_ZINDICES *idxz;
  SNA_BINDICES *idxb;
  double **rootpqarray;
  double *cglist;
  int ***idxcg_block;
  SNA_DVEC *ulisttot_r, *ulisttot_i;
  SNA_DVEC **ulist_r_ij, **ulist_i_ij;    // short u list
  int *idxu_block;
  SNA_DVEC *zlist_r, *zlist_i;
  int ***idxz_block;
  int ***idxb_block;
  SNA_DVEC **dulist_r, **dulist_i;
  SNA_DVEC *ylist_r, *ylist_i;
  int idxcg_max, idxu_max, idxz_max, idxb_max;
  void create_twojmax_arrays();
  void destroy_twojmax_arrays();
  void init_clebsch_gordan();
  void print_clebsch_gordan();
  void init_rootpqarray();
  void zero_uarraytot(const SNA_IVEC &);
  void add_uarraytot(const SNA_DVEC &, const int, const SNA_IVEC &);
  void compute_uarray(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
                      const SNA_DVEC &, const SNA_DVEC &, const int,
                      const SNA_IVEC &);
  double deltacg(int, int, int);
  void compute_ncoeff();
  void compute_duarray(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
                       const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &,
                       const SNA_DVEC &, const SNA_DVEC &, int,
                       const SNA_IVEC &);
  inline double choose_beta(const int, const int, const int,
                            const int, const int, const int,  int &);
  // Sets the style for the switching function
  // 0 = none
  // 1 = cosine
  int switch_flag;
  // Sets the style for the inner switching function
  // 0 = none
  // 1 = cosine
  int switch_inner_flag;
  // Self-weight
  double wself;
  int bzero_flag;       // 1 if bzero subtracted from barray
  double *bzero;        // array of B values for isolated atoms
  int bnorm_flag;       // 1 if barray divided by j+1
  int chem_flag;        // 1 for multi-element bispectrum components
  int wselfall_flag;    // 1 for adding wself to all element labelings
  int nelements;        // number of elements
  int ndoubles;         // number of multi-element pairs
  int ntriples;         // number of multi-element triplets
 };
 }    // namespace LAMMPS_NS
 #endif
 #endif
 #endif