diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index c45a1d778c..0d54913bd7 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -265,7 +265,7 @@ OPT. * :doc:`smd/tri_surface ` * :doc:`smd/ulsph ` * :doc:`smtbq ` - * :doc:`snap (k) ` + * :doc:`snap (ik) ` * :doc:`soft (go) ` * :doc:`sph/heatconduction ` * :doc:`sph/idealgas ` diff --git a/doc/src/pair_snap.rst b/doc/src/pair_snap.rst index ebedb288c1..ffc43c712a 100644 --- a/doc/src/pair_snap.rst +++ b/doc/src/pair_snap.rst @@ -1,10 +1,11 @@ .. index:: pair_style snap +.. index:: pair_style snap/intel .. index:: pair_style snap/kk pair_style snap command ======================= -Accelerator Variants: *snap/kk* +Accelerator Variants: *snap/intel*, *snap/kk* Syntax """""" @@ -260,6 +261,14 @@ This style is part of the ML-SNAP package. It is only enabled if LAMMPS was built with that package. See the :doc:`Build package ` page for more info. +The *snap/intel* accelerator variant will *only* be available if LAMMPS +is built with Intel *compilers* and for CPUs with AVX-512 support. +While the INTEL package in general allows multiple floating point +precision modes to be selected, *snap/intel* will currently always use +full double precision regardless of the precision mode selected. +Additionally, the *intel* variant of snap will **NOT** use multiple +threads with OpenMP. + Related commands """""""""""""""" diff --git a/src/Depend.sh b/src/Depend.sh index 4004f12686..b88c527b55 100755 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -185,6 +185,7 @@ fi if (test $1 = "ML-SNAP") then depend ML-IAP depend KOKKOS + depend INTEL fi if (test $1 = "CG-SPICA") then diff --git a/src/INTEL/TEST/in.intel.snap b/src/INTEL/TEST/in.intel.snap new file mode 100644 index 0000000000..4e45fe01f5 --- /dev/null +++ b/src/INTEL/TEST/in.intel.snap @@ -0,0 +1,70 @@ +# Toy demonstration of SNAP "scale" parameter, using fix/adapt and hybrid/overlay +# Mixing linear and quadratic SNAP Ni potentials by Zuo et al. JCPA 2020 + +variable w index 10 # Warmup Timesteps +variable t index 100 # Main Run Timesteps +variable m index 1 # Main Run Timestep Multiplier +variable n index 0 # Use NUMA Mapping for Multi-Node + +variable x index 4 +variable y index 2 +variable z index 2 + +variable rr equal floor($t*$m) +variable root getenv LMP_ROOT + +if "$n > 0" then "processors * * * grid numa" + +# mixing parameter + +variable lambda equal 0.2 + +# Initialize simulation + +variable a equal 3.52 +units metal + +# generate the box and atom positions using a FCC lattice +variable nx equal 20*$x +variable ny equal 20*$y +variable nz equal 20*$z + +boundary p p p + +lattice fcc $a +region box block 0 ${nx} 0 ${ny} 0 ${nz} +create_box 1 box +create_atoms 1 box + +mass 1 34. + +# choose bundled SNAP Ni potential from Zuo et al. JCPA 2020 +pair_style hybrid/overlay snap snap +pair_coeff * * snap 1 & + ${root}/examples/snap/Ni_Zuo_JPCA2020.snapcoeff & + ${root}/examples/snap/Ni_Zuo_JPCA2020.snapparam Ni +pair_coeff * * snap 2 & + ${root}/examples/snap/Ni_Zuo_JPCA2020.quadratic.snapcoeff & + ${root}/examples/snap/Ni_Zuo_JPCA2020.quadratic.snapparam Ni + +# scale according to mixing parameter +variable l1 equal ${lambda} +variable l2 equal 1.0-${lambda} +fix scale1 all adapt 1 pair snap:1 scale * * v_l1 +fix scale2 all adapt 1 pair snap:2 scale * * v_l2 + +# Setup output +thermo 1 +thermo_modify norm yes + +# Set up NVE run +timestep 0.5e-3 +neighbor 1.0 bin +neigh_modify every 1 delay 0 check yes + +# Run MD +velocity all create 300.0 4928459 loop geom +fix 1 all nve + +if "$w > 0" then "run $w" +run ${rr} diff --git a/src/INTEL/TEST/run_benchmarks.sh b/src/INTEL/TEST/run_benchmarks.sh index 82eb51c928..eeb9f07a11 100755 --- a/src/INTEL/TEST/run_benchmarks.sh +++ b/src/INTEL/TEST/run_benchmarks.sh @@ -35,7 +35,7 @@ export I_MPI_PIN_DOMAIN=core # End settings for your system ######################################################################### -export WORKLOADS="lj rhodo lc sw water eam airebo dpd tersoff" +export WORKLOADS="lj rhodo lc sw water eam airebo dpd tersoff snap" export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1" export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1" diff --git a/src/INTEL/intel_simd.h b/src/INTEL/intel_simd.h index 37842621dc..9f31580dd2 100644 --- a/src/INTEL/intel_simd.h +++ b/src/INTEL/intel_simd.h @@ -46,13 +46,38 @@ namespace ip_simd { typedef __mmask16 SIMD_mask; + inline bool any(const SIMD_mask &m) { return m != 0; } + struct SIMD_int { __m512i v; SIMD_int() {} SIMD_int(const __m512i in) : v(in) {} + inline int & operator[](const int i) { return ((int *)&(v))[i]; } + inline const int & operator[](const int i) const + { return ((int *)&(v))[i]; } operator __m512i() const { return v;} }; + struct SIMD256_int { + __m256i v; + SIMD256_int() {} + SIMD256_int(const __m256i in) : v(in) {} + SIMD256_int(const int in) : v(_mm256_set1_epi32(in)) {} + inline int & operator[](const int i) { return ((int *)&(v))[i]; } + inline const int & operator[](const int i) const + { return ((int *)&(v))[i]; } +#ifdef __INTEL_LLVM_COMPILER + inline SIMD256_int operator&=(const int i) + { v=_mm256_and_epi32(v, _mm256_set1_epi32(i)); return *this; }; +#else + inline SIMD256_int operator&=(const int i) + { v=_mm256_and_si256(v, _mm256_set1_epi32(i)); return *this; }; +#endif + inline SIMD256_int operator+=(const int i) + { v=_mm256_add_epi32(v, _mm256_set1_epi32(i)); return *this; }; + operator __m256i() const { return v;} + }; + struct SIMD_float { __m512 v; SIMD_float() {} @@ -64,7 +89,24 @@ namespace ip_simd { __m512d v; SIMD_double() {} SIMD_double(const __m512d in) : v(in) {} + SIMD_double(const double in) { v=_mm512_set1_pd(in); } + inline double & operator[](const int i) { return ((double *)&(v))[i]; } + inline const double & operator[](const int i) const + { return ((double *)&(v))[i]; } operator __m512d() const { return v;} + + SIMD_double & operator=(const double i) + { _mm512_set1_pd(i); return *this; } + SIMD_double &operator=(const SIMD_double &i) + { v = i.v; return *this; } + + SIMD_double operator-() { return _mm512_xor_pd(v, _mm512_set1_pd(-0.0)); } + SIMD_double & operator+=(const SIMD_double & two) + { v = _mm512_add_pd(v, two.v); return *this; } + SIMD_double & operator-=(const SIMD_double & two) + { v = _mm512_sub_pd(v, two.v); return *this; } + SIMD_double & operator*=(const SIMD_double & two) + { v = _mm512_mul_pd(v, two.v); return *this; } }; template @@ -99,6 +141,12 @@ namespace ip_simd { // ------- Set Operations + inline SIMD256_int SIMD256_set(const int l0, const int l1, const int l2, + const int l3, const int l4, const int l5, + const int l6, const int l7) { + return _mm256_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7); + } + inline SIMD_int SIMD_set(const int l0, const int l1, const int l2, const int l3, const int l4, const int l5, const int l6, const int l7, const int l8, @@ -109,6 +157,10 @@ namespace ip_simd { l8,l9,l10,l11,l12,l13,l14,l15); } + inline SIMD256_int SIMD256_set(const int l) { + return _mm256_set1_epi32(l); + } + inline SIMD_int SIMD_set(const int l) { return _mm512_set1_epi32(l); } @@ -121,6 +173,10 @@ namespace ip_simd { return _mm512_set1_pd(l); } + inline SIMD256_int SIMD256_count() { + return SIMD256_set(0,1,2,3,4,5,6,7); + } + inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) { return _mm512_maskz_mov_epi32(m, one); } @@ -147,6 +203,10 @@ namespace ip_simd { // -------- Load Operations + inline SIMD256_int SIMD_load(const SIMD256_int *p) { + return _mm256_load_epi32((int *)p); + } + inline SIMD_int SIMD_load(const int *p) { return _mm512_load_epi32(p); } @@ -159,6 +219,10 @@ namespace ip_simd { return _mm512_load_pd(p); } + inline SIMD_double SIMD_load(const SIMD_double *p) { + return _mm512_load_pd((double *)p); + } + inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) { return _mm512_maskz_load_epi32(m, p); } @@ -171,6 +235,10 @@ namespace ip_simd { return _mm512_maskz_load_pd(m, p); } + inline SIMD256_int SIMD_gather(const int *p, const SIMD256_int &i) { + return _mm256_i32gather_epi32(p, i, _MM_SCALE_4); + } + inline SIMD_int SIMD_gather(const int *p, const SIMD_int &i) { return _mm512_i32gather_epi32(i, p, _MM_SCALE_4); } @@ -179,6 +247,10 @@ namespace ip_simd { return _mm512_i32gather_ps(i, p, _MM_SCALE_4); } + inline SIMD_double SIMD_gather(const double *p, const SIMD256_int &i) { + return _mm512_i32gather_pd(i, p, _MM_SCALE_8); + } + inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) { return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8); } @@ -201,6 +273,12 @@ namespace ip_simd { _mm512_castsi512_si256(i), p, _MM_SCALE_8); } + inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p, + const SIMD256_int &i) { + return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, + i, p, _MM_SCALE_8); + } + template inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p, const SIMD_int &i) { @@ -252,6 +330,15 @@ namespace ip_simd { return _mm512_store_pd(p,one); } + inline void SIMD_store(SIMD_double *p, const SIMD_double &one) { + return _mm512_store_pd((double *)p,one); + } + + inline void SIMD_scatter(const SIMD_mask &m, int *p, + const SIMD256_int &i, const SIMD256_int &vec) { + _mm256_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4); + } + inline void SIMD_scatter(const SIMD_mask &m, int *p, const SIMD_int &i, const SIMD_int &vec) { _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4); @@ -268,8 +355,22 @@ namespace ip_simd { _MM_SCALE_8); } + inline void SIMD_scatter(const SIMD_mask &m, double *p, + const SIMD256_int &i, const SIMD_double &vec) { + _mm512_mask_i32scatter_pd(p, m, i, vec, _MM_SCALE_8); + } + + inline void SIMD_scatter(double *p, + const SIMD256_int &i, const SIMD_double &vec) { + _mm512_i32scatter_pd(p, i, vec, _MM_SCALE_8); + } + // ------- Arithmetic Operations + inline SIMD256_int operator+(const SIMD256_int &one, const SIMD256_int &two) { + return _mm256_add_epi32(one,two); + } + inline SIMD_int operator+(const SIMD_int &one, const SIMD_int &two) { return _mm512_add_epi32(one,two); } @@ -286,6 +387,10 @@ namespace ip_simd { return _mm512_add_epi32(one,SIMD_set(two)); } + inline SIMD256_int operator+(const SIMD256_int &one, const int two) { + return _mm256_add_epi32(one,SIMD256_set(two)); + } + inline SIMD_float operator+(const SIMD_float &one, const float two) { return _mm512_add_ps(one,SIMD_set(two)); } @@ -299,6 +404,11 @@ namespace ip_simd { return _mm512_mask_add_epi32(one,m,one,SIMD_set(two)); } + inline SIMD256_int SIMD_add(const SIMD_mask &m, + const SIMD256_int &one, const int two) { + return _mm256_mask_add_epi32(one,m,one,SIMD256_set(two)); + } + inline SIMD_float SIMD_add(const SIMD_mask &m, const SIMD_float &one, const float two) { return _mm512_mask_add_ps(one,m,one,SIMD_set(two)); @@ -309,6 +419,11 @@ namespace ip_simd { return _mm512_mask_add_pd(one,m,one,SIMD_set(two)); } + inline SIMD_double SIMD_add(const SIMD_mask &m, + const SIMD_double &one, const SIMD_double &two) { + return _mm512_mask_add_pd(one,m,one,two); + } + inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m, const SIMD_int &one, const SIMD_int &two) { return _mm512_mask_add_epi32(s,m,one,two); @@ -387,6 +502,10 @@ namespace ip_simd { return _mm512_mul_pd(one,two); } + inline SIMD256_int operator*(const SIMD256_int &one, const int two) { + return _mm256_mullo_epi32(one,SIMD256_set(two)); + } + inline SIMD_int operator*(const SIMD_int &one, const int two) { return _mm512_mullo_epi32(one,SIMD_set(two)); } @@ -417,6 +536,12 @@ namespace ip_simd { return _mm512_fmadd_pd(one,two,three); } + inline SIMD_double SIMD_fma(const SIMD_mask m, const SIMD_double &one, + const SIMD_double &two, + const SIMD_double &three) { + return _mm512_mask3_fmadd_pd(one,two,three,m); + } + inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two, const SIMD_float &three) { return _mm512_fmsub_ps(one,two,three); @@ -493,6 +618,10 @@ namespace ip_simd { return _mm512_pow_pd(one, two); } + inline SIMD_double SIMD_pow(const SIMD_double &one, const double two) { + return _mm512_pow_pd(one, SIMD_set(two)); + } + inline SIMD_float SIMD_exp(const SIMD_float &one) { return _mm512_exp_ps(one); } @@ -501,6 +630,18 @@ namespace ip_simd { return _mm512_exp_pd(one); } + inline SIMD_double SIMD_cos(const SIMD_double &one) { + return _mm512_cos_pd(one); + } + + inline SIMD_double SIMD_sin(const SIMD_double &one) { + return _mm512_sin_pd(one); + } + + inline SIMD_double SIMD_tan(const SIMD_double &one) { + return _mm512_tan_pd(one); + } + // ------- Comparison operations inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one, @@ -533,6 +674,14 @@ namespace ip_simd { return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two); } + inline SIMD_mask operator<(const SIMD256_int &one, const SIMD256_int &two) { + return _mm256_cmplt_epi32_mask(one,two); + } + + inline SIMD_mask operator<(const int one, const SIMD256_int &two) { + return _mm256_cmplt_epi32_mask(SIMD256_set(one),two); + } + inline SIMD_mask operator<(const SIMD_int &one, const SIMD_int &two) { return _mm512_cmplt_epi32_mask(one,two); } @@ -577,6 +726,10 @@ namespace ip_simd { return _mm512_cmple_ps_mask(SIMD_set(one), two); } + inline SIMD_mask operator<=(const SIMD_double &one, const SIMD_double &two) { + return _mm512_cmple_pd_mask(one, two); + } + inline SIMD_mask operator<=(const double one, const SIMD_double &two) { return _mm512_cmple_pd_mask(SIMD_set(one), two); } @@ -593,6 +746,14 @@ namespace ip_simd { return _mm512_cmplt_pd_mask(two,one); } + inline SIMD_mask operator>(const SIMD_double &one, const double two) { + return _mm512_cmplt_pd_mask(SIMD_set(two),one); + } + + inline SIMD_mask operator==(const SIMD256_int &one, const int two) { + return _mm256_cmpeq_epi32_mask(one,_mm256_set1_epi32(two)); + } + inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) { return _mm512_cmpeq_epi32_mask(one,two); } diff --git a/src/INTEL/pair_snap_intel.cpp b/src/INTEL/pair_snap_intel.cpp new file mode 100644 index 0000000000..d91f0adc36 --- /dev/null +++ b/src/INTEL/pair_snap_intel.cpp @@ -0,0 +1,779 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#if defined(__AVX512F__) +#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) + +#include "pair_snap_intel.h" + +#include "atom.h" +#include "comm.h" +#include "error.h" +#include "force.h" +#include "memory.h" +#include "modify.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "sna_intel.h" +#include "tokenizer.h" + +#include +#include + +using namespace LAMMPS_NS; +using namespace ip_simd; + +#define MAXLINE 1024 +#define MAXWORD 3 + +/* ---------------------------------------------------------------------- */ + +PairSNAPIntel::PairSNAPIntel(LAMMPS *lmp) : Pair(lmp) +{ + single_enable = 0; + restartinfo = 0; + one_coeff = 1; + manybody_flag = 1; + centroidstressflag = CENTROID_NOTAVAIL; + + radelem = nullptr; + wjelem = nullptr; + coeffelem = nullptr; + sinnerelem = nullptr; + dinnerelem = nullptr; + + beta = nullptr; + bispectrum = nullptr; + snaptr = nullptr; +} + +/* ---------------------------------------------------------------------- */ + +PairSNAPIntel::~PairSNAPIntel() +{ + if (copymode) return; + + memory->destroy(radelem); + memory->destroy(wjelem); + memory->destroy(coeffelem); + memory->destroy(sinnerelem); + memory->destroy(dinnerelem); + + memory->destroy(beta); + memory->destroy(bispectrum); + + delete snaptr; + + if (allocated) { + memory->destroy(setflag); + memory->destroy(cutsq); + memory->destroy(scale); + } + +} + +/* ---------------------------------------------------------------------- + This version is a straightforward implementation + ---------------------------------------------------------------------- */ + +void PairSNAPIntel::compute(int eflag, int vflag) +{ + SNA_DVEC fij[3]; + int *jlist,*numneigh,**firstneigh; + + ev_init(eflag,vflag); + int tally_xyz = 0; + if (vflag_atom || (vflag && !vflag_fdotr)) tally_xyz = 1; + + double **x = atom->x; + double *_x = atom->x[0]; + double **f = atom->f; + int *type = atom->type; + int nlocal = atom->nlocal; + int newton_pair = force->newton_pair; + + // compute dE_i/dB_i = beta_i for all i in list + + numneigh = list->numneigh; + firstneigh = list->firstneigh; + + SNA_DVEC sevdwl(0); + + const int vw = snaptr->vector_width(); + for (int ii = 0; ii < list->inum; ii+=vw) { + SNA_IVEC i, jnum; + int max_jnum = 0; + for (int l = 0; l < vw; l++) { + if (ii + l < list->inum) { + i[l] = list->ilist[ii + l]; + jnum[l] = numneigh[i[l]]; + } else { + i[l] = list->ilist[0]; + jnum[l] = 0; + } + if (jnum[l] > max_jnum) max_jnum = jnum[l]; + } + + // ensure rij, inside, wj, and rcutij are of size jnum + + snaptr->grow_rij(max_jnum); + + SNA_IVEC zero_vec(0); + + const SNA_DVEC xtmp = SIMD_gather(_x, i * 3); + const SNA_DVEC ytmp = SIMD_gather(_x, i * 3 + 1); + const SNA_DVEC ztmp = SIMD_gather(_x, i * 3 + 2); + const SNA_IVEC itype = SIMD_gather(type, i); + const SNA_IVEC ielem = SIMD_gather(map, itype); + const SNA_DVEC radi = SIMD_gather(radelem, ielem); + + // rij[][3] = displacements between atom I and those neighbors + // inside = indices of neighbors of I within cutoff + // wj = weights for neighbors of I within cutoff + // rcutij = cutoffs for neighbors of I within cutoff + // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi + + SNA_IVEC ninside(0); + for (int jj = 0; jj < max_jnum; jj++) { + SIMD_mask m(SIMD256_set(jj) < jnum); + + SNA_IVEC j; + SV_for (int l = 0; l < vw; l++) { + jlist = firstneigh[i[l]]; + if (jj < jnum[l]) j[l] = jlist[jj]; + else j[l] = 0; + } + j &= NEIGHMASK; + + const SNA_DVEC delx = SIMD_gather(m, _x, j * 3) - xtmp; + const SNA_DVEC dely = SIMD_gather(m, _x, j * 3 + 1) - ytmp; + const SNA_DVEC delz = SIMD_gather(m, _x, j * 3 + 2) - ztmp; + const SNA_IVEC jtype = SIMD_gather(type, j); + const SNA_DVEC rsq = delx*delx + dely*dely + delz*delz; + const SNA_DVEC vcut = SIMD_gather(m, cutsq[0], + itype * (atom->ntypes + 1) + jtype); + + m &= rsq < vcut; + m &= rsq > SIMD_set(1e-20); + const SNA_IVEC jelem = SIMD_gather(map, jtype); + const SNA_IVEC ni3 = ninside * vw * 3 + SIMD256_count(); + SIMD_scatter(m, (double *)(snaptr->rij[0]), ni3, delx); + SIMD_scatter(m, (double *)(snaptr->rij[0] + 1), ni3, dely); + SIMD_scatter(m, (double *)(snaptr->rij[0] + 2), ni3, delz); + const SNA_IVEC ni = ninside * vw + SIMD256_count(); + SIMD_scatter(m, (int *)(snaptr->inside), ni, j); + SIMD_scatter(m, (double *)(snaptr->wj), ni, + SIMD_gather(m, wjelem, jelem)); + SIMD_scatter(m, (double *)(snaptr->rcutij), ni, + (radi + SIMD_gather(m, radelem, jelem)) * rcutfac); + if (switchinnerflag) { + SIMD_scatter(m, (double *)(snaptr->sinnerij), ni, + (SIMD_gather(m, sinnerelem, ielem) + + SIMD_gather(m, sinnerelem, jelem)) * 0.5); + SIMD_scatter(m, (double *)(snaptr->dinnerij), ni, + (SIMD_gather(m, dinnerelem, ielem) + + SIMD_gather(m, dinnerelem, jelem)) * 0.5); + } + if (chemflag) + SIMD_scatter(m, (int *)(snaptr->element), ni, jelem); + ninside = SIMD_add(m, ninside, 1); + } // for jj + + // compute Ui, Yi for atom I + + if (chemflag) + snaptr->compute_ui(ninside, ielem, max_jnum); + else + snaptr->compute_ui(ninside, zero_vec, max_jnum); + + // Compute bispectrum + if (quadraticflag || eflag) { + snaptr->compute_zi_or_yi<0>(beta); + if (chemflag) + snaptr->compute_bi(ielem); + else + snaptr->compute_bi(zero_vec); + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + SIMD_store(bispectrum + icoeff, SIMD_load(snaptr->blist + icoeff)); + } + + // Compute beta + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + SIMD_store(beta + icoeff, SIMD_gather(coeffelem[0], + ielem * ncoeffall + icoeff + 1)); + + if (quadraticflag) { + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + SNA_DVEC bveci = SIMD_load(bispectrum + icoeff); + SNA_DVEC beta_i = SIMD_load(beta + icoeff) + + SIMD_gather(coeffelem[0], ielem * ncoeffall + k) * bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const SNA_DVEC ci = SIMD_gather(coeffelem[0], ielem * ncoeffall + k); + beta_i = beta_i + ci * SIMD_load(bispectrum + jcoeff); + SIMD_store(beta + jcoeff, ci * bveci + SIMD_load(beta + jcoeff)); + k++; + } + SIMD_store(beta + icoeff, beta_i); + } + } + + // for neighbors of I within cutoff: + // compute Fij = dEi/dRj = -dEi/dRi + // add to Fi, subtract from Fj + // scaling is that for type I + + if (quadraticflag || eflag) + snaptr->compute_yi_from_zi(beta); + else + snaptr->compute_zi_or_yi<1>(beta); + + SNA_DVEC fi_x(0.0), fi_y(0.0), fi_z(0.0); + SNA_DVEC scalev = SIMD_gather(scale[0], itype * (atom->ntypes+1) + itype); + for (int jj = 0; jj < max_jnum; jj++) { + snaptr->compute_duidrj(jj, ninside); + if (chemflag && nelements > 1) + snaptr->compute_deidrj_e(jj, ninside, fij); + else + snaptr->compute_deidrj(jj, ninside, fij); + + SNA_DVEC fijs_x = fij[0] * scalev; + SNA_DVEC fijs_y = fij[1] * scalev; + SNA_DVEC fijs_z = fij[2] * scalev; + + fi_x += fijs_x; + fi_y += fijs_y; + fi_z += fijs_z; + + for (int l = 0; l < vw; l++) { + if (jj < ninside[l]) { + int j = snaptr->inside[jj][l]; + f[j][0] -= fijs_x[l]; + f[j][1] -= fijs_y[l]; + f[j][2] -= fijs_z[l]; + + if (tally_xyz) + ev_tally_xyz(i[l],j,nlocal,newton_pair,0.0,0.0, + fij[0][l],fij[1][l],fij[2][l], + -snaptr->rij[jj][0][l],-snaptr->rij[jj][1][l], + -snaptr->rij[jj][2][l]); + } + } // for l + } // for jj + SIMD_mask m((SIMD256_count() + ii) < list->inum); + SNA_DVEC fix = SIMD_gather(m, f[0], i * 3) + fi_x; + SIMD_scatter(m, f[0], i * 3, fix); + SNA_DVEC fiy = SIMD_gather(m, f[0], i * 3 + 1) + fi_y; + SIMD_scatter(m, f[0], i * 3 + 1, fiy); + SNA_DVEC fiz = SIMD_gather(m, f[0], i * 3 + 2) + fi_z; + SIMD_scatter(m, f[0], i * 3 + 2, fiz); + + // tally energy contribution + + if (eflag) { + SNA_DVEC evdwl = SIMD_gather(coeffelem[0], ielem * ncoeffall); + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + evdwl += SIMD_gather(coeffelem[0], ielem * ncoeffall + icoeff +1) * + bispectrum[icoeff]; + + if (quadraticflag) { + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + SNA_DVEC bveci = SIMD_load(bispectrum + icoeff); + SNA_DVEC c = SIMD_gather(coeffelem[0], ielem * ncoeffall + k); + k++; + evdwl += c * 0.5 * bveci * bveci; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + SNA_DVEC bvecj = SIMD_load(bispectrum + jcoeff); + SNA_DVEC cj = SIMD_gather(coeffelem[0], ielem * ncoeffall + k); + k++; + evdwl += cj * bveci * bvecj; + } + } + } + sevdwl += scalev * evdwl; + if (eatom) { + SNA_DVEC ea = SIMD_gather(m, eatom, i) + scalev * evdwl; + SIMD_scatter(m, eatom, i, ea); + } + } // if (eflag) + } // for ii + if (eflag) eng_vdwl += SIMD_sum(sevdwl); + if (vflag_fdotr) virial_fdotr_compute(); +} + +/* ---------------------------------------------------------------------- + allocate all arrays +------------------------------------------------------------------------- */ + +void PairSNAPIntel::allocate() +{ + allocated = 1; + int n = atom->ntypes; + memory->create(setflag,n+1,n+1,"pair:setflag"); + memory->create(cutsq,n+1,n+1,"pair:cutsq"); + memory->create(scale,n+1,n+1,"pair:scale"); + map = new int[n+1]; +} + +/* ---------------------------------------------------------------------- + global settings +------------------------------------------------------------------------- */ + +void PairSNAPIntel::settings(int narg, char ** /* arg */) +{ + if (narg > 0) + error->all(FLERR,"Illegal pair_style command"); + if ((comm->me == 0) && (comm->nthreads > 1)) + error->warning(FLERR, "Pair style snap/intel does not use OpenMP threads"); +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs +------------------------------------------------------------------------- */ + +void PairSNAPIntel::coeff(int narg, char **arg) +{ + if (!allocated) allocate(); + if (narg != 4 + atom->ntypes) error->all(FLERR,"Incorrect args for pair coefficients"); + + map_element2type(narg-4,arg+4); + + // read snapcoeff and snapparam files + + read_files(arg[2],arg[3]); + + if (!quadraticflag) + ncoeff = ncoeffall - 1; + else { + + // ncoeffall should be (ncoeff+2)*(ncoeff+1)/2 + // so, ncoeff = floor(sqrt(2*ncoeffall))-1 + + ncoeff = sqrt(2.0*ncoeffall)-1; + ncoeffq = (ncoeff*(ncoeff+1))/2; + int ntmp = 1+ncoeff+ncoeffq; + if (ntmp != ncoeffall) { + error->all(FLERR,"Incorrect SNAP coeff file"); + } + } + + snaptr = new SNAIntel(lmp, rfac0, twojmax, + rmin0, switchflag, bzeroflag, + chemflag, bnormflag, wselfallflag, + nelements, switchinnerflag); + + if (ncoeff != snaptr->ncoeff) { + if (comm->me == 0) + printf("ncoeff = %d snancoeff = %d \n",ncoeff,snaptr->ncoeff); + error->all(FLERR,"Incorrect SNAP parameter file"); + } + + // Calculate maximum cutoff for all elements + rcutmax = 0.0; + for (int ielem = 0; ielem < nelements; ielem++) + rcutmax = MAX(2.0*radelem[ielem]*rcutfac,rcutmax); + + // set default scaling + int n = atom->ntypes; + for (int ii = 0; ii < n+1; ii++) + for (int jj = 0; jj < n+1; jj++) + scale[ii][jj] = 1.0; + +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairSNAPIntel::init_style() +{ + if (force->newton_pair == 0) + error->all(FLERR,"Pair style SNAP requires newton pair on"); + + // need a full neighbor list + + neighbor->add_request(this, NeighConst::REQ_FULL); + + snaptr->init(); + + fix = static_cast(modify->get_fix_by_id("package_intel")); + if (!fix) error->all(FLERR, "The 'package intel' command is required for /intel styles"); + + fix->pair_init_check(); + + memory->create(bispectrum,ncoeff,"PairSNAP:bispectrum"); + memory->create(beta,ncoeff,"PairSNAP:beta"); +} + +/* ---------------------------------------------------------------------- + init for one type pair i,j and corresponding j,i +------------------------------------------------------------------------- */ + +double PairSNAPIntel::init_one(int i, int j) +{ + if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set"); + scale[j][i] = scale[i][j]; + return (radelem[map[i]] + + radelem[map[j]])*rcutfac; +} + +/* ---------------------------------------------------------------------- */ + +void PairSNAPIntel::read_files(char *coefffilename, char *paramfilename) +{ + + // open SNAP coefficient file on proc 0 + + FILE *fpcoeff; + if (comm->me == 0) { + fpcoeff = utils::open_potential(coefffilename,lmp,nullptr); + if (fpcoeff == nullptr) + error->one(FLERR,"Cannot open SNAP coefficient file {}: ", + coefffilename, utils::getsyserror()); + } + + char line[MAXLINE],*ptr; + int eof = 0; + int nwords = 0; + while (nwords == 0) { + if (comm->me == 0) { + ptr = fgets(line,MAXLINE,fpcoeff); + if (ptr == nullptr) { + eof = 1; + fclose(fpcoeff); + } + } + MPI_Bcast(&eof,1,MPI_INT,0,world); + if (eof) break; + MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world); + + // strip comment, skip line if blank + + nwords = utils::count_words(utils::trim_comment(line)); + } + if (nwords != 2) + error->all(FLERR,"Incorrect format in SNAP coefficient file"); + + // strip single and double quotes from words + + int nelemtmp = 0; + try { + ValueTokenizer words(utils::trim_comment(line),"\"' \t\n\r\f"); + nelemtmp = words.next_int(); + ncoeffall = words.next_int(); + } catch (TokenizerException &e) { + error->all(FLERR,"Incorrect format in SNAP coefficient file: {}", e.what()); + } + + // clean out old arrays and set up element lists + + memory->destroy(radelem); + memory->destroy(wjelem); + memory->destroy(coeffelem); + memory->destroy(sinnerelem); + memory->destroy(dinnerelem); + memory->create(radelem,nelements,"pair:radelem"); + memory->create(wjelem,nelements,"pair:wjelem"); + memory->create(coeffelem,nelements,ncoeffall,"pair:coeffelem"); + memory->create(sinnerelem,nelements,"pair:sinnerelem"); + memory->create(dinnerelem,nelements,"pair:dinnerelem"); + + // initialize checklist for all required nelements + + int *elementflags = new int[nelements]; + for (int jelem = 0; jelem < nelements; jelem++) + elementflags[jelem] = 0; + + // loop over nelemtmp blocks in the SNAP coefficient file + + for (int ielem = 0; ielem < nelemtmp; ielem++) { + + if (comm->me == 0) { + ptr = fgets(line,MAXLINE,fpcoeff); + if (ptr == nullptr) { + eof = 1; + fclose(fpcoeff); + } + } + MPI_Bcast(&eof,1,MPI_INT,0,world); + if (eof) + error->all(FLERR,"Incorrect format in SNAP coefficient file"); + MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world); + + std::vector words; + try { + words = Tokenizer(utils::trim_comment(line),"\"' \t\n\r\f").as_vector(); + } catch (TokenizerException &) { + // ignore + } + if (words.size() != 3) + error->all(FLERR,"Incorrect format in SNAP coefficient file"); + + int jelem; + for (jelem = 0; jelem < nelements; jelem++) + if (words[0] == elements[jelem]) break; + + // if this element not needed, skip this block + + if (jelem == nelements) { + if (comm->me == 0) { + for (int icoeff = 0; icoeff < ncoeffall; icoeff++) { + ptr = fgets(line,MAXLINE,fpcoeff); + if (ptr == nullptr) { + eof = 1; + fclose(fpcoeff); + } + } + } + MPI_Bcast(&eof,1,MPI_INT,0,world); + if (eof) + error->all(FLERR,"Incorrect format in SNAP coefficient file"); + continue; + } + + if (elementflags[jelem] == 1) + error->all(FLERR,"Incorrect format in SNAP coefficient file"); + else + elementflags[jelem] = 1; + + radelem[jelem] = utils::numeric(FLERR,words[1],false,lmp); + wjelem[jelem] = utils::numeric(FLERR,words[2],false,lmp); + + if (comm->me == 0) + utils::logmesg(lmp,"SNAP Element = {}, Radius {}, Weight {}\n", + elements[jelem], radelem[jelem], wjelem[jelem]); + + for (int icoeff = 0; icoeff < ncoeffall; icoeff++) { + if (comm->me == 0) { + ptr = fgets(line,MAXLINE,fpcoeff); + if (ptr == nullptr) { + eof = 1; + fclose(fpcoeff); + } + } + + MPI_Bcast(&eof,1,MPI_INT,0,world); + if (eof) + error->all(FLERR,"Incorrect format in SNAP coefficient file"); + MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world); + + try { + ValueTokenizer coeff(utils::trim_comment(line)); + if (coeff.count() != 1) + error->all(FLERR,"Incorrect format in SNAP coefficient file"); + + coeffelem[jelem][icoeff] = coeff.next_double(); + } catch (TokenizerException &e) { + error->all(FLERR,"Incorrect format in SNAP coefficient file: {}", e.what()); + } + } + } + + if (comm->me == 0) fclose(fpcoeff); + + for (int jelem = 0; jelem < nelements; jelem++) { + if (elementflags[jelem] == 0) + error->all(FLERR,"Element {} not found in SNAP coefficient file", elements[jelem]); + } + delete[] elementflags; + + // set flags for required keywords + + rcutfacflag = 0; + twojmaxflag = 0; + + // Set defaults for optional keywords + + rfac0 = 0.99363; + rmin0 = 0.0; + switchflag = 1; + bzeroflag = 1; + quadraticflag = 0; + chemflag = 0; + bnormflag = 0; + wselfallflag = 0; + switchinnerflag = 0; + chunksize = 32768; + parallel_thresh = 8192; + + // set local input checks + + int sinnerflag = 0; + int dinnerflag = 0; + + // open SNAP parameter file on proc 0 + + FILE *fpparam; + if (comm->me == 0) { + fpparam = utils::open_potential(paramfilename,lmp,nullptr); + if (fpparam == nullptr) + error->one(FLERR,"Cannot open SNAP parameter file {}: {}", + paramfilename, utils::getsyserror()); + } + + eof = 0; + while (true) { + if (comm->me == 0) { + ptr = fgets(line,MAXLINE,fpparam); + if (ptr == nullptr) { + eof = 1; + fclose(fpparam); + } + } + MPI_Bcast(&eof,1,MPI_INT,0,world); + if (eof) break; + MPI_Bcast(line,MAXLINE,MPI_CHAR,0,world); + + // words = ptrs to all words in line + // strip single and double quotes from words + + std::vector words; + try { + words = Tokenizer(utils::trim_comment(line),"\"' \t\n\r\f").as_vector(); + } catch (TokenizerException &) { + // ignore + } + + if (words.size() == 0) continue; + + if (words.size() < 2) + error->all(FLERR,"Incorrect format in SNAP parameter file"); + + auto keywd = words[0]; + auto keyval = words[1]; + + // check for keywords with more than one value per element + + if (keywd == "sinner" || keywd == "dinner") { + + if ((int)words.size() != nelements+1) + error->all(FLERR,"Incorrect SNAP parameter file"); + + // innerlogstr collects all values of sinner or dinner for log output below + + std::string innerlogstr; + + int iword = 1; + + if (keywd == "sinner") { + for (int ielem = 0; ielem < nelements; ielem++) { + keyval = words[iword]; + sinnerelem[ielem] = utils::numeric(FLERR,keyval,false,lmp); + iword++; + innerlogstr += keyval + " "; + } + sinnerflag = 1; + } else if (keywd == "dinner") { + for (int ielem = 0; ielem < nelements; ielem++) { + keyval = words[iword]; + dinnerelem[ielem] = utils::numeric(FLERR,keyval,false,lmp); + iword++; + innerlogstr += keyval + " "; + } + dinnerflag = 1; + } + + if (comm->me == 0) + utils::logmesg(lmp,"SNAP keyword {} {} ... \n", keywd, innerlogstr); + + } else { + + // all other keywords take one value + + if (nwords != 2) + error->all(FLERR,"Incorrect SNAP parameter file"); + + if (comm->me == 0) + utils::logmesg(lmp,"SNAP keyword {} {}\n",keywd,keyval); + + if (keywd == "rcutfac") { + rcutfac = utils::numeric(FLERR,keyval,false,lmp); + rcutfacflag = 1; + } else if (keywd == "twojmax") { + twojmax = utils::inumeric(FLERR,keyval,false,lmp); + twojmaxflag = 1; + } else if (keywd == "rfac0") + rfac0 = utils::numeric(FLERR,keyval,false,lmp); + else if (keywd == "rmin0") + rmin0 = utils::numeric(FLERR,keyval,false,lmp); + else if (keywd == "switchflag") + switchflag = utils::inumeric(FLERR,keyval,false,lmp); + else if (keywd == "bzeroflag") + bzeroflag = utils::inumeric(FLERR,keyval,false,lmp); + else if (keywd == "quadraticflag") + quadraticflag = utils::inumeric(FLERR,keyval,false,lmp); + else if (keywd == "chemflag") + chemflag = utils::inumeric(FLERR,keyval,false,lmp); + else if (keywd == "bnormflag") + bnormflag = utils::inumeric(FLERR,keyval,false,lmp); + else if (keywd == "wselfallflag") + wselfallflag = utils::inumeric(FLERR,keyval,false,lmp); + else if (keywd == "switchinnerflag") + switchinnerflag = utils::inumeric(FLERR,keyval,false,lmp); + else if (keywd == "chunksize") + chunksize = utils::inumeric(FLERR,keyval,false,lmp); + else if (keywd == "parallelthresh") + parallel_thresh = utils::inumeric(FLERR,keyval,false,lmp); + else + error->all(FLERR,"Unknown parameter '{}' in SNAP parameter file", keywd); + } + } + + if (rcutfacflag == 0 || twojmaxflag == 0) + error->all(FLERR,"Incorrect SNAP parameter file"); + + if (chemflag && nelemtmp != nelements) + error->all(FLERR,"Incorrect SNAP parameter file"); + + if (switchinnerflag && !(sinnerflag && dinnerflag)) + error->all(FLERR,"Incorrect SNAP parameter file"); + + if (!switchinnerflag && (sinnerflag || dinnerflag)) + error->all(FLERR,"Incorrect SNAP parameter file"); +} + +/* ---------------------------------------------------------------------- + memory usage +------------------------------------------------------------------------- */ + +double PairSNAPIntel::memory_usage() +{ + double bytes = Pair::memory_usage(); + + int n = atom->ntypes+1; + bytes += (double)n*n*sizeof(int); // setflag + bytes += (double)n*n*sizeof(double); // cutsq + bytes += (double)n*n*sizeof(double); // scale + bytes += (double)n*sizeof(int); // map + bytes += (double)ncoeff*sizeof(SNA_DVEC); // bispectrum + bytes += (double)ncoeff*sizeof(SNA_DVEC); // beta + + bytes += snaptr->memory_usage(); // SNA object + + return bytes; +} + +/* ---------------------------------------------------------------------- */ + +void *PairSNAPIntel::extract(const char *str, int &dim) +{ + dim = 2; + if (strcmp(str,"scale") == 0) return (void *) scale; + return nullptr; +} + +#endif +#endif diff --git a/src/INTEL/pair_snap_intel.h b/src/INTEL/pair_snap_intel.h new file mode 100644 index 0000000000..2dc758f244 --- /dev/null +++ b/src/INTEL/pair_snap_intel.h @@ -0,0 +1,83 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#if defined(__AVX512F__) +#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(snap/intel,PairSNAPIntel); +// clang-format on +#else + +#ifndef LMP_PAIR_SNAP_INTEL_H +#define LMP_PAIR_SNAP_INTEL_H + +#include "fix_intel.h" +#include "pair.h" + +namespace ip_simd { class SIMD_double; class SIMD_int; }; +#define SNA_DVEC ip_simd::SIMD_double +#define SNA_IVEC ip_simd::SIMD256_int + +namespace LAMMPS_NS { + +class PairSNAPIntel : public Pair { + public: + PairSNAPIntel(class LAMMPS *); + ~PairSNAPIntel() override; + void compute(int, int) override; + void settings(int, char **) override; + void coeff(int, char **) override; + void init_style() override; + double init_one(int, int) override; + double memory_usage() override; + void *extract(const char *, int &) override; + + double rcutfac, quadraticflag; // declared public to workaround gcc 4.9 + int ncoeff; // compiler bug, manifest in KOKKOS package + + protected: + FixIntel *fix; + + int ncoeffq, ncoeffall; + class SNAIntel *snaptr; + virtual void allocate(); + void read_files(char *, char *); + inline int equal(double *x, double *y); + inline double dist2(double *x, double *y); + + double rcutmax; // max cutoff for all elements + double *radelem; // element radii + double *wjelem; // elements weights + double **coeffelem; // element bispectrum coefficients + SNA_DVEC *beta; // betas for all atoms in list + SNA_DVEC *bispectrum; // bispectrum components for all atoms in list + double **scale; // for thermodynamic integration + int twojmax, switchflag, bzeroflag, bnormflag; + int chemflag, wselfallflag; + int switchinnerflag; // inner cutoff switch + double *sinnerelem; // element inner cutoff midpoint + double *dinnerelem; // element inner cutoff half-width + int chunksize, parallel_thresh; + double rfac0, rmin0, wj1, wj2; + int rcutfacflag, twojmaxflag; // flags for required parameters +}; + +} // namespace LAMMPS_NS + +#endif +#endif + +#endif +#endif diff --git a/src/INTEL/sna_intel.cpp b/src/INTEL/sna_intel.cpp new file mode 100644 index 0000000000..b83c90688d --- /dev/null +++ b/src/INTEL/sna_intel.cpp @@ -0,0 +1,1505 @@ +// clang-format off +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: W. Michael Brown, Intel +------------------------------------------------------------------------- */ + +#if defined(__AVX512F__) +#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) + +#include "sna_intel.h" + +#include "comm.h" +#include "error.h" +#include "math_const.h" +#include "math_special.h" +#include "memory.h" + +#include + +using namespace std; +using namespace LAMMPS_NS; +using namespace MathConst; +using namespace MathSpecial; +using namespace ip_simd; + +/* ---------------------------------------------------------------------- + + this implementation is based on the method outlined + in Bartok[1], using formulae from VMK[2]. + + for the Clebsch-Gordan coefficients, we + convert the VMK half-integral labels + a, b, c, alpha, beta, gamma + to array offsets j1, j2, j, m1, m2, m + using the following relations: + + j1 = 2*a + j2 = 2*b + j = 2*c + + m1 = alpha+a 2*alpha = 2*m1 - j1 + m2 = beta+b or 2*beta = 2*m2 - j2 + m = gamma+c 2*gamma = 2*m - j + + in this way: + + -a <= alpha <= a + -b <= beta <= b + -c <= gamma <= c + + becomes: + + 0 <= m1 <= j1 + 0 <= m2 <= j2 + 0 <= m <= j + + and the requirement that + a+b+c be integral implies that + j1+j2+j must be even. + The requirement that: + + gamma = alpha+beta + + becomes: + + 2*m - j = 2*m1 - j1 + 2*m2 - j2 + + Similarly, for the Wigner U-functions U(J,m,m') we + convert the half-integral labels J,m,m' to + array offsets j,ma,mb: + + j = 2*J + ma = J+m + mb = J+m' + + so that: + + 0 <= j <= 2*Jmax + 0 <= ma, mb <= j. + + For the bispectrum components B(J1,J2,J) we convert to: + + j1 = 2*J1 + j2 = 2*J2 + j = 2*J + + and the requirement: + + |J1-J2| <= J <= J1+J2, for j1+j2+j integral + + becomes: + + |j1-j2| <= j <= j1+j2, for j1+j2+j even integer + + or + + j = |j1-j2|, |j1-j2|+2,...,j1+j2-2,j1+j2 + + [1] Albert Bartok-Partay, "Gaussian Approximation..." + Doctoral Thesis, Cambridge University, (2009) + + [2] D. A. Varshalovich, A. N. Moskalev, and V. K. Khersonskii, + "Quantum Theory of Angular Momentum," World Scientific (1988) + +------------------------------------------------------------------------- */ + +SNAIntel::SNAIntel(LAMMPS* lmp, double rfac0_in, int twojmax_in, + double rmin0_in, int switch_flag_in, int bzero_flag_in, + int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, + int nelements_in, int switch_inner_flag_in) : Pointers(lmp) +{ + wself = 1.0; + + rfac0 = rfac0_in; + rmin0 = rmin0_in; + switch_flag = switch_flag_in; + switch_inner_flag = switch_inner_flag_in; + bzero_flag = bzero_flag_in; + chem_flag = chem_flag_in; + bnorm_flag = bnorm_flag_in; + wselfall_flag = wselfall_flag_in; + + if (bnorm_flag != chem_flag) + lmp->error->warning(FLERR, "bnormflag and chemflag are not equal." + "This is probably not what you intended"); + + if (chem_flag) + nelements = nelements_in; + else + nelements = 1; + + twojmax = twojmax_in; + + compute_ncoeff(); + + rij = nullptr; + inside = nullptr; + wj = nullptr; + rcutij = nullptr; + sinnerij = nullptr; + dinnerij = nullptr; + element = nullptr; + nmax = 0; + idxz = nullptr; + idxb = nullptr; + ulist_r_ij = nullptr; + ulist_i_ij = nullptr; + + build_indexlist(); + create_twojmax_arrays(); + + if (bzero_flag) { + double www = wself*wself*wself; + for (int j = 0; j <= twojmax; j++) + if (bnorm_flag) + bzero[j] = www; + else + bzero[j] = www*(j+1); + } + +} + +/* ---------------------------------------------------------------------- */ + +SNAIntel::~SNAIntel() +{ + memory->destroy(rij); + memory->destroy(inside); + memory->destroy(wj); + memory->destroy(rcutij); + memory->destroy(sinnerij); + memory->destroy(dinnerij); + if (chem_flag) memory->destroy(element); + memory->destroy(ulist_r_ij); + memory->destroy(ulist_i_ij); + delete[] idxz; + delete[] idxb; + destroy_twojmax_arrays(); +} + +void SNAIntel::build_indexlist() +{ + + // index list for cglist + + int jdim = twojmax + 1; + memory->create(idxcg_block, jdim, jdim, jdim, + "sna:idxcg_block"); + + int idxcg_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + idxcg_block[j1][j2][j] = idxcg_count; + for (int m1 = 0; m1 <= j1; m1++) + for (int m2 = 0; m2 <= j2; m2++) + idxcg_count++; + } + idxcg_max = idxcg_count; + + // index list for uarray + // need to include both halves + + memory->create(idxu_block, jdim, + "sna:idxu_block"); + + int idxu_count = 0; + + for (int j = 0; j <= twojmax; j++) { + idxu_block[j] = idxu_count; + for (int mb = 0; mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) + idxu_count++; + } + idxu_max = idxu_count; + + // index list for beta and B + + int idxb_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) idxb_count++; + + idxb_max = idxb_count; + idxb = new SNA_BINDICES[idxb_max]; + + idxb_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) { + idxb[idxb_count].j1 = j1; + idxb[idxb_count].j2 = j2; + idxb[idxb_count].j = j; + idxb_count++; + } + + // reverse index list for beta and b + + memory->create(idxb_block, jdim, jdim, jdim, + "sna:idxb_block"); + idxb_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + if (j >= j1) { + idxb_block[j1][j2][j] = idxb_count; + idxb_count++; + } + } + + // index list for zlist + + int idxz_count = 0; + + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) + idxz_count++; + + idxz_max = idxz_count; + idxz = new SNA_ZINDICES[idxz_max]; + + memory->create(idxz_block, jdim, jdim, jdim, + "sna:idxz_block"); + + idxz_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + idxz_block[j1][j2][j] = idxz_count; + + // find right beta[jjb] entry + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) { + idxz[idxz_count].j1 = j1; + idxz[idxz_count].j2 = j2; + idxz[idxz_count].j = j; + idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2); + idxz[idxz_count].ma2max = (2 * ma - j - (2 * idxz[idxz_count].ma1min - j1) + j2) / 2; + idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - idxz[idxz_count].ma1min + 1; + idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2); + idxz[idxz_count].mb2max = (2 * mb - j - (2 * idxz[idxz_count].mb1min - j1) + j2) / 2; + idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - idxz[idxz_count].mb1min + 1; + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + + const int jju = idxu_block[j] + (j+1)*mb + ma; + idxz[idxz_count].jju = jju; + + idxz_count++; + } + } +} + +/* ---------------------------------------------------------------------- */ + +void SNAIntel::init() +{ + init_clebsch_gordan(); + // print_clebsch_gordan(); + init_rootpqarray(); +} + +void SNAIntel::grow_rij(int newnmax) +{ + if (newnmax <= nmax) return; + + nmax = newnmax; + + memory->destroy(rij); + memory->destroy(inside); + memory->destroy(wj); + memory->destroy(rcutij); + memory->destroy(sinnerij); + memory->destroy(dinnerij); + if (chem_flag) memory->destroy(element); + memory->destroy(ulist_r_ij); + memory->destroy(ulist_i_ij); + memory->create(rij, nmax, 3, "pair:rij"); + memory->create(inside, nmax, "pair:inside"); + memory->create(wj, nmax, "pair:wj"); + memory->create(rcutij, nmax, "pair:rcutij"); + memory->create(sinnerij, nmax, "pair:sinnerij"); + memory->create(dinnerij, nmax, "pair:dinnerij"); + if (chem_flag) memory->create(element, nmax, "sna:element"); + memory->create(ulist_r_ij, nmax, idxu_max, "sna:ulist_ij"); + memory->create(ulist_i_ij, nmax, idxu_max, "sna:ulist_ij"); +} + +/* ---------------------------------------------------------------------- + compute Ui by summing over neighbors j +------------------------------------------------------------------------- */ + +void SNAIntel::compute_ui(const SNA_IVEC &jnum, const SNA_IVEC &ielem, + const int max_jnum) +{ + // utot(j,ma,mb) = 0 for all j,ma,ma + // utot(j,ma,ma) = 1 for all j,ma + // for j in neighbors of i: + // compute r0 = (x,y,z,z0) + // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb + + zero_uarraytot(ielem); + + for (int j = 0; j < max_jnum; j++) { + const SNA_DVEC x = rij[j][0]; + const SNA_DVEC y = rij[j][1]; + const SNA_DVEC z = rij[j][2]; + const SNA_DVEC rcut = rcutij[j]; + const SNA_DVEC rsq = x * x + y * y + z * z; + const SNA_DVEC r = SIMD_sqrt(rsq); + const SNA_DVEC rscale0 = SIMD_rcp(rcut - rmin0) * rfac0 * MY_PI; + const SNA_DVEC theta0 = (r - rmin0) * rscale0; + const SNA_DVEC z0 = r * SIMD_rcp(SIMD_tan(theta0)); + + compute_uarray(x, y, z, z0, r, j, jnum); + add_uarraytot(r, j, jnum); + } + +} + +/* ---------------------------------------------------------------------- + pick out right beta value +------------------------------------------------------------------------- */ + +double SNAIntel::choose_beta(const int j, const int j1, const int j2, + const int elem1, const int elem2, const int elem3, + int &itriple) +{ + double bfactor; + if (j >= j1) { + const int jjb = idxb_block[j1][j2][j]; + itriple = ((elem1 * nelements + elem2) * nelements + elem3) * + idxb_max + jjb; + if (j1 == j) { + if (j2 == j) + bfactor = 3.0; + else + bfactor = 2.0; + } else + bfactor = 1.0; + } else if (j >= j2) { + const int jjb = idxb_block[j][j2][j1]; + itriple = ((elem3 * nelements + elem2) * nelements + elem1) * + idxb_max + jjb; + if (j2 == j) + bfactor = 2.0; + else + bfactor = 1.0; + } else { + const int jjb = idxb_block[j2][j][j1]; + itriple = ((elem2 * nelements + elem3) * nelements + elem1) * + idxb_max + jjb; + bfactor = 1.0; + } + + if (!bnorm_flag && j1 > j) + bfactor *= (1.0 + j1) / (1.0 + j); + + return bfactor; +} + +/* ---------------------------------------------------------------------- + compute Yi from Ui without storing Zi, looping over zlist indices +------------------------------------------------------------------------- */ + +template +void SNAIntel::compute_zi_or_yi(const SNA_DVEC* beta) +{ + if (COMPUTE_YI) { + memset(ylist_r,0,idxu_max*nelements*sizeof(SNA_DVEC)); + memset(ylist_i,0,idxu_max*nelements*sizeof(SNA_DVEC)); + } + + double *zlist_rp = (double *)zlist_r; + double *zlist_ip = (double *)zlist_i; + + int zlist_i = 0; + + for (int elem1 = 0; elem1 < nelements; elem1++) + for (int elem2 = 0; elem2 < nelements; elem2++) { + for (int jjz = 0; jjz < idxz_max; jjz++) { + const int j1 = idxz[jjz].j1; + const int j2 = idxz[jjz].j2; + const int j = idxz[jjz].j; + const int ma1min = idxz[jjz].ma1min; + const int ma2max = idxz[jjz].ma2max; + const int na = idxz[jjz].na; + const int mb1min = idxz[jjz].mb1min; + const int mb2max = idxz[jjz].mb2max; + const int nb = idxz[jjz].nb; + + const double *cgblock = cglist + idxcg_block[j1][j2][j]; + + SNA_DVEC ztmp_r = 0.0; + SNA_DVEC ztmp_i = 0.0; + + const double *u_r = (double *)ulisttot_r; + const double *u_i = (double *)ulisttot_i; + + int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min; + int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max; + jju1 *= vector_width(); + jju2 *= vector_width(); + int icgb = mb1min * (j2 + 1) + mb2max; + for (int ib = 0; ib < nb; ib++) { + + SNA_DVEC suma1_r = 0.0; + SNA_DVEC suma1_i = 0.0; + + int ma1 = ma1min * vector_width(); + int ma2 = ma2max * vector_width(); + int icga = ma1min * (j2 + 1) + ma2max; + + for (int ia = 0; ia < na; ia++) { + const SNA_DVEC u1_r = SIMD_load(u_r + jju1 + ma1); + const SNA_DVEC u2_r = SIMD_load(u_r + jju2 + ma2); + const SNA_DVEC u1_i = SIMD_load(u_i + jju1 + ma1); + const SNA_DVEC u2_i = SIMD_load(u_i + jju2 + ma2); + suma1_r += (u1_r*u2_r - u1_i*u2_i) * cgblock[icga]; + suma1_i += (u1_r*u2_i + u1_i*u2_r) * cgblock[icga]; + ma1+= vector_width(); + ma2-= vector_width(); + icga += j2; + } // end loop over ia + + ztmp_r += suma1_r * cgblock[icgb]; + ztmp_i += suma1_i * cgblock[icgb]; + + jju1 += (j1 + 1) * vector_width(); + jju2 -= (j2 + 1) * vector_width(); + icgb += j2; + } // end loop over ib + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta[jjb] entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + if (bnorm_flag) { + ztmp_i *= SIMD_rcp(SIMD_set(static_cast(j+1))); + ztmp_r *= SIMD_rcp(SIMD_set(static_cast(j+1))); + } + + if (COMPUTE_YI) { + int jju = idxz[jjz].jju; + for (int elem3 = 0; elem3 < nelements; elem3++) { + int itriple; + double bfactor = choose_beta(j, j1, j2, elem1, elem2, elem3, + itriple); + const SNA_DVEC betaj = beta[itriple] * bfactor; + const int i = elem3 * idxu_max + jju; + SIMD_store(&(ylist_r[i]), SIMD_load(ylist_r + i) + betaj * ztmp_r); + SIMD_store(&(ylist_i[i]), SIMD_load(ylist_i + i) + betaj * ztmp_i); + } + } else { + SIMD_store(zlist_rp + zlist_i, ztmp_r); + SIMD_store(zlist_ip + zlist_i, ztmp_i); + zlist_i += vector_width(); + } + }// end loop over jjz + } +} + +/* ---------------------------------------------------------------------- + compute Yi from Zi +------------------------------------------------------------------------- */ + +void SNAIntel::compute_yi_from_zi(const SNA_DVEC* beta) +{ + memset(ylist_r,0,idxu_max*nelements*sizeof(SNA_DVEC)); + memset(ylist_i,0,idxu_max*nelements*sizeof(SNA_DVEC)); + + double *zlist_rp = (double *)zlist_r; + double *zlist_ip = (double *)zlist_i; + + int zlist_i = 0; + + for (int elem1 = 0; elem1 < nelements; elem1++) + for (int elem2 = 0; elem2 < nelements; elem2++) { + for (int jjz = 0; jjz < idxz_max; jjz++) { + const int j1 = idxz[jjz].j1; + const int j2 = idxz[jjz].j2; + const int j = idxz[jjz].j; + + const SNA_DVEC ztmp_r = SIMD_load(zlist_rp + zlist_i); + const SNA_DVEC ztmp_i = SIMD_load(zlist_ip + zlist_i); + zlist_i += vector_width(); + + int jju = idxz[jjz].jju; + for (int elem3 = 0; elem3 < nelements; elem3++) { + int itriple; + double bfactor = choose_beta(j, j1, j2, elem1, elem2, elem3, + itriple); + const SNA_DVEC betaj = beta[itriple] * bfactor; + const int i = elem3 * idxu_max + jju; + SIMD_store(&(ylist_r[i]), SIMD_load(ylist_r + i) + betaj * ztmp_r); + SIMD_store(&(ylist_i[i]), SIMD_load(ylist_i + i) + betaj * ztmp_i); + } + } // end loop over jjz + } +} + +/* ---------------------------------------------------------------------- + compute dEidRj +------------------------------------------------------------------------- */ + +void SNAIntel::compute_deidrj_e(const int jj, const SNA_IVEC &jnum, + SNA_DVEC* dedr) +{ + double *ylist_rp = (double *)ylist_r; + double *ylist_ip = (double *)ylist_i; + double *dulist_rp = (double *)(dulist_r[0]); + double *dulist_ip = (double *)(dulist_i[0]); + + for (int k = 0; k < 3; k++) + dedr[k] = SIMD_set(0.0); + + SNA_IVEC jelem; + if (chem_flag) jelem = SIMD_load(element + jj); + else jelem = SIMD256_set(0); + + SIMD_mask m(jj < jnum); + + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_block[j] * vector_width(); + int jju3 = jju * 3; + SNA_IVEC i = jelem*idxu_max*vector_width() + jju + SIMD256_count(); + + for (int mb = 0; 2*mb < j; mb++) + for (int ma = 0; ma <= j; ma++) { + SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i); + SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i); + for (int k = 0; k < 3; k++) { + SNA_DVEC du_r = SIMD_load(dulist_rp + jju3); + SNA_DVEC du_i = SIMD_load(dulist_ip + jju3); + SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i; + dedr[k] = SIMD_add(m, dedr[k], du); + jju3 += vector_width(); + } + i = i + vector_width(); + } + + if (j%2 == 0) { + int mb = j / 2; + for (int ma = 0; ma < mb; ma++) { + SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i); + SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i); + for (int k = 0; k < 3; k++) { + SNA_DVEC du_r = SIMD_load(dulist_rp + jju3); + SNA_DVEC du_i = SIMD_load(dulist_ip + jju3); + SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i; + dedr[k] = SIMD_add(m, dedr[k], du); + jju3 += vector_width(); + } + i = i + vector_width(); + } + + SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i); + SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i); + for (int k = 0; k < 3; k++) { + SNA_DVEC du_r = SIMD_load(dulist_rp + jju3); + SNA_DVEC du_i = SIMD_load(dulist_ip + jju3); + SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i; + dedr[k] = SIMD_fma(m, SIMD_set(0.5), du, dedr[k]); + jju3 += vector_width(); + } + } // if j%2 + } // for j + + for (int k = 0; k < 3; k++) + dedr[k] = dedr[k] * 2.0; +} + +/* ---------------------------------------------------------------------- + compute dEidRj +------------------------------------------------------------------------- */ + +void SNAIntel::compute_deidrj(const int jj, const SNA_IVEC &jnum, + SNA_DVEC* dedr) +{ + double *ylist_rp = (double *)ylist_r; + double *ylist_ip = (double *)ylist_i; + double *dulist_rp = (double *)(dulist_r[0]); + double *dulist_ip = (double *)(dulist_i[0]); + + for (int k = 0; k < 3; k++) + dedr[k] = SIMD_set(0.0); + + SIMD_mask m(jj < jnum); + + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_block[j] * vector_width(); + int jju3 = jju * 3; + + for (int mb = 0; 2*mb < j; mb++) + for (int ma = 0; ma <= j; ma++) { + SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju); + SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju); + for (int k = 0; k < 3; k++) { + SNA_DVEC du_r = SIMD_load(dulist_rp + jju3); + SNA_DVEC du_i = SIMD_load(dulist_ip + jju3); + SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i; + dedr[k] = SIMD_add(m, dedr[k], du); + jju3 += vector_width(); + } + jju += vector_width(); + } + + if (j%2 == 0) { + int mb = j / 2; + for (int ma = 0; ma < mb; ma++) { + SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju); + SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju); + for (int k = 0; k < 3; k++) { + SNA_DVEC du_r = SIMD_load(dulist_rp + jju3); + SNA_DVEC du_i = SIMD_load(dulist_ip + jju3); + SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i; + dedr[k] = SIMD_add(m, dedr[k], du); + jju3 += vector_width(); + } + jju += vector_width(); + } + + SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju); + SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju); + for (int k = 0; k < 3; k++) { + SNA_DVEC du_r = SIMD_load(dulist_rp + jju3); + SNA_DVEC du_i = SIMD_load(dulist_ip + jju3); + SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i; + dedr[k] = SIMD_fma(m, SIMD_set(0.5), du, dedr[k]); + jju3 += vector_width(); + } + } // if j%2 + } // for j + + for (int k = 0; k < 3; k++) + dedr[k] = dedr[k] * 2.0; +} + +/* ---------------------------------------------------------------------- + compute Bi by summing conj(Ui)*Zi +------------------------------------------------------------------------- */ + +void SNAIntel::compute_bi(const SNA_IVEC &ielem) { + // for j1 = 0,...,twojmax + // for j2 = 0,twojmax + // for j = |j1-j2|,Min(twojmax,j1+j2),2 + // b(j1,j2,j) = 0 + // for mb = 0,...,jmid + // for ma = 0,...,j + // b(j1,j2,j) += + // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) + + double *ulisttot_rp = (double *)ulisttot_r; + double *ulisttot_ip = (double *)ulisttot_i; + double *blistp = (double *)blist; + + int itriple = 0; + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) + for (int elem2 = 0; elem2 < nelements; elem2++) { + + double *zlist_rp = (double *)(zlist_r + idouble*idxz_max); + double *zlist_ip = (double *)(zlist_i + idouble*idxz_max); + + for (int elem3 = 0; elem3 < nelements; elem3++) { + for (int jjb = 0; jjb < idxb_max; jjb++) { + const int j1 = idxb[jjb].j1; + const int j2 = idxb[jjb].j2; + const int j = idxb[jjb].j; + + int jjz = idxz_block[j1][j2][j] * vector_width(); + int jju = (elem3 * idxu_max + idxu_block[j]) * vector_width(); + SNA_DVEC sumzu(0.0); + for (int mb = 0; 2 * mb < j; mb++) + for (int ma = 0; ma <= j; ma++) { + const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju); + const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju); + const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz); + const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz); + sumzu = sumzu + utot_r * z_r + utot_i * z_i; + jjz += vector_width(); + jju += vector_width(); + } // end loop over ma, mb + + // For j even, handle middle column + + if (j % 2 == 0) { + int mb = j / 2; + for (int ma = 0; ma < mb; ma++) { + const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju); + const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju); + const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz); + const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz); + sumzu = sumzu + utot_r * z_r + utot_i * z_i; + jjz += vector_width(); + jju += vector_width(); + } + + const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju); + const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju); + const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz); + const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz); + sumzu = sumzu + (utot_r * z_r + utot_i * z_i) * 0.5; + } // end if jeven + + SIMD_store(blistp + (itriple*idxb_max+jjb) * vector_width(), + sumzu * 2.0); + } + itriple++; + } + idouble++; + } + + // apply bzero shift + + if (bzero_flag) { + if (!wselfall_flag) { + SNA_IVEC itriplev = (ielem*nelements+ielem)*nelements+ielem; + for (int jjb = 0; jjb < idxb_max; jjb++) { + const int j = idxb[jjb].j; + SNA_IVEC i = (itriplev*idxb_max+jjb) * vector_width() + SIMD256_count(); + SIMD_scatter(blistp, i, SIMD_gather(blistp, i) - bzero[j]); + } // end loop over JJ + } else { + int itriple = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) + for (int elem2 = 0; elem2 < nelements; elem2++) { + for (int elem3 = 0; elem3 < nelements; elem3++) { + for (int jjb = 0; jjb < idxb_max; jjb++) { + const int j = idxb[jjb].j; + int i = (itriple*idxb_max+jjb) * vector_width(); + SIMD_store(blistp + i, SIMD_load(blistp + i) - bzero[j]); + } // end loop over JJ + itriple++; + } // end loop over elem3 + } // end loop over elem1,elem2 + } + } +} + +/* ---------------------------------------------------------------------- + calculate derivative of Ui w.r.t. atom j +------------------------------------------------------------------------- */ + +void SNAIntel::compute_duidrj(const int jj, const SNA_IVEC &jnum) +{ + const SNA_DVEC x = rij[jj][0]; + const SNA_DVEC y = rij[jj][1]; + const SNA_DVEC z = rij[jj][2]; + const SNA_DVEC rcut = rcutij[jj]; + const SNA_DVEC rsq = x * x + y * y + z * z; + const SNA_DVEC r = SIMD_sqrt(rsq); + const SNA_DVEC rscale0 = SIMD_rcp(rcut - rmin0) * rfac0 * MY_PI; + const SNA_DVEC theta0 = (r - rmin0) * rscale0; + const SNA_DVEC z0 = r * SIMD_rcp(SIMD_tan(theta0)); + const SNA_DVEC dz0dr = z0 * SIMD_rcp(r) - (r*rscale0) * (rsq + z0 * z0) * + SIMD_rcp(rsq); + compute_duarray(x, y, z, z0, r, dz0dr, wj[jj], rcut, jj, jnum); +} + +/* ---------------------------------------------------------------------- */ + +void SNAIntel::zero_uarraytot(const SNA_IVEC &ielem) +{ + double *ulisttot_rp = (double *)ulisttot_r; + double *ulisttot_ip = (double *)ulisttot_i; + for (int jelem = 0; jelem < nelements; jelem++) + for (int j = 0; j <= twojmax; j++) { + int jju = (jelem * idxu_max + idxu_block[j]) * vector_width(); + for (int mb = 0; mb <= j; mb++) { + for (int ma = 0; ma <= j; ma++) { + SIMD_store(ulisttot_rp + jju, SIMD_set(0.0)); + SIMD_store(ulisttot_ip + jju, SIMD_set(0.0)); + + // utot(j,ma,ma) = wself, sometimes + if (ma == mb) { + if (wselfall_flag || nelements == 1) + SIMD_store(ulisttot_rp + jju, SIMD_set(wself)); + else { + SIMD_mask m(ielem == jelem); + SIMD_store(ulisttot_rp + jju, + SIMD_zero_masked(~m, SIMD_set(wself))); + } + } + jju += vector_width(); + } + } + } +} + + + +/* ---------------------------------------------------------------------- + add Wigner U-functions for one neighbor to the total +------------------------------------------------------------------------- */ + +void SNAIntel::add_uarraytot(const SNA_DVEC &r, const int jj, + const SNA_IVEC &jnum) +{ + SNA_DVEC sfac = compute_sfac(r, rcutij[jj], sinnerij[jj], dinnerij[jj]); + sfac *= wj[jj]; + + double *ulisttot_rp = (double *)ulisttot_r; + double *ulisttot_ip = (double *)ulisttot_i; + const double* ulist_r = (double *)(ulist_r_ij[jj]); + const double* ulist_i = (double *)(ulist_i_ij[jj]); + + SIMD_mask m(jj < jnum); + + if (chem_flag && nelements > 1) { + SNA_IVEC jelem = SIMD_load(element+jj); + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_block[j] * vector_width(); + SNA_IVEC i = jelem*idxu_max*vector_width() + jju + SIMD256_count(); + for (int mb = 0; mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) { + SNA_DVEC utot_r = SIMD_gather(m, ulisttot_rp, i); + SNA_DVEC utot_i = SIMD_gather(m, ulisttot_ip, i); + utot_r = SIMD_fma(m, sfac, SIMD_load(ulist_r + jju), utot_r); + utot_i = SIMD_fma(m, sfac, SIMD_load(ulist_i + jju), utot_i); + SIMD_scatter(m, ulisttot_rp, i, utot_r); + SIMD_scatter(m, ulisttot_ip, i, utot_i); + jju += vector_width(); + i = i + vector_width(); + } + } + } else { + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_block[j] * vector_width(); + for (int mb = 0; mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) { + SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju); + SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju); + utot_r = SIMD_fma(m, sfac, SIMD_load(ulist_r + jju), utot_r); + utot_i = SIMD_fma(m, sfac, SIMD_load(ulist_i + jju), utot_i); + SIMD_store(ulisttot_rp + jju, utot_r); + SIMD_store(ulisttot_ip + jju, utot_i); + jju += vector_width(); + } + } + } +} + +/* ---------------------------------------------------------------------- + compute Wigner U-functions for one neighbor +------------------------------------------------------------------------- */ + +void SNAIntel::compute_uarray(const SNA_DVEC &x, const SNA_DVEC &y, + const SNA_DVEC &z, const SNA_DVEC &z0, + const SNA_DVEC &r, const int jj, + const SNA_IVEC &jnum) +{ + // compute Cayley-Klein parameters for unit quaternion + + const SNA_DVEC r0inv = SIMD_invsqrt(r * r + z0 * z0); + const SNA_DVEC a_r = z0 * r0inv; + const SNA_DVEC a_i = -z * r0inv; + const SNA_DVEC b_r = y * r0inv; + const SNA_DVEC b_i = -x * r0inv; + + // VMK Section 4.8.2 + + double *ulist_rp = (double *)(ulist_r_ij[jj]); + double *ulist_ip = (double *)(ulist_i_ij[jj]); + + SIMD_store(ulist_rp, SIMD_set(1.0)); + SIMD_store(ulist_ip, SIMD_set(0.0)); + + for (int j = 1; j <= twojmax; j++) { + int jju = idxu_block[j] * vector_width(); + int jjup = idxu_block[j-1] * vector_width(); + + // fill in left side of matrix layer from previous layer + + for (int mb = 0; 2*mb <= j; mb++) { + SIMD_store(ulist_rp + jju, SIMD_set(0.0)); + SIMD_store(ulist_ip + jju, SIMD_set(0.0)); + + for (int ma = 0; ma < j; ma++) { + double rootpq = rootpqarray[j - ma][j - mb]; + SNA_DVEC u_r = SIMD_load(ulist_rp + jju); + SNA_DVEC u_i = SIMD_load(ulist_ip + jju); + const SNA_DVEC up_r = SIMD_load(ulist_rp + jjup); + const SNA_DVEC up_i = SIMD_load(ulist_ip + jjup); + + SNA_DVEC u_ro, u_io; + + u_ro = a_r * up_r + a_i * up_i; + u_r = SIMD_fma(SIMD_set(rootpq), u_ro, u_r); + SIMD_store(ulist_rp + jju, u_r); + u_io = a_r * up_i - a_i * up_r; + u_i = SIMD_fma(SIMD_set(rootpq), u_io, u_i); + SIMD_store(ulist_ip + jju, u_i); + + jju += vector_width(); + + rootpq = -rootpqarray[ma + 1][j - mb]; + u_r = (b_r * up_r + b_i * up_i) * rootpq; + SIMD_store(ulist_rp + jju, u_r); + u_i = (b_r * up_i - b_i * up_r) * rootpq; + SIMD_store(ulist_ip + jju, u_i); + + jjup += vector_width(); + } + jju += vector_width(); + } + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) + + jju = idxu_block[j]; + jjup = (jju+(j+1)*(j+1)-1) * vector_width(); + jju *= vector_width(); + int mbpar = 1; + for (int mb = 0; 2*mb <= j; mb++) { + int mapar = mbpar; + for (int ma = 0; ma <= j; ma++) { + if (mapar == 1) { + SIMD_store(ulist_rp + jjup, SIMD_load(ulist_rp + jju)); + SIMD_store(ulist_ip + jjup, -SIMD_load(ulist_ip + jju)); + } else { + SIMD_store(ulist_rp + jjup, -SIMD_load(ulist_rp + jju)); + SIMD_store(ulist_ip + jjup, SIMD_load(ulist_ip + jju)); + } + mapar = -mapar; + jju += vector_width(); + jjup -= vector_width(); + } + mbpar = -mbpar; + } + } +} + +/* ---------------------------------------------------------------------- + Compute derivatives of Wigner U-functions for one neighbor + see comments in compute_uarray() +------------------------------------------------------------------------- */ + +void SNAIntel::compute_duarray(const SNA_DVEC &x, const SNA_DVEC &y, + const SNA_DVEC &z, const SNA_DVEC &z0, + const SNA_DVEC &r, const SNA_DVEC &dz0dr, + const SNA_DVEC &wj, const SNA_DVEC &rcut, + const int jj, const SNA_IVEC &jnum) +{ + const SNA_DVEC rinv = SIMD_rcp(r); + const SNA_DVEC r0inv = SIMD_invsqrt(r * r + z0 * z0); + SNA_DVEC up[3]; + up[0] = x * rinv; + up[1] = y * rinv; + up[2] = z * rinv; + const SNA_DVEC a_r = z0 * r0inv; + const SNA_DVEC a_i = -z * r0inv; + const SNA_DVEC b_r = y * r0inv; + const SNA_DVEC b_i = -x * r0inv; + const SNA_DVEC dr0invdr = -SIMD_pow(r0inv, 3.0) * (r + z0 * dz0dr); + + SNA_DVEC dr0inv[3], da_r[3], da_i[3]; + for (int k = 0; k < 3; k++) { + dr0inv[k] = dr0invdr * up[k]; + da_r[k] = dz0dr * up[k] * r0inv + z0 * dr0inv[k]; + da_i[k] = -z * dr0inv[k]; + } + da_i[2] += -r0inv; + + double *ulist_rp = (double *)(ulist_r_ij[jj]); + double *ulist_ip = (double *)(ulist_i_ij[jj]); + double *dulist_rp = (double *)(dulist_r[0]); + double *dulist_ip = (double *)(dulist_i[0]); + + SNA_DVEC db_r[3], db_i[3]; + for (int k = 0; k < 3; k++) { + SIMD_store(dulist_rp + k * vector_width(), SIMD_set(0.0)); + SIMD_store(dulist_ip + k * vector_width(), SIMD_set(0.0)); + db_r[k] = y * dr0inv[k]; + db_i[k] = -x * dr0inv[k]; + } + db_i[0] -= r0inv; + db_r[1] += r0inv; + + for (int j = 1; j <= twojmax; j++) { + int jju3 = idxu_block[j] * 3 * vector_width(); + int jjup = idxu_block[j-1] * vector_width(); + int jjup3 = jjup * 3; + for (int mb = 0; 2*mb <= j; mb++) { + for (int k = 0; k < 3; k++) { + SIMD_store(dulist_rp + jju3 + k * vector_width(), SIMD_set(0.0)); + SIMD_store(dulist_ip + jju3 + k * vector_width(), SIMD_set(0.0)); + } + + for (int ma = 0; ma < j; ma++) { + const double rootpq = rootpqarray[j - ma][j - mb]; + const double mrootpq = -rootpqarray[ma + 1][j - mb]; + const SNA_DVEC up_r = SIMD_load(ulist_rp + jjup); + const SNA_DVEC up_i = SIMD_load(ulist_ip + jjup); + for (int k = 0; k < 3; k++) { + SNA_DVEC du_r = SIMD_load(dulist_rp + jju3); + SNA_DVEC du_i = SIMD_load(dulist_ip + jju3); + const SNA_DVEC dup_r = SIMD_load(dulist_rp + jjup3); + const SNA_DVEC dup_i = SIMD_load(dulist_ip + jjup3); + + SNA_DVEC du_ro, du_io; + + du_ro = (da_r[k]*up_r + da_i[k]*up_i + a_r*dup_r + a_i*dup_i); + du_r = SIMD_fma(SIMD_set(rootpq), du_ro, du_r); + SIMD_store(dulist_rp + jju3, du_r); + + du_io = (da_r[k]*up_i - da_i[k]*up_r + a_r*dup_i - a_i*dup_r); + du_i = SIMD_fma(SIMD_set(rootpq), du_io, du_i); + SIMD_store(dulist_ip + jju3, du_i); + + du_r = (db_r[k]*up_r + db_i[k]*up_i + b_r*dup_r + b_i*dup_i); + SIMD_store(dulist_rp + jju3 + 3 * vector_width(), du_r * mrootpq); + + du_i = (db_r[k]*up_i - db_i[k]*up_r + b_r*dup_i - b_i*dup_r); + SIMD_store(dulist_ip + jju3 + 3 * vector_width(), du_i * mrootpq); + + jju3 += vector_width(); + jjup3 += vector_width(); + } + jjup += vector_width(); + } // for ma + jju3 += 3 * vector_width(); + } // for mb + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) + + SNA_DVEC *du_r_p = dulist_r[0]; + SNA_DVEC *du_i_p = dulist_i[0]; + + int jju = idxu_block[j]; + jjup = (jju+(j+1)*(j+1)-1) * 3 * vector_width(); + jju *= 3 * vector_width(); + int mbpar = 1; + for (int mb = 0; 2*mb <= j; mb++) { + int mapar = mbpar; + for (int ma = 0; ma <= j; ma++) { + if (mapar == 1) { + for (int k = 0; k < 3; k++) { + SIMD_store(dulist_rp + jjup, SIMD_load(dulist_rp + jju)); + SIMD_store(dulist_ip + jjup, -SIMD_load(dulist_ip + jju)); + jju += vector_width(); + jjup += vector_width(); + } + } else { + for (int k = 0; k < 3; k++) { + SIMD_store(dulist_rp + jjup, -SIMD_load(dulist_rp + jju)); + SIMD_store(dulist_ip + jjup, SIMD_load(dulist_ip + jju)); + jju += vector_width(); + jjup += vector_width(); + } + } + mapar = -mapar; + jjup -= 6 * vector_width(); + } // for ma + mbpar = -mbpar; + } // for mb + } // for j + + SNA_DVEC dsfac; + SNA_DVEC sfac = compute_sfac_dsfac(r, rcut, sinnerij[jj], dinnerij[jj], + dsfac); + sfac = sfac * wj; + dsfac = dsfac * wj; + + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_block[j] * vector_width(); + int jju3 = jju * 3; + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) { + const SNA_DVEC ur_dsfac = dsfac * SIMD_load(ulist_rp + jju); + const SNA_DVEC ui_dsfac = dsfac * SIMD_load(ulist_ip + jju); + jju += vector_width(); + for (int k = 0; k < 3; k++) { + SNA_DVEC du_r = ur_dsfac * up[k] + sfac * SIMD_load(dulist_rp+jju3); + SIMD_store(dulist_rp + jju3, du_r); + SNA_DVEC du_i = ui_dsfac * up[k] + sfac * SIMD_load(dulist_ip+jju3); + SIMD_store(dulist_ip + jju3, du_i); + jju3 += vector_width(); + } + } + } +} + +/* ---------------------------------------------------------------------- + memory usage of arrays +------------------------------------------------------------------------- */ + +double SNAIntel::memory_usage() +{ + int jdimpq = twojmax + 2; + int jdim = twojmax + 1; + double bytes; + + bytes = 0; + + bytes += (double)jdimpq*jdimpq * sizeof(double); // pqarray + bytes += (double)idxcg_max * sizeof(double); // cglist + + bytes += (double)nmax * idxu_max * sizeof(SNA_DVEC) * 2; // ulist_ij + bytes += (double)idxu_max * nelements * sizeof(SNA_DVEC) * 2; // ulisttot + bytes += (double)idxu_max * 3 * sizeof(SNA_DVEC) * 2; // dulist + + bytes += (double)idxz_max * ndoubles * sizeof(SNA_DVEC) * 2; // zlist + bytes += (double)idxb_max * ntriples * sizeof(SNA_DVEC); // blist + bytes += (double)idxb_max * ntriples * 3 * sizeof(double); // dblist + bytes += (double)idxu_max * nelements * sizeof(SNA_DVEC) * 2; // ylist + + bytes += (double)jdim * jdim * jdim * sizeof(int); // idxcg_block + bytes += (double)jdim * sizeof(int); // idxu_block + bytes += (double)jdim * jdim * jdim * sizeof(int); // idxz_block + bytes += (double)jdim * jdim * jdim * sizeof(int); // idxb_block + + bytes += (double)idxz_max * sizeof(SNA_ZINDICES); // idxz + bytes += (double)idxb_max * sizeof(SNA_BINDICES); // idxb + + if (bzero_flag) + bytes += (double)jdim * sizeof(double); // bzero + + bytes += (double)nmax * 3 * sizeof(SNA_DVEC); // rij + bytes += (double)nmax * sizeof(SNA_IVEC); // inside + bytes += (double)nmax * sizeof(SNA_DVEC); // wj + bytes += (double)nmax * sizeof(SNA_DVEC); // rcutij + bytes += (double)nmax * sizeof(SNA_DVEC); // sinnerij + bytes += (double)nmax * sizeof(SNA_DVEC); // dinnerij + if (chem_flag) bytes += (double)nmax * sizeof(SNA_IVEC); // element + + return bytes; +} + +/* ---------------------------------------------------------------------- */ + +void SNAIntel::create_twojmax_arrays() +{ + int jdimpq = twojmax + 2; + memory->create(rootpqarray, jdimpq, jdimpq, + "sna:rootpqarray"); + memory->create(cglist, idxcg_max, "sna:cglist"); + memory->create(ulisttot_r, idxu_max*nelements, "sna:ulisttot"); + memory->create(ulisttot_i, idxu_max*nelements, "sna:ulisttot"); + memory->create(dulist_r, idxu_max, 3, "sna:dulist"); + memory->create(dulist_i, idxu_max, 3, "sna:dulist"); + memory->create(zlist_r, idxz_max*ndoubles, "sna:zlist"); + memory->create(zlist_i, idxz_max*ndoubles, "sna:zlist"); + memory->create(blist, idxb_max*ntriples, "sna:blist"); + memory->create(dblist, idxb_max*ntriples, 3, "sna:dblist"); + memory->create(ylist_r, idxu_max*nelements, "sna:ylist"); + memory->create(ylist_i, idxu_max*nelements, "sna:ylist"); + + if (bzero_flag) + memory->create(bzero, twojmax+1,"sna:bzero"); + else + bzero = nullptr; + +} + +/* ---------------------------------------------------------------------- */ + +void SNAIntel::destroy_twojmax_arrays() +{ + memory->destroy(rootpqarray); + memory->destroy(cglist); + memory->destroy(ulisttot_r); + memory->destroy(ulisttot_i); + memory->destroy(dulist_r); + memory->destroy(dulist_i); + memory->destroy(zlist_r); + memory->destroy(zlist_i); + memory->destroy(blist); + memory->destroy(dblist); + memory->destroy(ylist_r); + memory->destroy(ylist_i); + + memory->destroy(idxcg_block); + memory->destroy(idxu_block); + memory->destroy(idxz_block); + memory->destroy(idxb_block); + + if (bzero_flag) + memory->destroy(bzero); + +} + +/* ---------------------------------------------------------------------- + the function delta given by VMK Eq. 8.2(1) +------------------------------------------------------------------------- */ + +double SNAIntel::deltacg(int j1, int j2, int j) +{ + double sfaccg = factorial((j1 + j2 + j) / 2 + 1); + return sqrt(factorial((j1 + j2 - j) / 2) * + factorial((j1 - j2 + j) / 2) * + factorial((-j1 + j2 + j) / 2) / sfaccg); +} + +/* ---------------------------------------------------------------------- + assign Clebsch-Gordan coefficients using + the quasi-binomial formula VMK 8.2.1(3) +------------------------------------------------------------------------- */ + +void SNAIntel::init_clebsch_gordan() +{ + double sum,dcg,sfaccg; + int m, aa2, bb2, cc2; + int ifac; + + int idxcg_count = 0; + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) { + for (int m1 = 0; m1 <= j1; m1++) { + aa2 = 2 * m1 - j1; + + for (int m2 = 0; m2 <= j2; m2++) { + + // -c <= cc <= c + + bb2 = 2 * m2 - j2; + m = (aa2 + bb2 + j) / 2; + + if (m < 0 || m > j) { + cglist[idxcg_count] = 0.0; + idxcg_count++; + continue; + } + + sum = 0.0; + + for (int z = MAX(0, MAX(-(j - j2 + aa2) + / 2, -(j - j1 - bb2) / 2)); + z <= MIN((j1 + j2 - j) / 2, + MIN((j1 - aa2) / 2, (j2 + bb2) / 2)); + z++) { + ifac = z % 2 ? -1 : 1; + sum += ifac / + (factorial(z) * + factorial((j1 + j2 - j) / 2 - z) * + factorial((j1 - aa2) / 2 - z) * + factorial((j2 + bb2) / 2 - z) * + factorial((j - j2 + aa2) / 2 + z) * + factorial((j - j1 - bb2) / 2 + z)); + } + + cc2 = 2 * m - j; + dcg = deltacg(j1, j2, j); + sfaccg = sqrt(factorial((j1 + aa2) / 2) * + factorial((j1 - aa2) / 2) * + factorial((j2 + bb2) / 2) * + factorial((j2 - bb2) / 2) * + factorial((j + cc2) / 2) * + factorial((j - cc2) / 2) * + (j + 1)); + + cglist[idxcg_count] = sum * dcg * sfaccg; + idxcg_count++; + } + } + } +} + +/* ---------------------------------------------------------------------- + print out values of Clebsch-Gordan coefficients + format and notation follows VMK Table 8.11 +------------------------------------------------------------------------- */ + +void SNAIntel::print_clebsch_gordan() +{ + if (comm->me) return; + + int aa2, bb2, cc2; + for (int j = 0; j <= twojmax; j += 1) { + printf("c = %g\n",j/2.0); + printf("a alpha b beta C_{a alpha b beta}^{c alpha+beta}\n"); + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + if (j1-j2 <= j && j1+j2 >= j && (j1+j2+j)%2 == 0) { + int idxcg_count = idxcg_block[j1][j2][j]; + for (int m1 = 0; m1 <= j1; m1++) { + aa2 = 2*m1-j1; + for (int m2 = 0; m2 <= j2; m2++) { + bb2 = 2*m2-j2; + double cgtmp = cglist[idxcg_count]; + cc2 = aa2+bb2; + if (cc2 >= -j && cc2 <= j) + if (j1 != j2 || (aa2 > bb2 && aa2 >= -bb2) || (aa2 == bb2 && aa2 >= 0)) + printf("%4g %4g %4g %4g %10.6g\n", + j1/2.0,aa2/2.0,j2/2.0,bb2/2.0,cgtmp); + idxcg_count++; + } + } + } + } +} + +/* ---------------------------------------------------------------------- + pre-compute table of sqrt[p/m2], p, q = 1,twojmax + the p = 0, q = 0 entries are allocated and skipped for convenience. +------------------------------------------------------------------------- */ + +void SNAIntel::init_rootpqarray() +{ + for (int p = 1; p <= twojmax; p++) + for (int q = 1; q <= twojmax; q++) + rootpqarray[p][q] = sqrt(static_cast(p)/q); +} + +/* ---------------------------------------------------------------------- */ + +void SNAIntel::compute_ncoeff() +{ + int ncount; + + ncount = 0; + + for (int j1 = 0; j1 <= twojmax; j1++) + for (int j2 = 0; j2 <= j1; j2++) + for (int j = j1 - j2; + j <= MIN(twojmax, j1 + j2); j += 2) + if (j >= j1) ncount++; + + ndoubles = nelements*nelements; + ntriples = nelements*nelements*nelements; + if (chem_flag) + ncoeff = ncount*ntriples; + else + ncoeff = ncount; +} + +/* ---------------------------------------------------------------------- */ + +double SNAIntel::compute_sfac(double r, double rcut, double sinner, double dinner) +{ + double sfac; + + // calculate sfac = sfac_outer + + if (switch_flag == 0) sfac = 1.0; + else if (r <= rmin0) sfac = 1.0; + else if (r > rcut) sfac = 0.0; + else { + double rcutfac = MY_PI / (rcut - rmin0); + sfac = 0.5 * (cos((r - rmin0) * rcutfac) + 1.0); + } + + // calculate sfac *= sfac_inner, rarely visited + + if (switch_inner_flag == 1 && r < sinner + dinner) { + if (r > sinner - dinner) { + double rcutfac = MY_PI2 / dinner; + sfac *= 0.5 * (1.0 - cos(MY_PI2 + (r - sinner) * rcutfac)); + } else sfac = 0.0; + } + + return sfac; +} + +/* ---------------------------------------------------------------------- */ + +SNA_DVEC SNAIntel::compute_sfac(const SNA_DVEC &r, const SNA_DVEC &rcut, + const SNA_DVEC &sinner, const SNA_DVEC &dinner) +{ + // calculate sfac = sfac_outer + + // if (switch_flag == 0 || r <= rmin0) + SNA_DVEC sfac = SIMD_set(1.0); + if (switch_flag != 0) { + // r <= rcut && r > rmin0 + const SIMD_mask i(r > rmin0); + const SIMD_mask m(r <= rcut); + const SNA_DVEC rcutfac = SIMD_rcp(rcut - rmin0) * MY_PI; + const SNA_DVEC sfac_m = (SIMD_cos((r - rmin0) * rcutfac) + 1.0) * 0.5; + sfac = SIMD_set(sfac, m & i, sfac_m); + // (r > rcut) && (r> rmin0) + sfac = SIMD_zero_masked(m | i, sfac); + } + + // calculate sfac *= sfac_inner, rarely visited + + if (switch_inner_flag == 1) { + const SIMD_mask m(r < sinner + dinner); + // if any(m) + const SIMD_mask i(r > sinner - dinner); + const SNA_DVEC rcutfac = SIMD_rcp(dinner) * MY_PI2; + const SNA_DVEC sfac_m = (SIMD_set(1.0) - SIMD_cos((r-sinner) * rcutfac + + MY_PI2)) * 0.5; + sfac = SIMD_set(sfac, m & i, sfac_m); + sfac = SIMD_zero_masked((~m) | i, sfac); + } + + return sfac; +} + +/* ---------------------------------------------------------------------- */ + +SNA_DVEC SNAIntel::compute_sfac_dsfac(const SNA_DVEC & r, + const SNA_DVEC & rcut, + const SNA_DVEC & sinner, + const SNA_DVEC & dinner, + SNA_DVEC &dsfac) +{ + // calculate sfac = sfac_outer + + // if (switch_flag == 0 || r <= rmin0) + SNA_DVEC sfac = SIMD_set(1.0); + dsfac = SIMD_set(0.0); + if (switch_flag != 0) { + // r <= rcut && r > rmin0 + const SIMD_mask i(r > rmin0); + const SIMD_mask m(r <= rcut); + const SNA_DVEC rcutfac = SIMD_rcp(rcut - rmin0) * MY_PI; + const SNA_DVEC trig_arg = (r - rmin0) * rcutfac; + const SNA_DVEC sfac_m = (SIMD_cos(trig_arg) + 1.0) * 0.5; + const SNA_DVEC dsfac_m = SIMD_sin(trig_arg) * rcutfac * -0.5; + sfac = SIMD_set(sfac, m & i, sfac_m); + dsfac = SIMD_set(dsfac, m & i, dsfac_m); + // (r > rcut) && (r> rmin0) + sfac = SIMD_zero_masked(m | i, sfac); + } + + // calculate sfac *= sfac_inner, rarely visited + + if (switch_inner_flag == 1) { + const SIMD_mask m(r < sinner + dinner); + const SIMD_mask i(r > sinner - dinner); + if (any(m & i)) { + const SNA_DVEC rcutfac = SIMD_rcp(dinner) * MY_PI2; + const SNA_DVEC trig_arg = (r - sinner) * rcutfac + MY_PI2; + const SNA_DVEC sfac_inner = (SIMD_set(1.0) - SIMD_cos(trig_arg)) * 0.5; + const SNA_DVEC dsfac_inner = rcutfac * 0.5 * SIMD_sin(trig_arg); + dsfac = SIMD_set(dsfac, m & i, dsfac * sfac_inner + + sfac * dsfac_inner); + sfac = SIMD_set(sfac, m & i, sfac_inner); + } + sfac = SIMD_zero_masked((~m) | i, sfac); + dsfac = SIMD_zero_masked((~m) | i, dsfac); + } + + return sfac; +} + +template void SNAIntel::compute_zi_or_yi<1>(const SNA_DVEC *); +template void SNAIntel::compute_zi_or_yi<0>(const SNA_DVEC *); + +#endif +#endif diff --git a/src/INTEL/sna_intel.h b/src/INTEL/sna_intel.h new file mode 100644 index 0000000000..7900dee51b --- /dev/null +++ b/src/INTEL/sna_intel.h @@ -0,0 +1,187 @@ +/* -*- c++ -*- ------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: W. Michael Brown, Intel +------------------------------------------------------------------------- */ + +#ifndef LMP_SNA_INTEL_H +#define LMP_SNA_INTEL_H + +#if defined(__AVX512F__) +#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER) + +#include "pointers.h" +#include "intel_buffers.h" +#include "intel_simd.h" + +#define SVW 8 + +#if defined(LMP_SIMD_COMPILER) +#if defined(USE_OMP_SIMD) +#define SV_for _Pragma("omp simd") _Pragma("vector aligned") for +#else +#define SV_for _Pragma("simd assert") _Pragma("vector aligned") for +#endif +#else +#define SV_for for +#endif + +namespace LAMMPS_NS { + +struct SNA_ZINDICES { + int j1, j2, j, ma1min, ma2max, mb1min; + int mb2max, na, nb, jju; +}; + +struct SNA_BINDICES { + int j1, j2, j; +}; + +#define SNA_DVEC ip_simd::SIMD_double +#define SNA_IVEC ip_simd::SIMD256_int + +class SNAIntel : protected Pointers { + + public: + SNAIntel(LAMMPS *, double, int, double, int, int, int, int, int, int, int); + + SNAIntel(LAMMPS *lmp) : Pointers(lmp){}; + ~SNAIntel() override; + void build_indexlist(); + void init(); + double memory_usage(); + + int ncoeff; + + inline int vector_width() const { return SVW; } + + // functions for bispectrum coefficients + + void compute_ui(const SNA_IVEC &, const SNA_IVEC &, const int max_jnum); + template void compute_zi_or_yi(const SNA_DVEC *); + void compute_yi_from_zi(const SNA_DVEC *); + void compute_yterm(int, int, int, const double *); + void compute_bi(const SNA_IVEC &); + + // functions for derivatives + + void compute_duidrj(const int, const SNA_IVEC &); + void compute_deidrj_e(const int, const SNA_IVEC &, SNA_DVEC *); + void compute_deidrj(const int, const SNA_IVEC &, SNA_DVEC *); + double compute_sfac(double, double, double, double); + SNA_DVEC compute_sfac(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &, + const SNA_DVEC &); + inline SNA_DVEC compute_sfac_dsfac(const SNA_DVEC &, const SNA_DVEC &, + const SNA_DVEC &, const SNA_DVEC &, + SNA_DVEC &); + + // public bispectrum data + + int twojmax; + SNA_DVEC *blist; + double **dblist; + + // short neighbor list data + + void grow_rij(int); + int nmax; // allocated size of short lists + + SNA_DVEC **rij; // short rij list + SNA_IVEC *inside; // short neighbor list + SNA_DVEC *wj; // short weight list + SNA_DVEC *rcutij; // short cutoff list + + // only allocated for switch_inner_flag=1 + + SNA_DVEC *sinnerij; // short inner cutoff midpoint list + SNA_DVEC *dinnerij; // short inner half-width list + + // only allocated for chem_flag=1 + + SNA_IVEC *element; // short element list [0,nelements) + + private: + double rmin0, rfac0; + + // data for bispectrum coefficients + + SNA_ZINDICES *idxz; + SNA_BINDICES *idxb; + + double **rootpqarray; + double *cglist; + int ***idxcg_block; + + SNA_DVEC *ulisttot_r, *ulisttot_i; + SNA_DVEC **ulist_r_ij, **ulist_i_ij; // short u list + int *idxu_block; + + SNA_DVEC *zlist_r, *zlist_i; + int ***idxz_block; + + int ***idxb_block; + + SNA_DVEC **dulist_r, **dulist_i; + + SNA_DVEC *ylist_r, *ylist_i; + int idxcg_max, idxu_max, idxz_max, idxb_max; + + void create_twojmax_arrays(); + void destroy_twojmax_arrays(); + void init_clebsch_gordan(); + void print_clebsch_gordan(); + void init_rootpqarray(); + void zero_uarraytot(const SNA_IVEC &); + void add_uarraytot(const SNA_DVEC &, const int, const SNA_IVEC &); + void compute_uarray(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &, + const SNA_DVEC &, const SNA_DVEC &, const int, + const SNA_IVEC &); + double deltacg(int, int, int); + void compute_ncoeff(); + void compute_duarray(const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &, + const SNA_DVEC &, const SNA_DVEC &, const SNA_DVEC &, + const SNA_DVEC &, const SNA_DVEC &, int, + const SNA_IVEC &); + inline double choose_beta(const int, const int, const int, + const int, const int, const int, int &); + + // Sets the style for the switching function + // 0 = none + // 1 = cosine + int switch_flag; + + // Sets the style for the inner switching function + // 0 = none + // 1 = cosine + int switch_inner_flag; + + // Self-weight + double wself; + + int bzero_flag; // 1 if bzero subtracted from barray + double *bzero; // array of B values for isolated atoms + int bnorm_flag; // 1 if barray divided by j+1 + int chem_flag; // 1 for multi-element bispectrum components + int wselfall_flag; // 1 for adding wself to all element labelings + int nelements; // number of elements + int ndoubles; // number of multi-element pairs + int ntriples; // number of multi-element triplets +}; + +} // namespace LAMMPS_NS + +#endif +#endif + +#endif