lammps/src/INTEL/sna_intel.cpp

// clang-format off
/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   LAMMPS development team: developers@lammps.org

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: W. Michael Brown, Intel
------------------------------------------------------------------------- */

#if defined(__AVX512F__)
#if defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)

#include "sna_intel.h"

#include "comm.h"
#include "error.h"
#include "math_const.h"
#include "math_special.h"
#include "memory.h"

#include <cmath>

using namespace std;
using namespace LAMMPS_NS;
using namespace MathConst;
using namespace MathSpecial;
using namespace ip_simd;

/* ----------------------------------------------------------------------

   this implementation is based on the method outlined
   in Bartok[1], using formulae from VMK[2].

   for the Clebsch-Gordan coefficients, we
   convert the VMK half-integral labels
   a, b, c, alpha, beta, gamma
   to array offsets j1, j2, j, m1, m2, m
   using the following relations:

   j1 = 2*a
   j2 = 2*b
   j =  2*c

   m1 = alpha+a      2*alpha = 2*m1 - j1
   m2 = beta+b    or 2*beta = 2*m2 - j2
   m =  gamma+c      2*gamma = 2*m - j

   in this way:

   -a <= alpha <= a
   -b <= beta <= b
   -c <= gamma <= c

   becomes:

   0 <= m1 <= j1
   0 <= m2 <= j2
   0 <= m <= j

   and the requirement that
   a+b+c be integral implies that
   j1+j2+j must be even.
   The requirement that:

   gamma = alpha+beta

   becomes:

   2*m - j = 2*m1 - j1 + 2*m2 - j2

   Similarly, for the Wigner U-functions U(J,m,m') we
   convert the half-integral labels J,m,m' to
   array offsets j,ma,mb:

   j = 2*J
   ma = J+m
   mb = J+m'

   so that:

   0 <= j <= 2*Jmax
   0 <= ma, mb <= j.

   For the bispectrum components B(J1,J2,J) we convert to:

   j1 = 2*J1
   j2 = 2*J2
   j = 2*J

   and the requirement:

   |J1-J2| <= J <= J1+J2, for j1+j2+j integral

   becomes:

   |j1-j2| <= j <= j1+j2, for j1+j2+j even integer

   or

   j = |j1-j2|, |j1-j2|+2,...,j1+j2-2,j1+j2

   [1] Albert Bartok-Partay, "Gaussian Approximation..."
   Doctoral Thesis, Cambridge University, (2009)

   [2] D. A. Varshalovich, A. N. Moskalev, and V. K. Khersonskii,
   "Quantum Theory of Angular Momentum," World Scientific (1988)

------------------------------------------------------------------------- */

SNAIntel::SNAIntel(LAMMPS* lmp, double rfac0_in, int twojmax_in,
                   double rmin0_in, int switch_flag_in, int bzero_flag_in,
                   int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in,
                   int nelements_in, int switch_inner_flag_in) : Pointers(lmp)
{
  wself = 1.0;

  rfac0 = rfac0_in;
  rmin0 = rmin0_in;
  switch_flag = switch_flag_in;
  switch_inner_flag = switch_inner_flag_in;
  bzero_flag = bzero_flag_in;
  chem_flag = chem_flag_in;
  bnorm_flag = bnorm_flag_in;
  wselfall_flag = wselfall_flag_in;

  if (bnorm_flag != chem_flag)
    lmp->error->warning(FLERR, "bnormflag and chemflag are not equal."
                        "This is probably not what you intended");

  if (chem_flag)
    nelements = nelements_in;
  else
    nelements = 1;

  twojmax = twojmax_in;

  compute_ncoeff();

  rij = nullptr;
  inside = nullptr;
  wj = nullptr;
  rcutij = nullptr;
  sinnerij = nullptr;
  dinnerij = nullptr;
  element = nullptr;
  nmax = 0;
  idxz = nullptr;
  idxb = nullptr;
  ulist_r_ij = nullptr;
  ulist_i_ij = nullptr;

  build_indexlist();
  create_twojmax_arrays();

  if (bzero_flag) {
    double www = wself*wself*wself;
    for (int j = 0; j <= twojmax; j++)
      if (bnorm_flag)
        bzero[j] = www;
      else
        bzero[j] = www*(j+1);
  }

}

/* ---------------------------------------------------------------------- */

SNAIntel::~SNAIntel()
{
  memory->destroy(rij);
  memory->destroy(inside);
  memory->destroy(wj);
  memory->destroy(rcutij);
  memory->destroy(sinnerij);
  memory->destroy(dinnerij);
  if (chem_flag) memory->destroy(element);
  memory->destroy(ulist_r_ij);
  memory->destroy(ulist_i_ij);
  delete[] idxz;
  delete[] idxb;
  destroy_twojmax_arrays();
}

void SNAIntel::build_indexlist()
{

  // index list for cglist

  int jdim = twojmax + 1;
  memory->create(idxcg_block, jdim, jdim, jdim,
                 "sna:idxcg_block");

  int idxcg_count = 0;
  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
        idxcg_block[j1][j2][j] = idxcg_count;
        for (int m1 = 0; m1 <= j1; m1++)
          for (int m2 = 0; m2 <= j2; m2++)
            idxcg_count++;
      }
  idxcg_max = idxcg_count;

  // index list for uarray
  // need to include both halves

  memory->create(idxu_block, jdim,
                 "sna:idxu_block");

  int idxu_count = 0;

  for (int j = 0; j <= twojmax; j++) {
    idxu_block[j] = idxu_count;
    for (int mb = 0; mb <= j; mb++)
      for (int ma = 0; ma <= j; ma++)
        idxu_count++;
  }
  idxu_max = idxu_count;

  // index list for beta and B

  int idxb_count = 0;
  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
        if (j >= j1) idxb_count++;

  idxb_max = idxb_count;
  idxb = new SNA_BINDICES[idxb_max];

  idxb_count = 0;
  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
        if (j >= j1) {
          idxb[idxb_count].j1 = j1;
          idxb[idxb_count].j2 = j2;
          idxb[idxb_count].j = j;
          idxb_count++;
        }

  // reverse index list for beta and b

  memory->create(idxb_block, jdim, jdim, jdim,
                 "sna:idxb_block");
  idxb_count = 0;
  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
        if (j >= j1) {
          idxb_block[j1][j2][j] = idxb_count;
          idxb_count++;
        }
      }

  // index list for zlist

  int idxz_count = 0;

  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2)
        for (int mb = 0; 2*mb <= j; mb++)
          for (int ma = 0; ma <= j; ma++)
            idxz_count++;

  idxz_max = idxz_count;
  idxz = new SNA_ZINDICES[idxz_max];

  memory->create(idxz_block, jdim, jdim, jdim,
                 "sna:idxz_block");

  idxz_count = 0;
  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
        idxz_block[j1][j2][j] = idxz_count;

        // find right beta[jjb] entry
        // multiply and divide by j+1 factors
        // account for multiplicity of 1, 2, or 3

        for (int mb = 0; 2*mb <= j; mb++)
          for (int ma = 0; ma <= j; ma++) {
            idxz[idxz_count].j1 = j1;
            idxz[idxz_count].j2 = j2;
            idxz[idxz_count].j = j;
            idxz[idxz_count].ma1min = MAX(0, (2 * ma - j - j2 + j1) / 2);
            idxz[idxz_count].ma2max = (2 * ma - j - (2 * idxz[idxz_count].ma1min - j1) + j2) / 2;
            idxz[idxz_count].na = MIN(j1, (2 * ma - j + j2 + j1) / 2) - idxz[idxz_count].ma1min + 1;
            idxz[idxz_count].mb1min = MAX(0, (2 * mb - j - j2 + j1) / 2);
            idxz[idxz_count].mb2max = (2 * mb - j - (2 * idxz[idxz_count].mb1min - j1) + j2) / 2;
            idxz[idxz_count].nb = MIN(j1, (2 * mb - j + j2 + j1) / 2) - idxz[idxz_count].mb1min + 1;
            // apply to z(j1,j2,j,ma,mb) to unique element of y(j)

            const int jju = idxu_block[j] + (j+1)*mb + ma;
            idxz[idxz_count].jju = jju;

            idxz_count++;
          }
      }
}

/* ---------------------------------------------------------------------- */

void SNAIntel::init()
{
  init_clebsch_gordan();
  //   print_clebsch_gordan();
  init_rootpqarray();
}

void SNAIntel::grow_rij(int newnmax)
{
  if (newnmax <= nmax) return;

  nmax = newnmax;

  memory->destroy(rij);
  memory->destroy(inside);
  memory->destroy(wj);
  memory->destroy(rcutij);
  memory->destroy(sinnerij);
  memory->destroy(dinnerij);
  if (chem_flag) memory->destroy(element);
  memory->destroy(ulist_r_ij);
  memory->destroy(ulist_i_ij);
  memory->create(rij, nmax, 3, "pair:rij");
  memory->create(inside, nmax, "pair:inside");
  memory->create(wj, nmax, "pair:wj");
  memory->create(rcutij, nmax, "pair:rcutij");
  memory->create(sinnerij, nmax, "pair:sinnerij");
  memory->create(dinnerij, nmax, "pair:dinnerij");
  if (chem_flag) memory->create(element, nmax, "sna:element");
  memory->create(ulist_r_ij, nmax, idxu_max, "sna:ulist_ij");
  memory->create(ulist_i_ij, nmax, idxu_max, "sna:ulist_ij");
}

/* ----------------------------------------------------------------------
   compute Ui by summing over neighbors j
------------------------------------------------------------------------- */

void SNAIntel::compute_ui(const SNA_IVEC &jnum, const SNA_IVEC &ielem,
                          const int max_jnum)
{
  // utot(j,ma,mb) = 0 for all j,ma,ma
  // utot(j,ma,ma) = 1 for all j,ma
  // for j in neighbors of i:
  //   compute r0 = (x,y,z,z0)
  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb

  zero_uarraytot(ielem);

  for (int j = 0; j < max_jnum; j++) {
    const SNA_DVEC x = rij[j][0];
    const SNA_DVEC y = rij[j][1];
    const SNA_DVEC z = rij[j][2];
    const SNA_DVEC rcut = rcutij[j];
    const SNA_DVEC rsq = x * x + y * y + z * z;
    const SNA_DVEC r = SIMD_sqrt(rsq);
    const SNA_DVEC rscale0 = SIMD_rcp(rcut - rmin0) * rfac0 * MY_PI;
    const SNA_DVEC theta0 = (r - rmin0) * rscale0;
    const SNA_DVEC z0 = r * SIMD_rcp(SIMD_tan(theta0));

    compute_uarray(x, y, z, z0, r, j, jnum);
    add_uarraytot(r, j, jnum);
  }

}

/* ----------------------------------------------------------------------
   pick out right beta value
------------------------------------------------------------------------- */

double SNAIntel::choose_beta(const int j, const int j1, const int j2,
                             const int elem1, const int elem2, const int elem3,
                             int &itriple)
{
  double bfactor;
  if (j >= j1) {
    const int jjb = idxb_block[j1][j2][j];
    itriple = ((elem1 * nelements + elem2) * nelements + elem3) *
      idxb_max + jjb;
    if (j1 == j) {
      if (j2 == j)
        bfactor = 3.0;
      else
        bfactor = 2.0;
    } else
      bfactor = 1.0;
  } else if (j >= j2) {
    const int jjb = idxb_block[j][j2][j1];
    itriple = ((elem3 * nelements + elem2) * nelements + elem1) *
      idxb_max + jjb;
    if (j2 == j)
      bfactor = 2.0;
    else
      bfactor = 1.0;
  } else {
    const int jjb = idxb_block[j2][j][j1];
    itriple = ((elem2 * nelements + elem3) * nelements + elem1) *
      idxb_max + jjb;
    bfactor = 1.0;
  }

  if (!bnorm_flag && j1 > j)
    bfactor *= (1.0 + j1) / (1.0 + j);

  return bfactor;
}

/* ----------------------------------------------------------------------
   compute Yi from Ui without storing Zi, looping over zlist indices
------------------------------------------------------------------------- */

template <int COMPUTE_YI>
void SNAIntel::compute_zi_or_yi(const SNA_DVEC* beta)
{
  if (COMPUTE_YI) {
    memset(ylist_r,0,idxu_max*nelements*sizeof(SNA_DVEC));
    memset(ylist_i,0,idxu_max*nelements*sizeof(SNA_DVEC));
  }

  double *zlist_rp = (double *)zlist_r;
  double *zlist_ip = (double *)zlist_i;

  int zlist_i = 0;

  for (int elem1 = 0; elem1 < nelements; elem1++)
    for (int elem2 = 0; elem2 < nelements; elem2++) {
      for (int jjz = 0; jjz < idxz_max; jjz++) {
        const int j1 = idxz[jjz].j1;
        const int j2 = idxz[jjz].j2;
        const int j = idxz[jjz].j;
        const int ma1min = idxz[jjz].ma1min;
        const int ma2max = idxz[jjz].ma2max;
        const int na = idxz[jjz].na;
        const int mb1min = idxz[jjz].mb1min;
        const int mb2max = idxz[jjz].mb2max;
        const int nb = idxz[jjz].nb;

        const double *cgblock = cglist + idxcg_block[j1][j2][j];

        SNA_DVEC ztmp_r = 0.0;
        SNA_DVEC ztmp_i = 0.0;

        const double *u_r = (double *)ulisttot_r;
        const double *u_i = (double *)ulisttot_i;

        int jju1 = elem1 * idxu_max + idxu_block[j1] + (j1 + 1) * mb1min;
        int jju2 = elem2 * idxu_max + idxu_block[j2] + (j2 + 1) * mb2max;
        jju1 *= vector_width();
        jju2 *= vector_width();
        int icgb = mb1min * (j2 + 1) + mb2max;
        for (int ib = 0; ib < nb; ib++) {

          SNA_DVEC suma1_r = 0.0;
          SNA_DVEC suma1_i = 0.0;

          int ma1 = ma1min * vector_width();
          int ma2 = ma2max * vector_width();
          int icga = ma1min * (j2 + 1) + ma2max;

          for (int ia = 0; ia < na; ia++) {
            const SNA_DVEC u1_r = SIMD_load(u_r + jju1 + ma1);
            const SNA_DVEC u2_r = SIMD_load(u_r + jju2 + ma2);
            const SNA_DVEC u1_i = SIMD_load(u_i + jju1 + ma1);
            const SNA_DVEC u2_i = SIMD_load(u_i + jju2 + ma2);
            suma1_r += (u1_r*u2_r - u1_i*u2_i) * cgblock[icga];
            suma1_i += (u1_r*u2_i + u1_i*u2_r) * cgblock[icga];
            ma1+= vector_width();
            ma2-= vector_width();
            icga += j2;
          } // end loop over ia

          ztmp_r += suma1_r * cgblock[icgb];
          ztmp_i += suma1_i * cgblock[icgb];

          jju1 += (j1 + 1) * vector_width();
          jju2 -= (j2 + 1) * vector_width();
          icgb += j2;
        } // end loop over ib

        // apply to z(j1,j2,j,ma,mb) to unique element of y(j)
        // find right y_list[jju] and beta[jjb] entries
        // multiply and divide by j+1 factors
        // account for multiplicity of 1, 2, or 3

        if (bnorm_flag) {
          ztmp_i *= SIMD_rcp(SIMD_set(static_cast<double>(j+1)));
          ztmp_r *= SIMD_rcp(SIMD_set(static_cast<double>(j+1)));
        }

        if (COMPUTE_YI) {
          int jju = idxz[jjz].jju;
          for (int elem3 = 0; elem3 < nelements; elem3++) {
            int itriple;
            double bfactor = choose_beta(j, j1, j2, elem1, elem2, elem3,
                                         itriple);
            const SNA_DVEC betaj = beta[itriple] * bfactor;
            const int i = elem3 * idxu_max + jju;
            SIMD_store(&(ylist_r[i]), SIMD_load(ylist_r + i) + betaj * ztmp_r);
            SIMD_store(&(ylist_i[i]), SIMD_load(ylist_i + i) + betaj * ztmp_i);
          }
        } else {
          SIMD_store(zlist_rp + zlist_i, ztmp_r);
          SIMD_store(zlist_ip + zlist_i, ztmp_i);
          zlist_i += vector_width();
        }
      }// end loop over jjz
    }
}

/* ----------------------------------------------------------------------
   compute Yi from Zi
------------------------------------------------------------------------- */

void SNAIntel::compute_yi_from_zi(const SNA_DVEC* beta)
{
  memset(ylist_r,0,idxu_max*nelements*sizeof(SNA_DVEC));
  memset(ylist_i,0,idxu_max*nelements*sizeof(SNA_DVEC));

  double *zlist_rp = (double *)zlist_r;
  double *zlist_ip = (double *)zlist_i;

  int zlist_i = 0;

  for (int elem1 = 0; elem1 < nelements; elem1++)
    for (int elem2 = 0; elem2 < nelements; elem2++) {
      for (int jjz = 0; jjz < idxz_max; jjz++) {
        const int j1 = idxz[jjz].j1;
        const int j2 = idxz[jjz].j2;
        const int j = idxz[jjz].j;

        const SNA_DVEC ztmp_r = SIMD_load(zlist_rp + zlist_i);
        const SNA_DVEC ztmp_i = SIMD_load(zlist_ip + zlist_i);
        zlist_i += vector_width();

        int jju = idxz[jjz].jju;
        for (int elem3 = 0; elem3 < nelements; elem3++) {
          int itriple;
          double bfactor = choose_beta(j, j1, j2, elem1, elem2, elem3,
                                       itriple);
          const SNA_DVEC betaj = beta[itriple] * bfactor;
          const int i = elem3 * idxu_max + jju;
          SIMD_store(&(ylist_r[i]), SIMD_load(ylist_r + i) + betaj * ztmp_r);
          SIMD_store(&(ylist_i[i]), SIMD_load(ylist_i + i) + betaj * ztmp_i);
        }
      } // end loop over jjz
    }
}

/* ----------------------------------------------------------------------
   compute dEidRj
------------------------------------------------------------------------- */

void SNAIntel::compute_deidrj_e(const int jj, const SNA_IVEC &jnum,
                                SNA_DVEC* dedr)
{
  double *ylist_rp = (double *)ylist_r;
  double *ylist_ip = (double *)ylist_i;
  double *dulist_rp = (double *)(dulist_r[0]);
  double *dulist_ip = (double *)(dulist_i[0]);

  for (int k = 0; k < 3; k++)
    dedr[k] = SIMD_set(0.0);

  SNA_IVEC jelem;
  if (chem_flag) jelem = SIMD_load(element + jj);
  else jelem = SIMD256_set(0);

  SIMD_mask m(jj < jnum);

  for (int j = 0; j <= twojmax; j++) {
    int jju = idxu_block[j] * vector_width();
    int jju3 = jju * 3;
    SNA_IVEC i = jelem*idxu_max*vector_width() + jju + SIMD256_count();

    for (int mb = 0; 2*mb < j; mb++)
      for (int ma = 0; ma <= j; ma++) {
        SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i);
        SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i);
        for (int k = 0; k < 3; k++) {
          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
          SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
          dedr[k] = SIMD_add(m, dedr[k], du);
          jju3 += vector_width();
        }
        i = i + vector_width();
      }

    if (j%2 == 0) {
      int mb = j / 2;
      for (int ma = 0; ma < mb; ma++) {
        SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i);
        SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i);
        for (int k = 0; k < 3; k++) {
          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
          SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
          dedr[k] = SIMD_add(m, dedr[k], du);
          jju3 += vector_width();
        }
        i = i + vector_width();
      }

      SNA_DVEC jjjmambyarray_r = SIMD_gather(m, ylist_rp, i);
      SNA_DVEC jjjmambyarray_i = SIMD_gather(m, ylist_ip, i);
      for (int k = 0; k < 3; k++) {
        SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
        SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
        SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
        dedr[k] = SIMD_fma(m, SIMD_set(0.5), du, dedr[k]);
        jju3 += vector_width();
      }
    } // if j%2
  } // for j

  for (int k = 0; k < 3; k++)
    dedr[k] = dedr[k] * 2.0;
}

/* ----------------------------------------------------------------------
   compute dEidRj
------------------------------------------------------------------------- */

void SNAIntel::compute_deidrj(const int jj, const SNA_IVEC &jnum,
                              SNA_DVEC* dedr)
{
  double *ylist_rp = (double *)ylist_r;
  double *ylist_ip = (double *)ylist_i;
  double *dulist_rp = (double *)(dulist_r[0]);
  double *dulist_ip = (double *)(dulist_i[0]);

  for (int k = 0; k < 3; k++)
    dedr[k] = SIMD_set(0.0);

  SIMD_mask m(jj < jnum);

  for (int j = 0; j <= twojmax; j++) {
    int jju = idxu_block[j] * vector_width();
    int jju3 = jju * 3;

    for (int mb = 0; 2*mb < j; mb++)
      for (int ma = 0; ma <= j; ma++) {
        SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju);
        SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju);
        for (int k = 0; k < 3; k++) {
          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
          SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
          dedr[k] = SIMD_add(m, dedr[k], du);
          jju3 += vector_width();
        }
        jju += vector_width();
      }

    if (j%2 == 0) {
      int mb = j / 2;
      for (int ma = 0; ma < mb; ma++) {
        SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju);
        SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju);
        for (int k = 0; k < 3; k++) {
          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
          SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
          dedr[k] = SIMD_add(m, dedr[k], du);
          jju3 += vector_width();
        }
        jju += vector_width();
      }

      SNA_DVEC jjjmambyarray_r = SIMD_load(ylist_rp + jju);
      SNA_DVEC jjjmambyarray_i = SIMD_load(ylist_ip + jju);
      for (int k = 0; k < 3; k++) {
        SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
        SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
        SNA_DVEC du = du_r * jjjmambyarray_r + du_i * jjjmambyarray_i;
        dedr[k] = SIMD_fma(m, SIMD_set(0.5), du, dedr[k]);
        jju3 += vector_width();
      }
    } // if j%2
  } // for j

  for (int k = 0; k < 3; k++)
    dedr[k] = dedr[k] * 2.0;
}

/* ----------------------------------------------------------------------
   compute Bi by summing conj(Ui)*Zi
------------------------------------------------------------------------- */

void SNAIntel::compute_bi(const SNA_IVEC &ielem) {
  // for j1 = 0,...,twojmax
  //   for j2 = 0,twojmax
  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
  //        b(j1,j2,j) = 0
  //        for mb = 0,...,jmid
  //          for ma = 0,...,j
  //            b(j1,j2,j) +=
  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)

  double *ulisttot_rp = (double *)ulisttot_r;
  double *ulisttot_ip = (double *)ulisttot_i;
  double *blistp = (double *)blist;

  int itriple = 0;
  int idouble = 0;
  for (int elem1 = 0; elem1 < nelements; elem1++)
    for (int elem2 = 0; elem2 < nelements; elem2++) {

      double *zlist_rp = (double *)(zlist_r + idouble*idxz_max);
      double *zlist_ip = (double *)(zlist_i + idouble*idxz_max);

      for (int elem3 = 0; elem3 < nelements; elem3++) {
        for (int jjb = 0; jjb < idxb_max; jjb++) {
          const int j1 = idxb[jjb].j1;
          const int j2 = idxb[jjb].j2;
          const int j = idxb[jjb].j;

          int jjz = idxz_block[j1][j2][j] * vector_width();
          int jju = (elem3 * idxu_max + idxu_block[j]) * vector_width();
          SNA_DVEC sumzu(0.0);
          for (int mb = 0; 2 * mb < j; mb++)
            for (int ma = 0; ma <= j; ma++) {
              const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju);
              const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju);
              const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz);
              const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz);
              sumzu = sumzu + utot_r * z_r + utot_i * z_i;
              jjz += vector_width();
              jju += vector_width();
            } // end loop over ma, mb

          // For j even, handle middle column

          if (j % 2 == 0) {
            int mb = j / 2;
            for (int ma = 0; ma < mb; ma++) {
              const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju);
              const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju);
              const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz);
              const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz);
              sumzu = sumzu + utot_r * z_r + utot_i * z_i;
              jjz += vector_width();
              jju += vector_width();
            }

            const SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju);
            const SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju);
            const SNA_DVEC z_r = SIMD_load(zlist_rp + jjz);
            const SNA_DVEC z_i = SIMD_load(zlist_ip + jjz);
            sumzu = sumzu + (utot_r * z_r + utot_i * z_i) * 0.5;
          } // end if jeven

          SIMD_store(blistp + (itriple*idxb_max+jjb) * vector_width(),
                     sumzu * 2.0);
        }
        itriple++;
      }
      idouble++;
    }

  // apply bzero shift

  if (bzero_flag) {
    if (!wselfall_flag) {
      SNA_IVEC itriplev = (ielem*nelements+ielem)*nelements+ielem;
      for (int jjb = 0; jjb < idxb_max; jjb++) {
        const int j = idxb[jjb].j;
        SNA_IVEC i = (itriplev*idxb_max+jjb) * vector_width() + SIMD256_count();
        SIMD_scatter(blistp, i, SIMD_gather(blistp, i) - bzero[j]);
      } // end loop over JJ
    } else {
      int itriple = 0;
      for (int elem1 = 0; elem1 < nelements; elem1++)
        for (int elem2 = 0; elem2 < nelements; elem2++) {
          for (int elem3 = 0; elem3 < nelements; elem3++) {
            for (int jjb = 0; jjb < idxb_max; jjb++) {
              const int j = idxb[jjb].j;
              int i = (itriple*idxb_max+jjb) * vector_width();
              SIMD_store(blistp + i, SIMD_load(blistp + i) - bzero[j]);
            } // end loop over JJ
            itriple++;
          } // end loop over elem3
        } // end loop over elem1,elem2
    }
  }
}

/* ----------------------------------------------------------------------
   calculate derivative of Ui w.r.t. atom j
------------------------------------------------------------------------- */

void SNAIntel::compute_duidrj(const int jj, const SNA_IVEC &jnum)
{
  const SNA_DVEC x = rij[jj][0];
  const SNA_DVEC y = rij[jj][1];
  const SNA_DVEC z = rij[jj][2];
  const SNA_DVEC rcut = rcutij[jj];
  const SNA_DVEC rsq = x * x + y * y + z * z;
  const SNA_DVEC r = SIMD_sqrt(rsq);
  const SNA_DVEC rscale0 = SIMD_rcp(rcut - rmin0) * rfac0 * MY_PI;
  const SNA_DVEC theta0 = (r - rmin0) * rscale0;
  const SNA_DVEC z0 = r * SIMD_rcp(SIMD_tan(theta0));
  const SNA_DVEC dz0dr = z0 * SIMD_rcp(r) - (r*rscale0) * (rsq + z0 * z0) *
    SIMD_rcp(rsq);
  compute_duarray(x, y, z, z0, r, dz0dr, wj[jj], rcut, jj, jnum);
}

/* ---------------------------------------------------------------------- */

void SNAIntel::zero_uarraytot(const SNA_IVEC &ielem)
{
  double *ulisttot_rp = (double *)ulisttot_r;
  double *ulisttot_ip = (double *)ulisttot_i;
  for (int jelem = 0; jelem < nelements; jelem++)
    for (int j = 0; j <= twojmax; j++) {
      int jju = (jelem * idxu_max + idxu_block[j]) * vector_width();
      for (int mb = 0; mb <= j; mb++) {
        for (int ma = 0; ma <= j; ma++) {
          SIMD_store(ulisttot_rp + jju, SIMD_set(0.0));
          SIMD_store(ulisttot_ip + jju, SIMD_set(0.0));

          // utot(j,ma,ma) = wself, sometimes
          if (ma == mb) {
            if (wselfall_flag || nelements == 1)
              SIMD_store(ulisttot_rp + jju, SIMD_set(wself));
            else {
              SIMD_mask m(ielem == jelem);
              SIMD_store(ulisttot_rp + jju,
                         SIMD_zero_masked(~m, SIMD_set(wself)));
            }
          }
          jju += vector_width();
        }
      }
    }
}


/* ----------------------------------------------------------------------
   add Wigner U-functions for one neighbor to the total
------------------------------------------------------------------------- */

void SNAIntel::add_uarraytot(const SNA_DVEC &r, const int jj,
                             const SNA_IVEC &jnum)
{
  SNA_DVEC sfac = compute_sfac(r, rcutij[jj], sinnerij[jj], dinnerij[jj]);
  sfac *= wj[jj];

  double *ulisttot_rp = (double *)ulisttot_r;
  double *ulisttot_ip = (double *)ulisttot_i;
  const double* ulist_r = (double *)(ulist_r_ij[jj]);
  const double* ulist_i = (double *)(ulist_i_ij[jj]);

  SIMD_mask m(jj < jnum);

  if (chem_flag && nelements > 1) {
    SNA_IVEC jelem = SIMD_load(element+jj);
    for (int j = 0; j <= twojmax; j++) {
      int jju = idxu_block[j] * vector_width();
      SNA_IVEC i = jelem*idxu_max*vector_width() + jju + SIMD256_count();
      for (int mb = 0; mb <= j; mb++)
        for (int ma = 0; ma <= j; ma++) {
          SNA_DVEC utot_r = SIMD_gather(m, ulisttot_rp, i);
          SNA_DVEC utot_i = SIMD_gather(m, ulisttot_ip, i);
          utot_r = SIMD_fma(m, sfac, SIMD_load(ulist_r + jju), utot_r);
          utot_i = SIMD_fma(m, sfac, SIMD_load(ulist_i + jju), utot_i);
          SIMD_scatter(m, ulisttot_rp, i, utot_r);
          SIMD_scatter(m, ulisttot_ip, i, utot_i);
          jju += vector_width();
          i = i + vector_width();
        }
    }
  } else {
    for (int j = 0; j <= twojmax; j++) {
      int jju = idxu_block[j] * vector_width();
      for (int mb = 0; mb <= j; mb++)
        for (int ma = 0; ma <= j; ma++) {
          SNA_DVEC utot_r = SIMD_load(ulisttot_rp + jju);
          SNA_DVEC utot_i = SIMD_load(ulisttot_ip + jju);
          utot_r = SIMD_fma(m, sfac, SIMD_load(ulist_r + jju), utot_r);
          utot_i = SIMD_fma(m, sfac, SIMD_load(ulist_i + jju), utot_i);
          SIMD_store(ulisttot_rp + jju, utot_r);
          SIMD_store(ulisttot_ip + jju, utot_i);
          jju += vector_width();
        }
    }
  }
}

/* ----------------------------------------------------------------------
   compute Wigner U-functions for one neighbor
------------------------------------------------------------------------- */

void SNAIntel::compute_uarray(const SNA_DVEC &x, const SNA_DVEC &y,
                              const SNA_DVEC &z, const SNA_DVEC &z0,
                              const SNA_DVEC &r, const int jj,
                              const SNA_IVEC &jnum)
{
  // compute Cayley-Klein parameters for unit quaternion

  const SNA_DVEC r0inv = SIMD_invsqrt(r * r + z0 * z0);
  const SNA_DVEC a_r = z0 * r0inv;
  const SNA_DVEC a_i = -z * r0inv;
  const SNA_DVEC b_r = y * r0inv;
  const SNA_DVEC b_i = -x * r0inv;

  // VMK Section 4.8.2

  double *ulist_rp = (double *)(ulist_r_ij[jj]);
  double *ulist_ip = (double *)(ulist_i_ij[jj]);

  SIMD_store(ulist_rp, SIMD_set(1.0));
  SIMD_store(ulist_ip, SIMD_set(0.0));

  for (int j = 1; j <= twojmax; j++) {
    int jju = idxu_block[j] * vector_width();
    int jjup = idxu_block[j-1] * vector_width();

    // fill in left side of matrix layer from previous layer

    for (int mb = 0; 2*mb <= j; mb++) {
      SIMD_store(ulist_rp + jju, SIMD_set(0.0));
      SIMD_store(ulist_ip + jju, SIMD_set(0.0));

      for (int ma = 0; ma < j; ma++) {
        double rootpq = rootpqarray[j - ma][j - mb];
        SNA_DVEC u_r = SIMD_load(ulist_rp + jju);
        SNA_DVEC u_i = SIMD_load(ulist_ip + jju);
        const SNA_DVEC up_r = SIMD_load(ulist_rp + jjup);
        const SNA_DVEC up_i = SIMD_load(ulist_ip + jjup);

        SNA_DVEC u_ro, u_io;

        u_ro = a_r * up_r + a_i * up_i;
        u_r = SIMD_fma(SIMD_set(rootpq), u_ro, u_r);
        SIMD_store(ulist_rp + jju, u_r);
        u_io = a_r * up_i - a_i * up_r;
        u_i = SIMD_fma(SIMD_set(rootpq), u_io, u_i);
        SIMD_store(ulist_ip + jju, u_i);

        jju += vector_width();

        rootpq = -rootpqarray[ma + 1][j - mb];
        u_r = (b_r * up_r + b_i * up_i) * rootpq;
        SIMD_store(ulist_rp + jju, u_r);
        u_i = (b_r * up_i - b_i * up_r) * rootpq;
        SIMD_store(ulist_ip + jju, u_i);

        jjup += vector_width();
      }
      jju += vector_width();
    }

    // copy left side to right side with inversion symmetry VMK 4.4(2)
    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])

    jju = idxu_block[j];
    jjup = (jju+(j+1)*(j+1)-1) * vector_width();
    jju *=  vector_width();
    int mbpar = 1;
    for (int mb = 0; 2*mb <= j; mb++) {
      int mapar = mbpar;
      for (int ma = 0; ma <= j; ma++) {
        if (mapar == 1) {
          SIMD_store(ulist_rp + jjup, SIMD_load(ulist_rp + jju));
          SIMD_store(ulist_ip + jjup, -SIMD_load(ulist_ip + jju));
        } else {
          SIMD_store(ulist_rp + jjup, -SIMD_load(ulist_rp + jju));
          SIMD_store(ulist_ip + jjup, SIMD_load(ulist_ip + jju));
        }
        mapar = -mapar;
        jju += vector_width();
        jjup -= vector_width();
      }
      mbpar = -mbpar;
    }
  }
}

/* ----------------------------------------------------------------------
   Compute derivatives of Wigner U-functions for one neighbor
   see comments in compute_uarray()
------------------------------------------------------------------------- */

void SNAIntel::compute_duarray(const SNA_DVEC &x, const SNA_DVEC &y,
                               const SNA_DVEC &z, const SNA_DVEC &z0,
                               const SNA_DVEC &r, const SNA_DVEC &dz0dr,
                               const SNA_DVEC &wj, const SNA_DVEC &rcut,
                               const int jj, const SNA_IVEC &jnum)
{
  const SNA_DVEC rinv = SIMD_rcp(r);
  const SNA_DVEC r0inv = SIMD_invsqrt(r * r + z0 * z0);
  SNA_DVEC up[3];
  up[0] = x * rinv;
  up[1] = y * rinv;
  up[2] = z * rinv;
  const SNA_DVEC a_r = z0 * r0inv;
  const SNA_DVEC a_i = -z * r0inv;
  const SNA_DVEC b_r = y * r0inv;
  const SNA_DVEC b_i = -x * r0inv;
  const SNA_DVEC dr0invdr = -SIMD_pow(r0inv, 3.0) * (r + z0 * dz0dr);

  SNA_DVEC dr0inv[3], da_r[3], da_i[3];
  for (int k = 0; k < 3; k++) {
    dr0inv[k] = dr0invdr * up[k];
    da_r[k] = dz0dr * up[k] * r0inv + z0 * dr0inv[k];
    da_i[k] = -z * dr0inv[k];
  }
  da_i[2] += -r0inv;

  double *ulist_rp = (double *)(ulist_r_ij[jj]);
  double *ulist_ip = (double *)(ulist_i_ij[jj]);
  double *dulist_rp = (double *)(dulist_r[0]);
  double *dulist_ip = (double *)(dulist_i[0]);

  SNA_DVEC db_r[3], db_i[3];
  for (int k = 0; k < 3; k++) {
    SIMD_store(dulist_rp + k * vector_width(), SIMD_set(0.0));
    SIMD_store(dulist_ip + k * vector_width(), SIMD_set(0.0));
    db_r[k] = y * dr0inv[k];
    db_i[k] = -x * dr0inv[k];
  }
  db_i[0] -= r0inv;
  db_r[1] += r0inv;

  for (int j = 1; j <= twojmax; j++) {
    int jju3 = idxu_block[j] * 3 * vector_width();
    int jjup = idxu_block[j-1] * vector_width();
    int jjup3 = jjup * 3;
    for (int mb = 0; 2*mb <= j; mb++) {
      for (int k = 0; k < 3; k++) {
        SIMD_store(dulist_rp + jju3 + k * vector_width(), SIMD_set(0.0));
        SIMD_store(dulist_ip + jju3 + k * vector_width(), SIMD_set(0.0));
      }

      for (int ma = 0; ma < j; ma++) {
        const double rootpq = rootpqarray[j - ma][j - mb];
        const double mrootpq = -rootpqarray[ma + 1][j - mb];
        const SNA_DVEC up_r = SIMD_load(ulist_rp + jjup);
        const SNA_DVEC up_i = SIMD_load(ulist_ip + jjup);
        for (int k = 0; k < 3; k++) {
          SNA_DVEC du_r = SIMD_load(dulist_rp + jju3);
          SNA_DVEC du_i = SIMD_load(dulist_ip + jju3);
          const SNA_DVEC dup_r = SIMD_load(dulist_rp + jjup3);
          const SNA_DVEC dup_i = SIMD_load(dulist_ip + jjup3);

          SNA_DVEC du_ro, du_io;

          du_ro = (da_r[k]*up_r + da_i[k]*up_i + a_r*dup_r + a_i*dup_i);
          du_r = SIMD_fma(SIMD_set(rootpq), du_ro, du_r);
          SIMD_store(dulist_rp + jju3, du_r);

          du_io = (da_r[k]*up_i - da_i[k]*up_r + a_r*dup_i - a_i*dup_r);
          du_i = SIMD_fma(SIMD_set(rootpq), du_io, du_i);
          SIMD_store(dulist_ip + jju3, du_i);

          du_r = (db_r[k]*up_r + db_i[k]*up_i + b_r*dup_r + b_i*dup_i);
          SIMD_store(dulist_rp + jju3 + 3 * vector_width(), du_r * mrootpq);

          du_i = (db_r[k]*up_i - db_i[k]*up_r + b_r*dup_i - b_i*dup_r);
          SIMD_store(dulist_ip + jju3 + 3 * vector_width(), du_i * mrootpq);

          jju3 += vector_width();
          jjup3 += vector_width();
        }
        jjup += vector_width();
      } // for ma
      jju3 += 3 * vector_width();
    } // for mb

    // copy left side to right side with inversion symmetry VMK 4.4(2)
    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])

    SNA_DVEC *du_r_p = dulist_r[0];
    SNA_DVEC *du_i_p = dulist_i[0];

    int jju = idxu_block[j];
    jjup = (jju+(j+1)*(j+1)-1) * 3 * vector_width();
    jju *=  3 * vector_width();
    int mbpar = 1;
    for (int mb = 0; 2*mb <= j; mb++) {
      int mapar = mbpar;
      for (int ma = 0; ma <= j; ma++) {
        if (mapar == 1) {
          for (int k = 0; k < 3; k++) {
            SIMD_store(dulist_rp + jjup, SIMD_load(dulist_rp + jju));
            SIMD_store(dulist_ip + jjup, -SIMD_load(dulist_ip + jju));
            jju += vector_width();
            jjup += vector_width();
          }
        } else {
          for (int k = 0; k < 3; k++) {
            SIMD_store(dulist_rp + jjup, -SIMD_load(dulist_rp + jju));
            SIMD_store(dulist_ip + jjup, SIMD_load(dulist_ip + jju));
            jju += vector_width();
            jjup += vector_width();
          }
        }
        mapar = -mapar;
        jjup -= 6 * vector_width();
      } // for ma
      mbpar = -mbpar;
    } // for mb
  } // for j

  SNA_DVEC dsfac;
  SNA_DVEC sfac = compute_sfac_dsfac(r, rcut, sinnerij[jj], dinnerij[jj],
                                      dsfac);
  sfac = sfac * wj;
  dsfac = dsfac * wj;

  for (int j = 0; j <= twojmax; j++) {
    int jju = idxu_block[j] * vector_width();
    int jju3 = jju * 3;
    for (int mb = 0; 2*mb <= j; mb++)
      for (int ma = 0; ma <= j; ma++) {
        const SNA_DVEC ur_dsfac = dsfac * SIMD_load(ulist_rp + jju);
        const SNA_DVEC ui_dsfac = dsfac * SIMD_load(ulist_ip + jju);
        jju += vector_width();
        for (int k = 0; k < 3; k++) {
          SNA_DVEC du_r = ur_dsfac * up[k] + sfac * SIMD_load(dulist_rp+jju3);
          SIMD_store(dulist_rp + jju3, du_r);
          SNA_DVEC du_i = ui_dsfac * up[k] + sfac * SIMD_load(dulist_ip+jju3);
          SIMD_store(dulist_ip + jju3, du_i);
          jju3 += vector_width();
        }
      }
  }
}

/* ----------------------------------------------------------------------
   memory usage of arrays
------------------------------------------------------------------------- */

double SNAIntel::memory_usage()
{
  int jdimpq = twojmax + 2;
  int jdim = twojmax + 1;
  double bytes;

  bytes = 0;

  bytes += (double)jdimpq*jdimpq * sizeof(double);               // pqarray
  bytes += (double)idxcg_max * sizeof(double);                   // cglist

  bytes += (double)nmax * idxu_max * sizeof(SNA_DVEC) * 2;       // ulist_ij
  bytes += (double)idxu_max * nelements * sizeof(SNA_DVEC) * 2;  // ulisttot
  bytes += (double)idxu_max * 3 * sizeof(SNA_DVEC) * 2;          // dulist

  bytes += (double)idxz_max * ndoubles * sizeof(SNA_DVEC) * 2;   // zlist
  bytes += (double)idxb_max * ntriples * sizeof(SNA_DVEC);       // blist
  bytes += (double)idxb_max * ntriples * 3 * sizeof(double);     // dblist
  bytes += (double)idxu_max * nelements * sizeof(SNA_DVEC) * 2;  // ylist

  bytes += (double)jdim * jdim * jdim * sizeof(int);             // idxcg_block
  bytes += (double)jdim * sizeof(int);                           // idxu_block
  bytes += (double)jdim * jdim * jdim * sizeof(int);             // idxz_block
  bytes += (double)jdim * jdim * jdim * sizeof(int);             // idxb_block

  bytes += (double)idxz_max * sizeof(SNA_ZINDICES);              // idxz
  bytes += (double)idxb_max * sizeof(SNA_BINDICES);              // idxb

  if (bzero_flag)
  bytes += (double)jdim * sizeof(double);                        // bzero

  bytes += (double)nmax * 3 * sizeof(SNA_DVEC);                  // rij
  bytes += (double)nmax * sizeof(SNA_IVEC);                      // inside
  bytes += (double)nmax * sizeof(SNA_DVEC);                      // wj
  bytes += (double)nmax * sizeof(SNA_DVEC);                      // rcutij
  bytes += (double)nmax * sizeof(SNA_DVEC);                      // sinnerij
  bytes += (double)nmax * sizeof(SNA_DVEC);                      // dinnerij
  if (chem_flag) bytes += (double)nmax * sizeof(SNA_IVEC);       // element

  return bytes;
}

/* ---------------------------------------------------------------------- */

void SNAIntel::create_twojmax_arrays()
{
  int jdimpq = twojmax + 2;
  memory->create(rootpqarray, jdimpq, jdimpq,
                 "sna:rootpqarray");
  memory->create(cglist, idxcg_max, "sna:cglist");
  memory->create(ulisttot_r, idxu_max*nelements, "sna:ulisttot");
  memory->create(ulisttot_i, idxu_max*nelements, "sna:ulisttot");
  memory->create(dulist_r, idxu_max, 3, "sna:dulist");
  memory->create(dulist_i, idxu_max, 3, "sna:dulist");
  memory->create(zlist_r, idxz_max*ndoubles, "sna:zlist");
  memory->create(zlist_i, idxz_max*ndoubles, "sna:zlist");
  memory->create(blist, idxb_max*ntriples, "sna:blist");
  memory->create(dblist, idxb_max*ntriples, 3, "sna:dblist");
  memory->create(ylist_r, idxu_max*nelements, "sna:ylist");
  memory->create(ylist_i, idxu_max*nelements, "sna:ylist");

  if (bzero_flag)
    memory->create(bzero, twojmax+1,"sna:bzero");
  else
    bzero = nullptr;

}

/* ---------------------------------------------------------------------- */

void SNAIntel::destroy_twojmax_arrays()
{
  memory->destroy(rootpqarray);
  memory->destroy(cglist);
  memory->destroy(ulisttot_r);
  memory->destroy(ulisttot_i);
  memory->destroy(dulist_r);
  memory->destroy(dulist_i);
  memory->destroy(zlist_r);
  memory->destroy(zlist_i);
  memory->destroy(blist);
  memory->destroy(dblist);
  memory->destroy(ylist_r);
  memory->destroy(ylist_i);

  memory->destroy(idxcg_block);
  memory->destroy(idxu_block);
  memory->destroy(idxz_block);
  memory->destroy(idxb_block);

  if (bzero_flag)
    memory->destroy(bzero);

}

/* ----------------------------------------------------------------------
   the function delta given by VMK Eq. 8.2(1)
------------------------------------------------------------------------- */

double SNAIntel::deltacg(int j1, int j2, int j)
{
  double sfaccg = factorial((j1 + j2 + j) / 2 + 1);
  return sqrt(factorial((j1 + j2 - j) / 2) *
              factorial((j1 - j2 + j) / 2) *
              factorial((-j1 + j2 + j) / 2) / sfaccg);
}

/* ----------------------------------------------------------------------
   assign Clebsch-Gordan coefficients using
   the quasi-binomial formula VMK 8.2.1(3)
------------------------------------------------------------------------- */

void SNAIntel::init_clebsch_gordan()
{
  double sum,dcg,sfaccg;
  int m, aa2, bb2, cc2;
  int ifac;

  int idxcg_count = 0;
  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
        for (int m1 = 0; m1 <= j1; m1++) {
          aa2 = 2 * m1 - j1;

          for (int m2 = 0; m2 <= j2; m2++) {

            // -c <= cc <= c

            bb2 = 2 * m2 - j2;
            m = (aa2 + bb2 + j) / 2;

            if (m < 0 || m > j) {
              cglist[idxcg_count] = 0.0;
              idxcg_count++;
              continue;
            }

            sum = 0.0;

            for (int z = MAX(0, MAX(-(j - j2 + aa2)
                                    / 2, -(j - j1 - bb2) / 2));
                 z <= MIN((j1 + j2 - j) / 2,
                          MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
                 z++) {
              ifac = z % 2 ? -1 : 1;
              sum += ifac /
                (factorial(z) *
                 factorial((j1 + j2 - j) / 2 - z) *
                 factorial((j1 - aa2) / 2 - z) *
                 factorial((j2 + bb2) / 2 - z) *
                 factorial((j - j2 + aa2) / 2 + z) *
                 factorial((j - j1 - bb2) / 2 + z));
            }

            cc2 = 2 * m - j;
            dcg = deltacg(j1, j2, j);
            sfaccg = sqrt(factorial((j1 + aa2) / 2) *
                          factorial((j1 - aa2) / 2) *
                          factorial((j2 + bb2) / 2) *
                          factorial((j2 - bb2) / 2) *
                          factorial((j  + cc2) / 2) *
                          factorial((j  - cc2) / 2) *
                          (j + 1));

            cglist[idxcg_count] = sum * dcg * sfaccg;
            idxcg_count++;
          }
        }
      }
}

/* ----------------------------------------------------------------------
   print out values of Clebsch-Gordan coefficients
   format and notation follows VMK Table 8.11
------------------------------------------------------------------------- */

void SNAIntel::print_clebsch_gordan()
{
  if (comm->me) return;

  int aa2, bb2, cc2;
  for (int j = 0; j <= twojmax; j += 1) {
    printf("c = %g\n",j/2.0);
    printf("a alpha b beta C_{a alpha b beta}^{c alpha+beta}\n");
    for (int j1 = 0; j1 <= twojmax; j1++)
      for (int j2 = 0; j2 <= j1; j2++)
        if (j1-j2 <= j && j1+j2 >= j && (j1+j2+j)%2 == 0) {
          int idxcg_count = idxcg_block[j1][j2][j];
          for (int m1 = 0; m1 <= j1; m1++) {
            aa2 = 2*m1-j1;
            for (int m2 = 0; m2 <= j2; m2++) {
              bb2 = 2*m2-j2;
              double cgtmp = cglist[idxcg_count];
              cc2 = aa2+bb2;
              if (cc2 >= -j && cc2 <= j)
                if (j1 != j2 || (aa2 > bb2 && aa2 >= -bb2) || (aa2 == bb2 && aa2 >= 0))
                  printf("%4g %4g %4g %4g %10.6g\n",
                         j1/2.0,aa2/2.0,j2/2.0,bb2/2.0,cgtmp);
              idxcg_count++;
            }
          }
        }
  }
}

/* ----------------------------------------------------------------------
   pre-compute table of sqrt[p/m2], p, q = 1,twojmax
   the p = 0, q = 0 entries are allocated and skipped for convenience.
------------------------------------------------------------------------- */

void SNAIntel::init_rootpqarray()
{
  for (int p = 1; p <= twojmax; p++)
    for (int q = 1; q <= twojmax; q++)
      rootpqarray[p][q] = sqrt(static_cast<double>(p)/q);
}

/* ---------------------------------------------------------------------- */

void SNAIntel::compute_ncoeff()
{
  int ncount;

  ncount = 0;

  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = j1 - j2;
           j <= MIN(twojmax, j1 + j2); j += 2)
        if (j >= j1) ncount++;

  ndoubles = nelements*nelements;
  ntriples = nelements*nelements*nelements;
  if (chem_flag)
    ncoeff = ncount*ntriples;
  else
    ncoeff = ncount;
}

/* ---------------------------------------------------------------------- */

double SNAIntel::compute_sfac(double r, double rcut, double sinner, double dinner)
{
  double sfac;

  // calculate sfac = sfac_outer

  if (switch_flag == 0) sfac = 1.0;
  else if (r <= rmin0) sfac = 1.0;
  else if (r > rcut) sfac = 0.0;
  else {
    double rcutfac = MY_PI / (rcut - rmin0);
    sfac = 0.5 * (cos((r - rmin0) * rcutfac) + 1.0);
  }

  // calculate sfac *= sfac_inner, rarely visited

  if (switch_inner_flag == 1 && r < sinner + dinner) {
    if (r > sinner - dinner) {
      double rcutfac = MY_PI2 / dinner;
      sfac *= 0.5 * (1.0 - cos(MY_PI2 + (r - sinner) * rcutfac));
    } else sfac = 0.0;
  }

  return sfac;
}

/* ---------------------------------------------------------------------- */

SNA_DVEC SNAIntel::compute_sfac(const SNA_DVEC &r, const SNA_DVEC &rcut,
                                const SNA_DVEC &sinner, const SNA_DVEC &dinner)
{
  // calculate sfac = sfac_outer

  // if (switch_flag == 0 || r <= rmin0)
  SNA_DVEC sfac = SIMD_set(1.0);
  if (switch_flag != 0) {
    // r <= rcut && r > rmin0
    const SIMD_mask i(r > rmin0);
    const SIMD_mask m(r <= rcut);
    const SNA_DVEC rcutfac = SIMD_rcp(rcut - rmin0) * MY_PI;
    const SNA_DVEC sfac_m = (SIMD_cos((r - rmin0) * rcutfac) + 1.0) * 0.5;
    sfac = SIMD_set(sfac, m & i, sfac_m);
    // (r > rcut) && (r> rmin0)
    sfac = SIMD_zero_masked(m | i, sfac);
  }

  // calculate sfac *= sfac_inner, rarely visited

  if (switch_inner_flag == 1) {
    const SIMD_mask m(r < sinner + dinner);
    // if any(m)
    const SIMD_mask i(r > sinner - dinner);
    const SNA_DVEC rcutfac = SIMD_rcp(dinner) * MY_PI2;
    const SNA_DVEC sfac_m = (SIMD_set(1.0) - SIMD_cos((r-sinner) * rcutfac +
                                                      MY_PI2)) * 0.5;
    sfac = SIMD_set(sfac, m & i, sfac_m);
    sfac = SIMD_zero_masked((~m) | i, sfac);
  }

  return sfac;
}

/* ---------------------------------------------------------------------- */

SNA_DVEC SNAIntel::compute_sfac_dsfac(const SNA_DVEC & r,
                                      const SNA_DVEC & rcut,
                                      const SNA_DVEC & sinner,
                                      const SNA_DVEC & dinner,
                                      SNA_DVEC &dsfac)
{
  // calculate sfac = sfac_outer

  // if (switch_flag == 0 || r <= rmin0)
  SNA_DVEC sfac = SIMD_set(1.0);
  dsfac = SIMD_set(0.0);
  if (switch_flag != 0) {
    // r <= rcut && r > rmin0
    const SIMD_mask i(r > rmin0);
    const SIMD_mask m(r <= rcut);
    const SNA_DVEC rcutfac = SIMD_rcp(rcut - rmin0) * MY_PI;
    const SNA_DVEC trig_arg = (r - rmin0) * rcutfac;
    const SNA_DVEC sfac_m = (SIMD_cos(trig_arg) + 1.0) * 0.5;
    const SNA_DVEC dsfac_m = SIMD_sin(trig_arg) * rcutfac * -0.5;
    sfac = SIMD_set(sfac, m & i, sfac_m);
    dsfac = SIMD_set(dsfac, m & i, dsfac_m);
    // (r > rcut) && (r> rmin0)
    sfac = SIMD_zero_masked(m | i, sfac);
  }

  // calculate sfac *= sfac_inner, rarely visited

  if (switch_inner_flag == 1) {
    const SIMD_mask m(r < sinner + dinner);
    const SIMD_mask i(r > sinner - dinner);
    if (any(m & i)) {
      const SNA_DVEC rcutfac = SIMD_rcp(dinner) * MY_PI2;
      const SNA_DVEC trig_arg = (r - sinner) * rcutfac + MY_PI2;
      const SNA_DVEC sfac_inner = (SIMD_set(1.0) - SIMD_cos(trig_arg)) * 0.5;
      const SNA_DVEC dsfac_inner = rcutfac * 0.5 * SIMD_sin(trig_arg);
      dsfac = SIMD_set(dsfac, m & i, dsfac * sfac_inner +
                       sfac * dsfac_inner);
      sfac = SIMD_set(sfac, m & i, sfac_inner);
    }
    sfac = SIMD_zero_masked((~m) | i, sfac);
    dsfac = SIMD_zero_masked((~m) | i, dsfac);
  }

  return sfac;
}

template void SNAIntel::compute_zi_or_yi<1>(const SNA_DVEC *);
template void SNAIntel::compute_zi_or_yi<0>(const SNA_DVEC *);

#endif
#endif