/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: Aidan Thompson, Christian Trott, SNL
------------------------------------------------------------------------- */

#include "sna.h"
#include <math.h>
#include "math_const.h"
#include "math_extra.h"
#include <string.h>
#include <stdlib.h>
#include "openmp_snap.h"

#include "memory.h"
#include "error.h"
#include "comm.h"
#include "atom.h"

using namespace std;
using namespace LAMMPS_NS;
using namespace MathConst;

/* ----------------------------------------------------------------------

   this implementation is based on the method outlined
   in Bartok[1], using formulae from VMK[2].

   for the Clebsch-Gordan coefficients, we
   convert the VMK half-integral labels
   a, b, c, alpha, beta, gamma
   to array offsets j1, j2, j, m1, m2, m
   using the following relations:

   j1 = 2*a
   j2 = 2*b
   j =  2*c

   m1 = alpha+a      2*alpha = 2*m1 - j1
   m2 = beta+b    or 2*beta = 2*m2 - j2
   m =  gamma+c      2*gamma = 2*m - j

   in this way:

   -a <= alpha <= a
   -b <= beta <= b
   -c <= gamma <= c

   becomes:

   0 <= m1 <= j1
   0 <= m2 <= j2
   0 <= m <= j

   and the requirement that
   a+b+c be integral implies that
   j1+j2+j must be even.
   The requirement that:

   gamma = alpha+beta

   becomes:

   2*m - j = 2*m1 - j1 + 2*m2 - j2

   Similarly, for the Wigner U-functions U(J,m,m') we
   convert the half-integral labels J,m,m' to
   array offsets j,ma,mb:

   j = 2*J
   ma = J+m
   mb = J+m'

   so that:

   0 <= j <= 2*Jmax
   0 <= ma, mb <= j.

   For the bispectrum components B(J1,J2,J) we convert to:

   j1 = 2*J1
   j2 = 2*J2
   j = 2*J

   and the requirement:

   |J1-J2| <= J <= J1+J2, for j1+j2+j integral

   becomes:

   |j1-j2| <= j <= j1+j2, for j1+j2+j even integer

   or

   j = |j1-j2|, |j1-j2|+2,...,j1+j2-2,j1+j2

   [1] Albert Bartok-Partay, "Gaussian Approximation..."
   Doctoral Thesis, Cambrindge University, (2009)

   [2] D. A. Varshalovich, A. N. Moskalev, and V. K. Khersonskii,
   "Quantum Theory of Angular Momentum," World Scientific (1988)

------------------------------------------------------------------------- */

SNA::SNA(LAMMPS* lmp, double rfac0_in,
         int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in,
         double rmin0_in, int switch_flag_in) : Pointers(lmp)
{
  wself = 1.0;

  use_shared_arrays = use_shared_arrays_in;
  rfac0 = rfac0_in;
  rmin0 = rmin0_in;
  switch_flag = switch_flag_in;

  twojmax = twojmax_in;
  diagonalstyle = diagonalstyle_in;

  ncoeff = compute_ncoeff();

  create_twojmax_arrays();

  bvec = NULL;
  dbvec = NULL;
  memory->create(bvec, ncoeff, "pair:bvec");
  memory->create(dbvec, ncoeff, 3, "pair:dbvec");
  rij = NULL;
  inside = NULL;
  wj = NULL;
  rcutij = NULL;
  nmax = 0;
  idxj = NULL;

#ifdef TIMING_INFO
  timers = new double[20];
  for(int i = 0; i < 20; i++) timers[i] = 0;
  print = 0;
  counter = 0;
#endif

  build_indexlist();

}

/* ---------------------------------------------------------------------- */

SNA::~SNA()
{
  if(!use_shared_arrays) {
    destroy_twojmax_arrays();
    memory->destroy(rij);
    memory->destroy(inside);
    memory->destroy(wj);
    memory->destroy(rcutij);
    memory->destroy(bvec);
    memory->destroy(dbvec);
  }
  delete[] idxj;
}

void SNA::build_indexlist()
{
  if(diagonalstyle == 0) {
    int idxj_count = 0;

    for(int j1 = 0; j1 <= twojmax; j1++)
      for(int j2 = 0; j2 <= j1; j2++)
        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
          idxj_count++;

    // indexList can be changed here

    idxj = new SNA_LOOPINDICES[idxj_count];
    idxj_max = idxj_count;

    idxj_count = 0;

    for(int j1 = 0; j1 <= twojmax; j1++)
      for(int j2 = 0; j2 <= j1; j2++)
        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
          idxj[idxj_count].j1 = j1;
          idxj[idxj_count].j2 = j2;
          idxj[idxj_count].j = j;
          idxj_count++;
        }
  }

  if(diagonalstyle == 1) {
    int idxj_count = 0;

    for(int j1 = 0; j1 <= twojmax; j1++)
      for(int j = 0; j <= MIN(twojmax, 2 * j1); j += 2) {
        idxj_count++;
      }

    // indexList can be changed here

    idxj = new SNA_LOOPINDICES[idxj_count];
    idxj_max = idxj_count;

    idxj_count = 0;

    for(int j1 = 0; j1 <= twojmax; j1++)
      for(int j = 0; j <= MIN(twojmax, 2 * j1); j += 2) {
        idxj[idxj_count].j1 = j1;
        idxj[idxj_count].j2 = j1;
        idxj[idxj_count].j = j;
        idxj_count++;
      }
  }

  if(diagonalstyle == 2) {
    int idxj_count = 0;

    for(int j1 = 0; j1 <= twojmax; j1++) {
      idxj_count++;
    }

    // indexList can be changed here

    idxj = new SNA_LOOPINDICES[idxj_count];
    idxj_max = idxj_count;

    idxj_count = 0;

    for(int j1 = 0; j1 <= twojmax; j1++) {
      idxj[idxj_count].j1 = j1;
      idxj[idxj_count].j2 = j1;
      idxj[idxj_count].j = j1;
      idxj_count++;
    }
  }

  if(diagonalstyle == 3) {
    int idxj_count = 0;

    for(int j1 = 0; j1 <= twojmax; j1++)
      for(int j2 = 0; j2 <= j1; j2++)
        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
          if (j >= j1) idxj_count++;

    // indexList can be changed here

    idxj = new SNA_LOOPINDICES[idxj_count];
    idxj_max = idxj_count;

    idxj_count = 0;

    for(int j1 = 0; j1 <= twojmax; j1++)
      for(int j2 = 0; j2 <= j1; j2++)
        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
	  if (j >= j1) {
	    idxj[idxj_count].j1 = j1;
	    idxj[idxj_count].j2 = j2;
	    idxj[idxj_count].j = j;
	    idxj_count++;
	  }
  }

}
/* ---------------------------------------------------------------------- */

void SNA::init()
{
  init_clebsch_gordan();
  init_rootpqarray();
}


void SNA::grow_rij(int newnmax)
{
  if(newnmax <= nmax) return;

  nmax = newnmax;

  if(!use_shared_arrays) {
    memory->destroy(rij);
    memory->destroy(inside);
    memory->destroy(wj);
    memory->destroy(rcutij);
    memory->create(rij, nmax, 3, "pair:rij");
    memory->create(inside, nmax, "pair:inside");
    memory->create(wj, nmax, "pair:wj");
    memory->create(rcutij, nmax, "pair:rcutij");
 }
}
/* ----------------------------------------------------------------------
   compute Ui by summing over neighbors j
------------------------------------------------------------------------- */

void SNA::compute_ui(int jnum)
{
  double rsq, r, x, y, z, z0, theta0;

  // utot(j,ma,mb) = 0 for all j,ma,ma
  // utot(j,ma,ma) = 1 for all j,ma
  // for j in neighbors of i:
  //   compute r0 = (x,y,z,z0)
  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb

  zero_uarraytot();
  addself_uarraytot(wself);

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &starttime);
#endif

  for(int j = 0; j < jnum; j++) {
    x = rij[j][0];
    y = rij[j][1];
    z = rij[j][2];
    rsq = x * x + y * y + z * z;
    r = sqrt(rsq);

    theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij[j] - rmin0);
    //    theta0 = (r - rmin0) * rscale0;
    z0 = r / tan(theta0);

    compute_uarray(x, y, z, z0, r);
    add_uarraytot(r, wj[j], rcutij[j]);
  }

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &endtime);
  timers[0] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
#endif

}

void SNA::compute_ui_omp(int jnum, int sub_threads)
{
  double rsq, r, x, y, z, z0, theta0;

  // utot(j,ma,mb) = 0 for all j,ma,ma
  // utot(j,ma,ma) = 1 for all j,ma
  // for j in neighbors of i:
  //   compute r0 = (x,y,z,z0)
  //   utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb

  zero_uarraytot();
  addself_uarraytot(wself);

  for(int j = 0; j < jnum; j++) {
    x = rij[j][0];
    y = rij[j][1];
    z = rij[j][2];
    rsq = x * x + y * y + z * z;
    r = sqrt(rsq);
    theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij[j] - rmin0);
    //    theta0 = (r - rmin0) * rscale0;
    z0 = r / tan(theta0);
    omp_set_num_threads(sub_threads);

#if defined(_OPENMP)
#pragma omp parallel shared(x,y,z,z0,r,sub_threads) default(none)
#endif
    {
      compute_uarray_omp(x, y, z, z0, r, sub_threads);
    }
    add_uarraytot(r, wj[j], rcutij[j]);
  }


}

/* ----------------------------------------------------------------------
   compute Zi by summing over products of Ui
------------------------------------------------------------------------- */

void SNA::compute_zi()
{
  // for j1 = 0,...,twojmax
  //   for j2 = 0,twojmax
  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
  //        for ma = 0,...,j
  //          for mb = 0,...,jmid
  //            z(j1,j2,j,ma,mb) = 0
  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
  //              sumb1 = 0
  //              ma2 = ma-ma1+(j1+j2-j)/2;
  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
  //                mb2 = mb-mb1+(j1+j2-j)/2;
  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &starttime);
#endif

  // compute_dbidrj() requires full j1/j2/j chunk of z elements
  // use zarray j1/j2 symmetry

  for(int j1 = 0; j1 <= twojmax; j1++)
    for(int j2 = 0; j2 <= j1; j2++) {
      for(int j = j1 - j2; j <= MIN(twojmax, j1 + j2); j += 2) {
	double sumb1_r, sumb1_i;
	int ma2, mb2;
	for(int mb = 0; 2*mb <= j; mb++)
	  for(int ma = 0; ma <= j; ma++) {
	    zarray_r[j1][j2][j][ma][mb] = 0.0;
	    zarray_i[j1][j2][j][ma][mb] = 0.0;

	    for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
		ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) {
	      sumb1_r = 0.0;
	      sumb1_i = 0.0;

	      ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;

	      for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2);
              mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) {

		mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2;
		sumb1_r += cgarray[j1][j2][j][mb1][mb2] *
		  (uarraytot_r[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2] -
		   uarraytot_i[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2]);
		sumb1_i += cgarray[j1][j2][j][mb1][mb2] *
		  (uarraytot_r[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2] +
		   uarraytot_i[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2]);
	      } // end loop over mb1

	      zarray_r[j1][j2][j][ma][mb] +=
		sumb1_r * cgarray[j1][j2][j][ma1][ma2];
	      zarray_i[j1][j2][j][ma][mb] +=
		sumb1_i * cgarray[j1][j2][j][ma1][ma2];
	    } // end loop over ma1
	  } // end loop over ma, mb
      } // end loop over j
    } // end loop over j1, j2

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &endtime);
  timers[1] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
#endif
}

void SNA::compute_zi_omp(int sub_threads)
{
  // for j1 = 0,...,twojmax
  //   for j2 = 0,twojmax
  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
  //        for ma = 0,...,j
  //          for mb = 0,...,j
  //            z(j1,j2,j,ma,mb) = 0
  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
  //              sumb1 = 0
  //              ma2 = ma-ma1+(j1+j2-j)/2;
  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
  //                mb2 = mb-mb1+(j1+j2-j)/2;
  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
  //                  u(j1,ma1,mb1) * u(j2,ma2,mb2)
  //              z(j1,j2,j,ma,mb) += sumb1*cg(j1,ma1,j2,ma2,j)

  if(omp_in_parallel())
    omp_set_num_threads(sub_threads);

  // compute_dbidrj() requires full j1/j2/j chunk of z elements
  // use zarray j1/j2 symmetry

#if defined(_OPENMP)
#pragma omp parallel for schedule(auto) default(none)
#endif
  for(int j1 = 0; j1 <= twojmax; j1++)
    for(int j2 = 0; j2 <= j1; j2++)
      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {

    double sumb1_r, sumb1_i;
    int ma2, mb2;

    for(int ma = 0; ma <= j; ma++)
      for(int mb = 0; mb <= j; mb++) {
        zarray_r[j1][j2][j][ma][mb] = 0.0;
        zarray_i[j1][j2][j][ma][mb] = 0.0;

        for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
            ma1 <= MIN(j1, (2 * ma - j + j2 + j1) / 2); ma1++) {
          sumb1_r = 0.0;
          sumb1_i = 0.0;

          ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;

          for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2);
              mb1 <= MIN(j1, (2 * mb - j + j2 + j1) / 2); mb1++) {

            mb2 = (2 * mb - j - (2 * mb1 - j1) + j2) / 2;
            sumb1_r += cgarray[j1][j2][j][mb1][mb2] *
	      (uarraytot_r[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2] -
	       uarraytot_i[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2]);
            sumb1_i += cgarray[j1][j2][j][mb1][mb2] *
	      (uarraytot_r[j1][ma1][mb1] * uarraytot_i[j2][ma2][mb2] +
	       uarraytot_i[j1][ma1][mb1] * uarraytot_r[j2][ma2][mb2]);
          }

          zarray_r[j1][j2][j][ma][mb] +=
            sumb1_r * cgarray[j1][j2][j][ma1][ma2];
          zarray_i[j1][j2][j][ma][mb] +=
            sumb1_i * cgarray[j1][j2][j][ma1][ma2];
        }
      }
  }
}

/* ----------------------------------------------------------------------
   compute Bi by summing conj(Ui)*Zi
------------------------------------------------------------------------- */

void SNA::compute_bi()
{
  // for j1 = 0,...,twojmax
  //   for j2 = 0,twojmax
  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
  //        b(j1,j2,j) = 0
  //        for mb = 0,...,jmid
  //          for ma = 0,...,j
  //            b(j1,j2,j) +=
  //              2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb)

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &starttime);
#endif

  for(int j1 = 0; j1 <= twojmax; j1++)
    for(int j2 = 0; j2 <= j1; j2++) {
      for(int j = abs(j1 - j2);
          j <= MIN(twojmax, j1 + j2); j += 2) {
        barray[j1][j2][j] = 0.0;

	for(int mb = 0; 2*mb < j; mb++) {
	  for(int ma = 0; ma <= j; ma++) {
            barray[j1][j2][j] +=
              uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
	      uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb];
	  }
	}

	// For j even, special treatment for middle column

	if (j%2 == 0) {
	  int mb = j/2;
	  for(int ma = 0; ma < mb; ma++)
	    barray[j1][j2][j] +=
	      uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
	      uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb];
	  int ma = mb;
	  barray[j1][j2][j] +=
	    (uarraytot_r[j][ma][mb] * zarray_r[j1][j2][j][ma][mb] +
	     uarraytot_i[j][ma][mb] * zarray_i[j1][j2][j][ma][mb])*0.5;
	}

        barray[j1][j2][j] *= 2.0;
      }
    }

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &endtime);
  timers[2] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
#endif

}

/* ----------------------------------------------------------------------
   copy Bi array to a vector
------------------------------------------------------------------------- */

void SNA::copy_bi2bvec()
{
  int ncount, j1, j2, j;

  ncount = 0;

  for(j1 = 0; j1 <= twojmax; j1++)
    if(diagonalstyle == 0) {
      for(j2 = 0; j2 <= j1; j2++)
        for(j = abs(j1 - j2);
            j <= MIN(twojmax, j1 + j2); j += 2) {
          bvec[ncount] = barray[j1][j2][j];
          ncount++;
        }
    } else if(diagonalstyle == 1) {
      j2 = j1;
      for(j = abs(j1 - j2);
          j <= MIN(twojmax, j1 + j2); j += 2) {
        bvec[ncount] = barray[j1][j2][j];
        ncount++;
      }
    } else if(diagonalstyle == 2) {
      j = j2 = j1;
      bvec[ncount] = barray[j1][j2][j];
      ncount++;
    } else if(diagonalstyle == 3) {
      for(j2 = 0; j2 <= j1; j2++)
        for(j = abs(j1 - j2);
            j <= MIN(twojmax, j1 + j2); j += 2)
	  if (j >= j1) {
	    bvec[ncount] = barray[j1][j2][j];
	    ncount++;
	  }
    }
}

/* ----------------------------------------------------------------------
   calculate derivative of Ui w.r.t. atom j
------------------------------------------------------------------------- */

void SNA::compute_duidrj(double* rij, double wj, double rcut)
{
  double rsq, r, x, y, z, z0, theta0, cs, sn;
  double dz0dr;

  x = rij[0];
  y = rij[1];
  z = rij[2];
  rsq = x * x + y * y + z * z;
  r = sqrt(rsq);
  double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
  theta0 = (r - rmin0) * rscale0;
  cs = cos(theta0);
  sn = sin(theta0);
  z0 = r * cs / sn;
  dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &starttime);
#endif

  compute_duarray(x, y, z, z0, r, dz0dr, wj, rcut);

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &endtime);
  timers[3] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
#endif

}

/* ----------------------------------------------------------------------
   calculate derivative of Bi w.r.t. atom j
   variant using indexlist for j1,j2,j
   variant not using symmetry relation
------------------------------------------------------------------------- */

void SNA::compute_dbidrj_nonsymm()
{
  // for j1 = 0,...,twojmax
  //   for j2 = 0,twojmax
  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
  //        dbdr(j1,j2,j) = 0
  //        for ma = 0,...,j
  //          for mb = 0,...,j
  //            dzdr = 0
  //            for ma1 = Max(0,ma+(j1-j2-j)/2),Min(j1,ma+(j1+j2-j)/2)
  //              sumb1 = 0
  //              ma2 = ma-ma1+(j1+j2-j)/2;
  //              for mb1 = Max(0,mb+(j1-j2-j)/2),Min(j1,mb+(j1+j2-j)/2)
  //                mb2 = mb-mb1+(j1+j2-j)/2;
  //                sumb1 += cg(j1,mb1,j2,mb2,j) *
  //                  (dudr(j1,ma1,mb1) * u(j2,ma2,mb2) +
  //                  u(j1,ma1,mb1) * dudr(j2,ma2,mb2))
  //              dzdr += sumb1*cg(j1,ma1,j2,ma2,j)
  //            dbdr(j1,j2,j) +=
  //              Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) +
  //              Conj(u(j,ma,mb))*dzdr

  double* dbdr;
  double* dudr_r, *dudr_i;
  double sumb1_r[3], sumb1_i[3], dzdr_r[3], dzdr_i[3];
  int ma2;

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &starttime);
#endif

  for(int JJ = 0; JJ < idxj_max; JJ++) {
    const int j1 = idxj[JJ].j1;
    const int j2 = idxj[JJ].j2;
    const int j = idxj[JJ].j;

    dbdr = dbarray[j1][j2][j];
    dbdr[0] = 0.0;
    dbdr[1] = 0.0;
    dbdr[2] = 0.0;

    double** *j1duarray_r = duarray_r[j1];
    double** *j2duarray_r = duarray_r[j2];
    double** *j1duarray_i = duarray_i[j1];
    double** *j2duarray_i = duarray_i[j2];
    double** j1uarraytot_r = uarraytot_r[j1];
    double** j2uarraytot_r = uarraytot_r[j2];
    double** j1uarraytot_i = uarraytot_i[j1];
    double** j2uarraytot_i = uarraytot_i[j2];
    double** j1j2jcgarray = cgarray[j1][j2][j];

    for(int ma = 0; ma <= j; ma++)
      for(int mb = 0; mb <= j; mb++) {
        dzdr_r[0] = 0.0;
        dzdr_r[1] = 0.0;
        dzdr_r[2] = 0.0;
        dzdr_i[0] = 0.0;
        dzdr_i[1] = 0.0;
        dzdr_i[2] = 0.0;

        const int max_mb1 = MIN(j1, (2 * mb - j + j2 + j1) / 2) + 1;
        const int max_ma1 = MIN(j1, (2 * ma - j + j2 + j1) / 2) + 1;

        for(int ma1 = MAX(0, (2 * ma - j - j2 + j1) / 2);
            ma1 < max_ma1; ma1++) {

          ma2 = (2 * ma - j - (2 * ma1 - j1) + j2) / 2;
          sumb1_r[0] = 0.0;
          sumb1_r[1] = 0.0;
          sumb1_r[2] = 0.0;
          sumb1_i[0] = 0.0;
          sumb1_i[1] = 0.0;
          sumb1_i[2] = 0.0;

          //inside loop 54 operations (mul and add)
          for(int mb1 = MAX(0, (2 * mb - j - j2 + j1) / 2),
              mb2 = mb + (j1 + j2 - j) / 2 - mb1;
              mb1 < max_mb1; mb1++, mb2--) {

            double* dudr1_r, *dudr1_i, *dudr2_r, *dudr2_i;

            dudr1_r = j1duarray_r[ma1][mb1];
            dudr2_r = j2duarray_r[ma2][mb2];
            dudr1_i = j1duarray_i[ma1][mb1];
            dudr2_i = j2duarray_i[ma2][mb2];

            const double cga_mb1mb2 = j1j2jcgarray[mb1][mb2];
            const double uat_r_ma2mb2 = cga_mb1mb2 * j2uarraytot_r[ma2][mb2];
            const double uat_r_ma1mb1 = cga_mb1mb2 * j1uarraytot_r[ma1][mb1];
            const double uat_i_ma2mb2 = cga_mb1mb2 * j2uarraytot_i[ma2][mb2];
            const double uat_i_ma1mb1 = cga_mb1mb2 * j1uarraytot_i[ma1][mb1];

            for(int k = 0; k < 3; k++) {
              sumb1_r[k] += dudr1_r[k] * uat_r_ma2mb2;
              sumb1_r[k] -= dudr1_i[k] * uat_i_ma2mb2;
              sumb1_i[k] += dudr1_r[k] * uat_i_ma2mb2;
              sumb1_i[k] += dudr1_i[k] * uat_r_ma2mb2;

              sumb1_r[k] += dudr2_r[k] * uat_r_ma1mb1;
              sumb1_r[k] -= dudr2_i[k] * uat_i_ma1mb1;
              sumb1_i[k] += dudr2_r[k] * uat_i_ma1mb1;
              sumb1_i[k] += dudr2_i[k] * uat_r_ma1mb1;
            }
          } // end loop over mb1,mb2

          // dzdr += sumb1*cg(j1,ma1,j2,ma2,j)

          dzdr_r[0] += sumb1_r[0] * j1j2jcgarray[ma1][ma2];
          dzdr_r[1] += sumb1_r[1] * j1j2jcgarray[ma1][ma2];
          dzdr_r[2] += sumb1_r[2] * j1j2jcgarray[ma1][ma2];
          dzdr_i[0] += sumb1_i[0] * j1j2jcgarray[ma1][ma2];
          dzdr_i[1] += sumb1_i[1] * j1j2jcgarray[ma1][ma2];
          dzdr_i[2] += sumb1_i[2] * j1j2jcgarray[ma1][ma2];
        } // end loop over ma1,ma2

        // dbdr(j1,j2,j) +=
        //   Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb) +
        //   Conj(u(j,ma,mb))*dzdr

        dudr_r = duarray_r[j][ma][mb];
        dudr_i = duarray_i[j][ma][mb];

        for(int k = 0; k < 3; k++)
          dbdr[k] +=
            (dudr_r[k] * zarray_r[j1][j2][j][ma][mb] +
             dudr_i[k] * zarray_i[j1][j2][j][ma][mb]) +
            (uarraytot_r[j][ma][mb] * dzdr_r[k] +
             uarraytot_i[j][ma][mb] * dzdr_i[k]);
      } //end loop over ma mb

  } //end loop over j1 j2 j

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &endtime);
  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
#endif

}

/* ----------------------------------------------------------------------
   calculate derivative of Bi w.r.t. atom j
   variant using indexlist for j1,j2,j
   variant using symmetry relation
------------------------------------------------------------------------- */

void SNA::compute_dbidrj()
{
  // for j1 = 0,...,twojmax
  //   for j2 = 0,twojmax
  //     for j = |j1-j2|,Min(twojmax,j1+j2),2
  //        zdb = 0
  //        for mb = 0,...,jmid
  //          for ma = 0,...,j
  //            zdb +=
  //              Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb)
  //        dbdr(j1,j2,j) += 2*zdb
  //        zdb = 0
  //        for mb1 = 0,...,j1mid
  //          for ma1 = 0,...,j1
  //            zdb +=
  //              Conj(dudr(j1,ma1,mb1))*z(j,j2,j1,ma1,mb1)
  //        dbdr(j1,j2,j) += 2*zdb*(j+1)/(j1+1)
  //        zdb = 0
  //        for mb2 = 0,...,j2mid
  //          for ma2 = 0,...,j2
  //            zdb +=
  //              Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2)
  //        dbdr(j1,j2,j) += 2*zdb*(j+1)/(j2+1)

  double* dbdr;
  double* dudr_r, *dudr_i;
  double sumzdu_r[3];
  double** jjjzarray_r;
  double** jjjzarray_i;
  double jjjmambzarray_r;
  double jjjmambzarray_i;

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &starttime);
#endif

  for(int JJ = 0; JJ < idxj_max; JJ++) {
    const int j1 = idxj[JJ].j1;
    const int j2 = idxj[JJ].j2;
    const int j = idxj[JJ].j;

    dbdr = dbarray[j1][j2][j];
    dbdr[0] = 0.0;
    dbdr[1] = 0.0;
    dbdr[2] = 0.0;

    // Sum terms Conj(dudr(j,ma,mb))*z(j1,j2,j,ma,mb)

    for(int k = 0; k < 3; k++)
      sumzdu_r[k] = 0.0;

    // use zarray j1/j2 symmetry (optional)

    if (j1 >= j2) {
      jjjzarray_r = zarray_r[j1][j2][j];
      jjjzarray_i = zarray_i[j1][j2][j];
    } else {
      jjjzarray_r = zarray_r[j2][j1][j];
      jjjzarray_i = zarray_i[j2][j1][j];
    }

    for(int mb = 0; 2*mb < j; mb++)
      for(int ma = 0; ma <= j; ma++) {

        dudr_r = duarray_r[j][ma][mb];
        dudr_i = duarray_i[j][ma][mb];
	jjjmambzarray_r = jjjzarray_r[ma][mb];
	jjjmambzarray_i = jjjzarray_i[ma][mb];
        for(int k = 0; k < 3; k++)
          sumzdu_r[k] +=
            dudr_r[k] * jjjmambzarray_r +
	    dudr_i[k] * jjjmambzarray_i;

      } //end loop over ma mb

    // For j even, handle middle column

    if (j%2 == 0) {
      int mb = j/2;
      for(int ma = 0; ma < mb; ma++) {
        dudr_r = duarray_r[j][ma][mb];
	dudr_i = duarray_i[j][ma][mb];
	jjjmambzarray_r = jjjzarray_r[ma][mb];
	jjjmambzarray_i = jjjzarray_i[ma][mb];
        for(int k = 0; k < 3; k++)
          sumzdu_r[k] +=
            dudr_r[k] * jjjmambzarray_r +
	    dudr_i[k] * jjjmambzarray_i;
      }
      int ma = mb;
      dudr_r = duarray_r[j][ma][mb];
      dudr_i = duarray_i[j][ma][mb];
      jjjmambzarray_r = jjjzarray_r[ma][mb];
      jjjmambzarray_i = jjjzarray_i[ma][mb];
      for(int k = 0; k < 3; k++)
	sumzdu_r[k] +=
	  (dudr_r[k] * jjjmambzarray_r +
	   dudr_i[k] * jjjmambzarray_i)*0.5;
    } // end if jeven

    for(int k = 0; k < 3; k++)
      dbdr[k] += 2.0*sumzdu_r[k];

    // Sum over Conj(dudr(j1,ma1,mb1))*z(j,j2,j1,ma1,mb1)

    double j1fac = (j+1)/(j1+1.0);

    for(int k = 0; k < 3; k++)
      sumzdu_r[k] = 0.0;

    // use zarray j1/j2 symmetry (optional)

    if (j >= j2) {
      jjjzarray_r = zarray_r[j][j2][j1];
      jjjzarray_i = zarray_i[j][j2][j1];
    } else {
      jjjzarray_r = zarray_r[j2][j][j1];
      jjjzarray_i = zarray_i[j2][j][j1];
    }

    for(int mb1 = 0; 2*mb1 < j1; mb1++)
      for(int ma1 = 0; ma1 <= j1; ma1++) {

        dudr_r = duarray_r[j1][ma1][mb1];
        dudr_i = duarray_i[j1][ma1][mb1];
	jjjmambzarray_r = jjjzarray_r[ma1][mb1];
	jjjmambzarray_i = jjjzarray_i[ma1][mb1];
        for(int k = 0; k < 3; k++)
          sumzdu_r[k] +=
            dudr_r[k] * jjjmambzarray_r +
	    dudr_i[k] * jjjmambzarray_i;

      } //end loop over ma1 mb1

    // For j1 even, handle middle column

    if (j1%2 == 0) {
      int mb1 = j1/2;
      for(int ma1 = 0; ma1 < mb1; ma1++) {
        dudr_r = duarray_r[j1][ma1][mb1];
	dudr_i = duarray_i[j1][ma1][mb1];
	jjjmambzarray_r = jjjzarray_r[ma1][mb1];
	jjjmambzarray_i = jjjzarray_i[ma1][mb1];
        for(int k = 0; k < 3; k++)
          sumzdu_r[k] +=
            dudr_r[k] * jjjmambzarray_r +
	    dudr_i[k] * jjjmambzarray_i;
      }
      int ma1 = mb1;
      dudr_r = duarray_r[j1][ma1][mb1];
      dudr_i = duarray_i[j1][ma1][mb1];
      jjjmambzarray_r = jjjzarray_r[ma1][mb1];
      jjjmambzarray_i = jjjzarray_i[ma1][mb1];
      for(int k = 0; k < 3; k++)
	sumzdu_r[k] +=
	  (dudr_r[k] * jjjmambzarray_r +
	   dudr_i[k] * jjjmambzarray_i)*0.5;
    } // end if j1even

    for(int k = 0; k < 3; k++)
      dbdr[k] += 2.0*sumzdu_r[k]*j1fac;

    // Sum over Conj(dudr(j2,ma2,mb2))*z(j1,j,j2,ma2,mb2)

    double j2fac = (j+1)/(j2+1.0);

    for(int k = 0; k < 3; k++)
      sumzdu_r[k] = 0.0;

    // use zarray j1/j2 symmetry (optional)

    if (j1 >= j) {
      jjjzarray_r = zarray_r[j1][j][j2];
      jjjzarray_i = zarray_i[j1][j][j2];
    } else {
      jjjzarray_r = zarray_r[j][j1][j2];
      jjjzarray_i = zarray_i[j][j1][j2];
    }

    for(int mb2 = 0; 2*mb2 < j2; mb2++)
      for(int ma2 = 0; ma2 <= j2; ma2++) {

        dudr_r = duarray_r[j2][ma2][mb2];
        dudr_i = duarray_i[j2][ma2][mb2];
	jjjmambzarray_r = jjjzarray_r[ma2][mb2];
	jjjmambzarray_i = jjjzarray_i[ma2][mb2];
        for(int k = 0; k < 3; k++)
          sumzdu_r[k] +=
            dudr_r[k] * jjjmambzarray_r +
	    dudr_i[k] * jjjmambzarray_i;

      } //end loop over ma2 mb2

    // For j2 even, handle middle column

    if (j2%2 == 0) {
      int mb2 = j2/2;
      for(int ma2 = 0; ma2 < mb2; ma2++) {
        dudr_r = duarray_r[j2][ma2][mb2];
	dudr_i = duarray_i[j2][ma2][mb2];
	jjjmambzarray_r = jjjzarray_r[ma2][mb2];
	jjjmambzarray_i = jjjzarray_i[ma2][mb2];
        for(int k = 0; k < 3; k++)
          sumzdu_r[k] +=
            dudr_r[k] * jjjmambzarray_r +
	    dudr_i[k] * jjjmambzarray_i;
      }
      int ma2 = mb2;
      dudr_r = duarray_r[j2][ma2][mb2];
      dudr_i = duarray_i[j2][ma2][mb2];
      jjjmambzarray_r = jjjzarray_r[ma2][mb2];
      jjjmambzarray_i = jjjzarray_i[ma2][mb2];
      for(int k = 0; k < 3; k++)
	sumzdu_r[k] +=
	  (dudr_r[k] * jjjmambzarray_r +
	   dudr_i[k] * jjjmambzarray_i)*0.5;
    } // end if j2even

    for(int k = 0; k < 3; k++)
      dbdr[k] += 2.0*sumzdu_r[k]*j2fac;

  } //end loop over j1 j2 j

#ifdef TIMING_INFO
  clock_gettime(CLOCK_REALTIME, &endtime);
  timers[4] += (endtime.tv_sec - starttime.tv_sec + 1.0 *
                (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
#endif

}

/* ----------------------------------------------------------------------
   copy Bi derivatives into a vector
------------------------------------------------------------------------- */

void SNA::copy_dbi2dbvec()
{
  int ncount, j1, j2, j;

  ncount = 0;

  for(j1 = 0; j1 <= twojmax; j1++) {
    if(diagonalstyle == 0) {
      for(j2 = 0; j2 <= j1; j2++)
        for(j = abs(j1 - j2);
            j <= MIN(twojmax, j1 + j2); j += 2) {
          dbvec[ncount][0] = dbarray[j1][j2][j][0];
          dbvec[ncount][1] = dbarray[j1][j2][j][1];
          dbvec[ncount][2] = dbarray[j1][j2][j][2];
          ncount++;
        }
    } else if(diagonalstyle == 1) {
      j2 = j1;
      for(j = abs(j1 - j2);
          j <= MIN(twojmax, j1 + j2); j += 2) {
        dbvec[ncount][0] = dbarray[j1][j2][j][0];
        dbvec[ncount][1] = dbarray[j1][j2][j][1];
        dbvec[ncount][2] = dbarray[j1][j2][j][2];
        ncount++;
      }
    } else if(diagonalstyle == 2) {
      j = j2 = j1;
      dbvec[ncount][0] = dbarray[j1][j2][j][0];
      dbvec[ncount][1] = dbarray[j1][j2][j][1];
      dbvec[ncount][2] = dbarray[j1][j2][j][2];
      ncount++;
    } else if(diagonalstyle == 3) {
      for(j2 = 0; j2 <= j1; j2++)
        for(j = abs(j1 - j2);
            j <= MIN(twojmax, j1 + j2); j += 2)
	  if (j >= j1) {
	    dbvec[ncount][0] = dbarray[j1][j2][j][0];
	    dbvec[ncount][1] = dbarray[j1][j2][j][1];
	    dbvec[ncount][2] = dbarray[j1][j2][j][2];
	    ncount++;
	  }
    }
  }
}

/* ---------------------------------------------------------------------- */

void SNA::zero_uarraytot()
{
  for (int j = 0; j <= twojmax; j++)
    for (int ma = 0; ma <= j; ma++)
      for (int mb = 0; mb <= j; mb++) {
        uarraytot_r[j][ma][mb] = 0.0;
        uarraytot_i[j][ma][mb] = 0.0;
      }
}

/* ---------------------------------------------------------------------- */

void SNA::addself_uarraytot(double wself_in)
{
  for (int j = 0; j <= twojmax; j++)
    for (int ma = 0; ma <= j; ma++) {
      uarraytot_r[j][ma][ma] = wself_in;
      uarraytot_i[j][ma][ma] = 0.0;
    }
}

/* ----------------------------------------------------------------------
   add Wigner U-functions for one neighbor to the total
------------------------------------------------------------------------- */

void SNA::add_uarraytot(double r, double wj, double rcut)
{
  double sfac;

  sfac = compute_sfac(r, rcut);

  sfac *= wj;

  for (int j = 0; j <= twojmax; j++)
    for (int ma = 0; ma <= j; ma++)
      for (int mb = 0; mb <= j; mb++) {
        uarraytot_r[j][ma][mb] +=
          sfac * uarray_r[j][ma][mb];
        uarraytot_i[j][ma][mb] +=
          sfac * uarray_i[j][ma][mb];
      }
}

void SNA::add_uarraytot_omp(double r, double wj, double rcut)
{
  double sfac;

  sfac = compute_sfac(r, rcut);

  sfac *= wj;

#if defined(_OPENMP)
#pragma omp for
#endif
  for (int j = 0; j <= twojmax; j++)
    for (int ma = 0; ma <= j; ma++)
      for (int mb = 0; mb <= j; mb++) {
        uarraytot_r[j][ma][mb] +=
          sfac * uarray_r[j][ma][mb];
        uarraytot_i[j][ma][mb] +=
          sfac * uarray_i[j][ma][mb];
      }
}

/* ----------------------------------------------------------------------
   compute Wigner U-functions for one neighbor
------------------------------------------------------------------------- */

void SNA::compute_uarray(double x, double y, double z,
                         double z0, double r)
{
  double r0inv;
  double a_r, b_r, a_i, b_i;
  double rootpq;

  // compute Cayley-Klein parameters for unit quaternion

  r0inv = 1.0 / sqrt(r * r + z0 * z0);
  a_r = r0inv * z0;
  a_i = -r0inv * z;
  b_r = r0inv * y;
  b_i = -r0inv * x;

  // VMK Section 4.8.2

  uarray_r[0][0][0] = 1.0;
  uarray_i[0][0][0] = 0.0;

  for (int j = 1; j <= twojmax; j++) {

    // fill in left side of matrix layer from previous layer

    for (int mb = 0; 2*mb <= j; mb++) {
      uarray_r[j][0][mb] = 0.0;
      uarray_i[j][0][mb] = 0.0;

      for (int ma = 0; ma < j; ma++) {
	rootpq = rootpqarray[j - ma][j - mb];
        uarray_r[j][ma][mb] +=
          rootpq *
          (a_r * uarray_r[j - 1][ma][mb] +
	   a_i * uarray_i[j - 1][ma][mb]);
        uarray_i[j][ma][mb] +=
          rootpq *
          (a_r * uarray_i[j - 1][ma][mb] -
	   a_i * uarray_r[j - 1][ma][mb]);

	rootpq = rootpqarray[ma + 1][j - mb];
        uarray_r[j][ma + 1][mb] =
          -rootpq *
          (b_r * uarray_r[j - 1][ma][mb] +
	   b_i * uarray_i[j - 1][ma][mb]);
        uarray_i[j][ma + 1][mb] =
          -rootpq *
          (b_r * uarray_i[j - 1][ma][mb] -
	   b_i * uarray_r[j - 1][ma][mb]);
      }
    }

    // copy left side to right side with inversion symmetry VMK 4.4(2)
    // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])

    int mbpar = -1;
    for (int mb = 0; 2*mb <= j; mb++) {
      mbpar = -mbpar;
      int mapar = -mbpar;
      for (int ma = 0; ma <= j; ma++) {
    	mapar = -mapar;
    	if (mapar == 1) {
    	  uarray_r[j][j-ma][j-mb] = uarray_r[j][ma][mb];
    	  uarray_i[j][j-ma][j-mb] = -uarray_i[j][ma][mb];
    	} else {
    	  uarray_r[j][j-ma][j-mb] = -uarray_r[j][ma][mb];
    	  uarray_i[j][j-ma][j-mb] = uarray_i[j][ma][mb];
    	}
      }
    }
  }
}

void SNA::compute_uarray_omp(double x, double y, double z,
                             double z0, double r, int sub_threads)
{
  double r0inv;
  double a_r, b_r, a_i, b_i;
  double rootpq;

  // compute Cayley-Klein parameters for unit quaternion

  r0inv = 1.0 / sqrt(r * r + z0 * z0);
  a_r = r0inv * z0;
  a_i = -r0inv * z;
  b_r = r0inv * y;
  b_i = -r0inv * x;

  // VMK Section 4.8.2

  uarray_r[0][0][0] = 1.0;
  uarray_i[0][0][0] = 0.0;

  for (int j = 1; j <= twojmax; j++) {
#if defined(_OPENMP)
#pragma omp for
#endif
    for (int mb = 0; mb < j; mb++) {
      uarray_r[j][0][mb] = 0.0;
      uarray_i[j][0][mb] = 0.0;

      for (int ma = 0; ma < j; ma++) {
	rootpq = rootpqarray[j - ma][j - mb];
        uarray_r[j][ma][mb] +=
	  rootpq *
          (a_r * uarray_r[j - 1][ma][mb] +
	   a_i * uarray_i[j - 1][ma][mb]);
        uarray_i[j][ma][mb] +=
	  rootpq *
          (a_r * uarray_i[j - 1][ma][mb] -
	   a_i * uarray_r[j - 1][ma][mb]);

	rootpq = rootpqarray[ma + 1][j - mb];
        uarray_r[j][ma + 1][mb] =
	  -rootpq *
          (b_r * uarray_r[j - 1][ma][mb] +
	   b_i * uarray_i[j - 1][ma][mb]);
        uarray_i[j][ma + 1][mb] =
	  -rootpq *
          (b_r * uarray_i[j - 1][ma][mb] -
	   b_i * uarray_r[j - 1][ma][mb]);
      }
    }

    int mb = j;
    uarray_r[j][0][mb] = 0.0;
    uarray_i[j][0][mb] = 0.0;

#if defined(_OPENMP)
#pragma omp for
#endif
    for (int ma = 0; ma < j; ma++) {
      rootpq = rootpqarray[j - ma][mb];
      uarray_r[j][ma][mb] +=
	rootpq *
        (b_r * uarray_r[j - 1][ma][mb - 1] -
	 b_i * uarray_i[j - 1][ma][mb - 1]);
      uarray_i[j][ma][mb] +=
	rootpq *
        (b_r * uarray_i[j - 1][ma][mb - 1] +
	 b_i * uarray_r[j - 1][ma][mb - 1]);

      rootpq = rootpqarray[ma + 1][mb];
      uarray_r[j][ma + 1][mb] =
	rootpq *
        (a_r * uarray_r[j - 1][ma][mb - 1] -
	 a_i * uarray_i[j - 1][ma][mb - 1]);
      uarray_i[j][ma + 1][mb] =
	rootpq *
        (a_r * uarray_i[j - 1][ma][mb - 1] +
	 a_i * uarray_r[j - 1][ma][mb - 1]);
    }
  }
}

/* ----------------------------------------------------------------------
   compute derivatives of Wigner U-functions for one neighbor
   see comments in compute_uarray()
------------------------------------------------------------------------- */

void SNA::compute_duarray(double x, double y, double z,
                          double z0, double r, double dz0dr,
			  double wj, double rcut)
{
  double r0inv;
  double a_r, a_i, b_r, b_i;
  double da_r[3], da_i[3], db_r[3], db_i[3];
  double dz0[3], dr0inv[3], dr0invdr;
  double rootpq;

  double rinv = 1.0 / r;
  double ux = x * rinv;
  double uy = y * rinv;
  double uz = z * rinv;

  r0inv = 1.0 / sqrt(r * r + z0 * z0);
  a_r = z0 * r0inv;
  a_i = -z * r0inv;
  b_r = y * r0inv;
  b_i = -x * r0inv;

  dr0invdr = -pow(r0inv, 3.0) * (r + z0 * dz0dr);

  dr0inv[0] = dr0invdr * ux;
  dr0inv[1] = dr0invdr * uy;
  dr0inv[2] = dr0invdr * uz;

  dz0[0] = dz0dr * ux;
  dz0[1] = dz0dr * uy;
  dz0[2] = dz0dr * uz;

  for (int k = 0; k < 3; k++) {
    da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k];
    da_i[k] = -z * dr0inv[k];
  }

  da_i[2] += -r0inv;

  for (int k = 0; k < 3; k++) {
    db_r[k] = y * dr0inv[k];
    db_i[k] = -x * dr0inv[k];
  }

  db_i[0] += -r0inv;
  db_r[1] += r0inv;

  uarray_r[0][0][0] = 1.0;
  duarray_r[0][0][0][0] = 0.0;
  duarray_r[0][0][0][1] = 0.0;
  duarray_r[0][0][0][2] = 0.0;
  uarray_i[0][0][0] = 0.0;
  duarray_i[0][0][0][0] = 0.0;
  duarray_i[0][0][0][1] = 0.0;
  duarray_i[0][0][0][2] = 0.0;

  for (int j = 1; j <= twojmax; j++) {
    for (int mb = 0; 2*mb <= j; mb++) {
      uarray_r[j][0][mb] = 0.0;
      duarray_r[j][0][mb][0] = 0.0;
      duarray_r[j][0][mb][1] = 0.0;
      duarray_r[j][0][mb][2] = 0.0;
      uarray_i[j][0][mb] = 0.0;
      duarray_i[j][0][mb][0] = 0.0;
      duarray_i[j][0][mb][1] = 0.0;
      duarray_i[j][0][mb][2] = 0.0;

      for (int ma = 0; ma < j; ma++) {
        rootpq = rootpqarray[j - ma][j - mb];
        uarray_r[j][ma][mb] += rootpq *
                               (a_r *  uarray_r[j - 1][ma][mb] +
                                a_i *  uarray_i[j - 1][ma][mb]);
        uarray_i[j][ma][mb] += rootpq *
                               (a_r *  uarray_i[j - 1][ma][mb] -
                                a_i *  uarray_r[j - 1][ma][mb]);

        for (int k = 0; k < 3; k++) {
          duarray_r[j][ma][mb][k] +=
            rootpq * (da_r[k] * uarray_r[j - 1][ma][mb] +
                      da_i[k] * uarray_i[j - 1][ma][mb] +
                      a_r * duarray_r[j - 1][ma][mb][k] +
                      a_i * duarray_i[j - 1][ma][mb][k]);
          duarray_i[j][ma][mb][k] +=
            rootpq * (da_r[k] * uarray_i[j - 1][ma][mb] -
                      da_i[k] * uarray_r[j - 1][ma][mb] +
                      a_r * duarray_i[j - 1][ma][mb][k] -
                      a_i * duarray_r[j - 1][ma][mb][k]);
        }

	rootpq = rootpqarray[ma + 1][j - mb];
        uarray_r[j][ma + 1][mb] =
          -rootpq * (b_r *  uarray_r[j - 1][ma][mb] +
                     b_i *  uarray_i[j - 1][ma][mb]);
        uarray_i[j][ma + 1][mb] =
          -rootpq * (b_r *  uarray_i[j - 1][ma][mb] -
                     b_i *  uarray_r[j - 1][ma][mb]);

        for (int k = 0; k < 3; k++) {
          duarray_r[j][ma + 1][mb][k] =
            -rootpq * (db_r[k] * uarray_r[j - 1][ma][mb] +
                       db_i[k] * uarray_i[j - 1][ma][mb] +
                       b_r * duarray_r[j - 1][ma][mb][k] +
                       b_i * duarray_i[j - 1][ma][mb][k]);
          duarray_i[j][ma + 1][mb][k] =
            -rootpq * (db_r[k] * uarray_i[j - 1][ma][mb] -
                       db_i[k] * uarray_r[j - 1][ma][mb] +
                       b_r * duarray_i[j - 1][ma][mb][k] -
                       b_i * duarray_r[j - 1][ma][mb][k]);
        }
      }
    }

    int mbpar = -1;
    for (int mb = 0; 2*mb <= j; mb++) {
      mbpar = -mbpar;
      int mapar = -mbpar;
      for (int ma = 0; ma <= j; ma++) {
    	mapar = -mapar;
    	if (mapar == 1) {
    	  uarray_r[j][j-ma][j-mb] = uarray_r[j][ma][mb];
    	  uarray_i[j][j-ma][j-mb] = -uarray_i[j][ma][mb];
    	  for (int k = 0; k < 3; k++) {
    	    duarray_r[j][j-ma][j-mb][k] = duarray_r[j][ma][mb][k];
    	    duarray_i[j][j-ma][j-mb][k] = -duarray_i[j][ma][mb][k];
    	  }
    	} else {
    	  uarray_r[j][j-ma][j-mb] = -uarray_r[j][ma][mb];
    	  uarray_i[j][j-ma][j-mb] = uarray_i[j][ma][mb];
    	  for (int k = 0; k < 3; k++) {
    	    duarray_r[j][j-ma][j-mb][k] = -duarray_r[j][ma][mb][k];
    	    duarray_i[j][j-ma][j-mb][k] = duarray_i[j][ma][mb][k];
    	  }
    	}
      }
    }
  }

  double sfac = compute_sfac(r, rcut);
  double dsfac = compute_dsfac(r, rcut);

  sfac *= wj;
  dsfac *= wj;

  for (int j = 0; j <= twojmax; j++)
    for (int ma = 0; ma <= j; ma++)
      for (int mb = 0; mb <= j; mb++) {
        duarray_r[j][ma][mb][0] = dsfac * uarray_r[j][ma][mb] * ux +
                                  sfac * duarray_r[j][ma][mb][0];
        duarray_i[j][ma][mb][0] = dsfac * uarray_i[j][ma][mb] * ux +
                                  sfac * duarray_i[j][ma][mb][0];
        duarray_r[j][ma][mb][1] = dsfac * uarray_r[j][ma][mb] * uy +
                                  sfac * duarray_r[j][ma][mb][1];
        duarray_i[j][ma][mb][1] = dsfac * uarray_i[j][ma][mb] * uy +
                                  sfac * duarray_i[j][ma][mb][1];
        duarray_r[j][ma][mb][2] = dsfac * uarray_r[j][ma][mb] * uz +
                                  sfac * duarray_r[j][ma][mb][2];
        duarray_i[j][ma][mb][2] = dsfac * uarray_i[j][ma][mb] * uz +
                                  sfac * duarray_i[j][ma][mb][2];
      }
}

/* ----------------------------------------------------------------------
   memory usage of arrays
------------------------------------------------------------------------- */

double SNA::memory_usage()
{
  int jdim = twojmax + 1;
  double bytes;
  bytes = jdim * jdim * jdim * jdim * jdim * sizeof(double);
  bytes += 2 * jdim * jdim * jdim * sizeof(complex<double>);
  bytes += 2 * jdim * jdim * jdim * sizeof(double);
  bytes += jdim * jdim * jdim * 3 * sizeof(complex<double>);
  bytes += jdim * jdim * jdim * 3 * sizeof(double);
  bytes += ncoeff * sizeof(double);
  bytes += jdim * jdim * jdim * jdim * jdim * sizeof(complex<double>);
  return bytes;
}

/* ---------------------------------------------------------------------- */

void SNA::create_twojmax_arrays()
{
  int jdim = twojmax + 1;

  memory->create(cgarray, jdim, jdim, jdim, jdim, jdim,
                 "sna:cgarray");
  memory->create(rootpqarray, jdim+1, jdim+1,
                 "sna:rootpqarray");
  memory->create(barray, jdim, jdim, jdim,
                 "sna:barray");
  memory->create(dbarray, jdim, jdim, jdim, 3,
                 "sna:dbarray");

  memory->create(duarray_r, jdim, jdim, jdim, 3,
                 "sna:duarray");
  memory->create(duarray_i, jdim, jdim, jdim, 3,
                 "sna:duarray");

  memory->create(uarray_r, jdim, jdim, jdim,
                 "sna:uarray");
  memory->create(uarray_i, jdim, jdim, jdim,
                 "sna:uarray");

  if(!use_shared_arrays) {
    memory->create(uarraytot_r, jdim, jdim, jdim,
                   "sna:uarraytot");
    memory->create(zarray_r, jdim, jdim, jdim, jdim, jdim,
                   "sna:zarray");
    memory->create(uarraytot_i, jdim, jdim, jdim,
                   "sna:uarraytot");
    memory->create(zarray_i, jdim, jdim, jdim, jdim, jdim,
                   "sna:zarray");
  }

}

/* ---------------------------------------------------------------------- */

void SNA::destroy_twojmax_arrays()
{
  memory->destroy(cgarray);
  memory->destroy(rootpqarray);
  memory->destroy(barray);

  memory->destroy(dbarray);

  memory->destroy(duarray_r);
  memory->destroy(duarray_i);

  memory->destroy(uarray_r);
  memory->destroy(uarray_i);

  if(!use_shared_arrays) {
    memory->destroy(uarraytot_r);
    memory->destroy(zarray_r);
    memory->destroy(uarraytot_i);
    memory->destroy(zarray_i);
  }
}

/* ----------------------------------------------------------------------
   factorial n, wrapper for precomputed table
------------------------------------------------------------------------- */

double SNA::factorial(int n)
{
  if (n < 0 || n > nmaxfactorial) {
    char str[128];
    sprintf(str, "Invalid argument to factorial %d", n);
    error->all(FLERR, str);
  }

  return nfac_table[n];
}

/* ----------------------------------------------------------------------
   factorial n table, size SNA::nmaxfactorial+1
------------------------------------------------------------------------- */

const double SNA::nfac_table[] = {
  1,
  1,
  2,
  6,
  24,
  120,
  720,
  5040,
  40320,
  362880,
  3628800,
  39916800,
  479001600,
  6227020800,
  87178291200,
  1307674368000,
  20922789888000,
  355687428096000,
  6.402373705728e+15,
  1.21645100408832e+17,
  2.43290200817664e+18,
  5.10909421717094e+19,
  1.12400072777761e+21,
  2.5852016738885e+22,
  6.20448401733239e+23,
  1.5511210043331e+25,
  4.03291461126606e+26,
  1.08888694504184e+28,
  3.04888344611714e+29,
  8.8417619937397e+30,
  2.65252859812191e+32,
  8.22283865417792e+33,
  2.63130836933694e+35,
  8.68331761881189e+36,
  2.95232799039604e+38,
  1.03331479663861e+40,
  3.71993326789901e+41,
  1.37637530912263e+43,
  5.23022617466601e+44,
  2.03978820811974e+46,
  8.15915283247898e+47,
  3.34525266131638e+49,
  1.40500611775288e+51,
  6.04152630633738e+52,
  2.65827157478845e+54,
  1.1962222086548e+56,
  5.50262215981209e+57,
  2.58623241511168e+59,
  1.24139155925361e+61,
  6.08281864034268e+62,
  3.04140932017134e+64,
  1.55111875328738e+66,
  8.06581751709439e+67,
  4.27488328406003e+69,
  2.30843697339241e+71,
  1.26964033536583e+73,
  7.10998587804863e+74,
  4.05269195048772e+76,
  2.35056133128288e+78,
  1.3868311854569e+80,
  8.32098711274139e+81,
  5.07580213877225e+83,
  3.14699732603879e+85,
  1.98260831540444e+87,
  1.26886932185884e+89,
  8.24765059208247e+90,
  5.44344939077443e+92,
  3.64711109181887e+94,
  2.48003554243683e+96,
  1.71122452428141e+98,
  1.19785716699699e+100,
  8.50478588567862e+101,
  6.12344583768861e+103,
  4.47011546151268e+105,
  3.30788544151939e+107,
  2.48091408113954e+109,
  1.88549470166605e+111,
  1.45183092028286e+113,
  1.13242811782063e+115,
  8.94618213078297e+116,
  7.15694570462638e+118,
  5.79712602074737e+120,
  4.75364333701284e+122,
  3.94552396972066e+124,
  3.31424013456535e+126,
  2.81710411438055e+128,
  2.42270953836727e+130,
  2.10775729837953e+132,
  1.85482642257398e+134,
  1.65079551609085e+136,
  1.48571596448176e+138,
  1.3520015276784e+140,
  1.24384140546413e+142,
  1.15677250708164e+144,
  1.08736615665674e+146,
  1.03299784882391e+148,
  9.91677934870949e+149,
  9.61927596824821e+151,
  9.42689044888324e+153,
  9.33262154439441e+155,
  9.33262154439441e+157,
  9.42594775983835e+159,
  9.61446671503512e+161,
  9.90290071648618e+163,
  1.02990167451456e+166,
  1.08139675824029e+168,
  1.14628056373471e+170,
  1.22652020319614e+172,
  1.32464181945183e+174,
  1.44385958320249e+176,
  1.58824554152274e+178,
  1.76295255109024e+180,
  1.97450685722107e+182,
  2.23119274865981e+184,
  2.54355973347219e+186,
  2.92509369349301e+188,
  3.3931086844519e+190,
  3.96993716080872e+192,
  4.68452584975429e+194,
  5.5745857612076e+196,
  6.68950291344912e+198,
  8.09429852527344e+200,
  9.8750442008336e+202,
  1.21463043670253e+205,
  1.50614174151114e+207,
  1.88267717688893e+209,
  2.37217324288005e+211,
  3.01266001845766e+213,
  3.8562048236258e+215,
  4.97450422247729e+217,
  6.46685548922047e+219,
  8.47158069087882e+221,
  1.118248651196e+224,
  1.48727070609069e+226,
  1.99294274616152e+228,
  2.69047270731805e+230,
  3.65904288195255e+232,
  5.01288874827499e+234,
  6.91778647261949e+236,
  9.61572319694109e+238,
  1.34620124757175e+241,
  1.89814375907617e+243,
  2.69536413788816e+245,
  3.85437071718007e+247,
  5.5502938327393e+249,
  8.04792605747199e+251,
  1.17499720439091e+254,
  1.72724589045464e+256,
  2.55632391787286e+258,
  3.80892263763057e+260,
  5.71338395644585e+262,
  8.62720977423323e+264,
  1.31133588568345e+267,
  2.00634390509568e+269,
  3.08976961384735e+271,
  4.78914290146339e+273,
  7.47106292628289e+275,
  1.17295687942641e+278,
  1.85327186949373e+280,
  2.94670227249504e+282,
  4.71472363599206e+284,
  7.59070505394721e+286,
  1.22969421873945e+289,
  2.0044015765453e+291,
  3.28721858553429e+293,
  5.42391066613159e+295,
  9.00369170577843e+297,
  1.503616514865e+300, // nmaxfactorial = 167
};

/* ----------------------------------------------------------------------
   the function delta given by VMK Eq. 8.2(1)
------------------------------------------------------------------------- */

double SNA::deltacg(int j1, int j2, int j)
{
  double sfaccg = factorial((j1 + j2 + j) / 2 + 1);
  return sqrt(factorial((j1 + j2 - j) / 2) *
              factorial((j1 - j2 + j) / 2) *
              factorial((-j1 + j2 + j) / 2) / sfaccg);
}

/* ----------------------------------------------------------------------
   assign Clebsch-Gordan coefficients using
   the quasi-binomial formula VMK 8.2.1(3)
------------------------------------------------------------------------- */

void SNA::init_clebsch_gordan()
{
  double sum,dcg,sfaccg;
  int m, aa2, bb2, cc2;
  int ifac;

  for (int j1 = 0; j1 <= twojmax; j1++)
    for (int j2 = 0; j2 <= twojmax; j2++)
      for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2)
        for (int m1 = 0; m1 <= j1; m1 += 1) {
          aa2 = 2 * m1 - j1;

          for (int m2 = 0; m2 <= j2; m2 += 1) {

            // -c <= cc <= c

            bb2 = 2 * m2 - j2;
            m = (aa2 + bb2 + j) / 2;

            if(m < 0 || m > j) continue;

	    sum = 0.0;

	    for (int z = MAX(0, MAX(-(j - j2 + aa2)
				   / 2, -(j - j1 - bb2) / 2));
		z <= MIN((j1 + j2 - j) / 2,
			 MIN((j1 - aa2) / 2, (j2 + bb2) / 2));
		z++) {
	      ifac = z % 2 ? -1 : 1;
	      sum += ifac /
		(factorial(z) *
		 factorial((j1 + j2 - j) / 2 - z) *
		 factorial((j1 - aa2) / 2 - z) *
		 factorial((j2 + bb2) / 2 - z) *
		 factorial((j - j2 + aa2) / 2 + z) *
		 factorial((j - j1 - bb2) / 2 + z));
	    }

	    cc2 = 2 * m - j;
	    dcg = deltacg(j1, j2, j);
	    sfaccg = sqrt(factorial((j1 + aa2) / 2) *
			factorial((j1 - aa2) / 2) *
			factorial((j2 + bb2) / 2) *
			factorial((j2 - bb2) / 2) *
			factorial((j  + cc2) / 2) *
			factorial((j  - cc2) / 2) *
			(j + 1));

	    cgarray[j1][j2][j][m1][m2] = sum * dcg * sfaccg;
	  }
	}
}

/* ----------------------------------------------------------------------
   pre-compute table of sqrt[p/m2], p, q = 1,twojmax
   the p = 0, q = 0 entries are allocated and skipped for convenience.
------------------------------------------------------------------------- */

void SNA::init_rootpqarray()
{
  for (int p = 1; p <= twojmax; p++)
    for (int q = 1; q <= twojmax; q++)
      rootpqarray[p][q] = sqrt(static_cast<double>(p)/q);
}

/* ----------------------------------------------------------------------
   a = j/2
------------------------------------------------------------------------- */

void SNA::jtostr(char* str, int j)
{
  if(j % 2 == 0)
    sprintf(str, "%d", j / 2);
  else
    sprintf(str, "%d/2", j);
}

/* ----------------------------------------------------------------------
   aa = m - j/2
------------------------------------------------------------------------- */

void SNA::mtostr(char* str, int j, int m)
{
  if(j % 2 == 0)
    sprintf(str, "%d", m - j / 2);
  else
    sprintf(str, "%d/2", 2 * m - j);
}

/* ----------------------------------------------------------------------
   list values of Clebsch-Gordan coefficients
   using notation of VMK Table 8.11
------------------------------------------------------------------------- */

void SNA::print_clebsch_gordan(FILE* file)
{
  char stra[20], strb[20], strc[20], straa[20], strbb[20], strcc[20];
  int m, aa2, bb2;

  fprintf(file, "a, aa, b, bb, c, cc, c(a,aa,b,bb,c,cc) \n");

  for (int j1 = 0; j1 <= twojmax; j1++) {
    jtostr(stra, j1);

    for (int j2 = 0; j2 <= twojmax; j2++) {
      jtostr(strb, j2);

      for (int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
        jtostr(strc, j);

        for (int m1 = 0; m1 <= j1; m1 += 1) {
          mtostr(straa, j1, m1);
          aa2 = 2 * m1 - j1;

          for (int m2 = 0; m2 <= j2; m2 += 1) {
            bb2 = 2 * m2 - j2;
            m = (aa2 + bb2 + j) / 2;

            if(m < 0 || m > j) continue;

            mtostr(strbb, j2, m2);
            mtostr(strcc, j, m);

            fprintf(file, "%s\t%s\t%s\t%s\t%s\t%s\t%g\n",
                    stra, straa, strb, strbb, strc, strcc,
                    cgarray[j1][j2][j][m1][m2]);
          }
        }
      }
    }
  }
}

/* ---------------------------------------------------------------------- */

int SNA::compute_ncoeff()
{
  int ncount;

  ncount = 0;

  for (int j1 = 0; j1 <= twojmax; j1++)
    if(diagonalstyle == 0) {
      for (int j2 = 0; j2 <= j1; j2++)
        for (int j = abs(j1 - j2);
            j <= MIN(twojmax, j1 + j2); j += 2)
          ncount++;
    } else if(diagonalstyle == 1) {
      int j2 = j1;

      for (int j = abs(j1 - j2);
          j <= MIN(twojmax, j1 + j2); j += 2)
        ncount++;
    } else if(diagonalstyle == 2) {
      ncount++;
    } else if(diagonalstyle == 3) {
      for (int j2 = 0; j2 <= j1; j2++)
        for (int j = abs(j1 - j2);
            j <= MIN(twojmax, j1 + j2); j += 2)
          if (j >= j1) ncount++;
    }

  return ncount;
}

/* ---------------------------------------------------------------------- */

double SNA::compute_sfac(double r, double rcut)
{
  if (switch_flag == 0) return 1.0;
  if (switch_flag == 1) {
    if(r <= rmin0) return 1.0;
    else if(r > rcut) return 0.0;
    else {
      double rcutfac = MY_PI / (rcut - rmin0);
      return 0.5 * (cos((r - rmin0) * rcutfac) + 1.0);
    }
  }
  return 0.0;
}

/* ---------------------------------------------------------------------- */

double SNA::compute_dsfac(double r, double rcut)
{
  if (switch_flag == 0) return 0.0;
  if (switch_flag == 1) {
    if(r <= rmin0) return 0.0;
    else if(r > rcut) return 0.0;
    else {
      double rcutfac = MY_PI / (rcut - rmin0);
      return -0.5 * sin((r - rmin0) * rcutfac) * rcutfac;
    }
  }
  return 0.0;
}