lammps/lib/pace/ace_recursive.cpp

/*
 * Performant implementation of atomic cluster expansion and interface to LAMMPS
 *
 * Copyright 2021  (c) Yury Lysogorskiy^1, Cas van der Oord^2, Anton Bochkarev^1,
 * Sarath Menon^1, Matteo Rinaldi^1, Thomas Hammerschmidt^1, Matous Mrovec^1,
 * Aidan Thompson^3, Gabor Csanyi^2, Christoph Ortner^4, Ralf Drautz^1
 *
 * ^1: Ruhr-University Bochum, Bochum, Germany
 * ^2: University of Cambridge, Cambridge, United Kingdom
 * ^3: Sandia National Laboratories, Albuquerque, New Mexico, USA
 * ^4: University of British Columbia, Vancouver, BC, Canada
 *
 *
 * See the LICENSE file.
 * This FILENAME is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.

 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


// Created by Christoph Ortner on 20.12.2020

#include "ace_recursive.h"

#include "ace_abstract_basis.h"
#include "ace_types.h"

/* ------------------------------------------------------------
 *        ACEDAG Implementation
 *        (just the DAG construction, not the traversal)
 * ------------------------------------------------------------ */

/* Notes on different Tags:
 *   rec1 - first basic implementation
 *   rec2 - avoid index arithmetic, contiguous layout,
 *          canonical evaluator in ACE.jl format
 *   rec3 - split nodes into interior and leaf nodes
 */

void ACEDAG::init(Array2D<int> xAspec,
                  Array2D<int> AAspec,
                  Array1D<int> orders,
                  Array2D<DOUBLE_TYPE> jl_coeffs,
                  int _heuristic ) {

    // remember which heuristic we want to use!
    heuristic = _heuristic;

    /* stage one of the graph is just extracting the A basis from
     * the tensor product format into the linear list; all information
     * for that is already stored in Aspec, and the only thing to do here is
     * to construct zero-coefficients. Still we have to copy Aspec, since
     * the one we have here will (may?) be deleted. */
    int num1 = xAspec.get_dim(0);
    Aspec = xAspec; //YL: just copying the multiarray: Aspec = xAspec;

    /* fill the one-particle basis into the DAGmap
     *   DAGmap[ (i1,...,in) ] = iAA index where the (i1,...,in) basis functions
     *   lives.
     */
    TDAGMAP DAGmap;
    for (int iA = 0; iA < num1; iA++) {
        vector<int> a(1);
        a[0] = iA;
        DAGmap[a] = iA;
    }

    /* For stage 2 we now want to construct the actual recursion; the
       recursion info will be stored in DAGspec, while the
       coefficients go into DAGcoeffs. These arrays are initialized to
       length `num2`, but they will have to grow as we add additional
        artificial nodes into the graph.

        initially we treat all nodes as having children, but then in a
        second stage below we reorganize. */
    int num2 = AAspec.get_dim(0);
    int ndensity = jl_coeffs.get_dim(1);
    nodes_pre.resize(2*num2, 2);
    coeffs_pre.resize(2*num2, ndensity);

    /* the first basis function we construct will get index num1,
     * since there are already num1 one-particle basis functions
     * to collect during stage 1 */
    dag_idx = num1;
    /* main loop over AA basis set to transform into DAG  */
    for (int iAA = 0; iAA < num2; iAA++) {
        // create a vector representing the current basis function
        int ord = orders(iAA);
        vector<int> aa(ord);
        for (int t = 0; t < ord; t++) aa[t] = AAspec(iAA, t);
        vector<DOUBLE_TYPE> c(ndensity);
        for (int p = 0; p < ndensity; p++) c[p] = jl_coeffs(iAA, p);
        insert_node(DAGmap, aa, c);
    }

    /* convert to 3-stage format through reordering
     * interior nodes first, then leaf nodes  */

    // allocate storage
    num_nodes = dag_idx;   // store total size of dag
    // num_nodes - num1 = number of many-body nodes.
    nodes.resize(num_nodes - num1, 2);
    coeffs.resize(num_nodes - num1, ndensity);

    // find out which nodes have children
    haschild.resize(num_nodes - num1);
    haschild.fill(false);
    for (int iAA = 0; iAA < num_nodes - num1; iAA++) {
        if (nodes_pre(iAA, 0) >= num1)
            haschild(nodes_pre(iAA, 0)-num1) = true;
        if (nodes_pre(iAA, 1) >= num1)
        haschild(nodes_pre(iAA, 1)-num1) = true;
    }

    // to reorder the graph we need a fresh map from preordered indices  to
    // postordered indices; for the 1-particle basis the order remains the same.
    // TODO: doesn't need to be a map, could be a vector.
    map<int, int> neworder;
    for (int iA = 0; iA < num1; iA++)
        neworder[iA] = iA;

    // insert all interior nodes
    num2_int = 0;
    num2_leaf = 0;
    dag_idx = num1;
    int i1, i2, i1pre, i2pre;
    for (int iAA = 0; iAA < num_nodes - num1; iAA++) {
        if (haschild(iAA)) {
            num2_int += 1;
            // indices into AAbuf before reordering
            i1pre = nodes_pre(iAA, 0);
            i2pre = nodes_pre(iAA, 1);
            // indices into AAbuf after reordering
            i1 = neworder[i1pre];
            i2 = neworder[i2pre];
            // insert the current node : iAA is old order, dag_idx is new order
            neworder[num1+iAA] = dag_idx;
            nodes(dag_idx-num1, 0) = i1;
            nodes(dag_idx-num1, 1) = i2;
            for (int t = 0; t < ndensity; t++)
                coeffs(dag_idx-num1, t) = coeffs_pre(iAA, t);
            dag_idx++;
        }
    }

    // insert all leaf nodes
    for (int iAA = 0; iAA < num_nodes - num1; iAA++) {
        if (!haschild(iAA)) {
            num2_leaf += 1;
            // indices into AAbuf before reordering
            i1pre = nodes_pre(iAA, 0);
            i2pre = nodes_pre(iAA, 1);
            // insert the current node : no need to remember the new order now
            nodes(dag_idx-num1, 0) = neworder[i1pre];
            nodes(dag_idx-num1, 1) = neworder[i2pre];
            for (int t = 0; t < ndensity; t++)
                coeffs(dag_idx-num1, t) = coeffs_pre(iAA, t);
            dag_idx++;
        }
    }
#ifdef DEBUG
    cout << "num2_int = " << num2_int << "; num2_leaf = " << num2_leaf << "\n";
#endif
    // free up memory that is no longer needed
    nodes_pre.resize(0, 0);
    coeffs_pre.resize(0, 0);
    haschild.resize(0);

    /* finalize dag: allocate buffer storage  */
    AAbuf.resize(num1 + num2_int);
    w.resize(num_nodes);
    // TODO: technically only need num1 + num2_int for w, this can save  one
    //       memory access later, probably not worth the crazy code duplication.
}

void ACEDAG::insert_node(TDAGMAP &DAGmap, vector<int> a, vector<DOUBLE_TYPE> c) {
    /* start with a list of all possible partitions into 2 groups
     * and check whether any of these nodes are already in the dag */
    auto partitions = find_2partitions(a);
    int ndensity = c.size();
    int num1 = get_num1();

    // TODO: first try to find partitions into nodes that are already parents
    //       that way we will get more leaf nodes!
    for (TPARTITION const& p : partitions) {
        /* this is the good case; the parent nodes are both already in the
         * graph; add the new node and return. This is also the only place in the
         * code where an actual insert happens. */
        if (DAGmap.count(p.first) && DAGmap.count(p.second)) {
            if (nodes_pre.get_dim(0) < dag_idx + 1) { //check if array is sufficiently large
                int newsize = (dag_idx * 3) / 2;
                nodes_pre.resize(newsize, 2); // grow arrays if necessary
                coeffs_pre.resize(newsize, ndensity);
            }
            int i1 = DAGmap[p.first];
            int i2 = DAGmap[p.second];
            nodes_pre(dag_idx - num1, 0) = i1;
            nodes_pre(dag_idx - num1, 1) = i2;
            DAGmap[a] = dag_idx;
            for (int p = 0; p < ndensity; p++)
                coeffs_pre(dag_idx - num1, p) = c[p];
            dag_idx += 1;
            return;
        }
    }

    /* if we are here, then this means, the new node cannot yet be inserted.
     * We first need to insert some intermediate auxiliary nodes.  For this
     * we use a simple heuristic:
     *    try to find a partition where one of the two nodes are already
     *    in the graph, if there are several, then we remember the longest
     *    (this is a very greedy heuristic!!)
     *  .... (continue below) ....
     */
    TPARTITION longest;
    int longest_length = 0;
    for (auto const& p : partitions) {
        int len = 0;
        if (DAGmap.count(p.first)) {
            len = p.first.size();
        } else if (DAGmap.count(p.second)) {
            len = p.second.size();
        }
        if ((len > 0) && (len > longest_length)) {
            longest_length = len;
            longest = p;
        }
    }

    /* sanity check */
    if (longest_length == 0) {
        std::stringstream error_message;
        error_message << "WARNING : something has gone horribly wrong! `longest_length == 0`! \n";
        error_message << "a = [";
        for (int t = 0; t < a.size(); t++)
            error_message << a[t] << ", ";
        error_message << "]\n";
        throw std::logic_error(error_message.str());
//        return;
    }

    /* If there is a partition with one component already in the graph,
     * then we only need to add in the other component. Note that there will
     * always be at least one such partition, namely all those containing
     * a one-element node e.g. (1,2,3,4) -> (1,) (2,3,4)  then (1,) is
     * a one-particle basis function and hence always in the graph.
     * If heuristic == 0, then we just take one of those partitionas and move on.
     *
     * We also accept the found partition if longest_length > 1.
     * And we also accept it if we have a 2- or 3-correlation.
     */

    if (     (heuristic == 0)
          || (longest_length > 1)
          || (a.size() <= 3))       {
        /* insert the other node that isn't in the DAG yet
        * this is an artificial node so it gets zero-coefficients
        * This step is recursive, so more than one node might be inserted here */
        vector<DOUBLE_TYPE> cz(ndensity);
        for (int i = 0; i < ndensity; i++) cz[i] = 0.0;
        TPARTITION p = longest;
        if (DAGmap.count(p.first))
            insert_node(DAGmap, p.second, cz);
        else
            insert_node(DAGmap, p.first, cz);
    }

    /* Second heuristic : heuristic == 1
     * Focus on inserting artificial 2-correlations
     */
    else if (heuristic == 1) {
        // and we also know that longest_length == 1 and nu = a.size >= 4.
        int nu = a.size();
        // generate an artificial partition
        vector<int> a1(2);
        for (int i = 0; i < 2; i++) a1[i] = a[i];
        vector<int> a2(nu - 2);
        for (int i = 0; i < nu - 2; i++) a2[i] = a[2 + i];
        vector<DOUBLE_TYPE> cz(ndensity);
        for (int i = 0; i < cz.size(); i++) cz[i] = 0.0;
        // and insert both (we know neither are in the DAG yet)
        insert_node(DAGmap, a1, cz);
        insert_node(DAGmap, a2, cz);
    } else {
        cout << "WARNING : something has gone horribly wrong! \n";
        //  TODO: Throw and error here?!?
        return;
    }


    /* now we should be ready to insert the entire tuple `a` since there is now
     * an eligible parent pair. Here we recompute the partition of `a`, but
     * that's a small price to pay for a clearer code. Maybe this can be
     * optimized a bit by wrapping it all into a while loop or having a second
     * version of `insert_node` ... */
    insert_node(DAGmap, a, c);
}

TPARTITIONS ACEDAG::find_2partitions(vector<int> v) {
    int N = v.size();
    int zo;
    TPARTITIONS partitions;
    TPARTITION part;
    /* This is a fun little hack to extract all subsets of the indices 1:N
     * the number i will have binary representation with each digit indicating
     * whether or not that index belongs to the selected subset */
    for (int i = 1; i < (1<<N)/2; i++){
        int N1 = 0, N2 = 0;
        int p = 1;
        for (int n = 0; n < N; n++) {
            zo = ((i / p) % 2);
            N1 += zo;
            N2 += 1-zo;
            p *= 2;
        }
        /* convert to a more useful representation in terms of vector */
        vector<int> v1(N1);
        vector<int> v2(N2);
        int i1 =0, i2 = 0;
        p = 1;
        for (int n = 0; n < N; n++) {
            zo = ((i / p) % 2);
            p *= 2;
            if (zo == 1) {
                v1[i1] = v[n];
                i1 += 1;
            } else {
                v2[i2] = v[n];
                i2 += 1;
            }
        }
        part = make_pair(v1, v2);
        partitions.push_back(part);
    }
    return partitions;
}

void ACEDAG::print() {
    cout << "DAG Specification: \n" ;
    cout << "          n1 : " << get_num1() << "\n";
    cout << "          n2 : " << get_num2() << "\n";
    cout << "   num_nodes : " << num_nodes << "\n";
    cout << "--------------------\n";
    cout << "A-spec: \n";
    for (int iA = 0; iA < get_num1(); iA++) {
        cout << iA << " : " << Aspec(iA, 0) <<
            Aspec(iA, 1) << Aspec(iA, 2) << Aspec(iA, 3) << "\n";
    }

    cout << "-----------\n";
    cout << "AA-tree\n";

    for (int iAA = 0; iAA < get_num2(); iAA++) {
        cout << iAA + get_num1() << " : " <<
            nodes(iAA, 0) << ", " << nodes(iAA, 1) << "\n";
    }
}


/* ------------------------------------------------------------
 *        ACERecursiveEvaluator
 * ------------------------------------------------------------ */


void ACERecursiveEvaluator::set_basis(ACECTildeBasisSet &bas, int heuristic) {
    basis_set = &bas;
    init(basis_set, heuristic);
}

void ACERecursiveEvaluator::init(ACECTildeBasisSet *basis_set, int heuristic) {

    ACEEvaluator::init(basis_set);


    weights.init(basis_set->nelements, basis_set->nradmax + 1, basis_set->lmax + 1,
                 "weights");

    weights_rank1.init(basis_set->nelements, basis_set->nradbase, "weights_rank1");


    DG_cache.init(1, basis_set->nradbase, "DG_cache");
    DG_cache.fill(0);

    R_cache.init(1, basis_set->nradmax, basis_set->lmax + 1, "R_cache");
    R_cache.fill(0);

    DR_cache.init(1, basis_set->nradmax, basis_set->lmax + 1, "DR_cache");
    DR_cache.fill(0);

    Y_cache.init(1, basis_set->lmax + 1, "Y_cache");
    Y_cache.fill({0, 0});

    DY_cache.init(1, basis_set->lmax + 1, "dY_dense_cache");
    DY_cache.fill({0.});

    //hard-core repulsion
    DCR_cache.init(1, "DCR_cache");
    DCR_cache.fill(0);
    dB_flatten.init(basis_set->max_dB_array_size, "dB_flatten");

    /* convert to ACE.jl format to prepare for construction of DAG
     * This will fill the arrays jl_Aspec, jl_AAspec, jl_orders
     */
    acejlformat();

    // test_acejlformat();

    // now pass this info into the DAG
    dag.init(jl_Aspec, jl_AAspec, jl_orders, jl_coeffs, heuristic);

    // finally empty the temporary arrays to clear up the memory...
    // TODO
}


void ACERecursiveEvaluator::acejlformat() {

    int func_ms_ind = 0;
    int func_ms_t_ind = 0;// index for dB
    int j, jj, func_ind, ms_ind;

    SPECIES_TYPE mu_i = 0;//TODO: multispecies
    const SHORT_INT_TYPE total_basis_size = basis_set->total_basis_size[mu_i];
    ACECTildeBasisFunction *basis = basis_set->basis[mu_i];

    int AAidx = 0;
    RANK_TYPE order, t;
    SPECIES_TYPE *mus;
    NS_TYPE *ns;
    LS_TYPE *ls;
    MS_TYPE *ms;

    /* transform basis into new format:
       [A1 ... A_num1]
       [(i1,i2)(i1,i2)(...)(i1,i2,i3)(...)]
       where each ia represents an A_{ia}
    */

    /* compute max values for mu, n, l, m */
    SPECIES_TYPE maxmu = 0; //TODO: multispecies
    NS_TYPE maxn = basis_set->nradmax;
    LS_TYPE maxl = basis_set->lmax;
    RANK_TYPE maxorder = basis_set->rankmax;
    const DENSITY_TYPE ndensity = basis_set->ndensitymax;

    int num1 = 0;


    /* create a 4D lookup table for the 1-p basis
     * TODO: replace with a map??
     */
    Array4D<int> A_lookup(int(maxmu+1), int(maxn), int(maxl+1), int(2*maxl+1));
    for (int mu = 0; mu < maxmu+1; mu++)
        for (int n = 0; n < maxn; n++)
            for (int l = 0; l < maxl+1; l++)
                for (int m = 0; m < 2*maxl+1; m++)
                    A_lookup(mu, n, l, m) = -1;
    int A_idx = 0;  // linear index of A basis function (1-particle)
    for (func_ind = 0; func_ind < total_basis_size; ++func_ind) {
        ACECTildeBasisFunction *func = &basis[func_ind];
//        func->print();
        order = func->rank; mus = func->mus; ns = func->ns; ls = func->ls;
        for (ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind, ++func_ms_ind) {
            ms = &func->ms_combs[ms_ind * order];
            for (t = 0; t < order; t++) {
                int iA = A_lookup(mus[t], ns[t]-1, ls[t], ms[t]+ls[t]);
                if (iA == -1) {
                    A_lookup(mus[t], ns[t] - 1, ls[t], ms[t] + ls[t]) = A_idx;
                    A_idx += 1;
                }
            }
        }
    }

    /* create the reverse list: linear indixes to mu,l,m,n
       this keeps only the basis functions we really need */
    num1 = A_idx;
    Array2D<int> & Aspec = jl_Aspec;
    Aspec.resize(num1, 4);
    // Array2D<int> Aspec(num1, 4);
    for (int mu = 0; mu <= maxmu; mu++)
        for (int n = 1; n <= maxn; n++)
            for (int l = 0; l <= maxl; l++)
                for (int m = -l; m <= l; m++) {
                    int iA = A_lookup(mu, n-1, l, l+m);
                    if (iA != -1) {
                        Aspec(iA, 0) = mu;
                        Aspec(iA, 1) = n;
                        Aspec(iA, 2) = l;
                        Aspec(iA, 3) = m;
                    }
                }

    /* ============ HALF-BASIS TRICK START ============ */
    for (func_ind = 0; func_ind < total_basis_size; ++func_ind) {
        ACECTildeBasisFunction *func = &basis[func_ind];
        order = func->rank; mus = func->mus; ns = func->ns; ls = func->ls;
        if (!( (mus[0] <= maxmu) && (ns[0] <= maxn) && (ls[0] <= maxl) ))
            continue;

        for (ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind, ++func_ms_ind) {
            ms = &func->ms_combs[ms_ind * order];

            // find first positive and negative index
            int pos_idx = order + 1;
            int neg_idx = order + 1;
            for (t = order-1; t >= 0; t--)
                if (ms[t] > 0) pos_idx = t;
                else if (ms[t] < 0) neg_idx = t;

            // if neg_idx < pos_idx then this means that ms is non-zero
            // and that the first non-zero index is negative, hence this is
            // a negative-sign tuple which we want to combine into
            // its opposite.
            if (neg_idx < pos_idx) {
                // find the opposite tuple
                int ms_ind2 = 0;
                MS_TYPE *ms2;
                bool found_opposite = false;
                for (ms_ind2 = 0; ms_ind2 < func->num_ms_combs; ++ms_ind2) {
                    ms2 = &func->ms_combs[ms_ind2 * order];
                    bool isopposite = true;
                    for (t = 0; t < order; t++)
                        if (ms[t] != -ms2[t]) {
                            isopposite = false;
                            break;
                        }
                    if (isopposite) {
                        found_opposite = true;
                        break;
                    }
                }

                if (ms_ind == ms_ind2) {
                    cout << "WARNING - ms_ind == ms_ind2 \n";
                }

                // now we need to overwrite the coefficients
                if (found_opposite)  {
                    int sig = 1;
                    for (t = 0; t < order; t++)
                        if (ms[t] < 0)
                            sig *= -1;
                    for (int p = 0; p < ndensity; ++p) {
                        func->ctildes[ms_ind2 * ndensity + p] +=
                                func->ctildes[ms_ind * ndensity + p];
                        func->ctildes[ms_ind * ndensity + p] = 0.0;
                    }
                }
            }
        }
    }

    // /* ============ HALF-BASIS TRICK END ============ */


    /* count number of basis functions, keep only non-zero!!  */
    int num2 = 0;
    for (func_ind = 0; func_ind < total_basis_size; ++func_ind)  {
        ACECTildeBasisFunction *func = &basis[func_ind];
        for (ms_ind = 0; ms_ind < (&basis[func_ind])->num_ms_combs; ++ms_ind, ++func_ms_ind) {
            // check that the coefficients are actually non-zero
            bool isnonzero = false;
            for (DENSITY_TYPE p = 0; p < ndensity; ++p)
                if (func->ctildes[ms_ind * ndensity + p] != 0.0)
                    isnonzero = true;
            if (isnonzero)
                num2++;
        }
    }


    /* Now create the AA basis links into the A basis */
    num1 = A_idx;   // total number of A-basis functions that we keep
    // Array1D<int> AAorders(num2);
    Array1D<int> & AAorders = jl_orders;
    AAorders.resize(num2);
    // Array2D<int> AAspec(num2, maxorder);    // specs of AA basis functions
    Array2D<int> & AAspec = jl_AAspec;
    AAspec.resize(num2, maxorder);
    jl_coeffs.resize(num2, ndensity);
    AAidx = 0;                          // linear index into AA basis function
    int len_flat = 0;
    for (func_ind = 0; func_ind < total_basis_size; ++func_ind) {
        ACECTildeBasisFunction *func = &basis[func_ind];
        order = func->rank; mus = func->mus; ns = func->ns; ls = func->ls;
        if (!((mus[0] <= maxmu) && (ns[0] <= maxn) && (ls[0] <= maxl)))        //fool-proofing of functions
            continue;

        for (ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind, ++func_ms_ind) {
            ms = &func->ms_combs[ms_ind * order];

            // check that the coefficients are actually non-zero
            bool iszero = true;
            for (DENSITY_TYPE p = 0; p < ndensity; ++p)
                if (func->ctildes[ms_ind * ndensity + p] != 0.0)
                    iszero = false;
            if (iszero) continue;

            AAorders(AAidx) = order;
            for (t = 0; t < order; t++) {
                int Ait = A_lookup(int(mus[t]), int(ns[t]-1), int(ls[t]), int(ms[t])+int(ls[t]));
                AAspec(AAidx, t) = Ait;
                len_flat += 1;
            }
            for (t = order; t < maxorder; t++) AAspec(AAidx, t) = -1;
            /* copy over the coefficients */
            for (DENSITY_TYPE p = 0; p < ndensity; ++p)
                jl_coeffs(AAidx, p) = func->ctildes[ms_ind * ndensity + p];
            AAidx += 1;
        }
    }

    // flatten the AAspec array
    jl_AAspec_flat.resize(len_flat);
    int idx_spec = 0;
    for (int AAidx = 0; AAidx < jl_AAspec.get_dim(0); AAidx++)
        for (int p = 0; p < jl_orders(AAidx); p++, idx_spec++)
            jl_AAspec_flat(idx_spec) = jl_AAspec(AAidx, p);

}

void ACERecursiveEvaluator::test_acejlformat() {

    Array2D<int> AAspec = jl_AAspec;
    Array2D<int> Aspec = jl_Aspec;
    Array1D<int> AAorders = jl_orders;
    cout << "num2 = " << AAorders.get_dim(0) << "\n";
    int func_ms_ind = 0;
    int func_ms_t_ind = 0;// index for dB
    int j, jj, func_ind, ms_ind;

    SPECIES_TYPE mu_i = 0;
    const SHORT_INT_TYPE total_basis_size = basis_set->total_basis_size[mu_i];
    ACECTildeBasisFunction *basis = basis_set->basis[mu_i];

    RANK_TYPE order, t;
    SPECIES_TYPE *mus;
    NS_TYPE *ns;
    LS_TYPE *ls;
    MS_TYPE *ms;

    /* ==== test by printing the basis spec ====*/
    // TODO: convert this into an automatic consistency test
    int iAA = 0;
    for (func_ind = 0; func_ind < total_basis_size; ++func_ind) {
        ACECTildeBasisFunction *func = &basis[func_ind];
        order = func->rank; mus = func->mus; ns = func->ns; ls = func->ls;
        // func->print();
        //loop over {ms} combinations in sum
        for (ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind, ++func_ms_ind) {
            ms = &func->ms_combs[ms_ind * order];


            cout << iAA << " : |";
            for (t = 0; t < order; t++)
                cout << mus[t] << ";" << ns[t] << "," << ls[t] << "," << ms[t] << "|";
            cout << "\n";

            cout << "      [";
            for (t = 0; t < AAorders(iAA); t++)
                cout << AAspec(iAA, int(t)) << ",";
            cout << "]\n";
            cout << "      |";
            for (t = 0; t < AAorders(iAA); t++)  {
                int iA = AAspec(iAA, t);
                // cout << iA << ",";
                cout << Aspec(iA, 0) << ";"
                     << Aspec(iA, 1) << ","
                     << Aspec(iA, 2) << ","
                     << Aspec(iA, 3) << "|";
            }
            cout << "\n";
            iAA += 1;
        }
    }
    /* ==== END TEST ==== */


}


void ACERecursiveEvaluator::resize_neighbours_cache(int max_jnum) {
    if(basis_set== nullptr) {
        throw std::invalid_argument("ACERecursiveEvaluator: basis set is not assigned");
    }
    if (R_cache.get_dim(0) < max_jnum) {

        //TODO: implement grow
        R_cache.resize(max_jnum, basis_set->nradmax, basis_set->lmax + 1);
        R_cache.fill(0);

        DR_cache.resize(max_jnum, basis_set->nradmax, basis_set->lmax + 1);
        DR_cache.fill(0);

        DG_cache.resize(max_jnum, basis_set->nradbase);
        DG_cache.fill(0);

        Y_cache.resize(max_jnum, basis_set->lmax + 1);
        Y_cache.fill({0, 0});

        DY_cache.resize(max_jnum, basis_set->lmax + 1);
        DY_cache.fill({0});

        //hard-core repulsion
        DCR_cache.init(max_jnum, "DCR_cache");
        DCR_cache.fill(0);
    }
}


// double** r - atomic coordinates of atom I
// int* types - atomic types if atom I
// int **firstneigh -  ptr to 1st J int value of each I atom. Usage: jlist = firstneigh[i];
// Usage: j = jlist_of_i[jj];
// jnum - number of J neighbors for each I atom.  jnum = numneigh[i];

void
ACERecursiveEvaluator::compute_atom(int i, DOUBLE_TYPE **x, const SPECIES_TYPE *type, const int jnum, const int *jlist) {
    if(basis_set== nullptr) {
        throw std::invalid_argument("ACERecursiveEvaluator: basis set is not assigned");
    }
    per_atom_calc_timer.start();
#ifdef PRINT_MAIN_STEPS
    printf("\n ATOM: ind = %d r_norm=(%f, %f, %f)\n",i, x[i][0], x[i][1], x[i][2]);
#endif
    DOUBLE_TYPE evdwl = 0, evdwl_cut = 0, rho_core = 0;
    DOUBLE_TYPE r_norm;
    DOUBLE_TYPE xn, yn, zn, r_xyz;
    DOUBLE_TYPE R, GR, DGR, R_over_r, DR, DCR;
    DOUBLE_TYPE *r_hat;

    SPECIES_TYPE mu_j;
    RANK_TYPE r, rank, t;
    NS_TYPE n;
    LS_TYPE l;
    MS_TYPE m, m_t;

    SPECIES_TYPE *mus;
    NS_TYPE *ns;
    LS_TYPE *ls;
    MS_TYPE *ms;

    int j, jj, func_ind, ms_ind;
    SHORT_INT_TYPE factor;

    ACEComplex Y{0}, Y_DR{0.};
    ACEComplex B{0.};
    ACEComplex dB{0};
    ACEComplex A_cache[basis_set->rankmax];

    ACEComplex dA[basis_set->rankmax];
    int spec[basis_set->rankmax];

    dB_flatten.fill({0.});

    ACEDYcomponent grad_phi_nlm{0}, DY{0.};

    //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme
    ACEComplex A_forward_prod[basis_set->rankmax + 1];
    ACEComplex A_backward_prod[basis_set->rankmax + 1];

    DOUBLE_TYPE inv_r_norm;
    DOUBLE_TYPE r_norms[jnum];
    DOUBLE_TYPE inv_r_norms[jnum];
    DOUBLE_TYPE rhats[jnum][3];//normalized vector
    SPECIES_TYPE elements[jnum];
    const DOUBLE_TYPE xtmp = x[i][0];
    const DOUBLE_TYPE ytmp = x[i][1];
    const DOUBLE_TYPE ztmp = x[i][2];
    DOUBLE_TYPE f_ji[3];

    bool is_element_mapping = element_type_mapping.get_size() > 0;
    SPECIES_TYPE mu_i;
    if (is_element_mapping)
        mu_i = element_type_mapping(type[i]);
    else
        mu_i = type[i];

    const SHORT_INT_TYPE total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu_i];
    const SHORT_INT_TYPE total_basis_size = basis_set->total_basis_size[mu_i];

    ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[mu_i];
    ACECTildeBasisFunction *basis = basis_set->basis[mu_i];

    DOUBLE_TYPE rho_cut, drho_cut, fcut, dfcut;
    DOUBLE_TYPE dF_drho_core;

    //TODO: lmax -> lmaxi (get per-species type)
    const LS_TYPE lmaxi = basis_set->lmax;

    //TODO: nradmax -> nradiali (get per-species type)
    const NS_TYPE nradiali = basis_set->nradmax;

    //TODO: nradbase -> nradbasei (get per-species type)
    const NS_TYPE nradbasei = basis_set->nradbase;

    //TODO: get per-species type number of densities
    const DENSITY_TYPE ndensity= basis_set->ndensitymax;

    neighbours_forces.resize(jnum, 3);
    neighbours_forces.fill(0);

    //TODO: shift nullifications to place where arrays are used
    weights.fill({0});
    weights_rank1.fill(0);
    A.fill({0});
    A_rank1.fill(0);
    rhos.fill(0);
    dF_drho.fill(0);

#ifdef EXTRA_C_PROJECTIONS
    basis_projections_rank1.init(total_basis_size_rank1, ndensity, "c_projections_rank1");
    basis_projections.init(total_basis_size, ndensity, "c_projections");
#endif

    //proxy references to spherical harmonics and radial functions arrays
    const Array2DLM<ACEComplex> &ylm = basis_set->spherical_harmonics.ylm;
    const Array2DLM<ACEDYcomponent> &dylm = basis_set->spherical_harmonics.dylm;

    const Array2D<DOUBLE_TYPE> &fr = basis_set->radial_functions->fr;
    const Array2D<DOUBLE_TYPE> &dfr = basis_set->radial_functions->dfr;

    const Array1D<DOUBLE_TYPE> &gr = basis_set->radial_functions->gr;
    const Array1D<DOUBLE_TYPE> &dgr = basis_set->radial_functions->dgr;

    loop_over_neighbour_timer.start();

    int jj_actual = 0;
    SPECIES_TYPE type_j = 0;
    int neighbour_index_mapping[jnum]; // jj_actual -> jj
    //loop over neighbours, compute distance, consider only atoms within with r<cutoff(mu_i, mu_j)
    for (jj = 0; jj < jnum; ++jj) {

        j = jlist[jj];
        xn = x[j][0] - xtmp;
        yn = x[j][1] - ytmp;
        zn = x[j][2] - ztmp;
        type_j = type[j];
        if (is_element_mapping)
            mu_j = element_type_mapping(type_j);
        else
            mu_j = type_j;

        DOUBLE_TYPE current_cutoff = basis_set->radial_functions->cut(mu_i, mu_j);
        r_xyz = sqrt(xn * xn + yn * yn + zn * zn);

        if (r_xyz >= current_cutoff)
            continue;

        inv_r_norm = 1 / r_xyz;

        r_norms[jj_actual] = r_xyz;
        inv_r_norms[jj_actual] = inv_r_norm;
        rhats[jj_actual][0] = xn * inv_r_norm;
        rhats[jj_actual][1] = yn * inv_r_norm;
        rhats[jj_actual][2] = zn * inv_r_norm;
        elements[jj_actual] = mu_j;
        neighbour_index_mapping[jj_actual] = jj;
        jj_actual++;
    }

    int jnum_actual = jj_actual;

    //ALGORITHM 1: Atomic base A
    for (jj = 0; jj < jnum_actual; ++jj) {
        r_norm = r_norms[jj];
        mu_j = elements[jj];
        r_hat = rhats[jj];

        //proxies
        Array2DLM<ACEComplex> &Y_jj = Y_cache(jj);
        Array2DLM<ACEDYcomponent> &DY_jj = DY_cache(jj);


        basis_set->radial_functions->evaluate(r_norm, basis_set->nradbase, nradiali, mu_i, mu_j);
        basis_set->spherical_harmonics.compute_ylm(r_hat[0], r_hat[1], r_hat[2], lmaxi);
        //loop for computing A's
        //rank = 1
        for (n = 0; n < basis_set->nradbase; n++) {
            GR = gr(n);
#ifdef DEBUG_ENERGY_CALCULATIONS
            printf("-neigh atom %d\n", jj);
            printf("gr(n=%d)(r=%f) = %f\n", n, r_norm, gr(n));
            printf("dgr(n=%d)(r=%f) = %f\n", n, r_norm, dgr(n));
#endif
            DG_cache(jj, n) = dgr(n);
            A_rank1(mu_j, n) += GR * Y00;
        }
        //loop for computing A's
        // for rank > 1
        for (n = 0; n < nradiali; n++) {
            auto &A_lm = A(mu_j, n);
            for (l = 0; l <= lmaxi; l++) {
                R = fr(n, l);
#ifdef DEBUG_ENERGY_CALCULATIONS
                printf("R(nl=%d,%d)(r=%f)=%f\n", n + 1, l, r_norm, R);
#endif

                DR_cache(jj, n, l) = dfr(n, l);
                R_cache(jj, n, l) = R;

                for (m = 0; m <= l; m++) {
                    Y = ylm(l, m);
#ifdef DEBUG_ENERGY_CALCULATIONS
                    printf("Y(lm=%d,%d)=(%f, %f)\n", l, m, Y.real, Y.img);
#endif
                    A_lm(l, m) += R * Y; //accumulation sum over neighbours
                    Y_jj(l, m) = Y;
                    DY_jj(l, m) = dylm(l, m);
                }
            }
        }

        //hard-core repulsion
        rho_core += basis_set->radial_functions->cr;
        DCR_cache(jj) = basis_set->radial_functions->dcr;

    } //end loop over neighbours

    //complex conjugate A's (for NEGATIVE (-m) terms)
    // for rank > 1
    for (mu_j = 0; mu_j < basis_set->nelements; mu_j++) {
        for (n = 0; n < nradiali; n++) {
            auto &A_lm = A(mu_j, n);
            for (l = 0; l <= lmaxi; l++) {
                //fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm
                for (m = 1; m <= l; m++) {
                    factor = m % 2 == 0 ? 1 : -1;
                    A_lm(l, -m) = A_lm(l, m).conjugated() * factor;
                }
            }
        }
    }    //now A's are constructed
    loop_over_neighbour_timer.stop();

    // ==================== ENERGY ====================

    energy_calc_timer.start();
#ifdef EXTRA_C_PROJECTIONS
    basis_projections_rank1.fill(0);
    basis_projections.fill(0);
#endif

    //ALGORITHM 2: Basis functions B with iterative product and density rho(p) calculation
    //rank=1
    for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind) {
        ACECTildeBasisFunction *func = &basis_rank1[func_rank1_ind];
//        ndensity = func->ndensity;
#ifdef PRINT_LOOPS_INDICES
        printf("Num density = %d r = 0\n",(int) ndensity );
        print_C_tilde_B_basis_function(*func);
#endif
        double A_cur = A_rank1(func->mus[0], func->ns[0] - 1);
#ifdef DEBUG_ENERGY_CALCULATIONS
        printf("A_r=1(x=%d, n=%d)=(%f)\n", func->mus[0], func->ns[0], A_cur);
        printf("     coeff[0] = %f\n", func->ctildes[0]);
#endif
        for (DENSITY_TYPE p = 0; p < ndensity; ++p) {
            //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
            rhos(p) += func->ctildes[p] * A_cur;
#ifdef EXTRA_C_PROJECTIONS
            //aggregate C-projections separately
            basis_projections_rank1(func_rank1_ind, p)+= func->ctildes[p] * A_cur;
#endif
        }
    } // end loop for rank=1

    // ================ START RECURSIVE EVALUATOR ====================
    // (rank > 1 only)

    /* STAGE 1:
     * 1-particle basis is already evaluated, so we only need to
     * copy it into the AA value buffer
     */
    int num1 = dag.get_num1();
    for (int idx = 0; idx < num1; idx++)
        dag.AAbuf(idx) = A( dag.Aspec(idx, 0),
                            dag.Aspec(idx, 1)-1,
                            dag.Aspec(idx, 2),
                            dag.Aspec(idx, 3) );


    if (recursive) {
        /* STAGE 2: FORWARD PASS
        * Forward pass: go through the dag and store all intermediate results
        */

        // rhos.fill(0); note the rhos are already reset and started filling above!
        ACEComplex AAcur{0.0};
        int i1, i2;

        int * dag_nodes = dag.nodes.get_data();
        int idx_nodes = 0;

        DOUBLE_TYPE * dag_coefs = dag.coeffs.get_data();
        int idx_coefs = 0;

        int num2_int = dag.get_num2_int();
        int num2_leaf = dag.get_num2_leaf();

        // interior nodes (save AA)
        for (int idx = num1; idx < num1+num2_int; idx++) {
            i1 = dag_nodes[idx_nodes]; idx_nodes++;
            i2 = dag_nodes[idx_nodes]; idx_nodes++;
            AAcur = dag.AAbuf(i1) * dag.AAbuf(i2);
            dag.AAbuf(idx) = AAcur;
            for (int p = 0; p < ndensity; p++, idx_coefs++)
                rhos(p) += AAcur.real_part_product(dag_coefs[idx_coefs]);
        }

        // leaf nodes -> no need to store in AAbuf
        DOUBLE_TYPE AAcur_re = 0.0;
        for (int _idx = 0; _idx < num2_leaf; _idx++) {
            i1 = dag_nodes[idx_nodes]; idx_nodes++;
            i2 = dag_nodes[idx_nodes]; idx_nodes++;
            AAcur_re = dag.AAbuf(i1).real_part_product(dag.AAbuf(i2));
            for (int p = 0; p < ndensity; p++, idx_coefs++)
                rhos(p) += AAcur_re * dag_coefs[idx_coefs];
        }

    } else {

        /* non-recursive Julia-style evaluator implementation */
        // TODO: fix array access to enable bounds checking again???
        ACEComplex AAcur{1.0};
        int *AAspec = jl_AAspec_flat.get_data();
        DOUBLE_TYPE *coeffs = jl_coeffs.get_data();
        int idx_spec = 0;
        int idx_coefs = 0;
        int order = 0;
        int max_order = jl_AAspec.get_dim(1);
        for (int iAA = 0; iAA < jl_AAspec.get_dim(0); iAA ++) {
            AAcur = 1.0;
            order = jl_orders(iAA);
            for (int r = 0; r < order; r++, idx_spec++)
                AAcur *= dag.AAbuf( AAspec[idx_spec] );
            for (int p = 0; p < ndensity; p++, idx_coefs++)
                rhos(p) += AAcur.real_part_product(coeffs[idx_coefs]);
        }
    }

    /* we now have rho and can evaluate lots of things.
       -------- this is back to the original PACE code --------- */

#ifdef DEBUG_FORCES_CALCULATIONS
    printf("rhos = ");
    for(DENSITY_TYPE p =0; p<ndensity; ++p) printf(" %.20f ",rhos(p));
    printf("\n");
#endif


    // energy cutoff
    rho_cut = basis_set->rho_core_cutoffs(mu_i);
    drho_cut = basis_set->drho_core_cutoffs(mu_i);

    basis_set->inner_cutoff(rho_core, rho_cut, drho_cut, fcut, dfcut);
    basis_set->FS_values_and_derivatives(rhos, evdwl, dF_drho, ndensity);

    dF_drho_core = evdwl * dfcut + 1;
    for (DENSITY_TYPE p = 0; p < ndensity; ++p)
        dF_drho(p) *= fcut;
    evdwl_cut = evdwl * fcut + rho_core;

    // E0 shift
    evdwl_cut += basis_set->E0vals(mu_i);

    /* I've moved this from below the weight calculation
          since I believe it only times the energy? the weights
          are only needed for the forces?
          But I believe we could add a third timer for computing just
          the weights; this will allow us to check better where the
          bottleneck is.
    */
    energy_calc_timer.stop();

    forces_calc_loop_timer.start();


#ifdef DEBUG_FORCES_CALCULATIONS
    printf("dFrhos = ");
    for(DENSITY_TYPE p =0; p<ndensity; ++p) printf(" %f ",dF_drho(p));
    printf("\n");
#endif

    //ALGORITHM 3: Weights and theta calculation
    // rank = 1
    for (int f_ind = 0; f_ind < total_basis_size_rank1; ++f_ind) {
        ACECTildeBasisFunction *func = &basis_rank1[f_ind];
//        ndensity = func->ndensity;
        for (DENSITY_TYPE p = 0; p < ndensity; ++p) {
            //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
            weights_rank1(func->mus[0], func->ns[0] - 1) += dF_drho(p) * func->ctildes[p];
        }
    }

    /* --------- we now continue with the recursive code --------- */

    if (recursive) {
        /* STAGE 2:  BACKWARD PASS */
        int i1, i2;
        ACEComplex AA1{0.0};
        ACEComplex AA2{0.0};
        ACEComplex wcur{0.0};
        int num2_int = dag.get_num2_int();
        int num2_leaf = dag.get_num2_leaf();
        /* to prepare for the backward we first need to zero the weights */
        dag.w.fill({0.0});

        int * dag_nodes = dag.nodes.get_data();
        int idx_nodes = 2 * (num2_int + num2_leaf) - 1;

        DOUBLE_TYPE * dag_coefs = dag.coeffs.get_data();
        int idx_coefs = ndensity * (num2_int + num2_leaf) - 1;

        for (int idx = num1+num2_int+num2_leaf - 1; idx >= num1; idx--) {
            i2 = dag_nodes[idx_nodes]; idx_nodes--;
            i1 = dag_nodes[idx_nodes]; idx_nodes--;
            AA1 = dag.AAbuf(i1);
            AA2 = dag.AAbuf(i2);
            wcur = dag.w(idx);   // [***]
            for (int p = ndensity-1; p >= 0; p--, idx_coefs--)
                wcur += dF_drho(p) * dag_coefs[idx_coefs];
            dag.w(i1) += wcur * AA2;   // TODO: replace with explicit muladd?
            dag.w(i2) += wcur * AA1;
        }

        /*  [***]
         * Note that these weights don't really need to be stored for the
         * leaf nodes. We tested splitting this for loop into two where
         * for the leaf nodes the weight would just be initialized to 0.0
         * instead of reading from an array. The improvement was barely
         * measurable, ca 3%, so we reverted to this simpler algorithm
         */


    } else {

        // non-recursive ACE.jl style implemenation of gradients, but with
        // a backward differentiation approach to the prod-A
        // (cf. Algorithm 3 in the manuscript)

        dag.w.fill({0.0});
        ACEComplex AAf{1.0}, AAb{1.0}, theta{0.0};

        int *AAspec = jl_AAspec_flat.get_data();
        DOUBLE_TYPE *coeffs = jl_coeffs.get_data();
        int idx_spec = 0;
        int idx_coefs = 0;
        int order = 0;
        int max_order = jl_AAspec.get_dim(1);
        for (int iAA = 0; iAA < jl_AAspec.get_dim(0); iAA ++ ) {
            order = jl_orders(iAA);
            theta = 0.0;
            for (int p = 0; p < ndensity; p++, idx_coefs++)
                theta += dF_drho(p) * coeffs[idx_coefs];
            dA[0] = 1.0;
            AAf = 1.0;
            for (int t = 0; t < order-1; t++, idx_spec++) {
                spec[t] = AAspec[idx_spec];
                A_cache[t] = dag.AAbuf(spec[t]);
                AAf *= A_cache[t];
                dA[t+1] = AAf;
            }
            spec[order-1] = AAspec[idx_spec]; idx_spec++;
            A_cache[order-1] = dag.AAbuf(spec[order-1]);
            AAb = 1.0;
            for (int t = order-1; t >= 1; t--) {
                AAb *= A_cache[t];
                dA[t-1] *= AAb;
                dag.w(spec[t]) += theta * dA[t];
            }
            dag.w(spec[0]) += theta * dA[0];
        }

    }

    /* STAGE 3:
     * get the gradients from the 1-particle basis gradients and write them
     * into the dF/drho derivatives.
     */
    /* In order to reuse the original PACE code, we copy the weights back
     * into the the PACE datastructure. */

    for (int idx = 0; idx < num1; idx++) {
        int m = dag.Aspec(idx, 3);
        if (m >= 0) {
            weights(dag.Aspec(idx, 0),      // mu
                    dag.Aspec(idx, 1) - 1,  // n
                    dag.Aspec(idx, 2),      // l
                    m ) += dag.w(idx);
        } else {
            int factor = (m % 2 == 0 ? 1 : -1);
            weights(dag.Aspec(idx, 0),      // mu
                    dag.Aspec(idx, 1) - 1,  // n
                    dag.Aspec(idx, 2),      // l
                    -m ) += factor * dag.w(idx).conjugated();
        }
    }


    /* ------ From here we are now back to the original PACE code ---- */

// ==================== FORCES ====================
#ifdef PRINT_MAIN_STEPS
    printf("\nFORCE CALCULATION\n");
    printf("loop over neighbours\n");
#endif

// loop over neighbour atoms for force calculations
    for (jj = 0; jj < jnum_actual; ++jj) {
        mu_j = elements[jj];
        r_hat = rhats[jj];
        inv_r_norm = inv_r_norms[jj];

        Array2DLM<ACEComplex> &Y_cache_jj = Y_cache(jj);
        Array2DLM<ACEDYcomponent> &DY_cache_jj = DY_cache(jj);

#ifdef PRINT_LOOPS_INDICES
        printf("\nneighbour atom #%d\n", jj);
        printf("rhat = (%f, %f, %f)\n", r_hat[0], r_hat[1], r_hat[2]);
#endif

        forces_calc_neighbour_timer.start();

        f_ji[0] = f_ji[1] = f_ji[2] = 0;

//for rank = 1
        for (n = 0; n < nradbasei; ++n) {
            if (weights_rank1(mu_j, n) == 0)
                continue;
            auto &DG = DG_cache(jj, n);
            DGR = DG * Y00;
            DGR *= weights_rank1(mu_j, n);
#ifdef DEBUG_FORCES_CALCULATIONS
            printf("r=1: (n,l,m)=(%d, 0, 0)\n",n+1);
            printf("\tGR(n=%d, r=%f)=%f\n",n+1,r_norm, gr(n));
            printf("\tDGR(n=%d, r=%f)=%f\n",n+1,r_norm, dgr(n));
            printf("\tdF+=(%f, %f, %f)\n",DGR * r_hat[0], DGR * r_hat[1], DGR * r_hat[2]);
#endif
            f_ji[0] += DGR * r_hat[0];
            f_ji[1] += DGR * r_hat[1];
            f_ji[2] += DGR * r_hat[2];
        }

//for rank > 1
        for (n = 0; n < nradiali; n++) {
            for (l = 0; l <= lmaxi; l++) {
                R_over_r = R_cache(jj, n, l) * inv_r_norm;
                DR = DR_cache(jj, n, l);

                // for m>=0
                for (m = 0; m <= l; m++) {
                    ACEComplex w = weights(mu_j, n, l, m);
                    if (w == 0)
                        continue;
                    //counting for -m cases if m>0
                    // if (m > 0) w *= 2;  // not needed for recursive eval

                    DY = DY_cache_jj(l, m);
                    Y_DR = Y_cache_jj(l, m) * DR;

                    grad_phi_nlm.a[0] = Y_DR * r_hat[0] + DY.a[0] * R_over_r;
                    grad_phi_nlm.a[1] = Y_DR * r_hat[1] + DY.a[1] * R_over_r;
                    grad_phi_nlm.a[2] = Y_DR * r_hat[2] + DY.a[2] * R_over_r;
#ifdef DEBUG_FORCES_CALCULATIONS
                    printf("d_phi(n=%d, l=%d, m=%d) = ((%f,%f), (%f,%f), (%f,%f))\n",n+1,l,m,
                           grad_phi_nlm.a[0].real, grad_phi_nlm.a[0].img,
                           grad_phi_nlm.a[1].real, grad_phi_nlm.a[1].img,
                           grad_phi_nlm.a[2].real, grad_phi_nlm.a[2].img);

                    printf("weights(n,l,m)(%d,%d,%d) = (%f,%f)\n", n+1, l, m, w.real, w.img);
                    //if (m>0) w*=2;
                    printf("dF(n,l,m)(%d, %d, %d) += (%f, %f, %f)\n", n + 1, l, m,
                           w.real_part_product(grad_phi_nlm.a[0]),
                           w.real_part_product(grad_phi_nlm.a[1]),
                           w.real_part_product(grad_phi_nlm.a[2])
                    );
#endif
// real-part multiplication only
                    f_ji[0] += w.real_part_product(grad_phi_nlm.a[0]);
                    f_ji[1] += w.real_part_product(grad_phi_nlm.a[1]);
                    f_ji[2] += w.real_part_product(grad_phi_nlm.a[2]);
                }
            }
        }


#ifdef PRINT_INTERMEDIATE_VALUES
        printf("f_ji(jj=%d, i=%d)=(%f, %f, %f)\n", jj, i,
               f_ji[0], f_ji[1], f_ji[2]
        );
#endif

        //hard-core repulsion
        DCR = DCR_cache(jj);
#ifdef   DEBUG_FORCES_CALCULATIONS
        printf("DCR = %f\n",DCR);
#endif
        f_ji[0] += dF_drho_core * DCR * r_hat[0];
        f_ji[1] += dF_drho_core * DCR * r_hat[1];
        f_ji[2] += dF_drho_core * DCR * r_hat[2];
#ifdef PRINT_INTERMEDIATE_VALUES
        printf("with core-repulsion\n");
        printf("f_ji(jj=%d, i=%d)=(%f, %f, %f)\n", jj, i,
               f_ji[0], f_ji[1], f_ji[2]
        );
        printf("neighbour_index_mapping[jj=%d]=%d\n",jj,neighbour_index_mapping[jj]);
#endif

        neighbours_forces(neighbour_index_mapping[jj], 0) = f_ji[0];
        neighbours_forces(neighbour_index_mapping[jj], 1) = f_ji[1];
        neighbours_forces(neighbour_index_mapping[jj], 2) = f_ji[2];

        forces_calc_neighbour_timer.stop();
    }// end loop over neighbour atoms for forces

    forces_calc_loop_timer.stop();

    //now, energies and forces are ready
    //energies(i) = evdwl + rho_core;
    e_atom = evdwl_cut;

#ifdef PRINT_INTERMEDIATE_VALUES
    printf("energies(i) = FS(...rho_p_accum...) = %f\n", evdwl);
#endif
    per_atom_calc_timer.stop();
}