/* * Performant implementation of atomic cluster expansion and interface to LAMMPS * * Copyright 2021 (c) Yury Lysogorskiy^1, Cas van der Oord^2, Anton Bochkarev^1, * Sarath Menon^1, Matteo Rinaldi^1, Thomas Hammerschmidt^1, Matous Mrovec^1, * Aidan Thompson^3, Gabor Csanyi^2, Christoph Ortner^4, Ralf Drautz^1 * * ^1: Ruhr-University Bochum, Bochum, Germany * ^2: University of Cambridge, Cambridge, United Kingdom * ^3: Sandia National Laboratories, Albuquerque, New Mexico, USA * ^4: University of British Columbia, Vancouver, BC, Canada * * * See the LICENSE file. * This FILENAME is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ // Created by Christoph Ortner on 20.12.2020 #include "ace_recursive.h" #include "ace_abstract_basis.h" #include "ace_types.h" /* ------------------------------------------------------------ * ACEDAG Implementation * (just the DAG construction, not the traversal) * ------------------------------------------------------------ */ /* Notes on different Tags: * rec1 - first basic implementation * rec2 - avoid index arithmetic, contiguous layout, * canonical evaluator in ACE.jl format * rec3 - split nodes into interior and leaf nodes */ void ACEDAG::init(Array2D xAspec, Array2D AAspec, Array1D orders, Array2D jl_coeffs, int _heuristic ) { // remember which heuristic we want to use! heuristic = _heuristic; /* stage one of the graph is just extracting the A basis from * the tensor product format into the linear list; all information * for that is already stored in Aspec, and the only thing to do here is * to construct zero-coefficients. Still we have to copy Aspec, since * the one we have here will (may?) be deleted. */ int num1 = xAspec.get_dim(0); Aspec = xAspec; //YL: just copying the multiarray: Aspec = xAspec; /* fill the one-particle basis into the DAGmap * DAGmap[ (i1,...,in) ] = iAA index where the (i1,...,in) basis functions * lives. */ TDAGMAP DAGmap; for (int iA = 0; iA < num1; iA++) { vector a(1); a[0] = iA; DAGmap[a] = iA; } /* For stage 2 we now want to construct the actual recursion; the recursion info will be stored in DAGspec, while the coefficients go into DAGcoeffs. These arrays are initialized to length `num2`, but they will have to grow as we add additional artificial nodes into the graph. initially we treat all nodes as having children, but then in a second stage below we reorganize. */ int num2 = AAspec.get_dim(0); int ndensity = jl_coeffs.get_dim(1); nodes_pre.resize(2*num2, 2); coeffs_pre.resize(2*num2, ndensity); /* the first basis function we construct will get index num1, * since there are already num1 one-particle basis functions * to collect during stage 1 */ dag_idx = num1; /* main loop over AA basis set to transform into DAG */ for (int iAA = 0; iAA < num2; iAA++) { // create a vector representing the current basis function int ord = orders(iAA); vector aa(ord); for (int t = 0; t < ord; t++) aa[t] = AAspec(iAA, t); vector c(ndensity); for (int p = 0; p < ndensity; p++) c[p] = jl_coeffs(iAA, p); insert_node(DAGmap, aa, c); } /* convert to 3-stage format through reordering * interior nodes first, then leaf nodes */ // allocate storage num_nodes = dag_idx; // store total size of dag // num_nodes - num1 = number of many-body nodes. nodes.resize(num_nodes - num1, 2); coeffs.resize(num_nodes - num1, ndensity); // find out which nodes have children haschild.resize(num_nodes - num1); haschild.fill(false); for (int iAA = 0; iAA < num_nodes - num1; iAA++) { if (nodes_pre(iAA, 0) >= num1) haschild(nodes_pre(iAA, 0)-num1) = true; if (nodes_pre(iAA, 1) >= num1) haschild(nodes_pre(iAA, 1)-num1) = true; } // to reorder the graph we need a fresh map from preordered indices to // postordered indices; for the 1-particle basis the order remains the same. // TODO: doesn't need to be a map, could be a vector. map neworder; for (int iA = 0; iA < num1; iA++) neworder[iA] = iA; // insert all interior nodes num2_int = 0; num2_leaf = 0; dag_idx = num1; int i1, i2, i1pre, i2pre; for (int iAA = 0; iAA < num_nodes - num1; iAA++) { if (haschild(iAA)) { num2_int += 1; // indices into AAbuf before reordering i1pre = nodes_pre(iAA, 0); i2pre = nodes_pre(iAA, 1); // indices into AAbuf after reordering i1 = neworder[i1pre]; i2 = neworder[i2pre]; // insert the current node : iAA is old order, dag_idx is new order neworder[num1+iAA] = dag_idx; nodes(dag_idx-num1, 0) = i1; nodes(dag_idx-num1, 1) = i2; for (int t = 0; t < ndensity; t++) coeffs(dag_idx-num1, t) = coeffs_pre(iAA, t); dag_idx++; } } // insert all leaf nodes for (int iAA = 0; iAA < num_nodes - num1; iAA++) { if (!haschild(iAA)) { num2_leaf += 1; // indices into AAbuf before reordering i1pre = nodes_pre(iAA, 0); i2pre = nodes_pre(iAA, 1); // insert the current node : no need to remember the new order now nodes(dag_idx-num1, 0) = neworder[i1pre]; nodes(dag_idx-num1, 1) = neworder[i2pre]; for (int t = 0; t < ndensity; t++) coeffs(dag_idx-num1, t) = coeffs_pre(iAA, t); dag_idx++; } } #ifdef DEBUG cout << "num2_int = " << num2_int << "; num2_leaf = " << num2_leaf << "\n"; #endif // free up memory that is no longer needed nodes_pre.resize(0, 0); coeffs_pre.resize(0, 0); haschild.resize(0); /* finalize dag: allocate buffer storage */ AAbuf.resize(num1 + num2_int); w.resize(num_nodes); // TODO: technically only need num1 + num2_int for w, this can save one // memory access later, probably not worth the crazy code duplication. } void ACEDAG::insert_node(TDAGMAP &DAGmap, vector a, vector c) { /* start with a list of all possible partitions into 2 groups * and check whether any of these nodes are already in the dag */ auto partitions = find_2partitions(a); int ndensity = c.size(); int num1 = get_num1(); // TODO: first try to find partitions into nodes that are already parents // that way we will get more leaf nodes! for (TPARTITION const& p : partitions) { /* this is the good case; the parent nodes are both already in the * graph; add the new node and return. This is also the only place in the * code where an actual insert happens. */ if (DAGmap.count(p.first) && DAGmap.count(p.second)) { if (nodes_pre.get_dim(0) < dag_idx + 1) { //check if array is sufficiently large int newsize = (dag_idx * 3) / 2; nodes_pre.resize(newsize, 2); // grow arrays if necessary coeffs_pre.resize(newsize, ndensity); } int i1 = DAGmap[p.first]; int i2 = DAGmap[p.second]; nodes_pre(dag_idx - num1, 0) = i1; nodes_pre(dag_idx - num1, 1) = i2; DAGmap[a] = dag_idx; for (int p = 0; p < ndensity; p++) coeffs_pre(dag_idx - num1, p) = c[p]; dag_idx += 1; return; } } /* if we are here, then this means, the new node cannot yet be inserted. * We first need to insert some intermediate auxiliary nodes. For this * we use a simple heuristic: * try to find a partition where one of the two nodes are already * in the graph, if there are several, then we remember the longest * (this is a very greedy heuristic!!) * .... (continue below) .... */ TPARTITION longest; int longest_length = 0; for (auto const& p : partitions) { int len = 0; if (DAGmap.count(p.first)) { len = p.first.size(); } else if (DAGmap.count(p.second)) { len = p.second.size(); } if ((len > 0) && (len > longest_length)) { longest_length = len; longest = p; } } /* sanity check */ if (longest_length == 0) { std::stringstream error_message; error_message << "WARNING : something has gone horribly wrong! `longest_length == 0`! \n"; error_message << "a = ["; for (int t = 0; t < a.size(); t++) error_message << a[t] << ", "; error_message << "]\n"; throw std::logic_error(error_message.str()); // return; } /* If there is a partition with one component already in the graph, * then we only need to add in the other component. Note that there will * always be at least one such partition, namely all those containing * a one-element node e.g. (1,2,3,4) -> (1,) (2,3,4) then (1,) is * a one-particle basis function and hence always in the graph. * If heuristic == 0, then we just take one of those partitionas and move on. * * We also accept the found partition if longest_length > 1. * And we also accept it if we have a 2- or 3-correlation. */ if ( (heuristic == 0) || (longest_length > 1) || (a.size() <= 3)) { /* insert the other node that isn't in the DAG yet * this is an artificial node so it gets zero-coefficients * This step is recursive, so more than one node might be inserted here */ vector cz(ndensity); for (int i = 0; i < ndensity; i++) cz[i] = 0.0; TPARTITION p = longest; if (DAGmap.count(p.first)) insert_node(DAGmap, p.second, cz); else insert_node(DAGmap, p.first, cz); } /* Second heuristic : heuristic == 1 * Focus on inserting artificial 2-correlations */ else if (heuristic == 1) { // and we also know that longest_length == 1 and nu = a.size >= 4. int nu = a.size(); // generate an artificial partition vector a1(2); for (int i = 0; i < 2; i++) a1[i] = a[i]; vector a2(nu - 2); for (int i = 0; i < nu - 2; i++) a2[i] = a[2 + i]; vector cz(ndensity); for (int i = 0; i < cz.size(); i++) cz[i] = 0.0; // and insert both (we know neither are in the DAG yet) insert_node(DAGmap, a1, cz); insert_node(DAGmap, a2, cz); } else { cout << "WARNING : something has gone horribly wrong! \n"; // TODO: Throw and error here?!? return; } /* now we should be ready to insert the entire tuple `a` since there is now * an eligible parent pair. Here we recompute the partition of `a`, but * that's a small price to pay for a clearer code. Maybe this can be * optimized a bit by wrapping it all into a while loop or having a second * version of `insert_node` ... */ insert_node(DAGmap, a, c); } TPARTITIONS ACEDAG::find_2partitions(vector v) { int N = v.size(); int zo; TPARTITIONS partitions; TPARTITION part; /* This is a fun little hack to extract all subsets of the indices 1:N * the number i will have binary representation with each digit indicating * whether or not that index belongs to the selected subset */ for (int i = 1; i < (1< v1(N1); vector v2(N2); int i1 =0, i2 = 0; p = 1; for (int n = 0; n < N; n++) { zo = ((i / p) % 2); p *= 2; if (zo == 1) { v1[i1] = v[n]; i1 += 1; } else { v2[i2] = v[n]; i2 += 1; } } part = make_pair(v1, v2); partitions.push_back(part); } return partitions; } void ACEDAG::print() { cout << "DAG Specification: \n" ; cout << " n1 : " << get_num1() << "\n"; cout << " n2 : " << get_num2() << "\n"; cout << " num_nodes : " << num_nodes << "\n"; cout << "--------------------\n"; cout << "A-spec: \n"; for (int iA = 0; iA < get_num1(); iA++) { cout << iA << " : " << Aspec(iA, 0) << Aspec(iA, 1) << Aspec(iA, 2) << Aspec(iA, 3) << "\n"; } cout << "-----------\n"; cout << "AA-tree\n"; for (int iAA = 0; iAA < get_num2(); iAA++) { cout << iAA + get_num1() << " : " << nodes(iAA, 0) << ", " << nodes(iAA, 1) << "\n"; } } /* ------------------------------------------------------------ * ACERecursiveEvaluator * ------------------------------------------------------------ */ void ACERecursiveEvaluator::set_basis(ACECTildeBasisSet &bas, int heuristic) { basis_set = &bas; init(basis_set, heuristic); } void ACERecursiveEvaluator::init(ACECTildeBasisSet *basis_set, int heuristic) { ACEEvaluator::init(basis_set); weights.init(basis_set->nelements, basis_set->nradmax + 1, basis_set->lmax + 1, "weights"); weights_rank1.init(basis_set->nelements, basis_set->nradbase, "weights_rank1"); DG_cache.init(1, basis_set->nradbase, "DG_cache"); DG_cache.fill(0); R_cache.init(1, basis_set->nradmax, basis_set->lmax + 1, "R_cache"); R_cache.fill(0); DR_cache.init(1, basis_set->nradmax, basis_set->lmax + 1, "DR_cache"); DR_cache.fill(0); Y_cache.init(1, basis_set->lmax + 1, "Y_cache"); Y_cache.fill({0, 0}); DY_cache.init(1, basis_set->lmax + 1, "dY_dense_cache"); DY_cache.fill({0.}); //hard-core repulsion DCR_cache.init(1, "DCR_cache"); DCR_cache.fill(0); dB_flatten.init(basis_set->max_dB_array_size, "dB_flatten"); /* convert to ACE.jl format to prepare for construction of DAG * This will fill the arrays jl_Aspec, jl_AAspec, jl_orders */ acejlformat(); // test_acejlformat(); // now pass this info into the DAG dag.init(jl_Aspec, jl_AAspec, jl_orders, jl_coeffs, heuristic); // finally empty the temporary arrays to clear up the memory... // TODO } void ACERecursiveEvaluator::acejlformat() { int func_ms_ind = 0; int func_ms_t_ind = 0;// index for dB int j, jj, func_ind, ms_ind; SPECIES_TYPE mu_i = 0;//TODO: multispecies const SHORT_INT_TYPE total_basis_size = basis_set->total_basis_size[mu_i]; ACECTildeBasisFunction *basis = basis_set->basis[mu_i]; int AAidx = 0; RANK_TYPE order, t; SPECIES_TYPE *mus; NS_TYPE *ns; LS_TYPE *ls; MS_TYPE *ms; /* transform basis into new format: [A1 ... A_num1] [(i1,i2)(i1,i2)(...)(i1,i2,i3)(...)] where each ia represents an A_{ia} */ /* compute max values for mu, n, l, m */ SPECIES_TYPE maxmu = 0; //TODO: multispecies NS_TYPE maxn = basis_set->nradmax; LS_TYPE maxl = basis_set->lmax; RANK_TYPE maxorder = basis_set->rankmax; const DENSITY_TYPE ndensity = basis_set->ndensitymax; int num1 = 0; /* create a 4D lookup table for the 1-p basis * TODO: replace with a map?? */ Array4D A_lookup(int(maxmu+1), int(maxn), int(maxl+1), int(2*maxl+1)); for (int mu = 0; mu < maxmu+1; mu++) for (int n = 0; n < maxn; n++) for (int l = 0; l < maxl+1; l++) for (int m = 0; m < 2*maxl+1; m++) A_lookup(mu, n, l, m) = -1; int A_idx = 0; // linear index of A basis function (1-particle) for (func_ind = 0; func_ind < total_basis_size; ++func_ind) { ACECTildeBasisFunction *func = &basis[func_ind]; // func->print(); order = func->rank; mus = func->mus; ns = func->ns; ls = func->ls; for (ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind, ++func_ms_ind) { ms = &func->ms_combs[ms_ind * order]; for (t = 0; t < order; t++) { int iA = A_lookup(mus[t], ns[t]-1, ls[t], ms[t]+ls[t]); if (iA == -1) { A_lookup(mus[t], ns[t] - 1, ls[t], ms[t] + ls[t]) = A_idx; A_idx += 1; } } } } /* create the reverse list: linear indixes to mu,l,m,n this keeps only the basis functions we really need */ num1 = A_idx; Array2D & Aspec = jl_Aspec; Aspec.resize(num1, 4); // Array2D Aspec(num1, 4); for (int mu = 0; mu <= maxmu; mu++) for (int n = 1; n <= maxn; n++) for (int l = 0; l <= maxl; l++) for (int m = -l; m <= l; m++) { int iA = A_lookup(mu, n-1, l, l+m); if (iA != -1) { Aspec(iA, 0) = mu; Aspec(iA, 1) = n; Aspec(iA, 2) = l; Aspec(iA, 3) = m; } } /* ============ HALF-BASIS TRICK START ============ */ for (func_ind = 0; func_ind < total_basis_size; ++func_ind) { ACECTildeBasisFunction *func = &basis[func_ind]; order = func->rank; mus = func->mus; ns = func->ns; ls = func->ls; if (!( (mus[0] <= maxmu) && (ns[0] <= maxn) && (ls[0] <= maxl) )) continue; for (ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind, ++func_ms_ind) { ms = &func->ms_combs[ms_ind * order]; // find first positive and negative index int pos_idx = order + 1; int neg_idx = order + 1; for (t = order-1; t >= 0; t--) if (ms[t] > 0) pos_idx = t; else if (ms[t] < 0) neg_idx = t; // if neg_idx < pos_idx then this means that ms is non-zero // and that the first non-zero index is negative, hence this is // a negative-sign tuple which we want to combine into // its opposite. if (neg_idx < pos_idx) { // find the opposite tuple int ms_ind2 = 0; MS_TYPE *ms2; bool found_opposite = false; for (ms_ind2 = 0; ms_ind2 < func->num_ms_combs; ++ms_ind2) { ms2 = &func->ms_combs[ms_ind2 * order]; bool isopposite = true; for (t = 0; t < order; t++) if (ms[t] != -ms2[t]) { isopposite = false; break; } if (isopposite) { found_opposite = true; break; } } if (ms_ind == ms_ind2) { cout << "WARNING - ms_ind == ms_ind2 \n"; } // now we need to overwrite the coefficients if (found_opposite) { int sig = 1; for (t = 0; t < order; t++) if (ms[t] < 0) sig *= -1; for (int p = 0; p < ndensity; ++p) { func->ctildes[ms_ind2 * ndensity + p] += func->ctildes[ms_ind * ndensity + p]; func->ctildes[ms_ind * ndensity + p] = 0.0; } } } } } // /* ============ HALF-BASIS TRICK END ============ */ /* count number of basis functions, keep only non-zero!! */ int num2 = 0; for (func_ind = 0; func_ind < total_basis_size; ++func_ind) { ACECTildeBasisFunction *func = &basis[func_ind]; for (ms_ind = 0; ms_ind < (&basis[func_ind])->num_ms_combs; ++ms_ind, ++func_ms_ind) { // check that the coefficients are actually non-zero bool isnonzero = false; for (DENSITY_TYPE p = 0; p < ndensity; ++p) if (func->ctildes[ms_ind * ndensity + p] != 0.0) isnonzero = true; if (isnonzero) num2++; } } /* Now create the AA basis links into the A basis */ num1 = A_idx; // total number of A-basis functions that we keep // Array1D AAorders(num2); Array1D & AAorders = jl_orders; AAorders.resize(num2); // Array2D AAspec(num2, maxorder); // specs of AA basis functions Array2D & AAspec = jl_AAspec; AAspec.resize(num2, maxorder); jl_coeffs.resize(num2, ndensity); AAidx = 0; // linear index into AA basis function int len_flat = 0; for (func_ind = 0; func_ind < total_basis_size; ++func_ind) { ACECTildeBasisFunction *func = &basis[func_ind]; order = func->rank; mus = func->mus; ns = func->ns; ls = func->ls; if (!((mus[0] <= maxmu) && (ns[0] <= maxn) && (ls[0] <= maxl))) //fool-proofing of functions continue; for (ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind, ++func_ms_ind) { ms = &func->ms_combs[ms_ind * order]; // check that the coefficients are actually non-zero bool iszero = true; for (DENSITY_TYPE p = 0; p < ndensity; ++p) if (func->ctildes[ms_ind * ndensity + p] != 0.0) iszero = false; if (iszero) continue; AAorders(AAidx) = order; for (t = 0; t < order; t++) { int Ait = A_lookup(int(mus[t]), int(ns[t]-1), int(ls[t]), int(ms[t])+int(ls[t])); AAspec(AAidx, t) = Ait; len_flat += 1; } for (t = order; t < maxorder; t++) AAspec(AAidx, t) = -1; /* copy over the coefficients */ for (DENSITY_TYPE p = 0; p < ndensity; ++p) jl_coeffs(AAidx, p) = func->ctildes[ms_ind * ndensity + p]; AAidx += 1; } } // flatten the AAspec array jl_AAspec_flat.resize(len_flat); int idx_spec = 0; for (int AAidx = 0; AAidx < jl_AAspec.get_dim(0); AAidx++) for (int p = 0; p < jl_orders(AAidx); p++, idx_spec++) jl_AAspec_flat(idx_spec) = jl_AAspec(AAidx, p); } void ACERecursiveEvaluator::test_acejlformat() { Array2D AAspec = jl_AAspec; Array2D Aspec = jl_Aspec; Array1D AAorders = jl_orders; cout << "num2 = " << AAorders.get_dim(0) << "\n"; int func_ms_ind = 0; int func_ms_t_ind = 0;// index for dB int j, jj, func_ind, ms_ind; SPECIES_TYPE mu_i = 0; const SHORT_INT_TYPE total_basis_size = basis_set->total_basis_size[mu_i]; ACECTildeBasisFunction *basis = basis_set->basis[mu_i]; RANK_TYPE order, t; SPECIES_TYPE *mus; NS_TYPE *ns; LS_TYPE *ls; MS_TYPE *ms; /* ==== test by printing the basis spec ====*/ // TODO: convert this into an automatic consistency test int iAA = 0; for (func_ind = 0; func_ind < total_basis_size; ++func_ind) { ACECTildeBasisFunction *func = &basis[func_ind]; order = func->rank; mus = func->mus; ns = func->ns; ls = func->ls; // func->print(); //loop over {ms} combinations in sum for (ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind, ++func_ms_ind) { ms = &func->ms_combs[ms_ind * order]; cout << iAA << " : |"; for (t = 0; t < order; t++) cout << mus[t] << ";" << ns[t] << "," << ls[t] << "," << ms[t] << "|"; cout << "\n"; cout << " ["; for (t = 0; t < AAorders(iAA); t++) cout << AAspec(iAA, int(t)) << ","; cout << "]\n"; cout << " |"; for (t = 0; t < AAorders(iAA); t++) { int iA = AAspec(iAA, t); // cout << iA << ","; cout << Aspec(iA, 0) << ";" << Aspec(iA, 1) << "," << Aspec(iA, 2) << "," << Aspec(iA, 3) << "|"; } cout << "\n"; iAA += 1; } } /* ==== END TEST ==== */ } void ACERecursiveEvaluator::resize_neighbours_cache(int max_jnum) { if(basis_set== nullptr) { throw std::invalid_argument("ACERecursiveEvaluator: basis set is not assigned"); } if (R_cache.get_dim(0) < max_jnum) { //TODO: implement grow R_cache.resize(max_jnum, basis_set->nradmax, basis_set->lmax + 1); R_cache.fill(0); DR_cache.resize(max_jnum, basis_set->nradmax, basis_set->lmax + 1); DR_cache.fill(0); DG_cache.resize(max_jnum, basis_set->nradbase); DG_cache.fill(0); Y_cache.resize(max_jnum, basis_set->lmax + 1); Y_cache.fill({0, 0}); DY_cache.resize(max_jnum, basis_set->lmax + 1); DY_cache.fill({0}); //hard-core repulsion DCR_cache.init(max_jnum, "DCR_cache"); DCR_cache.fill(0); } } // double** r - atomic coordinates of atom I // int* types - atomic types if atom I // int **firstneigh - ptr to 1st J int value of each I atom. Usage: jlist = firstneigh[i]; // Usage: j = jlist_of_i[jj]; // jnum - number of J neighbors for each I atom. jnum = numneigh[i]; void ACERecursiveEvaluator::compute_atom(int i, DOUBLE_TYPE **x, const SPECIES_TYPE *type, const int jnum, const int *jlist) { if(basis_set== nullptr) { throw std::invalid_argument("ACERecursiveEvaluator: basis set is not assigned"); } per_atom_calc_timer.start(); #ifdef PRINT_MAIN_STEPS printf("\n ATOM: ind = %d r_norm=(%f, %f, %f)\n",i, x[i][0], x[i][1], x[i][2]); #endif DOUBLE_TYPE evdwl = 0, evdwl_cut = 0, rho_core = 0; DOUBLE_TYPE r_norm; DOUBLE_TYPE xn, yn, zn, r_xyz; DOUBLE_TYPE R, GR, DGR, R_over_r, DR, DCR; DOUBLE_TYPE *r_hat; SPECIES_TYPE mu_j; RANK_TYPE r, rank, t; NS_TYPE n; LS_TYPE l; MS_TYPE m, m_t; SPECIES_TYPE *mus; NS_TYPE *ns; LS_TYPE *ls; MS_TYPE *ms; int j, jj, func_ind, ms_ind; SHORT_INT_TYPE factor; ACEComplex Y{0}, Y_DR{0.}; ACEComplex B{0.}; ACEComplex dB{0}; ACEComplex A_cache[basis_set->rankmax]; ACEComplex dA[basis_set->rankmax]; int spec[basis_set->rankmax]; dB_flatten.fill({0.}); ACEDYcomponent grad_phi_nlm{0}, DY{0.}; //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme ACEComplex A_forward_prod[basis_set->rankmax + 1]; ACEComplex A_backward_prod[basis_set->rankmax + 1]; DOUBLE_TYPE inv_r_norm; DOUBLE_TYPE r_norms[jnum]; DOUBLE_TYPE inv_r_norms[jnum]; DOUBLE_TYPE rhats[jnum][3];//normalized vector SPECIES_TYPE elements[jnum]; const DOUBLE_TYPE xtmp = x[i][0]; const DOUBLE_TYPE ytmp = x[i][1]; const DOUBLE_TYPE ztmp = x[i][2]; DOUBLE_TYPE f_ji[3]; bool is_element_mapping = element_type_mapping.get_size() > 0; SPECIES_TYPE mu_i; if (is_element_mapping) mu_i = element_type_mapping(type[i]); else mu_i = type[i]; const SHORT_INT_TYPE total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu_i]; const SHORT_INT_TYPE total_basis_size = basis_set->total_basis_size[mu_i]; ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[mu_i]; ACECTildeBasisFunction *basis = basis_set->basis[mu_i]; DOUBLE_TYPE rho_cut, drho_cut, fcut, dfcut; DOUBLE_TYPE dF_drho_core; //TODO: lmax -> lmaxi (get per-species type) const LS_TYPE lmaxi = basis_set->lmax; //TODO: nradmax -> nradiali (get per-species type) const NS_TYPE nradiali = basis_set->nradmax; //TODO: nradbase -> nradbasei (get per-species type) const NS_TYPE nradbasei = basis_set->nradbase; //TODO: get per-species type number of densities const DENSITY_TYPE ndensity= basis_set->ndensitymax; neighbours_forces.resize(jnum, 3); neighbours_forces.fill(0); //TODO: shift nullifications to place where arrays are used weights.fill({0}); weights_rank1.fill(0); A.fill({0}); A_rank1.fill(0); rhos.fill(0); dF_drho.fill(0); #ifdef EXTRA_C_PROJECTIONS basis_projections_rank1.init(total_basis_size_rank1, ndensity, "c_projections_rank1"); basis_projections.init(total_basis_size, ndensity, "c_projections"); #endif //proxy references to spherical harmonics and radial functions arrays const Array2DLM &ylm = basis_set->spherical_harmonics.ylm; const Array2DLM &dylm = basis_set->spherical_harmonics.dylm; const Array2D &fr = basis_set->radial_functions->fr; const Array2D &dfr = basis_set->radial_functions->dfr; const Array1D &gr = basis_set->radial_functions->gr; const Array1D &dgr = basis_set->radial_functions->dgr; loop_over_neighbour_timer.start(); int jj_actual = 0; SPECIES_TYPE type_j = 0; int neighbour_index_mapping[jnum]; // jj_actual -> jj //loop over neighbours, compute distance, consider only atoms within with rradial_functions->cut(mu_i, mu_j); r_xyz = sqrt(xn * xn + yn * yn + zn * zn); if (r_xyz >= current_cutoff) continue; inv_r_norm = 1 / r_xyz; r_norms[jj_actual] = r_xyz; inv_r_norms[jj_actual] = inv_r_norm; rhats[jj_actual][0] = xn * inv_r_norm; rhats[jj_actual][1] = yn * inv_r_norm; rhats[jj_actual][2] = zn * inv_r_norm; elements[jj_actual] = mu_j; neighbour_index_mapping[jj_actual] = jj; jj_actual++; } int jnum_actual = jj_actual; //ALGORITHM 1: Atomic base A for (jj = 0; jj < jnum_actual; ++jj) { r_norm = r_norms[jj]; mu_j = elements[jj]; r_hat = rhats[jj]; //proxies Array2DLM &Y_jj = Y_cache(jj); Array2DLM &DY_jj = DY_cache(jj); basis_set->radial_functions->evaluate(r_norm, basis_set->nradbase, nradiali, mu_i, mu_j); basis_set->spherical_harmonics.compute_ylm(r_hat[0], r_hat[1], r_hat[2], lmaxi); //loop for computing A's //rank = 1 for (n = 0; n < basis_set->nradbase; n++) { GR = gr(n); #ifdef DEBUG_ENERGY_CALCULATIONS printf("-neigh atom %d\n", jj); printf("gr(n=%d)(r=%f) = %f\n", n, r_norm, gr(n)); printf("dgr(n=%d)(r=%f) = %f\n", n, r_norm, dgr(n)); #endif DG_cache(jj, n) = dgr(n); A_rank1(mu_j, n) += GR * Y00; } //loop for computing A's // for rank > 1 for (n = 0; n < nradiali; n++) { auto &A_lm = A(mu_j, n); for (l = 0; l <= lmaxi; l++) { R = fr(n, l); #ifdef DEBUG_ENERGY_CALCULATIONS printf("R(nl=%d,%d)(r=%f)=%f\n", n + 1, l, r_norm, R); #endif DR_cache(jj, n, l) = dfr(n, l); R_cache(jj, n, l) = R; for (m = 0; m <= l; m++) { Y = ylm(l, m); #ifdef DEBUG_ENERGY_CALCULATIONS printf("Y(lm=%d,%d)=(%f, %f)\n", l, m, Y.real, Y.img); #endif A_lm(l, m) += R * Y; //accumulation sum over neighbours Y_jj(l, m) = Y; DY_jj(l, m) = dylm(l, m); } } } //hard-core repulsion rho_core += basis_set->radial_functions->cr; DCR_cache(jj) = basis_set->radial_functions->dcr; } //end loop over neighbours //complex conjugate A's (for NEGATIVE (-m) terms) // for rank > 1 for (mu_j = 0; mu_j < basis_set->nelements; mu_j++) { for (n = 0; n < nradiali; n++) { auto &A_lm = A(mu_j, n); for (l = 0; l <= lmaxi; l++) { //fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm for (m = 1; m <= l; m++) { factor = m % 2 == 0 ? 1 : -1; A_lm(l, -m) = A_lm(l, m).conjugated() * factor; } } } } //now A's are constructed loop_over_neighbour_timer.stop(); // ==================== ENERGY ==================== energy_calc_timer.start(); #ifdef EXTRA_C_PROJECTIONS basis_projections_rank1.fill(0); basis_projections.fill(0); #endif //ALGORITHM 2: Basis functions B with iterative product and density rho(p) calculation //rank=1 for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind) { ACECTildeBasisFunction *func = &basis_rank1[func_rank1_ind]; // ndensity = func->ndensity; #ifdef PRINT_LOOPS_INDICES printf("Num density = %d r = 0\n",(int) ndensity ); print_C_tilde_B_basis_function(*func); #endif double A_cur = A_rank1(func->mus[0], func->ns[0] - 1); #ifdef DEBUG_ENERGY_CALCULATIONS printf("A_r=1(x=%d, n=%d)=(%f)\n", func->mus[0], func->ns[0], A_cur); printf(" coeff[0] = %f\n", func->ctildes[0]); #endif for (DENSITY_TYPE p = 0; p < ndensity; ++p) { //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 rhos(p) += func->ctildes[p] * A_cur; #ifdef EXTRA_C_PROJECTIONS //aggregate C-projections separately basis_projections_rank1(func_rank1_ind, p)+= func->ctildes[p] * A_cur; #endif } } // end loop for rank=1 // ================ START RECURSIVE EVALUATOR ==================== // (rank > 1 only) /* STAGE 1: * 1-particle basis is already evaluated, so we only need to * copy it into the AA value buffer */ int num1 = dag.get_num1(); for (int idx = 0; idx < num1; idx++) dag.AAbuf(idx) = A( dag.Aspec(idx, 0), dag.Aspec(idx, 1)-1, dag.Aspec(idx, 2), dag.Aspec(idx, 3) ); if (recursive) { /* STAGE 2: FORWARD PASS * Forward pass: go through the dag and store all intermediate results */ // rhos.fill(0); note the rhos are already reset and started filling above! ACEComplex AAcur{0.0}; int i1, i2; int * dag_nodes = dag.nodes.get_data(); int idx_nodes = 0; DOUBLE_TYPE * dag_coefs = dag.coeffs.get_data(); int idx_coefs = 0; int num2_int = dag.get_num2_int(); int num2_leaf = dag.get_num2_leaf(); // interior nodes (save AA) for (int idx = num1; idx < num1+num2_int; idx++) { i1 = dag_nodes[idx_nodes]; idx_nodes++; i2 = dag_nodes[idx_nodes]; idx_nodes++; AAcur = dag.AAbuf(i1) * dag.AAbuf(i2); dag.AAbuf(idx) = AAcur; for (int p = 0; p < ndensity; p++, idx_coefs++) rhos(p) += AAcur.real_part_product(dag_coefs[idx_coefs]); } // leaf nodes -> no need to store in AAbuf DOUBLE_TYPE AAcur_re = 0.0; for (int _idx = 0; _idx < num2_leaf; _idx++) { i1 = dag_nodes[idx_nodes]; idx_nodes++; i2 = dag_nodes[idx_nodes]; idx_nodes++; AAcur_re = dag.AAbuf(i1).real_part_product(dag.AAbuf(i2)); for (int p = 0; p < ndensity; p++, idx_coefs++) rhos(p) += AAcur_re * dag_coefs[idx_coefs]; } } else { /* non-recursive Julia-style evaluator implementation */ // TODO: fix array access to enable bounds checking again??? ACEComplex AAcur{1.0}; int *AAspec = jl_AAspec_flat.get_data(); DOUBLE_TYPE *coeffs = jl_coeffs.get_data(); int idx_spec = 0; int idx_coefs = 0; int order = 0; int max_order = jl_AAspec.get_dim(1); for (int iAA = 0; iAA < jl_AAspec.get_dim(0); iAA ++) { AAcur = 1.0; order = jl_orders(iAA); for (int r = 0; r < order; r++, idx_spec++) AAcur *= dag.AAbuf( AAspec[idx_spec] ); for (int p = 0; p < ndensity; p++, idx_coefs++) rhos(p) += AAcur.real_part_product(coeffs[idx_coefs]); } } /* we now have rho and can evaluate lots of things. -------- this is back to the original PACE code --------- */ #ifdef DEBUG_FORCES_CALCULATIONS printf("rhos = "); for(DENSITY_TYPE p =0; prho_core_cutoffs(mu_i); drho_cut = basis_set->drho_core_cutoffs(mu_i); basis_set->inner_cutoff(rho_core, rho_cut, drho_cut, fcut, dfcut); basis_set->FS_values_and_derivatives(rhos, evdwl, dF_drho, ndensity); dF_drho_core = evdwl * dfcut + 1; for (DENSITY_TYPE p = 0; p < ndensity; ++p) dF_drho(p) *= fcut; evdwl_cut = evdwl * fcut + rho_core; // E0 shift evdwl_cut += basis_set->E0vals(mu_i); /* I've moved this from below the weight calculation since I believe it only times the energy? the weights are only needed for the forces? But I believe we could add a third timer for computing just the weights; this will allow us to check better where the bottleneck is. */ energy_calc_timer.stop(); forces_calc_loop_timer.start(); #ifdef DEBUG_FORCES_CALCULATIONS printf("dFrhos = "); for(DENSITY_TYPE p =0; pndensity; for (DENSITY_TYPE p = 0; p < ndensity; ++p) { //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 weights_rank1(func->mus[0], func->ns[0] - 1) += dF_drho(p) * func->ctildes[p]; } } /* --------- we now continue with the recursive code --------- */ if (recursive) { /* STAGE 2: BACKWARD PASS */ int i1, i2; ACEComplex AA1{0.0}; ACEComplex AA2{0.0}; ACEComplex wcur{0.0}; int num2_int = dag.get_num2_int(); int num2_leaf = dag.get_num2_leaf(); /* to prepare for the backward we first need to zero the weights */ dag.w.fill({0.0}); int * dag_nodes = dag.nodes.get_data(); int idx_nodes = 2 * (num2_int + num2_leaf) - 1; DOUBLE_TYPE * dag_coefs = dag.coeffs.get_data(); int idx_coefs = ndensity * (num2_int + num2_leaf) - 1; for (int idx = num1+num2_int+num2_leaf - 1; idx >= num1; idx--) { i2 = dag_nodes[idx_nodes]; idx_nodes--; i1 = dag_nodes[idx_nodes]; idx_nodes--; AA1 = dag.AAbuf(i1); AA2 = dag.AAbuf(i2); wcur = dag.w(idx); // [***] for (int p = ndensity-1; p >= 0; p--, idx_coefs--) wcur += dF_drho(p) * dag_coefs[idx_coefs]; dag.w(i1) += wcur * AA2; // TODO: replace with explicit muladd? dag.w(i2) += wcur * AA1; } /* [***] * Note that these weights don't really need to be stored for the * leaf nodes. We tested splitting this for loop into two where * for the leaf nodes the weight would just be initialized to 0.0 * instead of reading from an array. The improvement was barely * measurable, ca 3%, so we reverted to this simpler algorithm */ } else { // non-recursive ACE.jl style implemenation of gradients, but with // a backward differentiation approach to the prod-A // (cf. Algorithm 3 in the manuscript) dag.w.fill({0.0}); ACEComplex AAf{1.0}, AAb{1.0}, theta{0.0}; int *AAspec = jl_AAspec_flat.get_data(); DOUBLE_TYPE *coeffs = jl_coeffs.get_data(); int idx_spec = 0; int idx_coefs = 0; int order = 0; int max_order = jl_AAspec.get_dim(1); for (int iAA = 0; iAA < jl_AAspec.get_dim(0); iAA ++ ) { order = jl_orders(iAA); theta = 0.0; for (int p = 0; p < ndensity; p++, idx_coefs++) theta += dF_drho(p) * coeffs[idx_coefs]; dA[0] = 1.0; AAf = 1.0; for (int t = 0; t < order-1; t++, idx_spec++) { spec[t] = AAspec[idx_spec]; A_cache[t] = dag.AAbuf(spec[t]); AAf *= A_cache[t]; dA[t+1] = AAf; } spec[order-1] = AAspec[idx_spec]; idx_spec++; A_cache[order-1] = dag.AAbuf(spec[order-1]); AAb = 1.0; for (int t = order-1; t >= 1; t--) { AAb *= A_cache[t]; dA[t-1] *= AAb; dag.w(spec[t]) += theta * dA[t]; } dag.w(spec[0]) += theta * dA[0]; } } /* STAGE 3: * get the gradients from the 1-particle basis gradients and write them * into the dF/drho derivatives. */ /* In order to reuse the original PACE code, we copy the weights back * into the the PACE datastructure. */ for (int idx = 0; idx < num1; idx++) { int m = dag.Aspec(idx, 3); if (m >= 0) { weights(dag.Aspec(idx, 0), // mu dag.Aspec(idx, 1) - 1, // n dag.Aspec(idx, 2), // l m ) += dag.w(idx); } else { int factor = (m % 2 == 0 ? 1 : -1); weights(dag.Aspec(idx, 0), // mu dag.Aspec(idx, 1) - 1, // n dag.Aspec(idx, 2), // l -m ) += factor * dag.w(idx).conjugated(); } } /* ------ From here we are now back to the original PACE code ---- */ // ==================== FORCES ==================== #ifdef PRINT_MAIN_STEPS printf("\nFORCE CALCULATION\n"); printf("loop over neighbours\n"); #endif // loop over neighbour atoms for force calculations for (jj = 0; jj < jnum_actual; ++jj) { mu_j = elements[jj]; r_hat = rhats[jj]; inv_r_norm = inv_r_norms[jj]; Array2DLM &Y_cache_jj = Y_cache(jj); Array2DLM &DY_cache_jj = DY_cache(jj); #ifdef PRINT_LOOPS_INDICES printf("\nneighbour atom #%d\n", jj); printf("rhat = (%f, %f, %f)\n", r_hat[0], r_hat[1], r_hat[2]); #endif forces_calc_neighbour_timer.start(); f_ji[0] = f_ji[1] = f_ji[2] = 0; //for rank = 1 for (n = 0; n < nradbasei; ++n) { if (weights_rank1(mu_j, n) == 0) continue; auto &DG = DG_cache(jj, n); DGR = DG * Y00; DGR *= weights_rank1(mu_j, n); #ifdef DEBUG_FORCES_CALCULATIONS printf("r=1: (n,l,m)=(%d, 0, 0)\n",n+1); printf("\tGR(n=%d, r=%f)=%f\n",n+1,r_norm, gr(n)); printf("\tDGR(n=%d, r=%f)=%f\n",n+1,r_norm, dgr(n)); printf("\tdF+=(%f, %f, %f)\n",DGR * r_hat[0], DGR * r_hat[1], DGR * r_hat[2]); #endif f_ji[0] += DGR * r_hat[0]; f_ji[1] += DGR * r_hat[1]; f_ji[2] += DGR * r_hat[2]; } //for rank > 1 for (n = 0; n < nradiali; n++) { for (l = 0; l <= lmaxi; l++) { R_over_r = R_cache(jj, n, l) * inv_r_norm; DR = DR_cache(jj, n, l); // for m>=0 for (m = 0; m <= l; m++) { ACEComplex w = weights(mu_j, n, l, m); if (w == 0) continue; //counting for -m cases if m>0 // if (m > 0) w *= 2; // not needed for recursive eval DY = DY_cache_jj(l, m); Y_DR = Y_cache_jj(l, m) * DR; grad_phi_nlm.a[0] = Y_DR * r_hat[0] + DY.a[0] * R_over_r; grad_phi_nlm.a[1] = Y_DR * r_hat[1] + DY.a[1] * R_over_r; grad_phi_nlm.a[2] = Y_DR * r_hat[2] + DY.a[2] * R_over_r; #ifdef DEBUG_FORCES_CALCULATIONS printf("d_phi(n=%d, l=%d, m=%d) = ((%f,%f), (%f,%f), (%f,%f))\n",n+1,l,m, grad_phi_nlm.a[0].real, grad_phi_nlm.a[0].img, grad_phi_nlm.a[1].real, grad_phi_nlm.a[1].img, grad_phi_nlm.a[2].real, grad_phi_nlm.a[2].img); printf("weights(n,l,m)(%d,%d,%d) = (%f,%f)\n", n+1, l, m, w.real, w.img); //if (m>0) w*=2; printf("dF(n,l,m)(%d, %d, %d) += (%f, %f, %f)\n", n + 1, l, m, w.real_part_product(grad_phi_nlm.a[0]), w.real_part_product(grad_phi_nlm.a[1]), w.real_part_product(grad_phi_nlm.a[2]) ); #endif // real-part multiplication only f_ji[0] += w.real_part_product(grad_phi_nlm.a[0]); f_ji[1] += w.real_part_product(grad_phi_nlm.a[1]); f_ji[2] += w.real_part_product(grad_phi_nlm.a[2]); } } } #ifdef PRINT_INTERMEDIATE_VALUES printf("f_ji(jj=%d, i=%d)=(%f, %f, %f)\n", jj, i, f_ji[0], f_ji[1], f_ji[2] ); #endif //hard-core repulsion DCR = DCR_cache(jj); #ifdef DEBUG_FORCES_CALCULATIONS printf("DCR = %f\n",DCR); #endif f_ji[0] += dF_drho_core * DCR * r_hat[0]; f_ji[1] += dF_drho_core * DCR * r_hat[1]; f_ji[2] += dF_drho_core * DCR * r_hat[2]; #ifdef PRINT_INTERMEDIATE_VALUES printf("with core-repulsion\n"); printf("f_ji(jj=%d, i=%d)=(%f, %f, %f)\n", jj, i, f_ji[0], f_ji[1], f_ji[2] ); printf("neighbour_index_mapping[jj=%d]=%d\n",jj,neighbour_index_mapping[jj]); #endif neighbours_forces(neighbour_index_mapping[jj], 0) = f_ji[0]; neighbours_forces(neighbour_index_mapping[jj], 1) = f_ji[1]; neighbours_forces(neighbour_index_mapping[jj], 2) = f_ji[2]; forces_calc_neighbour_timer.stop(); }// end loop over neighbour atoms for forces forces_calc_loop_timer.stop(); //now, energies and forces are ready //energies(i) = evdwl + rho_core; e_atom = evdwl_cut; #ifdef PRINT_INTERMEDIATE_VALUES printf("energies(i) = FS(...rho_p_accum...) = %f\n", evdwl); #endif per_atom_calc_timer.stop(); }