Merge branch 'master' into eam-gpu

2021-05-04 23:05:29 -05:00
parent dca9cd9c1c 2e01bb8f62
commit 7c3a7cb2e4
1203 changed files with 39864 additions and 13735 deletions
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@ -1,6 +1,6 @@
 # /* ----------------------------------------------------------------------
 #  Generic Linux Makefile for HIP
-#     - export HIP_PLATFORM=hcc (or nvcc) before execution
+#     - export HIP_PLATFORM=amd (or nvcc) before execution
 #     - change HIP_ARCH for your GPU
 # ------------------------------------------------------------------------- */

@ -42,6 +42,10 @@ ifeq (hcc,$(HIP_PLATFORM))
 	HIP_OPTS  += -ffast-math
 	# possible values: gfx803,gfx900,gfx906
 	HIP_ARCH = gfx906
+else ifeq (amd,$(HIP_PLATFORM))
+	HIP_OPTS  += -ffast-math
+	# possible values: gfx803,gfx900,gfx906
+	HIP_ARCH = gfx906
 else ifeq (nvcc,$(HIP_PLATFORM))
 	HIP_OPTS  += --use_fast_math
 	HIP_ARCH = -gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_32,code=[sm_32,compute_32] -gencode arch=compute_35,code=[sm_35,compute_35] \
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -212,8 +212,8 @@ additionally requires cub (https://nvlabs.github.io/cub). Download and
 extract the cub directory to lammps/lib/gpu/ or specify an appropriate
 path in lammps/lib/gpu/Makefile.hip.
 2. In Makefile.hip it is possible to specify the target platform via
-export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
-architecture (gfx803, gfx900, gfx906 etc.)
+export HIP_PLATFORM=amd (ROCm >= 4.1), HIP_PLATFORM=hcc (ROCm <= 4.0)
+or HIP_PLATFORM=nvcc as well as the target architecture (gfx803, gfx900, gfx906 etc.)
 3. If your MPI implementation does not support `mpicxx --showme` command,
 it is required to specify the corresponding MPI compiler and linker flags
 in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
@ -278,4 +278,3 @@ and
 Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
 Performance Computers - Three-Body Potentials. Computer Physics Communications.
 2013. 184: p. 2785–2793.
-
--- a/lib/gpu/lal_charmm.cpp
+++ b/lib/gpu/lal_charmm.cpp
@ -150,7 +150,7 @@ int CHARMMT::loop(const int eflag, const int vflag) {
                          &_cut_coul_innersq, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &ljd, &sp_lj,
+    this->k_pair.run(&this->atom->x, &lj1, &_lj_types, &sp_lj,
                     &this->nbor->dev_nbor, this->_nbor_data,
                     &this->ans->force, &this->ans->engv, &eflag,
                     &vflag, &ainum, &nbor_pitch, &this->atom->q,
--- a/lib/gpu/lal_charmm.cu
+++ b/lib/gpu/lal_charmm.cu
@ -29,7 +29,8 @@ _texture(q_tex, int2);
 #endif

 __kernel void k_charmm(const __global numtyp4 *restrict x_,
-                       const __global numtyp2 *restrict ljd,
+                       const __global numtyp4 *restrict lj1,
+                       const int lj_types,
                       const __global numtyp *restrict sp_lj,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
@ -88,20 +89,14 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
      numtyp delz = ix.z-jx.z;
      numtyp rsq = delx*delx+dely*dely+delz*delz;

+      int mtype=itype*lj_types+jtype;
      if (rsq<cut_bothsq) {
        numtyp r2inv=ucl_recip(rsq);
-        numtyp forcecoul, force_lj, force, switch1;
-        numtyp lj3, lj4;
+        numtyp forcecoul, force_lj, force, r6inv, switch1;

        if (rsq < cut_ljsq) {
-          numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
-          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
-
-          numtyp sig_r_6 = sig6*sig6*r2inv;
-          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
-          lj4 = (numtyp)4.0*eps*sig_r_6;
-          lj3 = lj4*sig_r_6;
-          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
          if (rsq > cut_lj_innersq) {
            switch1 = (cut_ljsq-rsq);
            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
@ -109,7 +104,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
            switch1 *= switch1;
            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
                       denom_lj;
-            switch2 *= lj3-lj4;
+            switch2 *= r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
            force_lj = force_lj*switch1+switch2;
          }
        } else
@ -137,7 +132,7 @@ __kernel void k_charmm(const __global numtyp4 *restrict x_,
        if (EVFLAG && eflag) {
          e_coul += forcecoul;
          if (rsq < cut_ljsq) {
-            numtyp e=lj3-lj4;
+            numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
            if (rsq > cut_lj_innersq)
              e *= switch1;
            energy+=factor_lj*e;
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -1061,7 +1061,7 @@ bool lmp_gpu_config(const std::string &category, const std::string &setting)
    return setting == "opencl";
 #elif defined(USE_HIP)
    return setting == "hip";
-#elif defined(USE_CUDA)
+#elif defined(USE_CUDA) || defined(USE_CUDART)
    return setting == "cuda";
 #endif
    return false;
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@ -26,10 +26,10 @@ _texture_2d( pos_tex,int4);

 // LJ quantities scaled by epsilon and rmin = sigma*2^1/6 (see src/pair_lj_cubic.h)

-#define _RT6TWO (numtyp)1.1224621
-#define _PHIS (numtyp)-0.7869823  /* energy at s */
-#define _DPHIDS (numtyp)2.6899009 /* gradient at s */
-#define _A3 (numtyp)27.93357 /* cubic coefficient */
+#define _RT6TWO (numtyp)1.1224620483093730  /* 2^1/6 */
+#define _PHIS (numtyp)-0.7869822485207097   /* energy at s */
+#define _DPHIDS (numtyp)2.6899008972047196  /* gradient at s */
+#define _A3 (numtyp)27.9335700460986445     /* cubic coefficient */

 __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
                         const __global numtyp4 *restrict lj1,
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -740,6 +740,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,

  // If binning on GPU, do this now
  if (_gpu_nbor==1) {
+    mn = _max_nbors;
    const numtyp i_cell_size=static_cast<numtyp>(1.0/_cell_size);
    const int neigh_block=_block_cell_id;
    const int GX=(int)ceil((float)nall/neigh_block);
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@ -30,7 +30,7 @@
 // -------------------------------------------------------------------------


-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)
 #define CONFIG_ID 303
 #define SIMD_SIZE 64
 #else
@ -161,7 +161,7 @@
 //                         KERNEL MACROS - TEXTURES
 // -------------------------------------------------------------------------

-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)
 #define _texture(name, type)  __device__ type* name
 #define _texture_2d(name, type)  __device__ type* name
 #else
@ -201,7 +201,7 @@
  #define mu_tex mu_
 #endif

-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)

 #undef fetch4
 #undef fetch
@ -266,7 +266,7 @@ typedef struct _double4 double4;
 #endif
 #endif

-#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__)
+#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__)

  #ifdef _SINGLE_SINGLE
    #define shfl_down __shfl_down
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@ -108,10 +108,7 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
  _nparams = nparams;
  _nelements = nelements;

-  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
-                               UCL_READ_WRITE);
-  host_write.zero();
-  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  _cutsq_max=0.0;
  for (int ii=1; ii<ntypes; ii++) {
    const int i=host_map[ii];
    for (int jj=1; jj<ntypes; jj++) {
@ -120,12 +117,10 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
        const int k=host_map[kk];
        if (i<0 || j<0 || k<0) continue;
        const int ijkparam = host_elem2param[i][j][k];
-        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
-          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+        if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
      }
    }
  }
-  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);

  // --------------------------------------------------------------------
  UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
@ -235,7 +230,6 @@ void TersoffT::clear() {
  ts3.clear();
  ts4.clear();
  ts5.clear();
-  cutsq_pair.clear();
  map.clear();
  elem2param.clear();
  _zetaij.clear();
@ -286,7 +280,7 @@ int TersoffT::loop(const int eflag, const int vflag, const int evatom,
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+  this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
                         &ainum, &nbor_pitch, &this->_threads_per_atom);

--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@ -226,17 +226,13 @@ _texture_2d( pos_tex,int4);
 #endif

 __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global numtyp *restrict cutsq_pair,
-                                   const int ntypes, __global int * dev_nbor,
+                                   const numtyp cutsq, const int ntypes,
+                                   __global int * dev_nbor,
                                   const __global int * dev_packed,
                                   const int inum, const int nbor_pitch,
                                   const int t_per_atom_in) {
  const int ii=GLOBAL_ID_X;

-  #ifdef ONETYPE
-  const numtyp cutsq=cutsq_pair[ONETYPE];
-  #endif
-
  if (ii<inum) {
    const int i=dev_packed[ii];
    int nbor=ii+nbor_pitch;
@ -245,9 +241,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    #ifndef ONETYPE
-    const int itype=ix.w*ntypes;
-    #endif
    int newj=0;

    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
@ -258,11 +251,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

-      #ifndef ONETYPE
-      const int mtype=jx.w+itype;
-      const numtyp cutsq=cutsq_pair[mtype];
-      #endif
-
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
--- a/lib/gpu/lal_tersoff.h
+++ b/lib/gpu/lal_tersoff.h
@ -73,7 +73,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
  /// ts5.x = beta, ts5.y = powern, ts5.z = lam2, ts5.w = bigb
  UCL_D_Vec<numtyp4> ts5;

-  UCL_D_Vec<numtyp> cutsq_pair;
+  numtyp _cutsq_max;

  UCL_D_Vec<int> elem2param;
  UCL_D_Vec<int> map;
--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@ -142,7 +142,10 @@ ucl_inline numtyp ters_fa_d(const numtyp r,
                            numtyp *ans_d)
 {
  #ifndef ONETYPE
-  if (r > param_bigr + param_bigd) return (numtyp)0.0;
+  if (r > param_bigr + param_bigd) {
+    *ans_d = (numtyp)0.0;
+    return (numtyp)0.0;
+  }
  #endif
  numtyp dfc;
  const numtyp fc=ters_fc_d(r,param_bigr,param_bigd,&dfc);
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@ -105,10 +105,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
  _nparams = nparams;
  _nelements = nelements;

-  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
-                               UCL_READ_WRITE);
-  host_write.zero();
-  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  _cutsq_max=0.0;
  for (int ii=1; ii<ntypes; ii++) {
    const int i=host_map[ii];
    for (int jj=1; jj<ntypes; jj++) {
@ -117,12 +114,10 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
        const int k=host_map[kk];
        if (i<0 || j<0 || k<0) continue;
        const int ijkparam = host_elem2param[i][j][k];
-        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
-          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+        if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
      }
    }
  }
-  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);

  UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                           UCL_WRITE_ONLY);
@ -229,7 +224,6 @@ void TersoffMT::clear() {
  ts3.clear();
  ts4.clear();
  ts5.clear();
-  cutsq_pair.clear();
  map.clear();
  elem2param.clear();
  _zetaij.clear();
@ -275,7 +269,7 @@ int TersoffMT::loop(const int eflag, const int vflag, const int evatom,
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+  this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
                         &ainum, &nbor_pitch, &this->_threads_per_atom);

--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@ -220,17 +220,13 @@ _texture_2d( pos_tex,int4);
 #endif

 __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global numtyp *restrict cutsq_pair,
-                                   const int ntypes, __global int * dev_nbor,
-                                   const __global int * dev_packed,
-                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
+                                       const numtyp cutsq, const int ntypes,
+                                       __global int * dev_nbor,
+                                       const __global int * dev_packed,
+                                       const int inum, const int nbor_pitch,
+                                       const int t_per_atom) {
  const int ii=GLOBAL_ID_X;

-  #ifdef ONETYPE
-  const numtyp cutsq=cutsq_pair[ONETYPE];
-  #endif
-
  if (ii<inum) {
    const int i=dev_packed[ii];
    int nbor=ii+nbor_pitch;
@ -239,9 +235,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    #ifndef ONETYPE
-    const int itype=ix.w*ntypes;
-    #endif
    int newj=0;

    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
@ -252,11 +245,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

-      #ifndef ONETYPE
-      const int mtype=jx.w+itype;
-      const numtyp cutsq=cutsq_pair[mtype];
-      #endif
-
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
--- a/lib/gpu/lal_tersoff_mod.h
+++ b/lib/gpu/lal_tersoff_mod.h
@ -76,7 +76,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
  /// ts5.x = c5, ts5.y = h
  UCL_D_Vec<numtyp4> ts5;

-  UCL_D_Vec<numtyp> cutsq_pair;
+  numtyp _cutsq_max;

  UCL_D_Vec<int> elem2param;
  UCL_D_Vec<int> map;
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@ -112,10 +112,7 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
  _nparams = nparams;
  _nelements = nelements;

-  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
-                               UCL_READ_WRITE);
-  host_write.zero();
-  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  _cutsq_max=0.0;
  for (int ii=1; ii<ntypes; ii++) {
    const int i=host_map[ii];
    for (int jj=1; jj<ntypes; jj++) {
@ -124,12 +121,10 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
        const int k=host_map[kk];
        if (i<0 || j<0 || k<0) continue;
        const int ijkparam = host_elem2param[i][j][k];
-        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
-          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+        if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
      }
    }
  }
-  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);

  UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                           UCL_WRITE_ONLY);
@ -253,7 +248,6 @@ void TersoffZT::clear() {
  ts4.clear();
  ts5.clear();
  ts6.clear();
-  cutsq_pair.clear();
  map.clear();
  elem2param.clear();
  _zetaij.clear();
@ -299,7 +293,7 @@ int TersoffZT::loop(const int eflag, const int vflag, const int evatom,
  int BX=this->block_pair();
  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+  this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
                         &ainum, &nbor_pitch, &this->_threads_per_atom);

--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@ -238,17 +238,13 @@ _texture( ts6_tex,int4);
 #endif

 __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global numtyp *restrict cutsq_pair,
-                                   const int ntypes, __global int * dev_nbor,
-                                   const __global int * dev_packed,
-                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
+                                       const numtyp cutsq, const int ntypes,
+                                       __global int * dev_nbor,
+                                       const __global int * dev_packed,
+                                       const int inum, const int nbor_pitch,
+                                       const int t_per_atom) {
  const int ii=GLOBAL_ID_X;

-  #ifdef ONETYPE
-  const numtyp cutsq=cutsq_pair[ONETYPE];
-  #endif
-
  if (ii<inum) {
    const int i=dev_packed[ii];
    int nbor=ii+nbor_pitch;
@ -257,9 +253,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);

    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    #ifndef ONETYPE
-    const int itype=ix.w*ntypes;
-    #endif
    int newj=0;

    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
@ -270,11 +263,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
      int j = sj & NEIGHMASK;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];

-      #ifndef ONETYPE
-      const int mtype=jx.w+itype;
-      const numtyp cutsq=cutsq_pair[mtype];
-      #endif
-
      // Compute r12
      numtyp delx = ix.x-jx.x;
      numtyp dely = ix.y-jx.y;
--- a/lib/gpu/lal_tersoff_zbl.h
+++ b/lib/gpu/lal_tersoff_zbl.h
@ -80,7 +80,7 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
  /// ts6.x = Z_i, ts6.y = Z_j, ts6.z = ZBLcut, ts6.w = ZBLexpscale
  UCL_D_Vec<numtyp4> ts6;

-  UCL_D_Vec<numtyp> cutsq_pair;
+  numtyp _cutsq_max;

  UCL_D_Vec<int> elem2param;
  UCL_D_Vec<int> map;