From 3d5897c9263ec2e4ecf39970409e705b2536a6cc Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Sun, 18 Apr 2021 01:40:33 -0700
Subject: [PATCH] Fixes issue from Feb 2021 GPU package update for tersoff
 styles using multiple types.

---
 lib/gpu/lal_tersoff.cpp     | 12 +++---------
 lib/gpu/lal_tersoff.cu      | 16 ++--------------
 lib/gpu/lal_tersoff.h       |  2 +-
 lib/gpu/lal_tersoff_extra.h |  5 ++++-
 lib/gpu/lal_tersoff_mod.cpp | 12 +++---------
 lib/gpu/lal_tersoff_mod.cu  | 22 +++++-----------------
 lib/gpu/lal_tersoff_mod.h   |  2 +-
 lib/gpu/lal_tersoff_zbl.cpp | 12 +++---------
 lib/gpu/lal_tersoff_zbl.cu  | 22 +++++-----------------
 lib/gpu/lal_tersoff_zbl.h   |  2 +-
 10 files changed, 28 insertions(+), 79 deletions(-)
diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index e0e87d9148..ac5e5bc600 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -108,10 +108,7 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
   _nparams = nparams;
   _nelements = nelements;
 
-  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
-                               UCL_READ_WRITE);
-  host_write.zero();
-  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  _cutsq_max=0.0;
   for (int ii=1; ii<ntypes; ii++) {
     const int i=host_map[ii];
     for (int jj=1; jj<ntypes; jj++) {
@@ -120,12 +117,10 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
         const int k=host_map[kk];
         if (i<0 || j<0 || k<0) continue;
         const int ijkparam = host_elem2param[i][j][k];
-        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
-          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+        if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
       }
     }
   }
-  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
 
   // --------------------------------------------------------------------
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
@@ -235,7 +230,6 @@ void TersoffT::clear() {
   ts3.clear();
   ts4.clear();
   ts5.clear();
-  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
@@ -286,7 +280,7 @@ int TersoffT::loop(const int eflag, const int vflag, const int evatom,
   int BX=this->block_pair();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+  this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
                          &this->nbor->dev_nbor, &this->nbor->dev_packed,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
 
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index 03ce68be77..8baa5ce12a 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -226,17 +226,13 @@ _texture_2d( pos_tex,int4);
 #endif
 
 __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global numtyp *restrict cutsq_pair,
-                                   const int ntypes, __global int * dev_nbor,
+                                   const numtyp cutsq, const int ntypes,
+                                   __global int * dev_nbor,
                                    const __global int * dev_packed,
                                    const int inum, const int nbor_pitch,
                                    const int t_per_atom_in) {
   const int ii=GLOBAL_ID_X;
 
-  #ifdef ONETYPE
-  const numtyp cutsq=cutsq_pair[ONETYPE];
-  #endif
-
   if (ii<inum) {
     const int i=dev_packed[ii];
     int nbor=ii+nbor_pitch;
@@ -245,9 +241,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
     const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    #ifndef ONETYPE
-    const int itype=ix.w*ntypes;
-    #endif
     int newj=0;
 
     __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
@@ -258,11 +251,6 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
       int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
-      #ifndef ONETYPE
-      const int mtype=jx.w+itype;
-      const numtyp cutsq=cutsq_pair[mtype];
-      #endif
-
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h
index 8f99569162..f38932d049 100644
--- a/lib/gpu/lal_tersoff.h
+++ b/lib/gpu/lal_tersoff.h
@@ -73,7 +73,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
   /// ts5.x = beta, ts5.y = powern, ts5.z = lam2, ts5.w = bigb
   UCL_D_Vec<numtyp4> ts5;
 
-  UCL_D_Vec<numtyp> cutsq_pair;
+  numtyp _cutsq_max;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
diff --git a/lib/gpu/lal_tersoff_extra.h b/lib/gpu/lal_tersoff_extra.h
index da2568aa1b..9fe2c63176 100644
--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@@ -142,7 +142,10 @@ ucl_inline numtyp ters_fa_d(const numtyp r,
                             numtyp *ans_d)
 {
   #ifndef ONETYPE
-  if (r > param_bigr + param_bigd) return (numtyp)0.0;
+  if (r > param_bigr + param_bigd) {
+    *ans_d = (numtyp)0.0;
+    return (numtyp)0.0;
+  }
   #endif
   numtyp dfc;
   const numtyp fc=ters_fc_d(r,param_bigr,param_bigd,&dfc);
diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index b7b0fff1b9..347feab06f 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -105,10 +105,7 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   _nparams = nparams;
   _nelements = nelements;
 
-  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
-                               UCL_READ_WRITE);
-  host_write.zero();
-  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  _cutsq_max=0.0;
   for (int ii=1; ii<ntypes; ii++) {
     const int i=host_map[ii];
     for (int jj=1; jj<ntypes; jj++) {
@@ -117,12 +114,10 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
         const int k=host_map[kk];
         if (i<0 || j<0 || k<0) continue;
         const int ijkparam = host_elem2param[i][j][k];
-        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
-          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+        if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
       }
     }
   }
-  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
 
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                            UCL_WRITE_ONLY);
@@ -229,7 +224,6 @@ void TersoffMT::clear() {
   ts3.clear();
   ts4.clear();
   ts5.clear();
-  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
@@ -275,7 +269,7 @@ int TersoffMT::loop(const int eflag, const int vflag, const int evatom,
   int BX=this->block_pair();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+  this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
                          &this->nbor->dev_nbor, &this->nbor->dev_packed,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
 
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 44b04c6933..1eb57683d5 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -220,17 +220,13 @@ _texture_2d( pos_tex,int4);
 #endif
 
 __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global numtyp *restrict cutsq_pair,
-                                   const int ntypes, __global int * dev_nbor,
-                                   const __global int * dev_packed,
-                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
+                                       const numtyp cutsq, const int ntypes,
+                                       __global int * dev_nbor,
+                                       const __global int * dev_packed,
+                                       const int inum, const int nbor_pitch,
+                                       const int t_per_atom) {
   const int ii=GLOBAL_ID_X;
 
-  #ifdef ONETYPE
-  const numtyp cutsq=cutsq_pair[ONETYPE];
-  #endif
-
   if (ii<inum) {
     const int i=dev_packed[ii];
     int nbor=ii+nbor_pitch;
@@ -239,9 +235,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
     const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    #ifndef ONETYPE
-    const int itype=ix.w*ntypes;
-    #endif
     int newj=0;
 
     __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
@@ -252,11 +245,6 @@ __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
       int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
-      #ifndef ONETYPE
-      const int mtype=jx.w+itype;
-      const numtyp cutsq=cutsq_pair[mtype];
-      #endif
-
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
diff --git a/lib/gpu/lal_tersoff_mod.h b/lib/gpu/lal_tersoff_mod.h
index 0baa1307cb..bd158481fd 100644
--- a/lib/gpu/lal_tersoff_mod.h
+++ b/lib/gpu/lal_tersoff_mod.h
@@ -76,7 +76,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
   /// ts5.x = c5, ts5.y = h
   UCL_D_Vec<numtyp4> ts5;
 
-  UCL_D_Vec<numtyp> cutsq_pair;
+  numtyp _cutsq_max;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 4456712b0a..4fba97606c 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -112,10 +112,7 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   _nparams = nparams;
   _nelements = nelements;
 
-  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
-                               UCL_READ_WRITE);
-  host_write.zero();
-  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  _cutsq_max=0.0;
   for (int ii=1; ii<ntypes; ii++) {
     const int i=host_map[ii];
     for (int jj=1; jj<ntypes; jj++) {
@@ -124,12 +121,10 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
         const int k=host_map[kk];
         if (i<0 || j<0 || k<0) continue;
         const int ijkparam = host_elem2param[i][j][k];
-        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
-          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+        if (host_cutsq[ijkparam]>_cutsq_max) _cutsq_max=host_cutsq[ijkparam];
       }
     }
   }
-  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
 
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                            UCL_WRITE_ONLY);
@@ -253,7 +248,6 @@ void TersoffZT::clear() {
   ts4.clear();
   ts5.clear();
   ts6.clear();
-  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
@@ -299,7 +293,7 @@ int TersoffZT::loop(const int eflag, const int vflag, const int evatom,
   int BX=this->block_pair();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+  this->k_short_nbor.run(&this->atom->x, &_cutsq_max, &_ntypes,
                          &this->nbor->dev_nbor, &this->nbor->dev_packed,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
 
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index fce1ccc406..6250fa55de 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -238,17 +238,13 @@ _texture( ts6_tex,int4);
 #endif
 
 __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global numtyp *restrict cutsq_pair,
-                                   const int ntypes, __global int * dev_nbor,
-                                   const __global int * dev_packed,
-                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
+                                       const numtyp cutsq, const int ntypes,
+                                       __global int * dev_nbor,
+                                       const __global int * dev_packed,
+                                       const int inum, const int nbor_pitch,
+                                       const int t_per_atom) {
   const int ii=GLOBAL_ID_X;
 
-  #ifdef ONETYPE
-  const numtyp cutsq=cutsq_pair[ONETYPE];
-  #endif
-
   if (ii<inum) {
     const int i=dev_packed[ii];
     int nbor=ii+nbor_pitch;
@@ -257,9 +253,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
     const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    #ifndef ONETYPE
-    const int itype=ix.w*ntypes;
-    #endif
     int newj=0;
 
     __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
@@ -270,11 +263,6 @@ __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
       int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
-      #ifndef ONETYPE
-      const int mtype=jx.w+itype;
-      const numtyp cutsq=cutsq_pair[mtype];
-      #endif
-
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
diff --git a/lib/gpu/lal_tersoff_zbl.h b/lib/gpu/lal_tersoff_zbl.h
index b82b391765..468006e499 100644
--- a/lib/gpu/lal_tersoff_zbl.h
+++ b/lib/gpu/lal_tersoff_zbl.h
@@ -80,7 +80,7 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   /// ts6.x = Z_i, ts6.y = Z_j, ts6.z = ZBLcut, ts6.w = ZBLexpscale
   UCL_D_Vec<numtyp4> ts6;
 
-  UCL_D_Vec<numtyp> cutsq_pair;
+  numtyp _cutsq_max;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;