From 3825fee8e9f25684da2baaebf3575f141c4fd580 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 25 Aug 2021 22:57:37 -0500
Subject: [PATCH 001/181] Added work on amoeba/gpu, some minor changes to
 PairAmoeba to allow function overriding in PairAmoebaGPU, added the package
 AMOEBA to cmake/CMakeLists.txt

---
 cmake/CMakeLists.txt        |   2 +
 lib/gpu/lal_amoeba.cpp      | 155 ++++++++
 lib/gpu/lal_amoeba.cu       | 684 ++++++++++++++++++++++++++++++++++++
 lib/gpu/lal_amoeba.h        |  87 +++++
 lib/gpu/lal_amoeba_ext.cpp  | 142 ++++++++
 lib/gpu/lal_base_amoeba.cpp | 516 +++++++++++++++++++++++++++
 lib/gpu/lal_base_amoeba.h   | 225 ++++++++++++
 lib/gpu/lal_base_atomic.cpp |   4 +-
 lib/gpu/lal_base_charge.cpp |   4 +-
 lib/gpu/lal_base_dipole.cpp |   4 +-
 lib/gpu/lal_base_dpd.cpp    |   5 +-
 lib/gpu/lal_base_three.cpp  |   4 +-
 lib/gpu/lal_neighbor.cpp    |  17 +
 lib/gpu/lal_neighbor.h      |   4 +
 lib/gpu/lal_neighbor_gpu.cu |  15 +
 src/AMOEBA/pair_amoeba.h    |   2 +-
 src/GPU/Install.sh          |   2 +
 src/GPU/pair_amoeba_gpu.cpp | 299 ++++++++++++++++
 src/GPU/pair_amoeba_gpu.h   |  63 ++++
 19 files changed, 2228 insertions(+), 6 deletions(-)
 create mode 100644 lib/gpu/lal_amoeba.cpp
 create mode 100644 lib/gpu/lal_amoeba.cu
 create mode 100644 lib/gpu/lal_amoeba.h
 create mode 100644 lib/gpu/lal_amoeba_ext.cpp
 create mode 100644 lib/gpu/lal_base_amoeba.cpp
 create mode 100644 lib/gpu/lal_base_amoeba.h
 create mode 100644 src/GPU/pair_amoeba_gpu.cpp
 create mode 100644 src/GPU/pair_amoeba_gpu.h
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 06297ca919..ccc9902778 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -140,6 +140,7 @@ option(CMAKE_VERBOSE_MAKEFILE "Generate verbose Makefiles" OFF)
 
 set(STANDARD_PACKAGES
   ADIOS
+  AMOEBA
   ASPHERE
   ATC
   AWPMD
@@ -308,6 +309,7 @@ endif()
 pkg_depends(ML-IAP ML-SNAP)
 pkg_depends(MPIIO MPI)
 pkg_depends(ATC MANYBODY)
+pkg_depends(AMOEBA KSPACE)
 pkg_depends(LATBOLTZ MPI)
 pkg_depends(PHONON KSPACE)
 pkg_depends(SCAFACOS MPI)
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
new file mode 100644
index 0000000000..67f0877e1a
--- /dev/null
+++ b/lib/gpu/lal_amoeba.cpp
@@ -0,0 +1,155 @@
+/***************************************************************************
+                                 amoeba.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "amoeba_cl.h"
+#elif defined(USE_CUDART)
+const char *amoeba=0;
+#else
+#include "amoeba_cubin.h"
+#endif
+
+#include "lal_amoeba.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define AmoebaT Amoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+AmoebaT::~Amoeba() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
+                  const double *host_thole, const double *host_special_polar_wscale,
+                  const double *host_special_polar_piscale,
+                  const double *host_special_polar_pscale,
+                  const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15,
+                  const double cell_size, const double gpu_split, FILE *_screen,
+                  const double aewald, const double felec,
+                  const double off2, const double polar_dscale,
+                  const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = (numtyp)0;
+    host_write[i].w = (numtyp)0;
+  }
+
+  damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(damping,host_write,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_polar_wscale[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_polar,dview,5,false);
+
+  _aewald = aewald;
+  _felec = felec;
+  _off2 = off2;
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=damping.row_bytes()
+    + sp_polar.row_bytes()
+    + this->_tep.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void AmoebaT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  damping.clear();
+  sp_polar.clear();
+  
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double AmoebaT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::loop(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int _nall=this->atom->nall();
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+
+  this->k_polar.set_size(GX,BX);
+
+  this->k_polar.run(&this->atom->x, &this->atom->extra,
+                    &damping, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,
+                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+  return GX;
+}
+
+template class Amoeba<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
new file mode 100644
index 0000000000..fbda1e0787
--- /dev/null
+++ b/lib/gpu/lal_amoeba.cu
@@ -0,0 +1,684 @@
+// **************************************************************************
+//                                   amoeba.cu
+//                             -------------------
+//                          Trung Dac Nguyen (Northwestern)
+//
+//  Device code for acceleration of the amoeba pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : trung.nguyen@northwestern.edu
+// ***************************************************************************
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include <stdio.h>
+#include "lal_aux_fun1.h"
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#include "inttypes.h"
+#define tagint int64_t
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+#ifndef _DOUBLE_DOUBLE
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+#else
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+#if (SHUFFLE_AVAIL == 0)
+
+#define local_allocate_store_ufld()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=ufld[0];                                                \
+    red_acc[1][tid]=ufld[1];                                                \
+    red_acc[2][tid]=ufld[2];                                                \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    ufld[0]=red_acc[0][tid];                                                \
+    ufld[1]=red_acc[1][tid];                                                \
+    ufld[2]=red_acc[2][tid];                                                \
+    red_acc[0][tid]=dufld[0];                                               \
+    red_acc[1][tid]=dufld[1];                                               \
+    red_acc[2][tid]=dufld[2];                                               \
+    red_acc[3][tid]=dufld[3];                                               \
+    red_acc[4][tid]=dufld[4];                                               \
+    red_acc[5][tid]=dufld[5];                                               \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    dufld[0]=red_acc[0][tid];                                               \
+    dufld[1]=red_acc[1][tid];                                               \
+    dufld[2]=red_acc[2][tid];                                               \
+    dufld[3]=red_acc[3][tid];                                               \
+    dufld[4]=red_acc[4][tid];                                               \
+    dufld[5]=red_acc[5][tid];                                               \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    numtyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#else
+
+#define local_allocate_store_ufld()
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      ufld[0] += shfl_down(ufld[0], s, t_per_atom);                         \
+      ufld[1] += shfl_down(ufld[1], s, t_per_atom);                         \
+      ufld[2] += shfl_down(ufld[2], s, t_per_atom);                         \
+      dufld[0] += shfl_down(dufld[0], s, t_per_atom);                       \
+      dufld[1] += shfl_down(dufld[1], s, t_per_atom);                       \
+      dufld[2] += shfl_down(dufld[2], s, t_per_atom);                       \
+      dufld[3] += shfl_down(dufld[3], s, t_per_atom);                       \
+      dufld[4] += shfl_down(dufld[4], s, t_per_atom);                       \
+      dufld[5] += shfl_down(dufld[5], s, t_per_atom);                       \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    numtyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#endif
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MY_PIS (acctyp)1.77245385090551602729
+
+__kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
+                            const __global numtyp *restrict extra,
+                            const __global numtyp4 *restrict damping,
+                            const __global numtyp4 *restrict sp_polar,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            __global numtyp4 *restrict tep,
+                            const int eflag, const int vflag, const int inum,
+                            const int nall, const int nbor_pitch, const int t_per_atom,
+                            const numtyp aewald, const numtyp felec,
+                            const numtyp off2, const numtyp polar_dscale,
+                            const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_ufld();
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp ufld[3];
+  ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
+  acctyp dufld[6];
+  for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int k,m,itype,igroup;
+    numtyp bfac;
+    numtyp psc3,psc5,psc7;
+    numtyp dsc3,dsc5,dsc7;
+    numtyp usc3,usc5;
+    numtyp psr3,psr5,psr7;
+    numtyp dsr3,dsr5,dsr7;
+    numtyp usr5;
+    numtyp term1,term2,term3;
+    numtyp term4,term5;
+    numtyp term6,term7;
+    numtyp rc3[3],rc5[3],rc7[3];
+    numtyp prc3[3],prc5[3],prc7[3];
+    numtyp drc3[3],drc5[3],drc7[3];
+    numtyp urc3[3],urc5[3];
+    numtyp bn[5];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    uix = polar4[i].x;    // uind[i][0];
+    uiy = polar4[i].y;    // uind[i][1];
+    uiz = polar4[i].z;    // uind[i][2];
+    uixp = polar5[i].x;   // uinp[i][0];
+    uiyp = polar5[i].y;   // uinp[i][1];
+    uizp = polar5[i].z;   // uinp[i][2];
+
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = damping[itype].x;
+    numtyp pti = damping[itype].y;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=dev_packed[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      numtyp uir = uix*xr + uiy*yr + uiz*zr;
+      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
+      numtyp ukr = ukx*xr + uky*yr + ukz*zr;
+      numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m <= 4; m++) {
+        bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+      }
+      for (m = 0; m < 5; m++) bn[m] *= felec;
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp sc3 = (numtyp)1.0;
+      numtyp sc5 = (numtyp)1.0;
+      numtyp sc7 = (numtyp)1.0;
+      for (k = 0; k < 3; k++) {
+        rc3[k] = (numtyp)0.0;
+        rc5[k] = (numtyp)0.0;
+        rc7[k] = (numtyp)0.0;
+      }
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          sc3 = (numtyp)1.0 - expdamp;
+          sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
+          sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
+          numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
+          numtyp temp5 = damp;
+          numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
+          rc3[0] = xr * temp3;
+          rc3[1] = yr * temp3;
+          rc3[2] = zr * temp3;
+          rc5[0] = rc3[0] * temp5;
+          rc5[1] = rc3[1] * temp5;
+          rc5[2] = rc3[2] * temp5;
+          rc7[0] = rc5[0] * temp7;
+          rc7[1] = rc5[1] * temp7;
+          rc7[2] = rc5[2] * temp7;
+        }
+
+        psc3 = (numtyp)1.0 - sc3*factor_pscale;
+        psc5 = (numtyp)1.0 - sc5*factor_pscale;
+        psc7 = (numtyp)1.0 - sc7*factor_pscale;
+        dsc3 = (numtyp)1.0 - sc3*factor_dscale;
+        dsc5 = (numtyp)1.0 - sc5*factor_dscale;
+        dsc7 = (numtyp)1.0 - sc7*factor_dscale;
+        usc3 = (numtyp)1.0 - sc3*factor_uscale;
+        usc5 = (numtyp)1.0 - sc5*factor_uscale;
+        psr3 = bn[1] - psc3*rr3;
+        psr5 = bn[2] - psc5*rr5;
+        psr7 = bn[3] - psc7*rr7;
+        dsr3 = bn[1] - dsc3*rr3;
+        dsr5 = bn[2] - dsc5*rr5;
+        dsr7 = bn[3] - dsc7*rr7;
+        usr5 = bn[2] - usc5*rr5;
+        for (k = 0; k < 3; k++) {
+          prc3[k] = rc3[k] * factor_pscale;
+          prc5[k] = rc5[k] * factor_pscale;
+          prc7[k] = rc7[k] * factor_pscale;
+          drc3[k] = rc3[k] * factor_dscale;
+          drc5[k] = rc5[k] * factor_dscale;
+          drc7[k] = rc7[k] * factor_dscale;
+          urc3[k] = rc3[k] * factor_uscale;
+          urc5[k] = rc5[k] * factor_uscale;
+        }
+      } else { // damp == 0: ???
+      }
+
+      // get the induced dipole field used for dipole torques
+
+      numtyp tix3 = psr3*ukx + dsr3*ukxp;
+      numtyp tiy3 = psr3*uky + dsr3*ukyp;
+      numtyp tiz3 = psr3*ukz + dsr3*ukzp;
+      numtyp tuir = -psr5*ukr - dsr5*ukrp;
+      
+      ufld[0] += tix3 + xr*tuir;
+      ufld[1] += tiy3 + yr*tuir;
+      ufld[2] += tiz3 + zr*tuir;
+
+      // get induced dipole field gradient used for quadrupole torques
+
+      numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
+      numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
+      numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
+      tuir = -psr7*ukr - dsr7*ukrp;
+      
+      dufld[0] += xr*tix5 + xr*xr*tuir;
+      dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
+      dufld[2] += yr*tiy5 + yr*yr*tuir;
+      dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
+      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
+      dufld[5] += zr*tiz5 + zr*zr*tuir;
+      
+      // get the dEd/dR terms used for direct polarization force
+
+      term1 = bn[2] - dsc3*rr5;
+      term2 = bn[3] - dsc5*rr7;
+      term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
+      term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
+      term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
+      numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6;
+      numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
+      term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
+      term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
+      numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
+      term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
+      term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
+      term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
+      term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
+      numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*xr*yr - rr3*yr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
+      term7 = rr5*drc5[0] - term2*xr;
+      numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;
+
+      term3 = term1*xr*zr - rr3*zr*drc3[0];
+      term5 = term2*xr*zr - rr5*zr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
+      numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*yr*zr - rr3*zr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
+      term7 = rr5*drc5[1] - term2*yr;
+      numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
+      numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
+      numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;
+
+      numtyp frcx = depx;
+      numtyp frcy = depy;
+      numtyp frcz = depz;
+
+      // get the dEp/dR terms used for direct polarization force
+      
+      // tixx and tkxx
+      term1 = bn[2] - psc3*rr5;
+      term2 = bn[3] - psc5*rr7;
+      term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr - psr5*xr;
+      term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
+      term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
+      tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
+      tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      // tiyy and tkyy
+      term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr - psr5*yr;
+      term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
+      term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
+      tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tizz and tkzz
+      term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
+      term4 = rr3*prc3[2] - term1*zr - psr5*zr;
+      term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
+      term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
+      term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
+      tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tixy and tkxy
+      term3 = term1*xr*yr - rr3*yr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
+      term7 = rr5*prc5[0] - term2*xr;
+      tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tixz and tkxz
+      term3 = term1*xr*zr - rr3*zr*prc3[0];
+      term5 = term2*xr*zr - rr5*zr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
+      tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tiyz and tkyz
+      term3 = term1*yr*zr - rr3*zr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
+      term7 = rr5*prc5[1] - term2*yr;
+      tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
+      depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
+      depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      // get the dtau/dr terms used for mutual polarization force
+      // poltyp == MUTUAL  && amoeba
+          
+      term1 = bn[2] - usc3*rr5;
+      term2 = bn[3] - usc5*rr7;
+      term3 = usr5 + term1;
+      term4 = rr3 * factor_uscale;
+      term5 = -xr*term3 + rc3[0]*term4;
+      term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
+      tixx = uix*term5 + uir*term6;
+      tkxx = ukx*term5 + ukr*term6;
+
+      term5 = -yr*term3 + rc3[1]*term4;
+      term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
+      tiyy = uiy*term5 + uir*term6;
+      tkyy = uky*term5 + ukr*term6;
+
+      term5 = -zr*term3 + rc3[2]*term4;
+      term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
+      tizz = uiz*term5 + uir*term6;
+      tkzz = ukz*term5 + ukr*term6;
+
+      term4 = -usr5 * yr;
+      term5 = -xr*term1 + rr3*urc3[0];
+      term6 = xr*yr*term2 - rr5*yr*urc5[0];
+      tixy = uix*term4 + uiy*term5 + uir*term6;
+      tkxy = ukx*term4 + uky*term5 + ukr*term6;
+
+      term4 = -usr5 * zr;
+      term6 = xr*zr*term2 - rr5*zr*urc5[0];
+      tixz = uix*term4 + uiz*term5 + uir*term6;
+      tkxz = ukx*term4 + ukz*term5 + ukr*term6;
+
+      term5 = -yr*term1 + rr3*urc3[1];
+      term6 = yr*zr*term2 - rr5*zr*urc5[1];
+      tiyz = uiy*term4 + uiz*term5 + uir*term6;
+      tkyz = uky*term4 + ukz*term5 + ukr*term6;
+
+      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
+        + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
+      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
+        + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
+      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
+        + tkxz*uixp + tkyz*uiyp + tkzz*uizp;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = xr * frcx;
+        numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = yr * frcy;
+        numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate ufld and dufld to compute tep
+  store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
+
+  // accumate force, energy and virial
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+  
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
new file mode 100644
index 0000000000..d4aa576c57
--- /dev/null
+++ b/lib/gpu/lal_amoeba.h
@@ -0,0 +1,87 @@
+/***************************************************************************
+                                  amoeba.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_AMOEBA_H
+#define LAL_AMOEBA_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Amoeba : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Amoeba();
+  ~Amoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const double *host_pdamp,
+           const double *host_thole, const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double aewald, const double felec,
+           const double off2, const double polar_dscale,
+           const double polar_uscale);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = damping.x; thole = damping.y
+  UCL_D_Vec<numtyp4> damping;
+  /// Special polar values [0-4]: 
+  ///   sp_polar.x = special_polar_wscale
+  ///   sp_polar.y special_polar_pscale,
+  ///   sp_polar.z = special_polar_piscale
+  UCL_D_Vec<numtyp4> sp_polar;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+ private:
+  bool _allocated;
+  int loop(const int eflag, const int vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
new file mode 100644
index 0000000000..27c35a810f
--- /dev/null
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -0,0 +1,142 @@
+/***************************************************************************
+                                 amoeba_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to amoeba acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_amoeba.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int amoeba_gpu_init(const int ntypes, const int max_amtype,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double aewald, const double felec,
+                    const double off2, const double polar_dscale,
+                    const double polar_uscale, int& tep_size) {
+  AMOEBAMF.clear();
+  gpu_mode=AMOEBAMF.device->gpu_mode();
+  double gpu_split=AMOEBAMF.device->particle_split();
+  int first_gpu=AMOEBAMF.device->first_device();
+  int last_gpu=AMOEBAMF.device->last_device();
+  int world_me=AMOEBAMF.device->world_me();
+  int gpu_rank=AMOEBAMF.device->gpu_rank();
+  int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
+
+  tep_size=sizeof(PRECISION);
+
+  AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
+
+  bool message=false;
+  if (AMOEBAMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
+                          host_special_polar_wscale, host_special_polar_piscale,
+                          host_special_polar_pscale, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split, screen,
+                          aewald, felec, off2, polar_dscale, polar_uscale);
+
+  AMOEBAMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
+                            host_special_polar_wscale, host_special_polar_piscale,
+                            host_special_polar_pscale, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split, screen,
+                            aewald, felec, off2, polar_dscale, polar_uscale);
+
+    AMOEBAMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    AMOEBAMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void amoeba_gpu_clear() {
+  AMOEBAMF.clear();
+}
+
+int** amoeba_gpu_compute_n(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup,
+                           double **host_rpole, double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
+  return AMOEBAMF.compute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom,
+                          vatom, host_start, ilist, jnum, cpu_time, success,
+                          host_q, boxlo, prd, tep_ptr);
+}
+
+void amoeba_gpu_compute(const int ago, const int inum_full, const int nall,
+                        double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+                        double **host_rpole, double **host_uind, double **host_uinp,
+                        int *ilist, int *numj, int **firstneigh,
+                        const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q,
+                        const int nlocal, double *boxlo, double *prd, void **tep_ptr) {
+  AMOEBAMF.compute(ago,inum_full, nall, host_x, host_type,
+                   host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                   ilist, numj, firstneigh, eflag, vflag, eatom, vatom,
+                   host_start, cpu_time, success, host_q, nlocal, boxlo, prd, tep_ptr);
+}
+
+double amoeba_gpu_bytes() {
+  return AMOEBAMF.host_memory_usage();
+}
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
new file mode 100644
index 0000000000..c5f4a01222
--- /dev/null
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -0,0 +1,516 @@
+/***************************************************************************
+                               base_amoeba.cpp
+                             -------------------
+                            Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include "lal_base_amoeba.h"
+namespace LAMMPS_AL {
+#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+  pair_program=nullptr;
+  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
+}
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::~BaseAmoeba() {
+  delete ans;
+  delete nbor;
+  k_polar.clear();
+  k_special15.clear();
+  if (pair_program) delete pair_program;
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
+                             const int max_nbors, const int maxspecial,
+                             const int maxspecial15,
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_charge();
+
+  bool charge = true;
+  bool rot = false;
+  bool vel = false;
+  _extra_fields = 24; // round up to accomodate quadruples of numtyp values
+                      // rpole 13; uind 3; uinp 3; amtype, amgroup
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields);
+  if (success!=0)
+    return success;
+
+  if (ucl_device!=device->gpu) _compiled=false;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
+  compile_kernels(*ucl_device,pair_program,k_name);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  _maxspecial=maxspecial;
+  _maxspecial15=maxspecial15;
+
+  // allocate per-atom array tep 
+
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
+  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
+
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  nbor->clear();
+  ans->clear();
+
+  _tep.clear();
+  dev_nspecial15.clear();
+  dev_special15.clear();
+  dev_special15_t.clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return nullptr;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
+                                         const int nall, double **host_x,
+                                         int *host_type, double *sublo,
+                                         double *subhi, tagint *tag,
+                                         int **nspecial, tagint **special,
+                                         int *nspecial15, tagint **special15,
+                                         bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+
+  // add one-five neighbors
+
+  if (_maxspecial15>0) {
+    UCL_H_Vec<int> view_nspecial15;
+    UCL_H_Vec<tagint> view_special15;
+    view_nspecial15.view(nspecial15,nall,*ucl_device);
+    view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
+    ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
+    ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
+    nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
+
+    add_onefive_neighbors();
+  }
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
+                          double **host_x, int *host_type, int *host_amtype,
+                          int *host_amgroup, double **host_rpole,
+                          double **host_uind, double **host_uinp,
+                          int *ilist, int *numj, int **firstneigh,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, const double cpu_time,
+                          bool &success, double *host_q, const int nlocal,
+                          double *boxlo, double *prd, void **tep_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
+
+    dev_nspecial15.clear();
+    dev_special15.clear();
+    dev_special15_t.clear();
+    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+  }
+
+  *tep_ptr=_tep.host.begin();
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return;
+  }
+
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  // packing host arrays into host_extra
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+  atom->add_extra_data();
+
+  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
+
+    dev_nspecial15.clear();
+    dev_special15.clear();
+    dev_special15_t.clear();
+    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+  }
+  *tep_ptr=_tep.host.begin();
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return nullptr;
+  }
+
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                    success);
+    if (!success)
+      return nullptr;
+    atom->cast_q_data(host_q);
+    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  atom->add_extra_data();
+
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
+/*  
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
+    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
+  return nbor->host_jlist.begin()-host_start;
+}
+
+template <class numtyp, class acctyp>
+double BaseAmoebaT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
+    double** uind, double** uinp) {
+  int _nall=atom->nall();
+  numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));
+
+  int n = 0;
+  int nstride = 4;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = rpole[i][0];
+    pextra[idx+1] = rpole[i][1];
+    pextra[idx+2] = rpole[i][2];
+    pextra[idx+3] = rpole[i][3];
+  }
+
+  n += nstride*_nall;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = rpole[i][4];
+    pextra[idx+1] = rpole[i][5];
+    pextra[idx+2] = rpole[i][6];
+    pextra[idx+3] = rpole[i][8];
+  }
+
+  n += nstride*_nall;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = rpole[i][9];
+    pextra[idx+1] = rpole[i][12];
+    pextra[idx+2] = (numtyp)amtype[i];
+    pextra[idx+3] = (numtyp)amgroup[i];
+  }
+
+  n += nstride*_nall;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = uind[i][0];
+    pextra[idx+1] = uind[i][1];
+    pextra[idx+2] = uind[i][2];
+  }
+
+  n += nstride*_nall;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = uinp[i][0];
+    pextra[idx+1] = uinp[i][1];
+    pextra[idx+2] = uinp[i][2];    
+  }
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname) {
+  if (_compiled)
+    return;
+
+  if (pair_program) delete pair_program;
+  pair_program=new UCL_Program(dev);
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  
+  k_polar.set_function(*pair_program,kname);
+  k_special15.set_function(*pair_program,"k_special15");
+  pos_tex.get_texture(*pair_program,"pos_tex");
+  q_tex.get_texture(*pair_program,"q_tex");
+  
+  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.has_subgroup_support()) {
+    size_t mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::add_onefive_neighbors() {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
+                               (BX/_threads_per_atom)));
+
+  int _nall=atom->nall();
+  int ainum=ans->inum();
+  int nbor_pitch=nbor->nbor_pitch();
+  
+  k_special15.set_size(GX,BX);
+  k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
+                    &atom->dev_tag, &dev_nspecial15, &dev_special15,
+                    &ainum, &_nall, &nbor_pitch,
+                    &_threads_per_atom);
+  
+  return GX;
+}
+
+template class BaseAmoeba<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
new file mode 100644
index 0000000000..ac9c23e8a9
--- /dev/null
+++ b/lib/gpu/lal_base_amoeba.h
@@ -0,0 +1,225 @@
+/***************************************************************************
+                                base_amoeba.h
+                             -------------------
+                        Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_BASE_AMOEBA_H
+#define LAL_BASE_AMOEBA_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#if defined(USE_OPENCL)
+#include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseAmoeba {
+ public:
+  BaseAmoeba();
+  virtual ~BaseAmoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const void *pair_program, const char *k_name);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead(const int add_kernels=0);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum,
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers(screen);
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, int *nspecial15, tagint **special15,
+                       bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *host_amtype,
+               int *host_amgroup, double **host_rpole, double **host_uind,
+               double **host_uinp, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd, void **tep_ptr);
+
+  /// Pair loop with device neighboring
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double **host_uind,
+                double **host_uinp, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, void **tep_ptr);
+
+  // -------------------------- DEVICE DATA -------------------------
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+  UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
+
+  /// cast host arrays into a single array for atom->extra
+  void cast_extra_data(int* amtype, int* amgroup, double** rpole,
+    double** uind, double** uinp);
+
+  /// Per-atom arrays
+  UCL_Vector<numtyp,numtyp> _tep;
+  int _max_tep_size;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+  /// Device storage for 1-5 special neighbor counts
+  UCL_D_Vec<int> dev_nspecial15;
+  /// Device storage for special neighbors
+  UCL_D_Vec<tagint> dev_special15, dev_special15_t;
+
+  int add_onefive_neighbors();
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_polar,k_special15;
+  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {}
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _block_bio_size, _threads_per_atom;
+  int _extra_fields;
+  double  _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
+
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+
+  virtual int loop(const int eflag, const int vflag) = 0;
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index 6aad138aa1..bda9441c5b 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_atom();
 
-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index 9045420425..5c236873d0 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_charge();
 
-  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 439637cbde..71650ebf7e 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_charge();
 
-  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
index d3c3353415..07c11caf8f 100644
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
 
   _threads_per_atom=device->threads_per_atom();
 
-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  bool charge = false;
+  bool rot = false;
+  bool vel = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index 15ef20230d..9dfee9b8c9 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   else
     _threads_per_atom=device->threads_per_three();
 
-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index a0d2eaa8c3..4e65a58003 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -579,6 +579,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     time_nbor.stop();
     if (_time_device)
       time_nbor.add_to_total();
+
+    // on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
+    // on the device, transpose the matrix (1-d array) for coalesced reads
+    //   dev_special[i][j] = the special i neighbor of atom j
+
     time_transpose.start();
     const int b2x=_block_cell_2d;
     const int b2y=_block_cell_2d;
@@ -682,6 +687,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     if (_cutoff < _cell_size) vadjust*=1.46;
     mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
     if (mn<33) mn+=3;
+    
     resize_max_neighbors<numtyp,acctyp>(mn,success);
     set_nbor_block_size(mn/2);
     if (!success)
@@ -834,6 +840,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   time_nbor.stop();
 }
 
+void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in, 
+    const int columns_in, const int rows_in)
+{
+  const int b2x=_block_cell_2d;
+  const int b2y=_block_cell_2d;
+  const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
+  const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
+  _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+  _shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
+}
+
 template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
      (double **x, const int inum, const int host_inum, const int nall,
       Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index c1e1a87ef4..97aec4e280 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -260,6 +260,10 @@ class Neighbor {
     return o.str();
   }
 
+  /// Helper function
+  void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in, 
+    const int columns_in, const int rows_in);
+
  private:
   NeighborShared *_shared;
   UCL_Device *dev;
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index 6fd724b494..144e9fa284 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -44,6 +44,19 @@ _texture_2d( pos_tex,int4);
 #define LAL_USE_OLD_NEIGHBOR
 #endif
 
+/*
+  compute the id of the cell where the atoms belong to
+x: atom coordinates
+cell_id: cell ids
+particle_id: 
+boxlo[0-2]: the lower left corner of the local box
+ncell[xyz]: the number of cells in xyz dims
+i_cell_size is the inverse cell size
+inum = the number of the local atoms that are ported to the device
+nall = the number of the local+ghost atoms that are ported to the device
+cells_in_cutoff = the number of cells that are within the cutoff
+*/
+
 __kernel void calc_cell_id(const numtyp4 *restrict x_,
                            unsigned *restrict cell_id,
                            int *restrict particle_id,
@@ -86,6 +99,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
   }
 }
 
+// compute the number of atoms in each cell
+
 __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
                                       int *restrict cell_counts,
                                       int nall, int ncell) {
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index b28a00fb84..4644d4a137 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -357,7 +357,7 @@ class PairAmoeba : public Pair {
 
   void polar();
   void polar_energy();
-  void polar_real();
+  virtual void polar_real();
   void polar_kspace();
   void damppole(double, int, double, double, double *, double *, double *);
 
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index a87d2165d9..9e231663c0 100755
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -41,6 +41,8 @@ action fix_npt_gpu.cpp
 action fix_nve_asphere_gpu.h fix_nve_asphere.h
 action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
 action gpu_extra.h
+action pair_amoeba_gpu.cpp pair_amoeba.cpp
+action pair_amoeba_gpu.h pair_amoeba.h
 action pair_beck_gpu.cpp pair_beck.cpp
 action pair_beck_gpu.h pair_beck.h
 action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
new file mode 100644
index 0000000000..4f1b20d364
--- /dev/null
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -0,0 +1,299 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (Northwestern)
+------------------------------------------------------------------------- */
+
+#include "pair_amoeba_gpu.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "suffix.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+// External functions from cuda library for atom decomposition
+
+int amoeba_gpu_init(const int ntypes, const int max_amtype,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double aewald, const double felec,
+                    const double off2, const double polar_dscale,
+                    const double polar_uscale, int& tep_size);
+void amoeba_gpu_clear();
+
+int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall,
+                            double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+                            double **host_rpole, double **host_uind, double **host_uinp,
+                            double *sublo, double *subhi, tagint *tag, int **nspecial,
+                            tagint **special, int* nspecial15, tagint** special15,
+                            const bool eflag, const bool vflag,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd,
+                            void **tep_ptr);
+void amoeba_gpu_compute(const int ago, const int inum,
+                        const int nall, double **host_x, int *host_type,
+                        int *host_amtype, int *host_amgroup,
+                        double **host_rpole, double **host_uind, double **host_uinp,
+                        int *ilist, int *numj, int **firstneigh,
+                        const bool eflag, const bool vflag, const bool eatom,
+                        const bool vatom, int &host_start, const double cpu_time,
+                        bool &success, double *host_q, const int nlocal,
+                        double *boxlo, double *prd, void **tep_ptr);
+
+double amoeba_gpu_bytes();
+
+enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
+
+/* ---------------------------------------------------------------------- */
+
+PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
+{
+  respa_enable = 0;
+  reinitflag = 0;
+  cpu_time = 0.0;
+  suffix_flag |= Suffix::GPU;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairAmoebaGPU::~PairAmoebaGPU()
+{
+  amoeba_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairAmoebaGPU::polar_real()
+{
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  if (gpu_mode != GPU_FORCE) {
+    double sublo[3],subhi[3];
+    if (domain->triclinic == 0) {
+      sublo[0] = domain->sublo[0];
+      sublo[1] = domain->sublo[1];
+      sublo[2] = domain->sublo[2];
+      subhi[0] = domain->subhi[0];
+      subhi[1] = domain->subhi[1];
+      subhi[2] = domain->subhi[2];
+    } else {
+      domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+    }
+    inum = atom->nlocal;
+
+    firstneigh = amoeba_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+                                      atom->type, amtype, amgroup,
+                                      rpole, uind, uinp, sublo, subhi,
+                                      atom->tag, atom->nspecial, atom->special,
+                                      atom->nspecial15, atom->special15,
+                                      eflag, vflag, eflag_atom, vflag_atom,
+                                      host_start, &ilist, &numneigh, cpu_time,
+                                      success, atom->q, domain->boxlo,
+                                      domain->prd, &tep_pinned);
+
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    
+    amoeba_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+                       amtype, amgroup, rpole, uind, uinp,
+                       ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+                       vflag_atom, host_start, cpu_time, success, atom->q,
+                       atom->nlocal, domain->boxlo, domain->prd, &tep_pinned);
+  }
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tep_single) {
+    float *tep_ptr = (float *)tep_pinned;
+    compute_force_from_tep<float>(tep_ptr);
+  } else {
+    double *tep_ptr = (double *)tep_pinned;
+    compute_force_from_tep<double>(tep_ptr);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
+{
+  int i,ix,iy,iz;
+  double ci,dix,diy,diz;
+  double qixx,qixy,qixz;
+  double qiyy,qiyz,qizz;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double fix[3],fiy[3],fiz[3],tep[4];
+
+  double** x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    dix = rpole[i][1];
+    diy = rpole[i][2];
+    diz = rpole[i][3];
+    qixx = rpole[i][4];
+    qixy = rpole[i][5];
+    qixz = rpole[i][6];
+    qiyy = rpole[i][8];
+    qiyz = rpole[i][9];
+    qizz = rpole[i][12];
+    
+    tep[0] = tep_ptr[4*i];
+    tep[1] = tep_ptr[4*i+1];
+    tep[2] = tep_ptr[4*i+2];
+    torque2force(i,tep,fix,fiy,fiz,fpolar);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+
+    virpolar[0] += vxx;
+    virpolar[1] += vyy;
+    virpolar[2] += vzz;
+    virpolar[3] += vxy;
+    virpolar[4] += vxz;
+    virpolar[5] += vyz;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::init_style()
+{
+  PairAmoeba::init_style();
+
+  if (gpu_mode == GPU_FORCE) {
+    if (comm->me == 0)
+      error->warning(FLERR,"Pair style amoeba/gpu does not support neigh no "
+        "for now, automatically switching to neigh yes");
+    gpu_mode = GPU_NEIGH;
+  } 
+
+  // Repeat cutsq calculation because done after call to init_style
+
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+
+  // select the cutoff (off2) for neighbor list builds (the polar term for now)
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  double cell_size = sqrt(off2) + neighbor->skin;
+
+  int maxspecial=0;
+  int maxspecial15=0;
+  if (atom->molecular != Atom::ATOMIC) {
+    maxspecial=atom->maxspecial;
+    maxspecial15=atom->maxspecial15;
+  }
+    
+  int tep_size;
+  int mnf = 5e-2 * neighbor->oneatom;
+
+  // set the energy unit conversion factor for polar real-space calculation
+
+  double felec = 0.5 * electric / am_dielectric;
+  
+  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole,
+                                special_polar_wscale, special_polar_piscale,
+                                special_polar_pscale, atom->nlocal,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
+                                maxspecial15, cell_size, gpu_mode, screen,
+                                aewald, felec, off2, polar_dscale, polar_uscale,
+                                tep_size);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (tep_size == sizeof(double))
+    tep_single = false;
+  else
+    tep_single = true;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairAmoebaGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + amoeba_gpu_bytes();
+}
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
new file mode 100644
index 0000000000..4d29bfaf34
--- /dev/null
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -0,0 +1,63 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(amoeba/gpu,PairAmoebaGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_AMOEBA_GPU_H
+#define LMP_PAIR_AMOEBA_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairAmoebaGPU : public PairAmoeba {
+ public:
+  PairAmoebaGPU(LAMMPS *lmp);
+  ~PairAmoebaGPU();
+  void init_style();
+  double memory_usage();
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  virtual void polar_real();
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tep_pinned;
+  bool tep_single;
+
+  template<class numtyp>
+  void compute_force_from_tep(const numtyp*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Insufficient memory on accelerator
+
+There is insufficient memory on one of the devices specified for the gpu
+package
+
+E: Pair style amoeba/gpu requires atom attribute q
+
+The atom style defined does not have this attribute.
+
+*/

From db92844228b555938a85ceb2d6f893010e5c5954 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 25 Aug 2021 23:22:23 -0500
Subject: [PATCH 002/181] Added recent changes to FixGPU to enable newton_pair
 on

---
 src/GPU/fix_gpu.cpp | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 71ab3f4cb4..66b938c577 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -127,7 +127,7 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   _gpu_mode = GPU_NEIGH;
   _particle_split = 1.0;
   int nthreads = 0;
-  int newtonflag = 0;
+  int newtonflag = force->newton_pair;
   int threads_per_atom = -1;
   double binsize = 0.0;
   char *opencl_args = nullptr;
@@ -211,14 +211,16 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   #endif
 
   // set newton pair flag
-  // require newtonflag = 0 since currently required by all GPU pair styles
-
-  if (newtonflag == 1) error->all(FLERR,"Illegal package gpu command");
 
   force->newton_pair = newtonflag;
   if (force->newton_pair || force->newton_bond) force->newton = 1;
   else force->newton = 0;
 
+  // require newton pair off if _particle_split < 1
+
+  if (force->newton_pair == 1 && _particle_split < 1)
+    error->all(FLERR,"Cannot use newton pair on for split less than 1");
+
   if (pair_only_flag) {
     lmp->suffixp = lmp->suffix;
     lmp->suffix = nullptr;
@@ -341,7 +343,23 @@ void FixGPU::post_force(int /* vflag */)
   force->pair->virial[4] += lvirial[4];
   force->pair->virial[5] += lvirial[5];
 
-  if (force->pair->vflag_fdotr) force->pair->virial_fdotr_compute();
+  // for newton pair off: force->pair->vflag_fdotr = 0
+  //    which has been the case so far, virial_fdotr_compute() is never called
+  // for newton pair on: force->pair->vflag_fdotr = 1
+  //    for neigh yes: full neighbor lists are built on the device
+  //    for neigh no: full neighbor lists are built on the host
+  //       either way the virial is tallied to force->pair->virial as above
+  //    so as long as _particle_split == 1 
+  //    no need to call force->pair->virial_fdotr_compute();
+  //    If _particle_split < 1, the local atom forces computed by
+  //      the gpu pair styles on the host (cpu_compute()) got tallied
+  //      by comm->reverse_comm() (which is done before this post_force() function).
+  //      A call to force->pair->virial_fdotr_compute() would double count
+  //      the virial from the local atoms on the host.
+  // Here a possible workaround is to comment out the below command
+  //   while enforcing newton pair off for _particle_split < 1.
+
+  //if (force->pair->vflag_fdotr) force->pair->virial_fdotr_compute();
   timer->stamp(Timer::PAIR);
 }
 

From 91317b2879e72589a4b62868f05fceb98b48f3b7 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 26 Aug 2021 09:33:20 -0500
Subject: [PATCH 003/181] Added changes to Atom and Device classes for
 allocation of extra fields and SBBITS15 and NEIGHMASK15

---
 lib/gpu/lal_atom.cpp       | 36 ++++++++++++++++++++++++++---
 lib/gpu/lal_atom.h         | 46 ++++++++++++++++++++++++++++++++++----
 lib/gpu/lal_device.cpp     | 11 ++++++---
 lib/gpu/lal_device.h       |  3 ++-
 lib/gpu/lal_preprocessor.h |  4 ++++
 5 files changed, 89 insertions(+), 11 deletions(-)

diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index cda4d383b5..618ffb0106 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -48,6 +48,8 @@ int AtomT::bytes_per_atom() const {
     bytes+=sizeof(numtyp);
   if (_vel)
     bytes+=4*sizeof(numtyp);
+  if (_extra_fields>0)
+    bytes+=_extra_fields*sizeof(numtyp);
   return bytes;
 }
 
@@ -122,6 +124,11 @@ bool AtomT::alloc(const int nall) {
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=v.device.row_bytes();
   }
+  if (_extra_fields>0 && _host_view==false) {
+    success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                   UCL_READ_ONLY)==UCL_SUCCESS);
+    gpu_bytes+=extra.device.row_bytes();
+  }
 
   if (_gpu_nbor>0) {
     if (_bonds) {
@@ -156,7 +163,8 @@ bool AtomT::alloc(const int nall) {
 
 template <class numtyp, class acctyp>
 bool AtomT::add_fields(const bool charge, const bool rot,
-                       const int gpu_nbor, const bool bonds, const bool vel) {
+                       const int gpu_nbor, const bool bonds, const bool vel,
+                       const int extra_fields) {
   bool success=true;
   // Ignore host/device transfers?
   int gpu_bytes=0;
@@ -191,6 +199,16 @@ bool AtomT::add_fields(const bool charge, const bool rot,
     }
   }
 
+  if (extra_fields > 0 && _extra_fields==0) {
+    _extra_fields=extra_fields;
+    _other=true;
+    if (_host_view==false) {
+      success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
+                                     UCL_READ_ONLY)==UCL_SUCCESS);
+      gpu_bytes+=extra.device.row_bytes();
+    }
+  }
+
   if (bonds && _bonds==false) {
     _bonds=true;
     if (_bonds && _gpu_nbor>0) {
@@ -254,7 +272,8 @@ bool AtomT::add_fields(const bool charge, const bool rot,
 
 template <class numtyp, class acctyp>
 bool AtomT::init(const int nall, const bool charge, const bool rot,
-                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel) {
+                 UCL_Device &devi, const int gpu_nbor, const bool bonds, const bool vel,
+                 const int extra_fields) {
   clear();
 
   bool success=true;
@@ -262,13 +281,15 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   _q_avail=false;
   _quat_avail=false;
   _v_avail=false;
+  _extra_avail=false;
   _resized=false;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
   _rot=rot;
   _vel=vel;
-  _other=_charge || _rot || _vel;
+  _extra_fields=extra_fields;
+  _other=_charge || _rot || _vel || (extra_fields>0);
   dev=&devi;
   _time_transfer=0;
 
@@ -282,10 +303,14 @@ bool AtomT::init(const int nall, const bool charge, const bool rot,
   time_q.init(*dev);
   time_quat.init(*dev);
   time_vel.init(*dev);
+  time_extra.init(*dev);
+
   time_pos.zero();
   time_q.zero();
   time_quat.zero();
   time_vel.zero();
+  time_extra.zero();
+
   _time_cast=0.0;
 
   #ifdef GPU_CAST
@@ -308,6 +333,8 @@ void AtomT::clear_resize() {
     quat.clear();
   if (_vel)
     v.clear();
+  if (_extra_fields>0)
+    extra.clear();
 
   dev_cell_id.clear();
   dev_particle_id.clear();
@@ -350,6 +377,7 @@ void AtomT::clear() {
   time_q.clear();
   time_quat.clear();
   time_vel.clear();
+  time_extra.clear();
   clear_resize();
 
   #ifdef GPU_CAST
@@ -370,6 +398,8 @@ double AtomT::host_memory_usage() const {
     atom_bytes+=4;
   if (_vel)
     atom_bytes+=4;
+  if (_extra_fields>0)
+    atom_bytes+=_extra_fields;
   return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
 
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 3cf97d94a0..ff335fffa9 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -76,7 +76,7 @@ class Atom {
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool init(const int nall, const bool charge, const bool rot,
             UCL_Device &dev, const int gpu_nbor=0, const bool bonds=false,
-            const bool vel=false);
+            const bool vel=false, const int extra_fields=0);
 
   /// Check if we have enough device storage and realloc if not
   /** Returns true if resized with any call during this timestep **/
@@ -96,7 +96,7 @@ class Atom {
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device **/
   bool add_fields(const bool charge, const bool rot, const int gpu_nbor,
-                  const bool bonds, const bool vel=false);
+                  const bool bonds, const bool vel=false, const int extra_fields=0);
 
   /// Returns true if GPU is using charges
   bool charge() { return _charge; }
@@ -107,6 +107,9 @@ class Atom {
   /// Returns true if GPU is using velocities
   bool velocity() { return _vel; }
 
+  /// Returns true if GPU is using extra fields
+  bool using_extra() { return _extra_fields; }
+
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
 
@@ -450,6 +453,38 @@ class Atom {
     add_v_data(host_ptr,host_tag);
   }
 
+ // Cast extras to write buffer
+  template<class cpytyp>
+  inline void cast_extra_data(cpytyp *host_ptr) {
+    if (_extra_avail==false) {
+      double t=MPI_Wtime();
+      if (_host_view) {
+        extra.host.view((numtyp*)host_ptr,_nall*_extra_fields,*dev);
+        extra.device.view(extra.host);
+      } else if (sizeof(numtyp)==sizeof(double))
+        memcpy(extra.host.begin(),host_ptr,_nall*_extra_fields*sizeof(numtyp));
+      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
+        for (int i=0; i<_nall*_extra_fields; i++) extra[i]=host_ptr[i];
+      _time_cast+=MPI_Wtime()-t;
+    }
+  }
+
+  // Copy extras to device
+  /** Copies nall()*_extra elements **/
+  inline void add_extra_data() {
+    time_extra.start();
+    if (_extra_avail==false) {
+      extra.update_device(_nall*_extra_fields,true);
+      _extra_avail=true;
+    }
+    time_extra.stop();
+  }
+
   /// Add in casting time from additional data (seconds)
   inline void add_cast_time(double t) { _time_cast+=t; }
 
@@ -473,6 +508,8 @@ class Atom {
   UCL_Vector<numtyp,numtyp> quat;
   /// Velocities
   UCL_Vector<numtyp,numtyp> v;
+  /// Extras
+  UCL_Vector<numtyp,numtyp> extra;
 
   #ifdef GPU_CAST
   UCL_Vector<double,double> x_cast;
@@ -493,7 +530,7 @@ class Atom {
   UCL_H_Vec<int> host_particle_id;
 
   /// Device timers
-  UCL_Timer time_pos, time_q, time_quat, time_vel;
+  UCL_Timer time_pos, time_q, time_quat, time_vel, time_extra;
 
   /// Geryon device
   UCL_Device *dev;
@@ -508,11 +545,12 @@ class Atom {
   bool _compiled;
 
   // True if data has been copied to device already
-  bool _x_avail, _q_avail, _quat_avail, _v_avail, _resized;
+  bool _x_avail, _q_avail, _quat_avail, _v_avail, _extra_avail, _resized;
 
   bool alloc(const int nall);
 
   bool _allocated, _rot, _charge, _bonds, _vel, _other;
+  int _extra_fields;
   int _max_atoms, _nall, _gpu_nbor;
   bool _host_view;
   double _time_cast, _time_transfer;
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index e2b5b9cdb5..8908f3aff7 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -424,7 +424,7 @@ template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                   const bool rot, const int nlocal,
                   const int nall, const int maxspecial,
-                  const bool vel) {
+                  const bool vel, const int extra_fields) {
   if (!_device_init)
     return -1;
   if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@@ -453,7 +453,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
 
   if (_init_count==0) {
     // Initialize atom and nbor data
-    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel))
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor>0 && maxspecial>0,vel,extra_fields))
       return -3;
 
     _data_in_estimate++;
@@ -463,6 +463,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (vel)
       _data_in_estimate++;
+    if (extra_fields>0)
+      _data_in_estimate++;
+
   } else {
     if (atom.charge()==false && charge)
       _data_in_estimate++;
@@ -470,7 +473,9 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (atom.velocity()==false && vel)
       _data_in_estimate++;
-    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel))
+      if (atom.using_extra()==false && extra_fields>0)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
       return -3;
   }
 
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 1db6ae3127..01d3d64627 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -61,6 +61,7 @@ class Device {
     * \param nall Total number of local+ghost particles
     * \param maxspecial Maximum mumber of special bonded atoms per atom
     * \param vel True if velocities need to be stored
+    * \param extra_fields Nonzero if extra fields need to be stored
     *
     * Returns:
     * -  0 if successful
@@ -70,7 +71,7 @@ class Device {
     * - -5 Double precision is not supported on card **/
   int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
            const int nlocal, const int nall, const int maxspecial,
-           const bool vel=false);
+           const bool vel=false, const int extra_fields=0);
 
   /// Initialize the device for Atom storage only
   /** \param nlocal Total number of local particles to allocate memory for
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 12cf6345c2..2ef8af0911 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -330,6 +330,10 @@
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
 
+#define SBBITS15 29
+#define NEIGHMASK15 0x1FFFFFFF
+ucl_inline int sbmask15(int j) { return j >> SBBITS15 & 7; };
+
 // default to 32-bit smallint and other ints, 64-bit bigint:
 // same as defined in src/lmptype.h
 #if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \

From 88f3dd334c6d8d99fa1b35492f2a945637dfdce7 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 26 Aug 2021 09:35:43 -0500
Subject: [PATCH 004/181] Some changes in PPPMGPU due to the API changes in the
 GridComm class

---
 src/GPU/pppm_gpu.cpp | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index 8e3ec2ace8..476e54f8ca 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -106,8 +106,6 @@ PPPMGPU::PPPMGPU(LAMMPS *lmp) : PPPM(lmp)
 PPPMGPU::~PPPMGPU()
 {
   PPPM_GPU_API(clear)(poisson_time);
-  destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
-  destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
 }
 
 /* ----------------------------------------------------------------------
@@ -257,12 +255,12 @@ void PPPMGPU::compute(int eflag, int vflag)
   // remap from 3d decomposition to FFT decomposition
 
   if (triclinic == 0) {
-    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_GPU,
-                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO_GPU,
+         gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     brick2fft_gpu();
   } else {
-    gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+         gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     PPPM::brick2fft();
   }
 
@@ -276,21 +274,21 @@ void PPPMGPU::compute(int eflag, int vflag)
   // to fill ghost cells surrounding their 3d bricks
 
   if (differentiation_flag == 1)
-    gc->forward_comm_kspace(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
-                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    gc->forward_comm(this,1,sizeof(FFT_SCALAR),FORWARD_AD,
+         gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   else
-    gc->forward_comm_kspace(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
-                            gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+    gc->forward_comm(this,3,sizeof(FFT_SCALAR),FORWARD_IK,
+         gc_buf1,gc_buf2,MPI_FFT_SCALAR);
 
   // extra per-atom energy/virial communication
 
   if (evflag_atom) {
     if (differentiation_flag == 1 && vflag_atom)
-      gc->forward_comm_kspace(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
-                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+      gc->forward_comm(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM,
+           gc_buf1,gc_buf2,MPI_FFT_SCALAR);
     else if (differentiation_flag == 0)
-      gc->forward_comm_kspace(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
-                              gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+      gc->forward_comm(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM,
+           gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   }
 
   poisson_time += MPI_Wtime()-t3;
@@ -833,8 +831,8 @@ void PPPMGPU::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_brick = density_A_brick;
   density_fft = density_A_fft;
 
-  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // group B
@@ -842,8 +840,8 @@ void PPPMGPU::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag)
   density_brick = density_B_brick;
   density_fft = density_B_fft;
 
-  gc->reverse_comm_kspace(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
-                          gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+  gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO,
+       gc_buf1,gc_buf2,MPI_FFT_SCALAR);
   brick2fft();
 
   // switch back pointers

From 6a998fcb8e0bbea758ac5738f98ed11891db4f5c Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 26 Aug 2021 11:17:49 -0500
Subject: [PATCH 005/181] Added fix store/state commands to the example input
 scripts

---
 examples/amoeba/in.ubiquitin | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin
index e6f9893e41..acb8b7fcb6 100644
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@@ -23,6 +23,15 @@ pair_coeff      * * amoeba_ubiquitin.prm amoeba_ubiquitin.key
 
 special_bonds   lj/coul 0.5 0.5 0.5 one/five yes
 
+# setup force components this way so can dump them (AMOEBA or HIPPO also needs them for now)
+
+#fix             fhal all store/state 0 fx fy fz
+#fix             frepulse all store/state 0 fx fy fz
+#fix             fdisp all store/state 0 fx fy fz
+#fix             fpolar all store/state 0 fx fy fz
+#fix             fmpole all store/state 0 fx fy fz
+#fix             fqxfer all store/state 0 fx fy fz
+
 # thermo output
 
 compute virial all pressure NULL virial

From 42048ee73fa6ae32fdfda7a47f6db3020691fd74 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 26 Aug 2021 11:23:21 -0500
Subject: [PATCH 006/181] Activated the fix store/state commands in one of the
 example input scripts

---
 examples/amoeba/in.ubiquitin | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin
index acb8b7fcb6..7f0b653350 100644
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@@ -25,12 +25,12 @@ special_bonds   lj/coul 0.5 0.5 0.5 one/five yes
 
 # setup force components this way so can dump them (AMOEBA or HIPPO also needs them for now)
 
-#fix             fhal all store/state 0 fx fy fz
-#fix             frepulse all store/state 0 fx fy fz
-#fix             fdisp all store/state 0 fx fy fz
-#fix             fpolar all store/state 0 fx fy fz
-#fix             fmpole all store/state 0 fx fy fz
-#fix             fqxfer all store/state 0 fx fy fz
+fix             fhal all store/state 0 fx fy fz
+fix             frepulse all store/state 0 fx fy fz
+fix             fdisp all store/state 0 fx fy fz
+fix             fpolar all store/state 0 fx fy fz
+fix             fmpole all store/state 0 fx fy fz
+fix             fqxfer all store/state 0 fx fy fz
 
 # thermo output
 

From 5ffae6ed23171e4bd9f366c0511575904b4558bd Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 30 Aug 2021 09:14:46 -0500
Subject: [PATCH 007/181] Limited to neigh yes for amoeba/gpu for now

---
 src/GPU/pair_amoeba_gpu.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 4f1b20d364..09ba100e4e 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -230,13 +230,6 @@ void PairAmoebaGPU::init_style()
 {
   PairAmoeba::init_style();
 
-  if (gpu_mode == GPU_FORCE) {
-    if (comm->me == 0)
-      error->warning(FLERR,"Pair style amoeba/gpu does not support neigh no "
-        "for now, automatically switching to neigh yes");
-    gpu_mode = GPU_NEIGH;
-  } 
-
   // Repeat cutsq calculation because done after call to init_style
 
   double maxcut = -1.0;
@@ -284,6 +277,9 @@ void PairAmoebaGPU::init_style()
                                 tep_size);
   GPU_EXTRA::check_flag(success,error,world);
 
+  if (gpu_mode == GPU_FORCE)
+    error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
+
   if (tep_size == sizeof(double))
     tep_single = false;
   else

From 07b60827c459e7ea57b7e351d1e064e27090c9ef Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 1 Sep 2021 12:30:41 -0500
Subject: [PATCH 008/181] Working on the udirect2b kernel for the induce real
 space term, need to add the API for the GPU library

---
 lib/gpu/lal_amoeba.cpp     |   5 +-
 lib/gpu/lal_amoeba.cu      | 258 +++++++++++++++++++++++++++++++++++++
 lib/gpu/lal_amoeba.h       |   3 +-
 lib/gpu/lal_amoeba_ext.cpp |   5 +-
 4 files changed, 266 insertions(+), 5 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 67f0877e1a..a3bd653efd 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -45,7 +45,8 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
-                  const double *host_thole, const double *host_special_polar_wscale,
+                  const double *host_thole, const double *host_dirdamp, 
+                  const double *host_special_polar_wscale,
                   const double *host_special_polar_piscale,
                   const double *host_special_polar_pscale,
                   const int nlocal, const int nall, const int max_nbors,
@@ -76,7 +77,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   for (int i = 0; i < max_amtype; i++) {
     host_write[i].x = host_pdamp[i];
     host_write[i].y = host_thole[i];
-    host_write[i].z = (numtyp)0;
+    host_write[i].z = host_dirdamp[i];
     host_write[i].w = (numtyp)0;
   }
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index fbda1e0787..1f5fb42438 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -91,6 +91,37 @@ _texture( q_tex,int2);
     tep[i]=t;                                                               \
   }
 
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset,     \
+                          i, field, fieldp)                                 \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=_fieldp[0];                                             \
+    red_acc[1][tid]=_fieldp[1];                                             \
+    red_acc[2][tid]=_fieldp[2];                                             \
+    red_acc[3][tid]=_fieldp[3];                                             \
+    red_acc[4][tid]=_fieldp[4];                                             \
+    red_acc[5][tid]=_fieldp[5];                                             \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    _fieldp[0]=red_acc[0][tid];                                             \
+    _fieldp[1]=red_acc[1][tid];                                             \
+    _fieldp[2]=red_acc[2][tid];                                             \
+    _fieldp[3]=red_acc[3][tid];                                             \
+    _fieldp[4]=red_acc[4][tid];                                             \
+    _fieldp[5]=red_acc[5][tid];                                             \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    numtyp4 f, fp;                                                          \
+    f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
+    fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
+    field[i] = f;                                                           \
+    fieldp[i] = fp;                                                         \
+  }
+
 #else
 
 #define local_allocate_store_ufld()
@@ -121,6 +152,26 @@ _texture( q_tex,int2);
     tep[i]=t;                                                               \
   }
 
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset,     \
+                          i, field, fieldp)                                 \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
+      _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom);                   \
+      _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom);                   \
+      _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom);                   \
+      _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom);                   \
+      _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom);                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    numtyp4 f, fp;                                                          \
+    f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
+    fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
+    field[i] = f;                                                           \
+    fieldp[i] = fp;                                                         \
+  }
+
 #endif
 
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
@@ -633,6 +684,213 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv);
 }
 
+__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
+                            const __global numtyp *restrict extra,
+                            const __global numtyp4 *restrict damping,
+                            const __global numtyp4 *restrict sp_polar,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global numtyp4 *restrict field,
+                            __global numtyp4 *restrict fieldp,
+                            const int eflag, const int vflag, const int inum,
+                            const int nall, const int nbor_pitch, const int t_per_atom,
+                            const numtyp aewald, const numtyp felec,
+                            const numtyp off2, const numtyp polar_dscale,
+                            const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = damping[itype].x;
+    numtyp pti = damping[itype].y;
+    numtyp ddi = damping[itype].z;
+
+    numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp alsq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=dev_packed[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for Thole polarization damping
+
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp scale7 = (numtyp)1.0;
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          if (damp < (numtyp)50.0) {
+            expdamp = ucl_exp(-damp) ;
+            scale3 = (numtyp)1.0 - expdamp ;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
+          }
+        } else {
+          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+          damp = pgamma * ucl_powr(r/damp,3.0);
+          if (damp < (numtyp)50.0) {
+            expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_dscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+        
+      scalek = factor_pscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+
+    } // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,field,fieldp);
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index d4aa576c57..4ca2035234 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -38,7 +38,8 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, const int max_amtype, const double *host_pdamp,
-           const double *host_thole, const double *host_special_polar_wscale,
+           const double *host_thole, const double *host_dirdamp, 
+           const double *host_special_polar_wscale,
            const double *host_special_polar_piscale,
            const double *host_special_polar_pscale,
            const int nlocal, const int nall, const int max_nbors,
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 27c35a810f..a7959ed93e 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -29,6 +29,7 @@ static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
 // ---------------------------------------------------------------------------
 int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
@@ -62,7 +63,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
+    init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
                           host_special_polar_wscale, host_special_polar_piscale,
                           host_special_polar_pscale, nlocal, nall, max_nbors,
                           maxspecial, maxspecial15, cell_size, gpu_split, screen,
@@ -82,7 +83,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
                             host_special_polar_wscale, host_special_polar_piscale,
                             host_special_polar_pscale, nlocal, nall, max_nbors,
                             maxspecial, maxspecial15, cell_size, gpu_split, screen,

From 785a794d3933c56d495d2970518ab653c8d1ba6c Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 1 Sep 2021 14:37:11 -0500
Subject: [PATCH 009/181] Added and renamed API to make room for additional
 kernels (udirect2b only computes the field and fieldp, not accumulating
 forces, energies, nor virials)

---
 lib/gpu/lal_amoeba.cpp      |  35 ++++-
 lib/gpu/lal_amoeba.cu       |  20 +--
 lib/gpu/lal_amoeba.h        |   5 +-
 lib/gpu/lal_amoeba_ext.cpp  |  31 ++--
 lib/gpu/lal_base_amoeba.cpp | 155 ++++++++++++++++----
 lib/gpu/lal_base_amoeba.h   |  24 ++-
 src/AMOEBA/pair_amoeba.h    |   2 +-
 src/GPU/pair_amoeba_gpu.cpp | 283 ++++++++++++++++++++++++++++++------
 src/GPU/pair_amoeba_gpu.h   |   4 +
 9 files changed, 448 insertions(+), 111 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index a3bd653efd..c7b4872db0 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -125,10 +125,10 @@ double AmoebaT::host_memory_usage() const {
 }
 
 // ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques
+// Calculate the polar real-space term, returning tep
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int AmoebaT::loop(const int eflag, const int vflag) {
+int AmoebaT::polar_real(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
@@ -140,9 +140,7 @@ int AmoebaT::loop(const int eflag, const int vflag) {
   this->time_pair.start();
 
   this->k_polar.set_size(GX,BX);
-
-  this->k_polar.run(&this->atom->x, &this->atom->extra,
-                    &damping, &sp_polar,
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->ans->force, &this->ans->engv, &this->_tep,
                     &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
@@ -152,5 +150,32 @@ int AmoebaT::loop(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Calculate the polar real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::udirect2b(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int _nall=this->atom->nall();
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+/*
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,
+                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
+*/
+  this->time_pair.stop();
+  return GX;
+}
+
 template class Amoeba<PRECISION,ACC_PRECISION>;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 1f5fb42438..3d28939d42 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -715,11 +715,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   //numtyp4 xi__;
 
   if (ii<inum) {
-    int itype,igroup;
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
-    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
-
     int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
@@ -728,6 +723,11 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
     //numtyp qtmp; fetch(qtmp,i,q_tex);
     //int itype=ix.w;
 
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+    
     ci  = polar1[i].x;    // rpole[i][0];
     dix = polar1[i].y;    // rpole[i][1];
     diy = polar1[i].z;    // rpole[i][2];
@@ -748,9 +748,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
     numtyp pti = damping[itype].y;
     numtyp ddi = damping[itype].z;
 
-    numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
-    numtyp alsq2n = (numtyp)0.0;
-    if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -841,7 +841,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
         if (pgamma != (numtyp)0.0) {
           damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
           if (damp < (numtyp)50.0) {
-            expdamp = ucl_exp(-damp) ;
+            numtyp expdamp = ucl_exp(-damp) ;
             scale3 = (numtyp)1.0 - expdamp ;
             scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
             scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
@@ -850,7 +850,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
           pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
           damp = pgamma * ucl_powr(r/damp,3.0);
           if (damp < (numtyp)50.0) {
-            expdamp = ucl_exp(-damp);
+            numtyp expdamp = ucl_exp(-damp);
             scale3 = (numtyp)1.0 - expdamp;
             scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
             scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 4ca2035234..842207dc73 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -78,9 +78,10 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale;
   numtyp _qqrd2e;
 
- private:
+ protected:
   bool _allocated;
-  int loop(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index a7959ed93e..9fa3c7f75b 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -105,7 +105,7 @@ void amoeba_gpu_clear() {
   AMOEBAMF.clear();
 }
 
-int** amoeba_gpu_compute_n(const int ago, const int inum_full,
+int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup,
                            double **host_rpole, double **host_uind, double **host_uinp,
@@ -116,7 +116,7 @@ int** amoeba_gpu_compute_n(const int ago, const int inum_full,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, double *host_q, double *boxlo,
                            double *prd, void **tep_ptr) {
-  return AMOEBAMF.compute(ago, inum_full, nall, host_x, host_type,
+  return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom,
@@ -124,18 +124,21 @@ int** amoeba_gpu_compute_n(const int ago, const int inum_full,
                           host_q, boxlo, prd, tep_ptr);
 }
 
-void amoeba_gpu_compute(const int ago, const int inum_full, const int nall,
-                        double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-                        double **host_rpole, double **host_uind, double **host_uinp,
-                        int *ilist, int *numj, int **firstneigh,
-                        const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success, double *host_q,
-                        const int nlocal, double *boxlo, double *prd, void **tep_ptr) {
-  AMOEBAMF.compute(ago,inum_full, nall, host_x, host_type,
-                   host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
-                   ilist, numj, firstneigh, eflag, vflag, eatom, vatom,
-                   host_start, cpu_time, success, host_q, nlocal, boxlo, prd, tep_ptr);
+int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **fieldp_ptr) {
+  return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, host_q, boxlo, prd, fieldp_ptr);
 }
 
 double amoeba_gpu_bytes() {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index c5f4a01222..0c9a422cec 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -118,8 +118,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   if (ef_nall==0)
     ef_nall=2000;
 
-  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _max_alloc_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _fieldp.alloc(_max_alloc_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _tep.alloc(_max_alloc_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
@@ -149,6 +150,7 @@ void BaseAmoebaT::clear_atomic() {
   ans->clear();
 
   _tep.clear();
+  _fieldp.clear();
   dev_nspecial15.clear();
   dev_special15.clear();
   dev_special15_t.clear();
@@ -250,9 +252,9 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
 
   // ------------------- Resize _tep array ------------------------
 
-  if (nall>_max_tep_size) {
-    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_tep_size*4);
+  if (nall>_max_alloc_size) {
+    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_alloc_size*4);
 
     dev_nspecial15.clear();
     dev_special15.clear();
@@ -296,17 +298,17 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
   device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  const int red_blocks=loop(eflag,vflag);
+  const int red_blocks=polar_real(eflag,vflag);
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall,
+int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
                            int *host_amgroup, double **host_rpole,
                            double **host_uind, double **host_uinp,
@@ -336,9 +338,9 @@ int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall,
 
   // ------------------- Resize _tep array ------------------------
 
-  if (nall>_max_tep_size) {
-    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_tep_size*4);
+  if (nall>_max_alloc_size) {
+    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_alloc_size*4);
 
     dev_nspecial15.clear();
     dev_special15.clear();
@@ -388,16 +390,16 @@ int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall,
   device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  const int red_blocks=loop(eflag,vflag);
+  const int red_blocks=polar_real(eflag,vflag);
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
   // copy tep from device to host
 
-  _tep.update_host(_max_tep_size*4,false);
+  _tep.update_host(_max_alloc_size*4,false);
 /*  
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_alloc_size);
   for (int i = 0; i < 10; i++) {
     numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
@@ -406,6 +408,101 @@ int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall,
   return nbor->host_jlist.begin()-host_start;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the permanent field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void** fieldp_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // ------------------- Resize _fieldp array ------------------------
+
+  if (nall>_max_alloc_size) {
+    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _fieldp.resize(_max_alloc_size*8);
+
+    dev_nspecial15.clear();
+    dev_special15.clear();
+    dev_special15_t.clear();
+    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+  }
+  *fieldp_ptr=_fieldp.host.begin();
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return nullptr;
+  }
+
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                    success);
+    if (!success)
+      return nullptr;
+    atom->cast_q_data(host_q);
+    cast_extra_data(host_amtype, host_amgroup, host_rpole, nullptr, nullptr);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    cast_extra_data(host_amtype, host_amgroup, host_rpole, nullptr, nullptr);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  atom->add_extra_data();
+
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  const int red_blocks=udirect2b(eflag,vflag);
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+
+  // copy field and fieldp from device to host
+
+  //_fieldp.update_host(_max_field_size*8,false);
+
+  return nbor->host_jlist.begin()-host_start;
+}
+
 template <class numtyp, class acctyp>
 double BaseAmoebaT::host_memory_usage_atomic() const {
   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
@@ -446,20 +543,24 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
     pextra[idx+3] = (numtyp)amgroup[i];
   }
 
-  n += nstride*_nall;
-  for (int i = 0; i < _nall; i++) {
-    int idx = n+i*nstride;
-    pextra[idx]   = uind[i][0];
-    pextra[idx+1] = uind[i][1];
-    pextra[idx+2] = uind[i][2];
+  if (uind) {
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx]   = uind[i][0];
+      pextra[idx+1] = uind[i][1];
+      pextra[idx+2] = uind[i][2];
+    }
   }
-
-  n += nstride*_nall;
-  for (int i = 0; i < _nall; i++) {
-    int idx = n+i*nstride;
-    pextra[idx]   = uinp[i][0];
-    pextra[idx+1] = uinp[i][1];
-    pextra[idx+2] = uinp[i][2];    
+  
+  if (uinp) {
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx]   = uinp[i][0];
+      pextra[idx+1] = uinp[i][1];
+      pextra[idx+2] = uinp[i][2];    
+    }
   }
 }
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index ac9c23e8a9..7ef94c776e 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -128,7 +128,7 @@ class BaseAmoeba {
                        tagint **special, int *nspecial15, tagint **special15,
                        bool &success);
 
-  /// Pair loop with host neighboring
+  /// Compute polar real-space with host neighboring (not active for now)
   void compute(const int f_ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double **host_uind,
@@ -138,8 +138,8 @@ class BaseAmoeba {
                const double cpu_time, bool &success, double *charge,
                const int nlocal, double *boxlo, double *prd, void **tep_ptr);
 
-  /// Pair loop with device neighboring
-  int** compute(const int ago, const int inum_full, const int nall,
+  /// Compute polar real-space with device neighboring
+  int** compute_polar_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole, double **host_uind,
                 double **host_uinp, double *sublo, double *subhi,
@@ -150,6 +150,17 @@ class BaseAmoeba {
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd, void **tep_ptr);
 
+  /// Compute the direct real space part of the permanent field (udirect2b) with device neighboring
+  int** compute_udirect2b(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, void **fieldp_ptr);                
+
   // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
@@ -179,8 +190,8 @@ class BaseAmoeba {
     double** uind, double** uinp);
 
   /// Per-atom arrays
-  UCL_Vector<numtyp,numtyp> _tep;
-  int _max_tep_size;
+  UCL_Vector<numtyp,numtyp> _tep,_fieldp;
+  int _max_alloc_size;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
@@ -217,7 +228,8 @@ class BaseAmoeba {
 
   void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
 
-  virtual int loop(const int eflag, const int vflag) = 0;
+  virtual int polar_real(const int eflag, const int vflag) = 0;
+  virtual int udirect2b(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 4644d4a137..9d23fccdd8 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -369,7 +369,7 @@ class PairAmoeba : public Pair {
   void umutual1(double **, double **);
   void umutual2b(double **, double **);
   void udirect1(double **);
-  void udirect2b(double **, double **);
+  virtual void udirect2b(double **, double **);
   void dampmut(double, double, double, double *);
   void dampdir(double, double, double, double *, double *);
   void cholesky(int, double *, double *);
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 09ba100e4e..3f4e72c0af 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -24,19 +24,24 @@
 #include "error.h"
 #include "force.h"
 #include "gpu_extra.h"
+#include "math_const.h"
+#include "my_page.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
 #include "neighbor.h"
 #include "suffix.h"
-
 #include <cmath>
 
 using namespace LAMMPS_NS;
+using namespace MathConst;
+
+enum{MUTUAL,OPT,TCG,DIRECT};
 
 // External functions from cuda library for atom decomposition
 
 int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
@@ -48,7 +53,17 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const double polar_uscale, int& tep_size);
 void amoeba_gpu_clear();
 
-int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall,
+int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
+                            double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+                            double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial,
+                            tagint **special, int* nspecial15, tagint** special15,
+                            const bool eflag, const bool vflag,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd,
+                            void **fieldp_ptr);
+
+int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
                             double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
                             double **host_rpole, double **host_uind, double **host_uinp,
                             double *sublo, double *subhi, tagint *tag, int **nspecial,
@@ -58,15 +73,6 @@ int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall,
                             int **ilist, int **jnum, const double cpu_time,
                             bool &success, double *host_q, double *boxlo, double *prd,
                             void **tep_ptr);
-void amoeba_gpu_compute(const int ago, const int inum,
-                        const int nall, double **host_x, int *host_type,
-                        int *host_amtype, int *host_amgroup,
-                        double **host_rpole, double **host_uind, double **host_uinp,
-                        int *ilist, int *numj, int **firstneigh,
-                        const bool eflag, const bool vflag, const bool eatom,
-                        const bool vatom, int &host_start, const double cpu_time,
-                        bool &success, double *host_q, const int nlocal,
-                        double *boxlo, double *prd, void **tep_ptr);
 
 double amoeba_gpu_bytes();
 
@@ -80,6 +86,8 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   reinitflag = 0;
   cpu_time = 0.0;
   suffix_flag |= Suffix::GPU;
+  fieldp_pinned = nullptr;
+  tep_pinned = nullptr;
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
 
@@ -102,42 +110,31 @@ void PairAmoebaGPU::polar_real()
 
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
-  if (gpu_mode != GPU_FORCE) {
-    double sublo[3],subhi[3];
-    if (domain->triclinic == 0) {
-      sublo[0] = domain->sublo[0];
-      sublo[1] = domain->sublo[1];
-      sublo[2] = domain->sublo[2];
-      subhi[0] = domain->subhi[0];
-      subhi[1] = domain->subhi[1];
-      subhi[2] = domain->subhi[2];
-    } else {
-      domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
-    }
-    inum = atom->nlocal;
-
-    firstneigh = amoeba_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
-                                      atom->type, amtype, amgroup,
-                                      rpole, uind, uinp, sublo, subhi,
-                                      atom->tag, atom->nspecial, atom->special,
-                                      atom->nspecial15, atom->special15,
-                                      eflag, vflag, eflag_atom, vflag_atom,
-                                      host_start, &ilist, &numneigh, cpu_time,
-                                      success, atom->q, domain->boxlo,
-                                      domain->prd, &tep_pinned);
-
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
   } else {
-    inum = list->inum;
-    ilist = list->ilist;
-    numneigh = list->numneigh;
-    firstneigh = list->firstneigh;
-    
-    amoeba_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
-                       amtype, amgroup, rpole, uind, uinp,
-                       ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
-                       vflag_atom, host_start, cpu_time, success, atom->q,
-                       atom->nlocal, domain->boxlo, domain->prd, &tep_pinned);
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
+  inum = atom->nlocal;
+
+  firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
+                                        atom->type, amtype, amgroup,
+                                        rpole, uind, uinp, sublo, subhi,
+                                        atom->tag, atom->nspecial, atom->special,
+                                        atom->nspecial15, atom->special15,
+                                        eflag, vflag, eflag_atom, vflag_atom,
+                                        host_start, &ilist, &numneigh, cpu_time,
+                                        success, atom->q, domain->boxlo,
+                                        domain->prd, &tep_pinned);
+
+  
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
@@ -248,6 +245,7 @@ void PairAmoebaGPU::init_style()
   }
 
   // select the cutoff (off2) for neighbor list builds (the polar term for now)
+  // NOTE: induce and polar terms are using the same flags here
 
   if (use_ewald) choose(POLAR_LONG);
   else choose(POLAR);
@@ -268,7 +266,7 @@ void PairAmoebaGPU::init_style()
 
   double felec = 0.5 * electric / am_dielectric;
   
-  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole,
+  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
                                 special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
@@ -286,6 +284,199 @@ void PairAmoebaGPU::init_style()
     tep_single = true;
 }
 
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
+{
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
+                                        atom->type, amtype, amgroup, rpole, sublo,
+                                        subhi, atom->tag, atom->nspecial, atom->special,
+                                        atom->nspecial15, atom->special15,
+                                        eflag, vflag, eflag_atom, vflag_atom,
+                                        host_start, &ilist, &numneigh, cpu_time,
+                                        success, atom->q, domain->boxlo,
+                                        domain->prd, &fieldp_pinned);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+  
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+
+  udirect2b_cpu();
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+     atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::udirect2b_cpu()
+{
+  int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup;
+  double xr,yr,zr,r,r2;
+  double rr1,rr2,rr3,rr5;
+  double bfac,exp2a;
+  double ralpha,aefac;
+  double aesq2,aesq2n;
+  double pdi,pti,ddi;
+  double pgamma;
+  double damp,expdamp;
+  double scale3,scale5;
+  double scale7,scalek;
+  double bn[4],bcn[3];
+  double factor_dscale,factor_pscale,factor_uscale,factor_wscale;
+
+  int inum,jnum;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  // launching the kernel to compute field and fieldp
+
+  // amoeba_gpu_compute_field(...);
+
+  double **x = atom->x;
+
+  // neigh list
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // NOTE: doesn't this have a problem if aewald is tiny ??
+  
+  aesq2 = 2.0 * aewald * aewald;
+  aesq2n = 0.0;
+  if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+
+  int *neighptr;
+  double *tdipdip;
+
+  // compute the real space portion of the Ewald summation
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = amtype[i];
+    igroup = amgroup[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    n = ndip = 0;
+    neighptr = ipage_dipole->vget();
+    tdipdip = dpage_dipdip->vget();
+
+    pdi = pdamp[itype];
+    pti = thole[itype];
+    ddi = dirdamp[itype];
+    
+    // evaluate all sites within the cutoff distance
+
+    for (jj = 0; jj < jnum; jj++) {
+      jextra = jlist[jj];
+      j = jextra & NEIGHMASK15;
+      
+      xr = x[j][0] - x[i][0];
+      yr = x[j][1] - x[i][1];
+      zr = x[j][2] - x[i][2];
+      r2 = xr*xr + yr* yr + zr*zr;
+      if (r2 > off2) continue;
+
+      jtype = amtype[j];
+      jgroup = amgroup[j];
+      
+      factor_wscale = special_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = special_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = special_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = 1.0;
+      }
+
+      r = sqrt(r2);
+      rr1 = 1.0 / r;
+      rr2 = rr1 * rr1;
+      rr3 = rr2 * rr1;
+      rr5 = 3.0 * rr2 * rr3;
+
+      // calculate the real space Ewald error function terms
+
+      ralpha = aewald * r;
+      bn[0] = erfc(ralpha) * rr1;
+      exp2a = exp(-ralpha*ralpha);
+      aefac = aesq2n;
+      for (m = 1; m <= 3; m++) {
+        bfac = m+m-1;
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
+      }
+      
+      // find terms needed later to compute mutual polarization
+
+      if (poltyp != DIRECT) {
+        scale3 = 1.0;
+        scale5 = 1.0;
+        damp = pdi * pdamp[jtype];
+        if (damp != 0.0) {
+          pgamma = MIN(pti,thole[jtype]);
+          damp = pgamma * pow(r/damp,3.0);
+          if (damp < 50.0) {
+            expdamp = exp(-damp);
+            scale3 = 1.0 - expdamp;
+            scale5 = 1.0 - expdamp*(1.0+damp);
+          }
+        }
+        scalek = factor_uscale;
+        bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
+        bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
+        
+        neighptr[n++] = j;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
+        tdipdip[ndip++] = bcn[1]*xr*yr;
+        tdipdip[ndip++] = bcn[1]*xr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr;
+        tdipdip[ndip++] = bcn[1]*yr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr;
+      }
+      
+    } // jj
+
+    firstneigh_dipole[i] = neighptr;
+    firstneigh_dipdip[i] = tdipdip;
+    numneigh_dipole[i] = n;
+    ipage_dipole->vgot(n);
+    dpage_dipdip->vgot(ndip);
+  }
+}
+
 /* ---------------------------------------------------------------------- */
 
 double PairAmoebaGPU::memory_usage()
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index 4d29bfaf34..e5d4aab176 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -34,13 +34,17 @@ class PairAmoebaGPU : public PairAmoeba {
   enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
   virtual void polar_real();
+  virtual void udirect2b(double **, double **);
 
  private:
   int gpu_mode;
   double cpu_time;
   void *tep_pinned;
+  void *fieldp_pinned;
   bool tep_single;
 
+  void udirect2b_cpu();
+
   template<class numtyp>
   void compute_force_from_tep(const numtyp*);
 };

From 7e0c77f1cbb38d98d03423a3d3ff0efb8ccd7b41 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 1 Sep 2021 14:51:36 -0500
Subject: [PATCH 010/181] Added fallback flags to indicate which terms are
 ready from the GPU lib

---
 src/GPU/pair_amoeba_gpu.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 3f4e72c0af..3cdaa25633 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -104,6 +104,12 @@ PairAmoebaGPU::~PairAmoebaGPU()
 
 void PairAmoebaGPU::polar_real()
 {
+  bool gpu_polar_real_ready = true;
+  if (!gpu_polar_real_ready) {
+    PairAmoeba::polar_real();
+    return;
+  }
+
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
@@ -292,6 +298,12 @@ void PairAmoebaGPU::init_style()
 
 void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 {
+  bool gpu_udirect2b_ready = false;
+  if (!gpu_udirect2b_ready) {
+    PairAmoeba::udirect2b(field, fieldp);
+    return;
+  }
+   
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
@@ -354,10 +366,6 @@ void PairAmoebaGPU::udirect2b_cpu()
   int inum,jnum;
   int *ilist,*jlist,*numneigh,**firstneigh;
 
-  // launching the kernel to compute field and fieldp
-
-  // amoeba_gpu_compute_field(...);
-
   double **x = atom->x;
 
   // neigh list

From 745c7089f0cf40162e6b790726f1766cd588379f Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 3 Sep 2021 01:00:29 -0500
Subject: [PATCH 011/181] Temporarily commented out the section in the Atom
 class where FixGPU finds the optimal bin size. This section makes ev_tally4()
 in Angle different from CPU-only runs, even with a single command "package
 gpu 1" without any gpu pair style. Need more effort to understand why.

---
 src/GPU/fix_gpu.cpp | 2 ++
 src/atom.cpp        | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 66b938c577..51f36defdc 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -386,6 +386,8 @@ double FixGPU::memory_usage()
   return bytes;
 }
 
+/* ---------------------------------------------------------------------- */
+
 double FixGPU::binsize(const double subx, const double suby,
                        const double subz, const int nlocal,
                        const double cut) {
diff --git a/src/atom.cpp b/src/atom.cpp
index 86e2b1151b..4ad5110ec9 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -2274,6 +2274,7 @@ void Atom::setup_sort_bins()
 #ifdef LMP_GPU
   if (userbinsize == 0.0) {
     int ifix = modify->find_fix("package_gpu");
+/*
     if (ifix >= 0) {
       const double subx = domain->subhi[0] - domain->sublo[0];
       const double suby = domain->subhi[1] - domain->sublo[1];
@@ -2297,6 +2298,7 @@ void Atom::setup_sort_bins()
       bininvy = bininv;
       bininvz = bininv;
     }
+*/    
   }
 #endif
 

From 7d69a870a4f4bfcab8dce6b0d6460ac1cde41a5d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 3 Sep 2021 13:43:22 -0500
Subject: [PATCH 012/181] Reverted the binsize function call from the GPU
 package in Atom, instead added atom_modify sort with a binsize to ensure
 matching virial values, enabled the udirect2b kernel, need more work to
 override dfield0c, and induce() to bypass reverse_comm() for field and fieldp
 (line amoeba_induce.cpp:111-112)

---
 examples/amoeba/in.ubiquitin |  2 +-
 lib/gpu/lal_amoeba.cpp       | 20 +++++------
 lib/gpu/lal_amoeba.cu        | 55 ++++++++++++++++++-------------
 lib/gpu/lal_base_amoeba.cpp  | 64 +++++++++++++++++++++++-------------
 lib/gpu/lal_base_amoeba.h    | 15 +++++----
 src/GPU/pair_amoeba_gpu.cpp  | 25 ++++++++++++--
 src/atom.cpp                 |  2 --
 7 files changed, 115 insertions(+), 68 deletions(-)

diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin
index e02d849ba4..2491493c45 100644
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@@ -4,7 +4,7 @@ units           real
 boundary        p p p
 
 atom_style amoeba
-
+#atom_modify sort 1000 7.0
 bond_style      class2
 angle_style     amoeba
 dihedral_style  none
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index c7b4872db0..0d78a8618a 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -57,7 +57,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
                   const double polar_uscale) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
-                            cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar");
+                            cell_size,gpu_split,_screen,amoeba,
+                            "k_amoeba_polar", "k_amoeba_udirect2b");
   if (success!=0)
     return success;
 
@@ -164,15 +165,14 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   int ainum=this->ans->inum();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
-/*
-  this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
-                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                    &this->ans->force, &this->ans->engv, &this->_tep,
-                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,
-                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
-*/
+
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &_aewald, &_off2,
+                        &_polar_dscale, &_polar_uscale);
+
   this->time_pair.stop();
   return GX;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 3d28939d42..adcff0e648 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -91,8 +91,8 @@ _texture( q_tex,int2);
     tep[i]=t;                                                               \
   }
 
-#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset,     \
-                          i, field, fieldp)                                 \
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                              fieldp)                                       \
   if (t_per_atom>1) {                                                       \
     red_acc[0][tid]=_fieldp[0];                                             \
     red_acc[1][tid]=_fieldp[1];                                             \
@@ -118,8 +118,8 @@ _texture( q_tex,int2);
     numtyp4 f, fp;                                                          \
     f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
     fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
-    field[i] = f;                                                           \
-    fieldp[i] = fp;                                                         \
+    fieldp[ii] = f;                                                         \
+    fieldp[ii+inum] = fp;                                                   \
   }
 
 #else
@@ -152,8 +152,8 @@ _texture( q_tex,int2);
     tep[i]=t;                                                               \
   }
 
-#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset,     \
-                          i, field, fieldp)                                 \
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                             fieldp)                                        \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
@@ -168,8 +168,8 @@ _texture( q_tex,int2);
     numtyp4 f, fp;                                                          \
     f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
     fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
-    field[i] = f;                                                           \
-    fieldp[i] = fp;                                                         \
+    fieldp[ii] = f;                                                         \
+    fieldp[ii+inum] = fp;                                                   \
   }
 
 #endif
@@ -177,6 +177,11 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
 __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict extra,
                             const __global numtyp4 *restrict damping,
@@ -468,7 +473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
       term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
       numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6;
+        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
       numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
         (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
 
@@ -684,19 +689,23 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv);
 }
 
+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
 __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
-                            const __global numtyp *restrict extra,
-                            const __global numtyp4 *restrict damping,
-                            const __global numtyp4 *restrict sp_polar,
-                            const __global int *dev_nbor,
-                            const __global int *dev_packed,
-                            __global numtyp4 *restrict field,
-                            __global numtyp4 *restrict fieldp,
-                            const int eflag, const int vflag, const int inum,
-                            const int nall, const int nbor_pitch, const int t_per_atom,
-                            const numtyp aewald, const numtyp felec,
-                            const numtyp off2, const numtyp polar_dscale,
-                            const numtyp polar_uscale)
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -771,7 +780,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
-      numtyp rr1 = felec * rinv;
+      numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
       numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
       numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
@@ -888,7 +897,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
   // accumulate field and fieldp
   
-  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,field,fieldp);
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 0c9a422cec..a1cf516777 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
   delete ans;
   delete nbor;
   k_polar.clear();
+  k_udirect2b.clear();
   k_special15.clear();
   if (pair_program) delete pair_program;
 }
@@ -53,7 +54,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const int maxspecial15,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
-                             const char *k_name) {
+                             const char *k_name_polar,
+                             const char *k_name_udirect2b) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -85,7 +87,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -118,9 +120,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   if (ef_nall==0)
     ef_nall=2000;
 
-  _max_alloc_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
-  _fieldp.alloc(_max_alloc_size*6,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-  _tep.alloc(_max_alloc_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _max_fieldp_size = _max_tep_size;
+  _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
@@ -224,7 +227,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
+void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall,
                           double **host_x, int *host_type, int *host_amtype,
                           int *host_amgroup, double **host_rpole,
                           double **host_uind, double **host_uinp,
@@ -252,9 +255,9 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
 
   // ------------------- Resize _tep array ------------------------
 
-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_alloc_size*4);
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
 
     dev_nspecial15.clear();
     dev_special15.clear();
@@ -302,6 +305,10 @@ void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
 }
 
 // ---------------------------------------------------------------------------
@@ -338,9 +345,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
 
   // ------------------- Resize _tep array ------------------------
 
-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_alloc_size*4);
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
 
     dev_nspecial15.clear();
     dev_special15.clear();
@@ -397,9 +404,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
 
   // copy tep from device to host
 
-  _tep.update_host(_max_alloc_size*4,false);
+  _tep.update_host(_max_tep_size*4,false);
 /*  
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_alloc_size);
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
   for (int i = 0; i < 10; i++) {
     numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
@@ -442,9 +449,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
 
   // ------------------- Resize _fieldp array ------------------------
 
-  if (nall>_max_alloc_size) {
-    _max_alloc_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _fieldp.resize(_max_alloc_size*8);
+  if (nall>_max_fieldp_size) {
+    _max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _fieldp.resize(_max_fieldp_size*8);
 
     dev_nspecial15.clear();
     dev_special15.clear();
@@ -492,13 +499,18 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   *jnum=nbor->host_acc.begin();
 
   const int red_blocks=udirect2b(eflag,vflag);
-  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
-  // copy field and fieldp from device to host
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
-  //_fieldp.update_host(_max_field_size*8,false);
+  _fieldp.update_host(_max_fieldp_size*8,false);
+/*  
+  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", this->_field.cols(), _max_fieldp_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
+    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
 
   return nbor->host_jlist.begin()-host_start;
 }
@@ -566,7 +578,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
+                                  const char *kname_polar,
+                                  const char *kname_udirect2b) {
   if (_compiled)
     return;
 
@@ -575,7 +588,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
   pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   
-  k_polar.set_function(*pair_program,kname);
+  k_polar.set_function(*pair_program,kname_polar);
+  k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_special15.set_function(*pair_program,"k_special15");
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
@@ -593,6 +607,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
 
 }
 
+// ---------------------------------------------------------------------------
+//  Specify 1-5 neighbors from the current neighbor list
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 int BaseAmoebaT::add_onefive_neighbors() {
   // Compute the block size and grid size to keep all cores busy
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 7ef94c776e..ae0f33ef29 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -53,8 +53,8 @@ class BaseAmoeba {
     * - -5 Double precision is not supported on card **/
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
-                  const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const double gpu_split, FILE *screen, const void *pair_program,
+                  const char *kname_polar, const char *kname_udirect2b);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -129,7 +129,7 @@ class BaseAmoeba {
                        bool &success);
 
   /// Compute polar real-space with host neighboring (not active for now)
-  void compute(const int f_ago, const int inum_full, const int nall,
+  void compute_polar_real(const int f_ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double **host_uind,
                double **host_uinp, int *ilist, int *numj,
@@ -190,8 +190,8 @@ class BaseAmoeba {
     double** uind, double** uinp);
 
   /// Per-atom arrays
-  UCL_Vector<numtyp,numtyp> _tep,_fieldp;
-  int _max_alloc_size;
+  UCL_Vector<numtyp,numtyp> _tep, _fieldp;
+  int _max_tep_size, _max_fieldp_size;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
@@ -210,7 +210,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_polar,k_special15;
+  UCL_Kernel k_polar, k_udirect2b, k_special15;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
 
@@ -226,7 +226,8 @@ class BaseAmoeba {
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+     const char *kname_polar, const char *kname_udirect2b);
 
   virtual int polar_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 3cdaa25633..a5cc86e39d 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -298,7 +298,7 @@ void PairAmoebaGPU::init_style()
 
 void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 {
-  bool gpu_udirect2b_ready = false;
+  bool gpu_udirect2b_ready = true;
   if (!gpu_udirect2b_ready) {
     PairAmoeba::udirect2b(field, fieldp);
     return;
@@ -334,7 +334,28 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
                                         domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
-  
+
+  // get field and fieldp values from the GPU lib
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] = field_ptr[idx];
+    field[i][1] = field_ptr[idx+1];
+    field[i][2] = field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] = fieldp_ptr[idx];
+    fieldp[i][1] = fieldp_ptr[idx+1];
+    fieldp[i][2] = fieldp_ptr[idx+2];
+  }
+
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
 
diff --git a/src/atom.cpp b/src/atom.cpp
index 4ad5110ec9..86e2b1151b 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -2274,7 +2274,6 @@ void Atom::setup_sort_bins()
 #ifdef LMP_GPU
   if (userbinsize == 0.0) {
     int ifix = modify->find_fix("package_gpu");
-/*
     if (ifix >= 0) {
       const double subx = domain->subhi[0] - domain->sublo[0];
       const double suby = domain->subhi[1] - domain->sublo[1];
@@ -2298,7 +2297,6 @@ void Atom::setup_sort_bins()
       bininvy = bininv;
       bininvz = bininv;
     }
-*/    
   }
 #endif
 

From 8f5f65e68da92c7649a7d0444b6630816db37ff5 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 3 Sep 2021 16:42:58 -0500
Subject: [PATCH 013/181] Declared virtual to relevant functions in PairAmoeba,
 added the overridden versions in PairAmoebaGPU

---
 examples/amoeba/in.ubiquitin |   2 +-
 src/AMOEBA/pair_amoeba.h     |   4 +-
 src/GPU/pair_amoeba_gpu.cpp  | 529 +++++++++++++++++++++++++++++++++--
 src/GPU/pair_amoeba_gpu.h    |   2 +
 4 files changed, 514 insertions(+), 23 deletions(-)

diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin
index 2491493c45..f017d8f122 100644
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@@ -4,7 +4,7 @@ units           real
 boundary        p p p
 
 atom_style amoeba
-#atom_modify sort 1000 7.0
+atom_modify sort 1000 7.0
 bond_style      class2
 angle_style     amoeba
 dihedral_style  none
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 9d23fccdd8..0ec601de47 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -361,11 +361,11 @@ class PairAmoeba : public Pair {
   void polar_kspace();
   void damppole(double, int, double, double, double *, double *, double *);
 
-  void induce();
+  virtual void induce();
   void ulspred();
   void ufield0c(double **, double **);
   void uscale0b(int, double **, double **, double **, double **);
-  void dfield0c(double **, double **);
+  virtual void dfield0c(double **, double **);
   void umutual1(double **, double **);
   void umutual2b(double **, double **);
   void udirect1(double **);
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index a5cc86e39d..f2ba3acceb 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -18,13 +18,16 @@
 
 #include "pair_amoeba_gpu.h"
 
+#include "amoeba_convolution.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
+#include "fix_store.h"
 #include "force.h"
 #include "gpu_extra.h"
 #include "math_const.h"
+#include "memory.h"
 #include "my_page.h"
 #include "neigh_list.h"
 #include "neigh_request.h"
@@ -35,7 +38,15 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
+enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP};   // forward comm
+enum{FIELD,ZRSD,TORQUE,UFLD};                               // reverse comm
+enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
 enum{MUTUAL,OPT,TCG,DIRECT};
+enum{GEAR,ASPC,LSQR};
+enum{BUILD,APPLY};
+enum{GORDON1,GORDON2};
+
+#define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
 
 // External functions from cuda library for atom decomposition
 
@@ -54,30 +65,28 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
 void amoeba_gpu_clear();
 
 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
-                            double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-                            double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial,
-                            tagint **special, int* nspecial15, tagint** special15,
-                            const bool eflag, const bool vflag,
-                            const bool eatom, const bool vatom, int &host_start,
-                            int **ilist, int **jnum, const double cpu_time,
-                            bool &success, double *host_q, double *boxlo, double *prd,
-                            void **fieldp_ptr);
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag,
+              const bool eatom, const bool vatom, int &host_start,
+              int **ilist, int **jnum, const double cpu_time,
+              bool &success, double *host_q, double *boxlo, double *prd,
+              void **fieldp_ptr);
 
 int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
-                            double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-                            double **host_rpole, double **host_uind, double **host_uinp,
-                            double *sublo, double *subhi, tagint *tag, int **nspecial,
-                            tagint **special, int* nspecial15, tagint** special15,
-                            const bool eflag, const bool vflag,
-                            const bool eatom, const bool vatom, int &host_start,
-                            int **ilist, int **jnum, const double cpu_time,
-                            bool &success, double *host_q, double *boxlo, double *prd,
-                            void **tep_ptr);
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp,
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag,
+              const bool eatom, const bool vatom, int &host_start,
+              int **ilist, int **jnum, const double cpu_time,
+              bool &success, double *host_q, double *boxlo, double *prd,
+              void **tep_ptr);
 
 double amoeba_gpu_bytes();
 
-enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
-
 /* ---------------------------------------------------------------------- */
 
 PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
@@ -290,6 +299,486 @@ void PairAmoebaGPU::init_style()
     tep_single = true;
 }
 
+/* ----------------------------------------------------------------------
+   induce = induced dipole moments via pre-conditioned CG solver
+   adapted from Tinker induce0a() routine
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::induce()
+{
+  bool done;
+  int i,j,m,ii,itype;
+  int iter,maxiter;
+  double polmin;
+  double eps,epsold;
+  double epsd,epsp;
+  double udsum,upsum;
+  double a,ap,b,bp;
+  double sum,sump,term;
+  double reduce[4],allreduce[4];
+
+  double *poli;
+  double **conj,**conjp;
+  double **vec,**vecp;
+  double **udir,**usum,**usump;
+
+  int debug = 1;
+
+  // set cutoffs, taper coeffs, and PME params
+  // create qfac here, free at end of polar()
+  
+  if (use_ewald) {
+    choose(POLAR_LONG);
+    int nmine = p_kspace->nfft_owned;
+    memory->create(qfac,nmine,"ameoba/induce:qfac");
+  } else choose(POLAR);
+
+  // owned atoms
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int nlocal = atom->nlocal;
+
+  // zero out the induced dipoles at each site
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      uind[i][j] = 0.0;
+      uinp[i][j] = 0.0;
+    }
+  }
+
+  // allocation of arrays
+  // NOTE: not all are used by all methods
+  // NOTE: could be re-allocated dynamically
+
+  memory->create(poli,nlocal,"ameoba/induce:poli");
+  memory->create(conj,nlocal,3,"ameoba/induce:conj");
+  memory->create(conjp,nlocal,3,"ameoba/induce:conjp");
+  memory->create(vec,nlocal,3,"ameoba/induce:vec");
+  memory->create(vecp,nlocal,3,"ameoba/induce:vecp");
+  memory->create(udir,nlocal,3,"ameoba/induce:udir");
+  memory->create(usum,nlocal,3,"ameoba/induce:usum");
+  memory->create(usump,nlocal,3,"ameoba/induce:usump");
+
+  // get the electrostatic field due to permanent multipoles
+  
+  dfield0c(field,fieldp);
+
+  // reverse comm to sum field,fieldp from ghost atoms to owned atoms
+
+  crstyle = FIELD;
+  comm->reverse_comm_pair(this);
+
+  // DEBUG statements
+
+  /*
+  for (i = 0; i < nlocal; i++)
+    if (atom->tag[i] == 1)
+      printf("AAA FIELD atom %d: field %g %g %g: fieldp %g %g %g\n",
+	     atom->tag[i],
+	     field[i][0],field[i][1],field[i][2],
+	     fieldp[i][0],fieldp[i][1],fieldp[i][2]);
+  */
+  
+  // set induced dipoles to polarizability times direct field
+
+  for (i = 0; i < nlocal; i++) {
+    itype = amtype[i];
+    for (j = 0; j < 3; j++) {
+      udir[i][j] = polarity[itype] * field[i][j];
+      udirp[i][j] = polarity[itype] * fieldp[i][j];
+      if (pcgguess) {
+        uind[i][j] = udir[i][j];
+        uinp[i][j] = udirp[i][j];
+      }
+    }
+  }
+
+  // get induced dipoles via the OPT extrapolation method
+  // NOTE: any way to rewrite these loops to avoid allocating
+  //       uopt,uoptp with a optorder+1 dimension, just optorder ??
+  //       since no need to store optorder+1 values after these loops
+
+  if (poltyp == OPT) { 
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uopt[i][0][j] = udir[i][j];
+        uoptp[i][0][j] = udirp[i][j];
+      }
+    }
+
+    for (m = 1; m <= optorder; m++) {
+      optlevel = m - 1;     // used in umutual1() for fopt,foptp
+
+      cfstyle = INDUCE;
+      comm->forward_comm_pair(this);
+
+      ufield0c(field,fieldp);
+
+      crstyle = FIELD;
+      comm->reverse_comm_pair(this);
+
+      for (i = 0; i < nlocal; i++) {
+	      itype = amtype[i];
+        for (j = 0; j < 3; j++) {
+          uopt[i][m][j] = polarity[itype] * field[i][j];
+          uoptp[i][m][j] = polarity[itype] * fieldp[i][j];
+          uind[i][j] = uopt[i][m][j];
+          uinp[i][j] = uoptp[i][m][j];
+        }
+      }
+    }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uind[i][j] = 0.0;
+        uinp[i][j] = 0.0;
+        usum[i][j] = 0.0;
+        usump[i][j] = 0.0;
+        for (m = 0; m <= optorder; m++) {
+          usum[i][j] += uopt[i][m][j];
+          usump[i][j] += uoptp[i][m][j];
+          uind[i][j] += copt[m]*usum[i][j];
+          uinp[i][j] += copt[m]*usump[i][j];
+        }
+      }
+    }
+  }
+
+  // set tolerances for computation of mutual induced dipoles
+
+  if (poltyp == MUTUAL) {
+    done = false;
+    maxiter = 100;
+    iter = 0;
+    polmin = 0.00000001;
+    eps = 100.0;
+
+    // estimate induced dipoles using a polynomial predictor
+
+    if (use_pred && nualt == maxualt) {
+      ulspred();
+
+      double ***udalt = fixudalt->tstore;
+      double ***upalt = fixupalt->tstore;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          udsum = 0.0;
+          upsum = 0.0;
+          for (m = 0; m < nualt; m++) {
+            udsum += bpred[m]*udalt[i][m][j];
+            upsum += bpredp[m]*upalt[i][m][j];
+          }
+          uind[i][j] = udsum;
+          uinp[i][j] = upsum;
+        }
+      }
+    }
+
+    // estimate induced dipoles via inertial extended Lagrangian
+    // not supported for now
+    // requires uaux,upaux to persist with each atom
+    // also requires a velocity vector(s) to persist
+    // also requires updating uaux,upaux in the Verlet integration
+
+    /*
+    if (use_ielscf) {
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uaux[i][j];
+          uinp[i][j] = upaux[i][j];
+        }
+      }
+    }
+    */
+
+    // get the electrostatic field due to induced dipoles
+
+    cfstyle = INDUCE;
+    comm->forward_comm_pair(this);
+
+    ufield0c(field,fieldp);
+
+    crstyle = FIELD;
+    comm->reverse_comm_pair(this);
+
+    
+    // set initial conjugate gradient residual and conjugate vector
+
+    for (i = 0; i < nlocal; i++) {
+      itype = amtype[i];
+
+      poli[i] = MAX(polmin,polarity[itype]);
+      for (j = 0; j < 3; j++) {
+        if (pcgguess) {
+          rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j];
+          rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j];
+        } else {
+          rsd[i][j] = udir[i][j] / poli[i];
+          rsdp[i][j] = udirp[i][j] / poli[i];
+        }
+        zrsd[i][j] = rsd[i][j];
+        zrsdp[i][j] = rsdp[i][j];
+      }
+    }
+
+    if (pcgprec) {
+      cfstyle = RSD;
+      comm->forward_comm_pair(this);
+      uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+      crstyle = ZRSD;
+      comm->reverse_comm_pair(this);
+   }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        conj[i][j] = zrsd[i][j];
+        conjp[i][j] = zrsdp[i][j];
+      }
+    }
+
+    // conjugate gradient iteration of the mutual induced dipoles
+
+    while (!done) {
+      iter++;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          vec[i][j] = uind[i][j];
+          vecp[i][j] = uinp[i][j];
+          uind[i][j] = conj[i][j];
+          uinp[i][j] = conjp[i][j];
+        }
+      }
+
+      cfstyle = INDUCE;
+      comm->forward_comm_pair(this);
+
+      ufield0c(field,fieldp);
+
+      //error->all(FLERR,"STOP");
+
+      crstyle = FIELD;
+      comm->reverse_comm_pair(this);
+     
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = vec[i][j];
+          uinp[i][j] = vecp[i][j];
+          vec[i][j] = conj[i][j]/poli[i] - field[i][j];
+          vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j];
+        }
+      }
+
+      a = 0.0;
+      ap = 0.0;
+      sum = 0.0;
+      sump = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          a += conj[i][j]*vec[i][j];
+          ap += conjp[i][j]*vecp[i][j];
+          sum += rsd[i][j]*zrsd[i][j];
+          sump += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      reduce[0] = a;
+      reduce[1] = ap;
+      reduce[2] = sum;
+      reduce[3] = sump;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      a = allreduce[0];
+      ap = allreduce[1];
+      sum = allreduce[2];
+      sump = allreduce[3];
+
+      if (a != 0.0) a = sum / a;
+      if (ap != 0.0) ap = sump / ap;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uind[i][j] + a*conj[i][j];
+          uinp[i][j] = uinp[i][j] + ap*conjp[i][j];
+          rsd[i][j] = rsd[i][j] - a*vec[i][j];
+          rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j];
+          zrsd[i][j] = rsd[i][j];
+          zrsdp[i][j] = rsdp[i][j];
+        }
+      }
+
+      if (pcgprec) {
+        cfstyle = RSD;
+        comm->forward_comm_pair(this);
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+        crstyle = ZRSD;
+        comm->reverse_comm_pair(this);
+      }
+
+      b = 0.0;
+      bp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          b += rsd[i][j]*zrsd[i][j];
+          bp += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      // NOTE: comp of b,bp and allreduce only needed if pcgprec ?
+
+      reduce[0] = b;
+      reduce[1] = bp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      b = allreduce[0];
+      bp = allreduce[1];
+
+      if (sum != 0.0) b /= sum;
+      if (sump != 0.0) bp /= sump;
+
+      epsd = 0.0;
+      epsp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          conj[i][j] = zrsd[i][j] + b*conj[i][j];
+          conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j];
+          epsd += rsd[i][j]*rsd[i][j];
+          epsp += rsdp[i][j]*rsdp[i][j];
+        }
+      }
+
+      reduce[0] = epsd;
+      reduce[1] = epsp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      epsd = allreduce[0];
+      epsp = allreduce[1];
+
+      // check the convergence of the mutual induced dipoles
+
+      epsold = eps;
+      eps = MAX(epsd,epsp);
+      eps = DEBYE * sqrt(eps/atom->natoms);
+
+      if (eps < poleps) done = true;
+      if (eps > epsold) done = true;
+      if (iter >= politer) done = true;
+
+      //  apply a "peek" iteration to the mutual induced dipoles
+     
+      if (done) {
+        for (i = 0; i < nlocal; i++) {
+          term = pcgpeek * poli[i];
+          for (j = 0; j < 3; j++) {
+            uind[i][j] += term*rsd[i][j];
+            uinp[i][j] += term*rsdp[i][j];
+          }
+        }
+      }
+
+    }
+
+    // terminate the calculation if dipoles failed to converge
+    // NOTE: could make this an error
+    
+    if (iter >= maxiter || eps > epsold)
+      if (me == 0)
+	      error->warning(FLERR,"AMOEBA induced dipoles did not converge");
+  }
+
+  // DEBUG output to dump file
+
+  if (uind_flag) 
+    dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp);
+
+  // deallocation of arrays
+
+  memory->destroy(poli);
+  memory->destroy(conj);
+  memory->destroy(conjp);
+  memory->destroy(vec);
+  memory->destroy(vecp);
+  memory->destroy(udir);
+  memory->destroy(usum);
+  memory->destroy(usump);
+
+  // update the lists of previous induced dipole values
+  // shift previous m values up to m+1, add new values at m = 0
+  // only when preconditioner is used
+
+  if (use_pred) {
+    double ***udalt = fixudalt->tstore;
+    double ***upalt = fixupalt->tstore;
+
+    nualt = MIN(nualt+1,maxualt);
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        for (m = nualt-1; m > 0; m--) {
+          udalt[i][m][j] = udalt[i][m-1][j];
+          upalt[i][m][j] = upalt[i][m-1][j];
+        }
+        udalt[i][0][j] = uind[i][j];
+        upalt[i][0][j] = uinp[i][j];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dfield0c =  direct induction via Ewald sum
+   dfield0c computes the mutual electrostatic field due to
+   permanent multipole moments via Ewald summation
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::dfield0c(double **field, double **fieldp)
+{
+  int i,j,ii;
+  double term;
+
+  int inum;
+  int *ilist;
+
+  // zero out field,fieldp for owned and ghost atoms
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+
+  for (i = 0; i < nall; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] = 0.0;
+      fieldp[i][j] = 0.0;
+    }
+  }
+
+  // get the reciprocal space part of the permanent field
+
+  if (kspace_flag) udirect1(field);
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      fieldp[i][j] = field[i][j];
+    }
+  }
+
+  // get the real space portion of the permanent field
+
+  if (rspace_flag) udirect2b(field,fieldp);
+
+  // get the self-energy portion of the permanent field
+
+  term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] += term*rpole[i][j+1];
+      fieldp[i][j] += term*rpole[i][j+1];
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    udirect2b = Ewald real direct field via list
    udirect2b computes the real space contribution of the permanent
@@ -298,7 +787,7 @@ void PairAmoebaGPU::init_style()
 
 void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 {
-  bool gpu_udirect2b_ready = true;
+  bool gpu_udirect2b_ready = false;
   if (!gpu_udirect2b_ready) {
     PairAmoeba::udirect2b(field, fieldp);
     return;
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index e5d4aab176..9f538ca903 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -34,6 +34,8 @@ class PairAmoebaGPU : public PairAmoeba {
   enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
   virtual void polar_real();
+  virtual void induce();
+  virtual void dfield0c(double **, double **);
   virtual void udirect2b(double **, double **);
 
  private:

From be5aa46df82b1aaa78250f3228465c6f7260c17a Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 3 Sep 2021 17:32:41 -0500
Subject: [PATCH 014/181] Re-arranged the binsize call from the GPU lib in Atom
 so that the box bounds and bininv[xyz] are computed on the CPU side intact

---
 examples/amoeba/in.ubiquitin |  2 +-
 src/atom.cpp                 | 44 ++++++++++++------------------------
 2 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin
index f017d8f122..2491493c45 100644
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@@ -4,7 +4,7 @@ units           real
 boundary        p p p
 
 atom_style amoeba
-atom_modify sort 1000 7.0
+#atom_modify sort 1000 7.0
 bond_style      class2
 angle_style     amoeba
 dihedral_style  none
diff --git a/src/atom.cpp b/src/atom.cpp
index 86e2b1151b..71cb2e9f31 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -2195,6 +2195,21 @@ void Atom::setup_sort_bins()
     return;
   }
 
+#ifdef LMP_GPU
+  if (userbinsize == 0.0) {   
+    int ifix = modify->find_fix("package_gpu");
+    if (ifix >= 0) {
+      const double subx = domain->subhi[0] - domain->sublo[0];
+      const double suby = domain->subhi[1] - domain->sublo[1];
+      const double subz = domain->subhi[2] - domain->sublo[2];
+
+      FixGPU *fix = static_cast<FixGPU *>(modify->fix[ifix]);
+      binsize = fix->binsize(subx, suby, subz, atom->nlocal,
+                             0.5 * neighbor->cutneighmax);
+    }
+  }
+#endif
+
   double bininv = 1.0/binsize;
 
   // nbin xyz = local bins
@@ -2271,35 +2286,6 @@ void Atom::setup_sort_bins()
   }
 #endif
 
-#ifdef LMP_GPU
-  if (userbinsize == 0.0) {
-    int ifix = modify->find_fix("package_gpu");
-    if (ifix >= 0) {
-      const double subx = domain->subhi[0] - domain->sublo[0];
-      const double suby = domain->subhi[1] - domain->sublo[1];
-      const double subz = domain->subhi[2] - domain->sublo[2];
-
-      FixGPU *fix = static_cast<FixGPU *>(modify->fix[ifix]);
-      binsize = fix->binsize(subx, suby, subz, atom->nlocal,
-                             neighbor->cutneighmax);
-      bininv = 1.0 / binsize;
-
-      nbinx = static_cast<int> (ceil(subx * bininv));
-      nbiny = static_cast<int> (ceil(suby * bininv));
-      nbinz = static_cast<int> (ceil(subz * bininv));
-      if (domain->dimension == 2) nbinz = 1;
-
-      if (nbinx == 0) nbinx = 1;
-      if (nbiny == 0) nbiny = 1;
-      if (nbinz == 0) nbinz = 1;
-
-      bininvx = bininv;
-      bininvy = bininv;
-      bininvz = bininv;
-    }
-  }
-#endif
-
   if (1.0*nbinx*nbiny*nbinz > INT_MAX)
     error->one(FLERR,"Too many atom sorting bins");
 

From 4e346c2de637d32f33d6b59c798b22b12b1e56df Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 7 Sep 2021 13:05:57 -0500
Subject: [PATCH 015/181] Refactored neighbor list builds and per-atom
 reallocation parts

---
 lib/gpu/lal_amoeba_ext.cpp  |   5 +-
 lib/gpu/lal_base_amoeba.cpp | 141 +++++++++++++++++++++---------------
 lib/gpu/lal_base_amoeba.h   |  20 ++++-
 src/GPU/pair_amoeba_gpu.cpp |  19 ++---
 4 files changed, 107 insertions(+), 78 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 9fa3c7f75b..59739f9f2a 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -127,6 +127,7 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
 int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, int *nspecial15, tagint** special15,
                            const bool eflag, const bool vflag,
@@ -135,8 +136,8 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
                            bool &success, double *host_q, double *boxlo,
                            double *prd, void **fieldp_ptr) {
   return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
-                          tag, nspecial, special, nspecial15, special15,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, host_q, boxlo, prd, fieldp_ptr);
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index a1cf516777..88caec3972 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -121,9 +121,12 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
     ef_nall=2000;
 
   _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+
   _max_fieldp_size = _max_tep_size;
   _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+
+  _nmax = nall;
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
@@ -312,10 +315,12 @@ void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute polar real-space
+// Prepare for multiple kernel calls in a time step:
+//   - reallocate per-atom arrays, if needed
+//   - build the full neighbor lists for use by different kernels
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
+int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
                            int *host_amgroup, double **host_rpole,
                            double **host_uind, double **host_uinp,
@@ -324,9 +329,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
                            int *nspecial15, tagint **special15,
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
+                           int **&ilist, int **&jnum, const double cpu_time,
                            bool &success, double *host_q, double *boxlo,
-                           double *prd, void **tep_ptr) {
+                           double *prd) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -343,12 +348,10 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
 
   set_kernel(eflag,vflag);
 
-  // ------------------- Resize _tep array ------------------------
-
-  if (nall>_max_tep_size) {
-    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_tep_size*4);
+  // ------------------- Resize 1-5 neighbor arrays ------------------------
 
+  if (nall>_nmax) {
+    _nmax = nall;
     dev_nspecial15.clear();
     dev_special15.clear();
     dev_special15_t.clear();
@@ -356,7 +359,6 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
     dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
     dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
   }
-  *tep_ptr=_tep.host.begin();
 
   if (inum_full==0) {
     host_start=0;
@@ -397,6 +399,60 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
   device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
+  return nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays and build the neighbor lists if needed
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
+  }
+  *tep_ptr=_tep.host.begin();
+
   const int red_blocks=polar_real(eflag,vflag);
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
@@ -412,7 +468,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
   }
 */  
-  return nbor->host_jlist.begin()-host_start;
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -423,6 +479,7 @@ template <class numtyp, class acctyp>
 int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
                            int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
                            double *sublo, double *subhi, tagint *tag,
                            int **nspecial, tagint **special,
                            int *nspecial15, tagint **special15,
@@ -447,59 +504,26 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
 
   set_kernel(eflag,vflag);
 
+  // reallocate per-atom arrays and build the neighbor lists if needed
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
   // ------------------- Resize _fieldp array ------------------------
 
   if (nall>_max_fieldp_size) {
     _max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
     _fieldp.resize(_max_fieldp_size*8);
-
-    dev_nspecial15.clear();
-    dev_special15.clear();
-    dev_special15_t.clear();
-    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
-    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
   }
   *fieldp_ptr=_fieldp.host.begin();
 
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    resize_atom(0,nall,success);
-    zero_timers();
-    return nullptr;
-  }
-
-  hd_balancer.balance(cpu_time);
-  int inum=hd_balancer.get_gpu_count(ago,inum_full);
-  ans->inum(inum);
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                    success);
-    if (!success)
-      return nullptr;
-    atom->cast_q_data(host_q);
-    cast_extra_data(host_amtype, host_amgroup, host_rpole, nullptr, nullptr);
-    hd_balancer.start_timer();
-  } else {
-    atom->cast_x_data(host_x,host_type);
-    atom->cast_q_data(host_q);
-    cast_extra_data(host_amtype, host_amgroup, host_rpole, nullptr, nullptr);
-    hd_balancer.start_timer();
-    atom->add_x_data(host_x,host_type);
-  }
-  atom->add_q_data();
-  atom->add_extra_data();
-
-  *ilist=nbor->host_ilist.begin();
-  *jnum=nbor->host_acc.begin();
-
   const int red_blocks=udirect2b(eflag,vflag);
-  hd_balancer.stop_timer();
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
@@ -510,9 +534,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
     numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
     printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
   }
-*/  
-
-  return nbor->host_jlist.begin()-host_start;
+*/
+  return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index ae0f33ef29..7d4f4c00b5 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -128,6 +128,18 @@ class BaseAmoeba {
                        tagint **special, int *nspecial15, tagint **special15,
                        bool &success);
 
+  /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
+  int** precompute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double **host_uind,
+                double **host_uinp, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **&ilist, int **&numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);
+
   /// Compute polar real-space with host neighboring (not active for now)
   void compute_polar_real(const int f_ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
@@ -153,7 +165,9 @@ class BaseAmoeba {
   /// Compute the direct real space part of the permanent field (udirect2b) with device neighboring
   int** compute_udirect2b(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp,
+                double *sublo, double *subhi,
                 tagint *tag, int **nspecial, tagint **special,
                 int *nspecial15, tagint **special15,
                 const bool eflag, const bool vflag,
@@ -191,7 +205,7 @@ class BaseAmoeba {
 
   /// Per-atom arrays
   UCL_Vector<numtyp,numtyp> _tep, _fieldp;
-  int _max_tep_size, _max_fieldp_size;
+  int _nmax, _max_tep_size, _max_fieldp_size;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
@@ -222,7 +236,7 @@ class BaseAmoeba {
   bool _compiled;
   int _block_size, _block_bio_size, _threads_per_atom;
   int _extra_fields;
-  double  _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
+  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index f2ba3acceb..d87e35cdf8 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -66,7 +66,8 @@ void amoeba_gpu_clear();
 
 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial,
+              double **host_rpole, double **host_uind, double **host_uinp, 
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
@@ -370,17 +371,7 @@ void PairAmoebaGPU::induce()
   crstyle = FIELD;
   comm->reverse_comm_pair(this);
 
-  // DEBUG statements
 
-  /*
-  for (i = 0; i < nlocal; i++)
-    if (atom->tag[i] == 1)
-      printf("AAA FIELD atom %d: field %g %g %g: fieldp %g %g %g\n",
-	     atom->tag[i],
-	     field[i][0],field[i][1],field[i][2],
-	     fieldp[i][0],fieldp[i][1],fieldp[i][2]);
-  */
-  
   // set induced dipoles to polarizability times direct field
 
   for (i = 0; i < nlocal; i++) {
@@ -799,7 +790,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
-  
+
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -814,8 +805,8 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   inum = atom->nlocal;
 
   firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup, rpole, sublo,
-                                        subhi, atom->tag, atom->nspecial, atom->special,
+                                        atom->type, amtype, amgroup, rpole, uind, uinp,
+                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
                                         atom->nspecial15, atom->special15,
                                         eflag, vflag, eflag_atom, vflag_atom,
                                         host_start, &ilist, &numneigh, cpu_time,

From 1c5d235f12799f6ce3b68dfbe5903fcd84840cc1 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 7 Sep 2021 16:15:08 -0500
Subject: [PATCH 016/181] Working on the field and fieldp values from GPU back
 to the host for dfield0c

---
 lib/gpu/lal_amoeba.cu       | 22 ++++++++++++++--------
 lib/gpu/lal_base_amoeba.cpp |  7 ++++---
 src/GPU/pair_amoeba_gpu.cpp | 32 +++++++++++++++++---------------
 3 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index adcff0e648..c4f146a7c9 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -116,9 +116,13 @@ _texture( q_tex,int2);
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
     numtyp4 f, fp;                                                          \
-    f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
-    fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
     fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
     fieldp[ii+inum] = fp;                                                   \
   }
 
@@ -152,7 +156,7 @@ _texture( q_tex,int2);
     tep[i]=t;                                                               \
   }
 
-#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+#define store_answers_fieldp(_fieldp, ii, inum, tid, t_per_atom, offset, i, \
                              fieldp)                                        \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
@@ -166,9 +170,13 @@ _texture( q_tex,int2);
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
     numtyp4 f, fp;                                                          \
-    f.x  = _fieldp[0]; f.y  = _fieldp[0]; f.z  = _fieldp[2];                \
-    fp.x = _fieldp[3]; fp.y = _fieldp[4]; fp.z = _fieldp[5];                \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
     fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
     fieldp[ii+inum] = fp;                                                   \
   }
 
@@ -890,8 +898,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       _fieldp[3] += fip[0];
       _fieldp[4] += fip[1];
       _fieldp[5] += fip[2];
-
-    } // nbor
+    }  // nbor
 
   } // ii<inum
 
@@ -948,4 +955,3 @@ __kernel void k_special15(__global int * dev_nbor,
 
   } // if ii
 }
-
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 88caec3972..6800288093 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -528,13 +528,14 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   _fieldp.update_host(_max_fieldp_size*8,false);
-/*  
-  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n", this->_field.cols(), _max_fieldp_size);
+
+  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
+    this->_fieldp.cols(), _max_fieldp_size);
   for (int i = 0; i < 10; i++) {
     numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
     printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
   }
-*/
+
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index d87e35cdf8..6501376dfa 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -367,10 +367,10 @@ void PairAmoebaGPU::induce()
   dfield0c(field,fieldp);
 
   // reverse comm to sum field,fieldp from ghost atoms to owned atoms
-
+/*
   crstyle = FIELD;
   comm->reverse_comm_pair(this);
-
+*/
 
   // set induced dipoles to polarizability times direct field
 
@@ -778,7 +778,7 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp)
 
 void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 {
-  bool gpu_udirect2b_ready = false;
+  bool gpu_udirect2b_ready = true;
   if (!gpu_udirect2b_ready) {
     PairAmoeba::udirect2b(field, fieldp);
     return;
@@ -815,31 +815,33 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
-  // get field and fieldp values from the GPU lib
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+
+  udirect2b_cpu();
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (udirect1)
 
   int nlocal = atom->nlocal;
   double *field_ptr = (double *)fieldp_pinned;
 
   for (int i = 0; i < nlocal; i++) {
     int idx = 4*i;
-    field[i][0] = field_ptr[idx];
-    field[i][1] = field_ptr[idx+1];
-    field[i][2] = field_ptr[idx+2]; 
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2]; 
   }
 
   double* fieldp_ptr = (double *)fieldp_pinned;
   fieldp_ptr += 4*inum;
   for (int i = 0; i < nlocal; i++) {
     int idx = 4*i;
-    fieldp[i][0] = fieldp_ptr[idx];
-    fieldp[i][1] = fieldp_ptr[idx+1];
-    fieldp[i][2] = fieldp_ptr[idx+2];
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
   }
-
-  // rebuild dipole-dipole pair list and store pairwise dipole matrices
-  // done one atom at a time in real-space double loop over atoms & neighs
-
-  udirect2b_cpu();
+  
 }
 
 /* ----------------------------------------------------------------------

From 8c5a116d30d391f9dac1c03f6be225332b956ad1 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 8 Sep 2021 16:43:33 -0500
Subject: [PATCH 017/181] Made dfield0c work to compute uind and uinp
 correctly; need to make sure they are correct for polar_real()

---
 lib/gpu/lal_base_amoeba.cpp  |  4 +--
 src/AMOEBA/amoeba_induce.cpp | 63 ++++++++++++++++++++++++++++++---
 src/GPU/pair_amoeba_gpu.cpp  | 67 ++++++++++++++++++++++++++++++------
 3 files changed, 117 insertions(+), 17 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 6800288093..26af83ab25 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -528,14 +528,14 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   _fieldp.update_host(_max_fieldp_size*8,false);
-
+/*
   printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
     this->_fieldp.cols(), _max_fieldp_size);
   for (int i = 0; i < 10; i++) {
     numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
     printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
   }
-
+*/
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index c8f361053c..b1e6fa3f5d 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -25,6 +25,7 @@
 #include "my_page.h"
 #include "math_const.h"
 #include "memory.h"
+#include "neighbor.h"
 #include "error.h"
 
 using namespace LAMMPS_NS;
@@ -103,14 +104,21 @@ void PairAmoeba::induce()
   memory->create(usump,nlocal,3,"ameoba/induce:usump");
 
   // get the electrostatic field due to permanent multipoles
-  
+
   dfield0c(field,fieldp);
 
   // reverse comm to sum field,fieldp from ghost atoms to owned atoms
 
   crstyle = FIELD;
   comm->reverse_comm_pair(this);
-
+/*  
+  printf("CPU: cutghost = %f\n", comm->cutghost[0]);
+  for (i = 0; i < nlocal; i++) {
+    printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
+      i, field[i][0], field[i][1], field[i][2],
+      fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
+  }
+*/  
   // DEBUG statements
 
   /*
@@ -135,7 +143,14 @@ void PairAmoeba::induce()
       }
     }
   }
-
+/*
+  printf("CPU: cutghost = %f\n", comm->cutghost[0]);
+  for (i = 0; i < 10; i++) {
+    printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
+      i, udir[i][0], udir[i][1], udir[i][2],
+      udirp[i][0], udirp[i][1], udirp[i][2]); 
+  }
+*/
   // DEBUG statements
 
   /*
@@ -250,12 +265,30 @@ void PairAmoeba::induce()
 
     cfstyle = INDUCE;
     comm->forward_comm_pair(this);
-
+/*
+    if (comm->me == 0) {
+      printf("CPU: cutghost = %f\n", comm->cutghost[0]);
+      for (i = 0; i < 20; i++) {
+        printf("i = %d: uind = %f %f %f; udirp = %f %f %f\n",
+          i, uind[i][0], uind[i][1], uind[i][2],
+          uinp[i][0], uinp[i][1], uinp[i][2]); 
+      }
+    }
+*/
     ufield0c(field,fieldp);
 
     crstyle = FIELD;
     comm->reverse_comm_pair(this);
-
+/*
+    if (comm->me == 0) {
+      printf("CPU: cutghost = %f\n", comm->cutghost[0]);
+      for (i = 0; i < nlocal; i++) {
+        printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
+          i, field[i][0], field[i][1], field[i][2],
+          fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
+      }    
+    }
+*/
     // DEBUG statements
 
     /*
@@ -342,6 +375,16 @@ void PairAmoeba::induce()
 
       crstyle = FIELD;
       comm->reverse_comm_pair(this);
+/*
+      if (comm->me == 0) {
+        printf("CPU: iter = %d\n", iter);
+        for (i = 0; i < 10; i++) {
+          printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
+            i, field[i][0], field[i][1], field[i][2],
+            fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
+        }    
+      }
+*/
 
       // DEBUG statements
 
@@ -537,6 +580,7 @@ void PairAmoeba::induce()
 	error->warning(FLERR,"AMOEBA induced dipoles did not converge");
   }
 
+
   // DEBUG output to dump file
 
   if (uind_flag) 
@@ -553,6 +597,15 @@ void PairAmoeba::induce()
   memory->destroy(usum);
   memory->destroy(usump);
 
+  if (comm->me == 0) {
+    printf("CPU: iter = %d\n", iter);
+    for (i = 0; i < 20; i++) {
+      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
+        i, uind[i][0], uind[i][1], uind[i][2],
+        uinp[i][0], uinp[i][1], uinp[i][2]); 
+    }    
+  }
+
   // update the lists of previous induced dipole values
   // shift previous m values up to m+1, add new values at m = 0
   // only when preconditioner is used
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 6501376dfa..cd577af912 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -260,7 +260,7 @@ void PairAmoebaGPU::init_style()
     }
   }
 
-  // select the cutoff (off2) for neighbor list builds (the polar term for now)
+  // select the squared cutoff (off2) for neighbor list builds (the polar term for now)
   // NOTE: induce and polar terms are using the same flags here
 
   if (use_ewald) choose(POLAR_LONG);
@@ -365,13 +365,10 @@ void PairAmoebaGPU::induce()
   // get the electrostatic field due to permanent multipoles
   
   dfield0c(field,fieldp);
-
-  // reverse comm to sum field,fieldp from ghost atoms to owned atoms
 /*
   crstyle = FIELD;
   comm->reverse_comm_pair(this);
 */
-
   // set induced dipoles to polarizability times direct field
 
   for (i = 0; i < nlocal; i++) {
@@ -385,7 +382,14 @@ void PairAmoebaGPU::induce()
       }
     }
   }
-
+/*
+  printf("GPU: cutghost = %f\n", comm->cutghost[0]);
+  for (i = 0; i < 10; i++) {
+    printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
+      i, udir[i][0], udir[i][1], udir[i][2],
+      udirp[i][0], udirp[i][1], udirp[i][2]); 
+  }
+*/
   // get induced dipoles via the OPT extrapolation method
   // NOTE: any way to rewrite these loops to avoid allocating
   //       uopt,uoptp with a optorder+1 dimension, just optorder ??
@@ -489,13 +493,30 @@ void PairAmoebaGPU::induce()
 
     cfstyle = INDUCE;
     comm->forward_comm_pair(this);
-
+/*
+    if (comm->me == 0) {
+      printf("GPU: cutghost = %f\n", comm->cutghost[0]);
+      for (i = 0; i < 20; i++) {
+        printf("i = %d: uind = %f %f %f; udirp = %f %f %f\n",
+          i, uind[i][0], uind[i][1], uind[i][2],
+          uinp[i][0], uinp[i][1], uinp[i][2]); 
+      }
+    }
+*/
     ufield0c(field,fieldp);
 
     crstyle = FIELD;
     comm->reverse_comm_pair(this);
-
-    
+/*
+    if (comm->me == 0) {
+      printf("GPU: cutghost = %f\n", comm->cutghost[0]);
+      for (i = 0; i < nlocal; i++) {
+        printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
+          i, field[i][0], field[i][1], field[i][2],
+          fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
+      }    
+    }
+*/      
     // set initial conjugate gradient residual and conjugate vector
 
     for (i = 0; i < nlocal; i++) {
@@ -554,7 +575,16 @@ void PairAmoebaGPU::induce()
 
       crstyle = FIELD;
       comm->reverse_comm_pair(this);
-     
+/*     
+     if (comm->me == 0) {
+       printf("GPU: iter = %d\n", iter);
+       for (i = 0; i < 10; i++) {
+          printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
+            i, field[i][0], field[i][1], field[i][2],
+            fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
+        }    
+      }
+*/
       for (i = 0; i < nlocal; i++) {
         for (j = 0; j < 3; j++) {
           uind[i][j] = vec[i][j];
@@ -697,6 +727,15 @@ void PairAmoebaGPU::induce()
   memory->destroy(usum);
   memory->destroy(usump);
 
+  if (comm->me == 0) {
+    printf("GPU: iter = %d\n", iter);
+    for (i = 0; i < 20; i++) {
+      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
+        i, uind[i][0], uind[i][1], uind[i][2],
+        uinp[i][0], uinp[i][1], uinp[i][2]); 
+    }    
+  }
+
   // update the lists of previous induced dipole values
   // shift previous m values up to m+1, add new values at m = 0
   // only when preconditioner is used
@@ -758,7 +797,7 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp)
   // get the real space portion of the permanent field
 
   if (rspace_flag) udirect2b(field,fieldp);
-
+  
   // get the self-energy portion of the permanent field
 
   term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
@@ -768,6 +807,14 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp)
       fieldp[i][j] += term*rpole[i][j+1];
     }
   }
+/*
+  printf("GPU: cutghost = %f\n", comm->cutghost[0]);
+  for (i = 0; i < nlocal; i++) {
+    printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
+      i, field[i][0], field[i][1], field[i][2],
+      fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
+  }
+*/  
 }
 
 /* ----------------------------------------------------------------------

From 6f6fd0999c324f679b263330ea9fad1aad725c10 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 9 Sep 2021 00:57:21 -0500
Subject: [PATCH 018/181] Both udirect2b and polar_real are working correctly
 on the GPU

---
 lib/gpu/lal_atom.h           |   6 +-
 lib/gpu/lal_base_amoeba.cpp  | 158 +++++++++++++++++++----------------
 src/AMOEBA/amoeba_induce.cpp |   9 --
 src/GPU/pair_amoeba_gpu.cpp  |  34 ++++++--
 src/GPU/pair_amoeba_gpu.h    |   3 +
 5 files changed, 121 insertions(+), 89 deletions(-)

diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index ff335fffa9..842257a592 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -284,7 +284,11 @@ class Atom {
 
   /// Signal that we need to transfer atom data for next timestep
   inline void data_unavail()
-    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
+  
+  /// Signal that we need to transfer atom extra data for next kernel call
+  inline void extra_data_unavail()
+    { _extra_avail=false; }
 
   typedef struct { double x,y,z; } vec3d;
   typedef struct { numtyp x,y,z,w; } vec4d_t;
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 26af83ab25..9baa7b30d3 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -317,8 +317,10 @@ void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const
 // ---------------------------------------------------------------------------
 // Prepare for multiple kernel calls in a time step:
 //   - reallocate per-atom arrays, if needed
+//   - transfer extra data from host to device
 //   - build the full neighbor lists for use by different kernels
 // ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
@@ -402,75 +404,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   return nbor->host_jlist.begin()-host_start;
 }
 
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute polar real-space
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
-                           double **host_x, int *host_type, int *host_amtype,
-                           int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special,
-                           int *nspecial15, tagint **special15,
-                           const bool eflag_in, const bool vflag_in,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void **tep_ptr) {
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-
-  // reallocate per-atom arrays and build the neighbor lists if needed
-
-  int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
-
-  // ------------------- Resize _tep array ------------------------
-
-  if (nall>_max_tep_size) {
-    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_tep_size*4);
-  }
-  *tep_ptr=_tep.host.begin();
-
-  const int red_blocks=polar_real(eflag,vflag);
-  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-
-  // copy tep from device to host
-
-  _tep.update_host(_max_tep_size*4,false);
-/*  
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
-    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/  
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute the direct real space part
 //    of the permanent field
@@ -504,7 +437,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
 
   set_kernel(eflag,vflag);
 
-  // reallocate per-atom arrays and build the neighbor lists if needed
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
 
   int** firstneigh = nullptr;
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
@@ -539,6 +473,85 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
+  }
+  *tep_ptr=_tep.host.begin();
+
+  const int red_blocks=polar_real(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
+/*
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
+    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
+
 template <class numtyp, class acctyp>
 double BaseAmoebaT::host_memory_usage_atomic() const {
   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
@@ -548,6 +561,11 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 template <class numtyp, class acctyp>
 void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
     double** uind, double** uinp) {
+
+  // signal that we need to transfer extra data from the host
+
+  atom->extra_data_unavail();
+
   int _nall=atom->nall();
   numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));
 
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index b1e6fa3f5d..2ffd4d275b 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -597,15 +597,6 @@ void PairAmoeba::induce()
   memory->destroy(usum);
   memory->destroy(usump);
 
-  if (comm->me == 0) {
-    printf("CPU: iter = %d\n", iter);
-    for (i = 0; i < 20; i++) {
-      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
-        i, uind[i][0], uind[i][1], uind[i][2],
-        uinp[i][0], uinp[i][1], uinp[i][2]); 
-    }    
-  }
-
   // update the lists of previous induced dipole values
   // shift previous m values up to m+1, add new values at m = 0
   // only when preconditioner is used
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index cd577af912..0c9ff4c780 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -98,6 +98,10 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   suffix_flag |= Suffix::GPU;
   fieldp_pinned = nullptr;
   tep_pinned = nullptr;
+
+  gpu_udirect2b_ready = true;
+  gpu_polar_real_ready = true; 
+
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
 
@@ -114,7 +118,6 @@ PairAmoebaGPU::~PairAmoebaGPU()
 
 void PairAmoebaGPU::polar_real()
 {
-  bool gpu_polar_real_ready = true;
   if (!gpu_polar_real_ready) {
     PairAmoeba::polar_real();
     return;
@@ -139,7 +142,16 @@ void PairAmoebaGPU::polar_real()
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
   inum = atom->nlocal;
-
+/*
+  if (comm->me == 0) {
+    printf("GPU: polar real\n");
+    for (int i = 0; i < 20; i++) {
+      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
+        i, uind[i][0], uind[i][1], uind[i][2],
+        uinp[i][0], uinp[i][1], uinp[i][2]); 
+    }    
+  }
+*/
   firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
                                         atom->type, amtype, amgroup,
                                         rpole, uind, uinp, sublo, subhi,
@@ -200,6 +212,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
     tep[0] = tep_ptr[4*i];
     tep[1] = tep_ptr[4*i+1];
     tep[2] = tep_ptr[4*i+2];
+
     torque2force(i,tep,fix,fiy,fiz,fpolar);
 
     iz = zaxis2local[i];
@@ -365,10 +378,14 @@ void PairAmoebaGPU::induce()
   // get the electrostatic field due to permanent multipoles
   
   dfield0c(field,fieldp);
-/*
-  crstyle = FIELD;
-  comm->reverse_comm_pair(this);
-*/
+
+  // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
+
+  if (!gpu_udirect2b_ready) {
+    crstyle = FIELD;
+    comm->reverse_comm_pair(this);
+  }
+
   // set induced dipoles to polarizability times direct field
 
   for (i = 0; i < nlocal; i++) {
@@ -726,7 +743,7 @@ void PairAmoebaGPU::induce()
   memory->destroy(udir);
   memory->destroy(usum);
   memory->destroy(usump);
-
+/*
   if (comm->me == 0) {
     printf("GPU: iter = %d\n", iter);
     for (i = 0; i < 20; i++) {
@@ -735,7 +752,7 @@ void PairAmoebaGPU::induce()
         uinp[i][0], uinp[i][1], uinp[i][2]); 
     }    
   }
-
+*/
   // update the lists of previous induced dipole values
   // shift previous m values up to m+1, add new values at m = 0
   // only when preconditioner is used
@@ -825,7 +842,6 @@ void PairAmoebaGPU::dfield0c(double **field, double **fieldp)
 
 void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 {
-  bool gpu_udirect2b_ready = true;
   if (!gpu_udirect2b_ready) {
     PairAmoeba::udirect2b(field, fieldp);
     return;
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index 9f538ca903..d4ab9bcdfd 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -45,6 +45,9 @@ class PairAmoebaGPU : public PairAmoeba {
   void *fieldp_pinned;
   bool tep_single;
 
+  bool gpu_polar_real_ready;
+  bool gpu_udirect2b_ready;
+
   void udirect2b_cpu();
 
   template<class numtyp>

From 4a75a9bdd2b38f70b2e0da8b3e7054b46082efdb Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 9 Sep 2021 14:47:29 -0500
Subject: [PATCH 019/181] Removed dfield0c from ameoba/gpu (no need to override
 this one)

---
 src/GPU/pair_amoeba_gpu.cpp | 60 +------------------------------------
 src/GPU/pair_amoeba_gpu.h   |  4 +--
 2 files changed, 3 insertions(+), 61 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 0c9ff4c780..3280e7b093 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -100,7 +100,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   tep_pinned = nullptr;
 
   gpu_udirect2b_ready = true;
-  gpu_polar_real_ready = true; 
+  gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -775,64 +775,6 @@ void PairAmoebaGPU::induce()
   }
 }
 
-/* ----------------------------------------------------------------------
-   dfield0c =  direct induction via Ewald sum
-   dfield0c computes the mutual electrostatic field due to
-   permanent multipole moments via Ewald summation
-------------------------------------------------------------------------- */
-
-void PairAmoebaGPU::dfield0c(double **field, double **fieldp)
-{
-  int i,j,ii;
-  double term;
-
-  int inum;
-  int *ilist;
-
-  // zero out field,fieldp for owned and ghost atoms
-
-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-
-  for (i = 0; i < nall; i++) {
-    for (j = 0; j < 3; j++) {
-      field[i][j] = 0.0;
-      fieldp[i][j] = 0.0;
-    }
-  }
-
-  // get the reciprocal space part of the permanent field
-
-  if (kspace_flag) udirect1(field);
-
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      fieldp[i][j] = field[i][j];
-    }
-  }
-
-  // get the real space portion of the permanent field
-
-  if (rspace_flag) udirect2b(field,fieldp);
-  
-  // get the self-energy portion of the permanent field
-
-  term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      field[i][j] += term*rpole[i][j+1];
-      fieldp[i][j] += term*rpole[i][j+1];
-    }
-  }
-/*
-  printf("GPU: cutghost = %f\n", comm->cutghost[0]);
-  for (i = 0; i < nlocal; i++) {
-    printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
-      i, field[i][0], field[i][1], field[i][2],
-      fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
-  }
-*/  
-}
 
 /* ----------------------------------------------------------------------
    udirect2b = Ewald real direct field via list
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index d4ab9bcdfd..d0cbad90a2 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -33,9 +33,9 @@ class PairAmoebaGPU : public PairAmoeba {
 
   enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
-  virtual void polar_real();
   virtual void induce();
-  virtual void dfield0c(double **, double **);
+
+  virtual void polar_real();
   virtual void udirect2b(double **, double **);
 
  private:

From efe0bf593f531721f0a7eb00c570a2f4663db94e Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 9 Sep 2021 15:19:43 -0500
Subject: [PATCH 020/181] Adding the umutual2b kernel, need to create another
 array for tdipdip on the GPU

---
 lib/gpu/lal_amoeba.cpp      |  31 +++++-
 lib/gpu/lal_amoeba.cu       | 210 ++++++++++++++++++++++++++++++++++++
 lib/gpu/lal_amoeba.h        |   4 +-
 lib/gpu/lal_base_amoeba.cpp |  82 +++++++++++++-
 lib/gpu/lal_base_amoeba.h   |  67 +++++++-----
 src/AMOEBA/pair_amoeba.h    |   4 +-
 src/GPU/pair_amoeba_gpu.cpp |  93 ++++++++++++++--
 src/GPU/pair_amoeba_gpu.h   |   4 +-
 8 files changed, 448 insertions(+), 47 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 0d78a8618a..8bcbd6c4cb 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -58,7 +58,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
-                            "k_amoeba_polar", "k_amoeba_udirect2b");
+                            "k_amoeba_polar", "k_amoeba_udirect2b",
+                            "k_amoeba_umutual2b");
   if (success!=0)
     return success;
 
@@ -152,7 +153,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the polar real-space term, returning tep
+// Calculate the real-space permanent field, returning field and fieldp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::udirect2b(const int eflag, const int vflag) {
@@ -177,5 +178,31 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Calculate the real-space induced field, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::umutual2b(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int _nall=this->atom->nall();
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &_aewald, &_off2,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
 template class Amoeba<PRECISION,ACC_PRECISION>;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index c4f146a7c9..192f440112 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -907,6 +907,216 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
+/* ----------------------------------------------------------------------
+  umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+    
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = damping[itype].x;
+    numtyp pti = damping[itype].y;
+    numtyp ddi = damping[itype].z;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=dev_packed[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for Thole polarization damping
+
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp scale7 = (numtyp)1.0;
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp) ;
+            scale3 = (numtyp)1.0 - expdamp ;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
+          }
+        } else {
+          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+          damp = pgamma * ucl_powr(r/damp,3.0);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_dscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+        
+      scalek = factor_pscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 842207dc73..ea4f8b9d1d 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -80,8 +80,10 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  int polar_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+  
 };
 
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 9baa7b30d3..6bcd6c50c7 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -38,6 +38,7 @@ BaseAmoebaT::~BaseAmoeba() {
   delete nbor;
   k_polar.clear();
   k_udirect2b.clear();
+  k_umutual2b.clear();
   k_special15.clear();
   if (pair_program) delete pair_program;
 }
@@ -55,7 +56,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
                              const char *k_name_polar,
-                             const char *k_name_udirect2b) {
+                             const char *k_name_udirect2b,
+                             const char *k_name_umutual2b) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -87,7 +89,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b);
+  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -230,7 +232,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_polar_real(const int f_ago, const int inum_full, const int nall,
+void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
                           double **host_x, int *host_type, int *host_amtype,
                           int *host_amgroup, double **host_rpole,
                           double **host_uind, double **host_uinp,
@@ -473,6 +475,75 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the induced field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void** fieldp_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer extra data from the host
+  //   and build the neighbor lists if needed
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _fieldp array ------------------------
+
+  if (nall>_max_fieldp_size) {
+    _max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _fieldp.resize(_max_fieldp_size*8);
+  }
+  *fieldp_ptr=_fieldp.host.begin();
+
+  const int red_blocks=umutual2b(eflag,vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  _fieldp.update_host(_max_fieldp_size*8,false);
+/*
+  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
+    this->_fieldp.cols(), _max_fieldp_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
+    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/
+  return firstneigh; //nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
@@ -551,7 +622,6 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
-
 template <class numtyp, class acctyp>
 double BaseAmoebaT::host_memory_usage_atomic() const {
   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
@@ -621,7 +691,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_polar,
-                                  const char *kname_udirect2b) {
+                                  const char *kname_udirect2b,
+                                  const char *kname_umutual2b) {
   if (_compiled)
     return;
 
@@ -632,6 +703,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   
   k_polar.set_function(*pair_program,kname_polar);
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
+  k_umutual2b.set_function(*pair_program,kname_umutual2b);
   k_special15.set_function(*pair_program,"k_special15");
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 7d4f4c00b5..3fb752c97c 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -54,7 +54,8 @@ class BaseAmoeba {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_polar, const char *kname_udirect2b);
+                  const char *kname_polar, const char *kname_udirect2b,
+                  const char *kname_umutual2b);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -140,15 +141,31 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
-  /// Compute polar real-space with host neighboring (not active for now)
-  void compute_polar_real(const int f_ago, const int inum_full, const int nall,
-               double **host_x, int *host_type, int *host_amtype,
-               int *host_amgroup, double **host_rpole, double **host_uind,
-               double **host_uinp, int *ilist, int *numj,
-               int **firstneigh, const bool eflag, const bool vflag,
-               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double *charge,
-               const int nlocal, double *boxlo, double *prd, void **tep_ptr);
+  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+  int** compute_udirect2b(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp,
+                double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+
+  /// Compute the real space part of the induced field (umutual2b) with device neighboring
+  int** compute_umutual2b(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp,
+                double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute polar real-space with device neighboring
   int** compute_polar_real(const int ago, const int inum_full, const int nall,
@@ -162,18 +179,15 @@ class BaseAmoeba {
                 int **ilist, int **numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd, void **tep_ptr);
 
-  /// Compute the direct real space part of the permanent field (udirect2b) with device neighboring
-  int** compute_udirect2b(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole,
-                double **host_uind, double **host_uinp,
-                double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd, void **fieldp_ptr);                
+  /// Compute polar real-space with host neighboring (not active for now)
+  void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *host_amtype,
+               int *host_amgroup, double **host_rpole, double **host_uind,
+               double **host_uinp, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd, void **tep_ptr);
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -224,7 +238,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_polar, k_udirect2b, k_special15;
+  UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
 
@@ -241,10 +255,13 @@ class BaseAmoeba {
   UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-     const char *kname_polar, const char *kname_udirect2b);
+     const char *kname_polar, const char *kname_udirect2b,
+     const char *kname_umutual2b);
 
-  virtual int polar_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
+  virtual int umutual2b(const int eflag, const int vflag) = 0;
+  virtual int polar_real(const int eflag, const int vflag) = 0;
+  
 };
 
 }
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 0ec601de47..b2318d296e 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -365,9 +365,9 @@ class PairAmoeba : public Pair {
   void ulspred();
   void ufield0c(double **, double **);
   void uscale0b(int, double **, double **, double **, double **);
-  virtual void dfield0c(double **, double **);
+  void dfield0c(double **, double **);
   void umutual1(double **, double **);
-  void umutual2b(double **, double **);
+  virtual void umutual2b(double **, double **);
   void udirect1(double **);
   virtual void udirect2b(double **, double **);
   void dampmut(double, double, double, double *);
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 3280e7b093..a1c21da3dd 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -74,7 +74,18 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal
               int **ilist, int **jnum, const double cpu_time,
               bool &success, double *host_q, double *boxlo, double *prd,
               void **fieldp_ptr);
-
+/*
+int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp, 
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag,
+              const bool eatom, const bool vatom, int &host_start,
+              int **ilist, int **jnum, const double cpu_time,
+              bool &success, double *host_q, double *boxlo, double *prd,
+              void **fieldp_ptr);
+*/
 int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
@@ -100,6 +111,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   tep_pinned = nullptr;
 
   gpu_udirect2b_ready = true;
+  gpu_umutual2b_ready = false;
   gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
@@ -142,16 +154,7 @@ void PairAmoebaGPU::polar_real()
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
   inum = atom->nlocal;
-/*
-  if (comm->me == 0) {
-    printf("GPU: polar real\n");
-    for (int i = 0; i < 20; i++) {
-      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
-        i, uind[i][0], uind[i][1], uind[i][2],
-        uinp[i][0], uinp[i][1], uinp[i][2]); 
-    }    
-  }
-*/
+
   firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
                                         atom->type, amtype, amgroup,
                                         rpole, uind, uinp, sublo, subhi,
@@ -993,6 +996,74 @@ void PairAmoebaGPU::udirect2b_cpu()
   }
 }
 
+/* ----------------------------------------------------------------------
+   umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
+{
+  if (!gpu_umutual2b_ready) {
+    PairAmoeba::umutual2b(field, fieldp);
+    return;
+  }
+/*   
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
+                                        atom->type, amtype, amgroup, rpole, uind, uinp,
+                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
+                                        atom->nspecial15, atom->special15,
+                                        eflag, vflag, eflag_atom, vflag_atom,
+                                        host_start, &ilist, &numneigh, cpu_time,
+                                        success, atom->q, domain->boxlo,
+                                        domain->prd, &fieldp_pinned);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (udirect1)
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
+  }
+*/  
+}
+
 /* ---------------------------------------------------------------------- */
 
 double PairAmoebaGPU::memory_usage()
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index d0cbad90a2..4dc547e469 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -37,6 +37,7 @@ class PairAmoebaGPU : public PairAmoeba {
 
   virtual void polar_real();
   virtual void udirect2b(double **, double **);
+  virtual void umutual2b(double **, double **);
 
  private:
   int gpu_mode;
@@ -45,8 +46,9 @@ class PairAmoebaGPU : public PairAmoeba {
   void *fieldp_pinned;
   bool tep_single;
 
-  bool gpu_polar_real_ready;
   bool gpu_udirect2b_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
 
   void udirect2b_cpu();
 

From b654f293ee637c25b8bb0fed92b9872eb7f58e0c Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 9 Sep 2021 16:52:27 -0500
Subject: [PATCH 021/181] Working on the umutual2b kernel, the tdipdip values
 are computed on the fly for now, maybe a seprate neigh list as in the CPU
 version will be more efficient

---
 lib/gpu/lal_amoeba.cu       | 130 ++++++++----------------------------
 src/GPU/pair_amoeba_gpu.cpp |  19 ++++--
 2 files changed, 41 insertions(+), 108 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 192f440112..a4b0063a4f 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -743,9 +743,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
     int itype,igroup;
     numtyp bn[4],bcn[3];
     numtyp fid[3],fip[3];
-    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
     
-    ci  = polar1[i].x;    // rpole[i][0];
     dix = polar1[i].y;    // rpole[i][1];
     diy = polar1[i].z;    // rpole[i][2];
     diz = polar1[i].w;    // rpole[i][3];
@@ -934,10 +932,9 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
 
-  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
 
   //numtyp4 xi__;
 
@@ -953,32 +950,13 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     int itype,igroup;
     numtyp bn[4],bcn[3];
     numtyp fid[3],fip[3];
-    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
     
-    ci  = polar1[i].x;    // rpole[i][0];
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
     
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
     numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
     numtyp ddi = damping[itype].z;
 
-    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
-    numtyp aesq2n = (numtyp)0.0;
-    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
-
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int jextra=dev_packed[nbor];
@@ -1001,107 +979,55 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
       numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
-      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
 
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
       int jtype =   polar3[j].z; // amtype[j];
       int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
 
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
-      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
-      if (igroup == jgroup) {
-        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
-        factor_dscale = polar_dscale;
-        factor_uscale = polar_uscale;
-      } else {
-        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
-        factor_dscale = factor_uscale = (numtyp)1.0;
-      }
-
-      // intermediates involving moments and separation distance
-
-      numtyp dir = dix*xr + diy*yr + diz*zr;
-      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
-      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
-      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
-      numtyp qir = qix*xr + qiy*yr + qiz*zr;
-      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
-      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
-      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
-      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
-      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-
-      // calculate the real space Ewald error function terms
-
-      numtyp ralpha = aewald * r;
-      numtyp exp2a = ucl_exp(-ralpha*ralpha);
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
-      bn[0] = _erfc * rinv;
-
-      numtyp aefac = aesq2n;
-      for (int m = 1; m <= 3; m++) {
-        numtyp bfac = (numtyp) (m+m-1);
-        aefac = aesq2 * aefac;
-        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
-      }
-
-      // find the field components for Thole polarization damping
+      numtyp factor_uscale;
 
+      // find terms needed later to compute mutual polarization
+      // if (poltyp != DIRECT) 
       numtyp scale3 = (numtyp)1.0;
       numtyp scale5 = (numtyp)1.0;
-      numtyp scale7 = (numtyp)1.0;
       numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
         numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
         if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp) ;
-            scale3 = (numtyp)1.0 - expdamp ;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
-          }
-        } else {
-          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
-          damp = pgamma * ucl_powr(r/damp,3.0);
+          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
           if (damp < (numtyp)50.0) {
             numtyp expdamp = ucl_exp(-damp);
             scale3 = (numtyp)1.0 - expdamp;
             scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
           }
         }
       } else { // damp == 0: ???
       }
 
-      numtyp scalek = factor_dscale;
+      numtyp scalek = factor_uscale;
       bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
       bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
-        
-      scalek = factor_pscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+      numtyp tdipdip[6];
+      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
+      tdipdip[1] = bcn[1]*xr*yr;
+      tdipdip[2] = bcn[1]*xr*zr;
+      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
+      tdipdip[4] = bcn[1]*yr*zr;
+      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
 
+      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
+      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
+      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
+      
+      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
+      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
+      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
+      
       _fieldp[0] += fid[0];
       _fieldp[1] += fid[1];
       _fieldp[2] += fid[2];
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index a1c21da3dd..9f1677f26d 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -431,8 +431,10 @@ void PairAmoebaGPU::induce()
 
       ufield0c(field,fieldp);
 
-      crstyle = FIELD;
-      comm->reverse_comm_pair(this);
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm_pair(this);
+      }
 
       for (i = 0; i < nlocal; i++) {
 	      itype = amtype[i];
@@ -525,8 +527,11 @@ void PairAmoebaGPU::induce()
 */
     ufield0c(field,fieldp);
 
-    crstyle = FIELD;
-    comm->reverse_comm_pair(this);
+    if (!gpu_umutual2b_ready) {
+      crstyle = FIELD;
+      comm->reverse_comm_pair(this);
+    }
+
 /*
     if (comm->me == 0) {
       printf("GPU: cutghost = %f\n", comm->cutghost[0]);
@@ -593,8 +598,10 @@ void PairAmoebaGPU::induce()
 
       //error->all(FLERR,"STOP");
 
-      crstyle = FIELD;
-      comm->reverse_comm_pair(this);
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm_pair(this);
+      }
 /*     
      if (comm->me == 0) {
        printf("GPU: iter = %d\n", iter);

From a22923aee29cb238c3c008363c0cf1eccb477b2d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 9 Sep 2021 17:22:09 -0500
Subject: [PATCH 022/181] Added the API for the umutual kernel, needs work for
 storing the tdiptdip array

---
 lib/gpu/lal_amoeba.cu       |  2 +-
 lib/gpu/lal_amoeba_ext.cpp  | 54 ++++++++++++++++++++++++-------------
 src/GPU/pair_amoeba_gpu.cpp |  8 +++---
 3 files changed, 41 insertions(+), 23 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index a4b0063a4f..30db5ba334 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1012,7 +1012,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp scalek = factor_uscale;
       bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
       bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      numtyp tdipdip[6];
+      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
       tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
       tdipdip[1] = bcn[1]*xr*yr;
       tdipdip[2] = bcn[1]*xr*zr;
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 59739f9f2a..5bb4dea25f 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -105,6 +105,42 @@ void amoeba_gpu_clear() {
   AMOEBAMF.clear();
 }
 
+int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **fieldp_ptr) {
+  return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, host_q, boxlo, prd, fieldp_ptr);
+}
+
+int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **fieldp_ptr) {
+  return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, host_q, boxlo, prd, fieldp_ptr);
+}
+
 int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup,
@@ -124,24 +160,6 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
                           host_q, boxlo, prd, tep_ptr);
 }
 
-int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void **fieldp_ptr) {
-  return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
-                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, host_q, boxlo, prd, fieldp_ptr);
-}
-
 double amoeba_gpu_bytes() {
   return AMOEBAMF.host_memory_usage();
 }
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 9f1677f26d..b9ee884fa0 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -74,7 +74,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal
               int **ilist, int **jnum, const double cpu_time,
               bool &success, double *host_q, double *boxlo, double *prd,
               void **fieldp_ptr);
-/*
+
 int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, 
@@ -85,7 +85,7 @@ int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nal
               int **ilist, int **jnum, const double cpu_time,
               bool &success, double *host_q, double *boxlo, double *prd,
               void **fieldp_ptr);
-*/
+
 int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
@@ -1015,7 +1015,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
     PairAmoeba::umutual2b(field, fieldp);
     return;
   }
-/*   
+
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
@@ -1068,7 +1068,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
-*/  
+
 }
 
 /* ---------------------------------------------------------------------- */

From 4ebe5833d33a62c9c1afbfd248ce00bc03e9d596 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 10 Sep 2021 16:51:16 -0500
Subject: [PATCH 023/181] Working on short nbor list for the amoeba kernels
 (based on what has been done with tersoff and ellipsod, nbor dev_packed needs
 to be allocated properly)

---
 lib/gpu/lal_amoeba.cpp      |  25 +++++---
 lib/gpu/lal_amoeba.cu       | 115 +++++++++++++++++++++++++++++++++++-
 lib/gpu/lal_base_amoeba.cpp |  26 +++++---
 lib/gpu/lal_base_amoeba.h   |   5 +-
 src/GPU/pair_amoeba_gpu.cpp |   4 +-
 5 files changed, 151 insertions(+), 24 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 8bcbd6c4cb..08b3f1c9a5 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -59,7 +59,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
                             "k_amoeba_polar", "k_amoeba_udirect2b",
-                            "k_amoeba_umutual2b");
+                            "k_amoeba_umutual2b", "k_amoeba_short_nbor");
   if (success!=0)
     return success;
 
@@ -157,16 +157,23 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::udirect2b(const int eflag, const int vflag) {
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  int ainum=this->ans->inum(); 
+
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int _nall=this->atom->nall();
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-
+  int GX;
+  
+  GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  // NOTE: this->nbor->dev_packed is not allocated!!
+/*  
+  this->k_short_nbor.run(&this->atom->x, &_off2,
+                         &this->nbor->dev_nbor,  &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
+*/
+  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
   this->k_udirect2b.set_size(GX,BX);
   this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 30db5ba334..9df1dbe485 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -781,8 +781,10 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      if (r2>off2) continue;
-  
+      if (r2>off2) {
+        if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj);
+        continue;
+      }
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
@@ -1091,3 +1093,112 @@ __kernel void k_special15(__global int * dev_nbor,
 
   } // if ii
 }
+
+/*
+__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
+                                   const numtyp off2, __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int new_numj=0;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    if (i == 0) printf("i = 0: numj before = %d\n", numj);
+    __global int *out_list=dev_nbor+nbor;
+    const int out_stride=n_stride;
+    
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+    
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK15;
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<=off2) {
+        *out_list=sj;
+        out_list+=out_stride;
+
+        new_numj++;
+        if (i == 0 && offset == 0) printf("neighbor of i = 0 within  off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
+      } else {
+        if (i == 0 && offset == 0) printf("neighbor of i = 0 outside off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
+      }
+    } // for nbor
+  } // if ii
+
+  if (t_per_atom>1) {
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1)
+      new_numj += shfl_down(new_numj, s, t_per_atom);
+  }
+  if (offset==0 && ii<inum) {
+    dev_nbor[ii+nbor_pitch]=new_numj;
+    if (i == 0) printf("i = 0: numj after = %d\n", new_numj);
+  }
+}
+*/
+#ifdef LAL_SIMD_IP_SYNC
+#define t_per_atom t_per_atom_in
+#else
+#define t_per_atom 1
+#endif
+
+__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
+                                   const numtyp off2,
+                                   __global int * dev_nbor,
+                                   const __global int * dev_packed,
+                                   const int inum, const int nbor_pitch,
+                                   const int t_per_atom_in) {
+  const int ii=GLOBAL_ID_X;
+
+  if (ii<inum) {
+/*    
+    const int i=dev_packed[ii];
+    
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int newj=0;
+
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
+
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK15;
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<off2) {
+        //*out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
+      }
+
+    } // for nbor
+    //dev_nbor[ii+nbor_pitch]=newj;
+*/
+  } // if ii
+}
\ No newline at end of file
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 6bcd6c50c7..1419e16de1 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -40,6 +40,7 @@ BaseAmoebaT::~BaseAmoeba() {
   k_udirect2b.clear();
   k_umutual2b.clear();
   k_special15.clear();
+  k_short_nbor.clear();
   if (pair_program) delete pair_program;
 }
 
@@ -57,7 +58,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              FILE *_screen, const void *pair_program,
                              const char *k_name_polar,
                              const char *k_name_udirect2b,
-                             const char *k_name_umutual2b) {
+                             const char *k_name_umutual2b,
+                             const char *k_name_short_nbor) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -89,19 +91,21 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,k_name_umutual2b);
+  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,
+                  k_name_umutual2b,k_name_short_nbor);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
     _nbor_data=&(nbor->dev_packed);
-  } else
+  } else {
     _nbor_data=&(nbor->dev_nbor);
-
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
+  }
+    
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
+                              _gpu_host,max_nbors,cell_size,false,_threads_per_atom);
   if (success!=0)
     return success;
-
+                              
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -223,6 +227,8 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
     add_onefive_neighbors();
   }
 
+  //nbor->copy_unpacked(inum,mn);
+
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
@@ -450,7 +456,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
                           success, host_q, boxlo, prd);
-
+                         
   // ------------------- Resize _fieldp array ------------------------
 
   if (nall>_max_fieldp_size) {
@@ -692,7 +698,8 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_polar,
                                   const char *kname_udirect2b,
-                                  const char *kname_umutual2b) {
+                                  const char *kname_umutual2b,
+                                  const char *kname_short_nbor) {
   if (_compiled)
     return;
 
@@ -704,6 +711,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_polar.set_function(*pair_program,kname_polar);
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
+  k_short_nbor.set_function(*pair_program,kname_short_nbor);
   k_special15.set_function(*pair_program,"k_special15");
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 3fb752c97c..755f11610f 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -55,7 +55,7 @@ class BaseAmoeba {
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
                   const char *kname_polar, const char *kname_udirect2b,
-                  const char *kname_umutual2b);
+                  const char *kname_umutual2b, const char *kname_short_nbor);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -239,6 +239,7 @@ class BaseAmoeba {
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
   UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
+  UCL_Kernel k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
 
@@ -256,7 +257,7 @@ class BaseAmoeba {
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
      const char *kname_polar, const char *kname_udirect2b,
-     const char *kname_umutual2b);
+     const char *kname_umutual2b, const char *kname_short_nbor);
 
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index b9ee884fa0..c51f741c0a 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
 
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = false;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -297,7 +297,7 @@ void PairAmoebaGPU::init_style()
   // set the energy unit conversion factor for polar real-space calculation
 
   double felec = 0.5 * electric / am_dielectric;
-  
+
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
                                 special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, atom->nlocal,

From 7f5a82dc54e699648d2372a2c52d0cab851d13a0 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 11 Sep 2021 00:34:43 -0500
Subject: [PATCH 024/181] Switched to the short neighbor list implementation in
 the pre-10Feb21 version (the recent version enforces tpa = 1 for short nbor)

---
 lib/gpu/lal_amoeba.cpp      | 39 ++++++++++++-----
 lib/gpu/lal_amoeba.cu       | 86 ++++++++++++++++++++++---------------
 lib/gpu/lal_base_amoeba.cpp | 23 +++++++---
 lib/gpu/lal_base_amoeba.h   |  7 ++-
 src/GPU/pair_amoeba_gpu.cpp |  2 +-
 5 files changed, 103 insertions(+), 54 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 08b3f1c9a5..3a83f57594 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -141,14 +141,31 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
 
+  // Build the short neighbor list if needed
+  if (!this->short_nbor_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &_off2, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
+
   this->k_polar.set_size(GX,BX);
   this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
                     &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
                     &this->_threads_per_atom,
                     &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
   this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_avail = false;
+
   return GX;
 }
 
@@ -163,20 +180,22 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
 
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int GX;
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
+
+  // Build the short neighbor list if needed
+  if (!this->short_nbor_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &_off2, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
   
-  GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  this->k_short_nbor.set_size(GX,BX);
-  // NOTE: this->nbor->dev_packed is not allocated!!
-/*  
-  this->k_short_nbor.run(&this->atom->x, &_off2,
-                         &this->nbor->dev_nbor,  &this->nbor->dev_packed,
-                         &ainum, &nbor_pitch, &this->_threads_per_atom);
-*/
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
   this->k_udirect2b.set_size(GX,BX);
   this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
                         &this->_threads_per_atom, &_aewald, &_off2,
                         &_polar_dscale, &_polar_uscale);
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 9df1dbe485..bcb3aef309 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -196,6 +196,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict sp_polar,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
+                            const __global int *dev_short_nbor,
                             __global acctyp4 *restrict ans,
                             __global acctyp *restrict engv,
                             __global numtyp4 *restrict tep,
@@ -255,6 +256,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
     numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
 
     int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -262,6 +264,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
     //numtyp qtmp; fetch(qtmp,i,q_tex);
     //int itype=ix.w;
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     ci  = polar1[i].x;    // rpole[i][0];
     dix = polar1[i].y;    // rpole[i][1];
     diy = polar1[i].z;    // rpole[i][2];
@@ -289,7 +299,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int jextra=dev_packed[nbor];
+      int jextra=nbor_mem[nbor];
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -709,6 +719,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                  const __global numtyp4 *restrict sp_polar,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
                                  __global numtyp4 *restrict fieldp,
                                  const int inum,  const int nall,
                                  const int nbor_pitch, const int t_per_atom,
@@ -733,6 +744,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -740,6 +752,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
     //numtyp qtmp; fetch(qtmp,i,q_tex);
     //int itype=ix.w;
 
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
     int itype,igroup;
     numtyp bn[4],bcn[3];
     numtyp fid[3],fip[3];
@@ -769,7 +789,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int jextra=dev_packed[nbor];
+      int jextra=nbor_mem[nbor];
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -1093,7 +1113,6 @@ __kernel void k_special15(__global int * dev_nbor,
 
   } // if ii
 }
-
 /*
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
                                    const numtyp off2, __global int * dev_nbor,
@@ -1149,38 +1168,36 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
   }
 }
 */
-#ifdef LAL_SIMD_IP_SYNC
-#define t_per_atom t_per_atom_in
-#else
-#define t_per_atom 1
-#endif
-
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
-                                   const numtyp off2,
-                                   __global int * dev_nbor,
+                                   const __global int * dev_nbor,
                                    const __global int * dev_packed,
+                                   __global int * dev_short_nbor,
+                                   const numtyp off2,
                                    const int inum, const int nbor_pitch,
-                                   const int t_per_atom_in) {
-  const int ii=GLOBAL_ID_X;
+                                   const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
 
   if (ii<inum) {
-/*    
-    const int i=dev_packed[ii];
-    
-    int nbor=ii+nbor_pitch;
-    const int numj=dev_packed[nbor];
-    nbor+=nbor_pitch;
-    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int newj=0;
 
-    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
-    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK15;
 
-    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-      int sj=dev_packed[nbor];
-      int j = sj & NEIGHMASK15;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
       // Compute r12
@@ -1190,15 +1207,14 @@ __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
       if (rsq<off2) {
-        //*out_list=sj;
-        out_list++;
-        newj++;
-        if ((newj & (t_per_atom-1))==0)
-          out_list+=out_stride;
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
       }
-
     } // for nbor
-    //dev_nbor[ii+nbor_pitch]=newj;
-*/
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
   } // if ii
-}
\ No newline at end of file
+}
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 1419e16de1..81b2527222 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -21,7 +21,7 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) {
+BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -100,9 +100,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   } else {
     _nbor_data=&(nbor->dev_nbor);
   }
-    
+
+  bool allocate_packed = false;
   success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
-                              _gpu_host,max_nbors,cell_size,false,_threads_per_atom);
+                              _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom);
   if (success!=0)
     return success;
                               
@@ -126,6 +127,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   if (ef_nall==0)
     ef_nall=2000;
 
+  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
+
   _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
   _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
@@ -158,6 +161,7 @@ void BaseAmoebaT::clear_atomic() {
   time_pair.clear();
   hd_balancer.clear();
 
+  dev_short_nbor.clear();
   nbor->clear();
   ans->clear();
 
@@ -195,7 +199,7 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
 // Build neighbor list on device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
+inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
                                          const int nall, double **host_x,
                                          int *host_type, double *sublo,
                                          double *subhi, tagint *tag,
@@ -206,7 +210,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
   resize_atom(inum,nall,success);
   resize_local(inum,host_inum,nbor->max_nbors(),success);
   if (!success)
-    return;
+    return 0;
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
@@ -232,6 +236,7 @@ inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
+  return mn;
 }
 
 // ---------------------------------------------------------------------------
@@ -385,7 +390,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 
   // Build neighbor list on GPU if necessary
   if (ago==0) {
-    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, nspecial15, special15,
                     success);
     if (!success)
@@ -409,6 +414,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
+  // re-allocate dev_short_nbor if necessary
+  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+    dev_short_nbor.resize((2+_max_nbors)*_nmax);
+  }
+
   return nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 755f11610f..eb8938d7c4 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -123,7 +123,7 @@ class BaseAmoeba {
                     int **firstneigh, bool &success);
 
   /// Build neighbor list on device
-  void build_nbor_list(const int inum, const int host_inum,
+  int build_nbor_list(const int inum, const int host_inum,
                        const int nall, double **host_x, int *host_type,
                        double *sublo, double *subhi, tagint *tag, int **nspecial,
                        tagint **special, int *nspecial15, tagint **special15,
@@ -236,6 +236,8 @@ class BaseAmoeba {
 
   int add_onefive_neighbors();
 
+  UCL_D_Vec<int> dev_short_nbor;
+
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
   UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
@@ -251,8 +253,9 @@ class BaseAmoeba {
   bool _compiled;
   int _block_size, _block_bio_size, _threads_per_atom;
   int _extra_fields;
-  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
+  double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
   double _gpu_overhead, _driver_overhead;
+  bool short_nbor_avail;
   UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index c51f741c0a..9fc2ea5114 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -112,7 +112,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
 
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = false;
+  gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }

From c765861851c3464c9d1c93e90bba8c4e75b28aa0 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 11 Sep 2021 01:00:58 -0500
Subject: [PATCH 025/181] Cleaned up and re-arranged the functions to reflect
 the order of calling in a time step

---
 lib/gpu/lal_amoeba.cpp      | 105 +++--
 lib/gpu/lal_amoeba.cu       | 890 +++++++++++++++++-------------------
 lib/gpu/lal_base_amoeba.cpp |  28 +-
 3 files changed, 493 insertions(+), 530 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 3a83f57594..6f1e0cfaa9 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -126,49 +126,6 @@ double AmoebaT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
 }
 
-// ---------------------------------------------------------------------------
-// Calculate the polar real-space term, returning tep
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int AmoebaT::polar_real(const int eflag, const int vflag) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  int _nall=this->atom->nall();
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
-  this->time_pair.start();
-
-  // Build the short neighbor list if needed
-  if (!this->short_nbor_avail) {
-    this->k_short_nbor.set_size(GX,BX);
-    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                          &this->_nbor_data->begin(),
-                          &this->dev_short_nbor, &_off2, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
-    this->short_nbor_avail = true;
-  }
-
-  this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
-                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                    &this->dev_short_nbor,
-                    &this->ans->force, &this->ans->engv, &this->_tep,
-                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,
-                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
-  this->time_pair.stop();
-
-  // Signal that short nbor list is not avail for the next time step
-  //   do it here because polar_real() is the last kernel in a time step at this point
-
-  this->short_nbor_avail = false;
-
-  return GX;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate the real-space permanent field, returning field and fieldp
 // ---------------------------------------------------------------------------
@@ -182,13 +139,13 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   const int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
 
-  // Build the short neighbor list if needed
+  // Build the short neighbor list if not done yet
   if (!this->short_nbor_avail) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                          &this->_nbor_data->begin(),
-                          &this->dev_short_nbor, &_off2, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &_off2, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
     this->short_nbor_avail = true;
   }
   
@@ -219,9 +176,20 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
 
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &_off2, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
+
   this->k_umutual2b.set_size(GX,BX);
   this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
                         &this->_threads_per_atom, &_aewald, &_off2,
                         &_polar_dscale, &_polar_uscale);
@@ -230,5 +198,48 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Calculate the polar real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::polar_real(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int _nall=this->atom->nall();
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &_off2, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,
+                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_avail = false;
+
+  return GX;
+}
+
 template class Amoeba<PRECISION,ACC_PRECISION>;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index bcb3aef309..fb515c69f7 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -185,6 +185,421 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = damping[itype].x;
+    numtyp pti = damping[itype].y;
+    numtyp ddi = damping[itype].z;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+      
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for Thole polarization damping
+
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp scale7 = (numtyp)1.0;
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp) ;
+            scale3 = (numtyp)1.0 - expdamp ;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
+          }
+        } else {
+          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+          damp = pgamma * ucl_powr(r/damp,3.0);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_dscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+        
+      scalek = factor_pscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+  umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global numtyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    numtyp pdi = damping[itype].x;
+    numtyp ddi = damping[itype].z;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
+
+      numtyp factor_uscale;
+
+      // find terms needed later to compute mutual polarization
+      // if (poltyp != DIRECT) 
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_uscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
+      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
+      tdipdip[1] = bcn[1]*xr*yr;
+      tdipdip[2] = bcn[1]*xr*zr;
+      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
+      tdipdip[4] = bcn[1]*yr*zr;
+      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
+
+      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
+      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
+      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
+      
+      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
+      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
+      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
+      
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+  
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
 /* ----------------------------------------------------------------------
    polar_real = real-space portion of induced dipole polarization
    adapted from Tinker epreal1d() routine
@@ -311,7 +726,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      if (r2>off2) continue;
+      //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
       
@@ -707,474 +1122,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv);
 }
 
-/* ----------------------------------------------------------------------
-  udirect2b = Ewald real direct field via list
-  udirect2b computes the real space contribution of the permanent
-   atomic multipole moments to the field via a neighbor list
-------------------------------------------------------------------------- */
-
-__kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
-                                 const __global numtyp4 *restrict sp_polar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 const __global int *dev_short_nbor,
-                                 __global numtyp4 *restrict fieldp,
-                                 const int inum,  const int nall,
-                                 const int nbor_pitch, const int t_per_atom,
-                                 const numtyp aewald, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
-{
-  int tid, ii, offset, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  int n_stride;
-  local_allocate_store_charge();
-
-  acctyp _fieldp[6];
-  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
-
-  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-
-  //numtyp4 xi__;
-
-  if (ii<inum) {
-    int numj, nbor, nbor_end;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
-    int itype,igroup;
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
-    
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
-    
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
-    numtyp ddi = damping[itype].z;
-
-    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
-    numtyp aesq2n = (numtyp)0.0;
-    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int jextra=nbor_mem[nbor];
-      int j = jextra & NEIGHMASK15;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
- 
-      // Compute r12
-      numtyp xr = jx.x - ix.x;
-      numtyp yr = jx.y - ix.y;
-      numtyp zr = jx.z - ix.z;
-      numtyp r2 = xr*xr + yr*yr + zr*zr;
-
-      if (r2>off2) {
-        if (i == 0) printf("i = 0: j = %d: r2 = %f; numj = %d\n", j, r2, numj);
-        continue;
-      }
-      numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
-      numtyp r2inv = rinv*rinv;
-      numtyp rr1 = rinv;
-      numtyp rr3 = rr1 * r2inv;
-      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
-      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
-
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
-
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
-      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
-      if (igroup == jgroup) {
-        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
-        factor_dscale = polar_dscale;
-        factor_uscale = polar_uscale;
-      } else {
-        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
-        factor_dscale = factor_uscale = (numtyp)1.0;
-      }
-
-      // intermediates involving moments and separation distance
-
-      numtyp dir = dix*xr + diy*yr + diz*zr;
-      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
-      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
-      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
-      numtyp qir = qix*xr + qiy*yr + qiz*zr;
-      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
-      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
-      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
-      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
-      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-
-      // calculate the real space Ewald error function terms
-
-      numtyp ralpha = aewald * r;
-      numtyp exp2a = ucl_exp(-ralpha*ralpha);
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
-      bn[0] = _erfc * rinv;
-
-      numtyp aefac = aesq2n;
-      for (int m = 1; m <= 3; m++) {
-        numtyp bfac = (numtyp) (m+m-1);
-        aefac = aesq2 * aefac;
-        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
-      }
-
-      // find the field components for Thole polarization damping
-
-      numtyp scale3 = (numtyp)1.0;
-      numtyp scale5 = (numtyp)1.0;
-      numtyp scale7 = (numtyp)1.0;
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
-      if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
-        if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp) ;
-            scale3 = (numtyp)1.0 - expdamp ;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
-          }
-        } else {
-          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
-          damp = pgamma * ucl_powr(r/damp,3.0);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp);
-            scale3 = (numtyp)1.0 - expdamp;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
-          }
-        }
-      } else { // damp == 0: ???
-      }
-
-      numtyp scalek = factor_dscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
-        
-      scalek = factor_pscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
-
-      _fieldp[0] += fid[0];
-      _fieldp[1] += fid[1];
-      _fieldp[2] += fid[2];
-      _fieldp[3] += fip[0];
-      _fieldp[4] += fip[1];
-      _fieldp[5] += fip[2];
-    }  // nbor
-
-  } // ii<inum
-
-  // accumulate field and fieldp
-  
-  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
-}
-
-/* ----------------------------------------------------------------------
-  umutual2b = Ewald real mutual field via list
-   umutual2b computes the real space contribution of the induced
-   atomic dipole moments to the field via a neighbor list
-------------------------------------------------------------------------- */
-
-__kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
-                                 const __global numtyp4 *restrict sp_polar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 __global numtyp4 *restrict fieldp,
-                                 const int inum,  const int nall,
-                                 const int nbor_pitch, const int t_per_atom,
-                                 const numtyp aewald, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
-{
-  int tid, ii, offset, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  int n_stride;
-  local_allocate_store_charge();
-
-  acctyp _fieldp[6];
-  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
-
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
-  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
-
-  //numtyp4 xi__;
-
-  if (ii<inum) {
-    int numj, nbor, nbor_end;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
-
-    int itype,igroup;
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
-    
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
-    
-    numtyp pdi = damping[itype].x;
-    numtyp ddi = damping[itype].z;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int jextra=dev_packed[nbor];
-      int j = jextra & NEIGHMASK15;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
- 
-      // Compute r12
-      numtyp xr = jx.x - ix.x;
-      numtyp yr = jx.y - ix.y;
-      numtyp zr = jx.z - ix.z;
-      numtyp r2 = xr*xr + yr*yr + zr*zr;
-
-      if (r2>off2) continue;
-  
-      numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
-      numtyp r2inv = rinv*rinv;
-      numtyp rr1 = rinv;
-      numtyp rr3 = rr1 * r2inv;
-      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
-
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
-      numtyp ukx = polar4[j].x;  // uind[j][0];
-      numtyp uky = polar4[j].y;  // uind[j][1];
-      numtyp ukz = polar4[j].z;  // uind[j][2];
-      numtyp ukxp = polar5[j].x; // uinp[j][0];
-      numtyp ukyp = polar5[j].y; // uinp[j][1];
-      numtyp ukzp = polar5[j].z; // uinp[j][2];
-
-      numtyp factor_uscale;
-
-      // find terms needed later to compute mutual polarization
-      // if (poltyp != DIRECT) 
-      numtyp scale3 = (numtyp)1.0;
-      numtyp scale5 = (numtyp)1.0;
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
-      if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
-        if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp);
-            scale3 = (numtyp)1.0 - expdamp;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-          }
-        }
-      } else { // damp == 0: ???
-      }
-
-      numtyp scalek = factor_uscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
-      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
-      tdipdip[1] = bcn[1]*xr*yr;
-      tdipdip[2] = bcn[1]*xr*zr;
-      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
-      tdipdip[4] = bcn[1]*yr*zr;
-      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
-
-      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
-      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
-      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
-      
-      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
-      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
-      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
-      
-      _fieldp[0] += fid[0];
-      _fieldp[1] += fid[1];
-      _fieldp[2] += fid[2];
-      _fieldp[3] += fip[0];
-      _fieldp[4] += fip[1];
-      _fieldp[5] += fip[2];
-    }  // nbor
-
-  } // ii<inum
-
-  // accumulate field and fieldp
-  
-  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
-}
-
-/* ----------------------------------------------------------------------
-   scan standard neighbor list and make it compatible with 1-5 neighbors
-   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
-   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
-   else do nothing to IJ entry
-------------------------------------------------------------------------- */
-
-__kernel void k_special15(__global int * dev_nbor,
-                          const __global int * dev_packed,
-                          const __global tagint *restrict tag,
-                          const __global int *restrict nspecial15,
-                          const __global tagint *restrict special15,
-                          const int inum, const int nall, const int nbor_pitch,
-                          const int t_per_atom) {
-  int tid, ii, offset, n_stride, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  if (ii<inum) {
-  
-    int numj, nbor, nbor_end;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    int n15 = nspecial15[ii];
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int sj=dev_packed[nbor];
-      int which = sj >> SBBITS & 3;
-      int j = sj & NEIGHMASK;
-      tagint jtag = tag[j];
-
-      if (!which) {
-        int offset=ii;
-        for (int k=0; k<n15; k++) {
-          if (special15[offset] == jtag) {
-            which = 4;
-            break;
-          }
-          offset += nall;
-        }
-      }
-
-      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
-    } // for nbor
-
-  } // if ii
-}
-/*
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
-                                   const numtyp off2, __global int * dev_nbor,
-                                   const __global int * dev_packed,
-                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
-  int tid, ii, offset, n_stride, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  int new_numj=0;
-
-  if (ii<inum) {
-    int numj, nbor, nbor_end;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    if (i == 0) printf("i = 0: numj before = %d\n", numj);
-    __global int *out_list=dev_nbor+nbor;
-    const int out_stride=n_stride;
-    
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-    
-      int sj=dev_packed[nbor];
-      int j = sj & NEIGHMASK15;
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp rsq = delx*delx+dely*dely+delz*delz;
-
-      if (rsq<=off2) {
-        *out_list=sj;
-        out_list+=out_stride;
-
-        new_numj++;
-        if (i == 0 && offset == 0) printf("neighbor of i = 0 within  off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
-      } else {
-        if (i == 0 && offset == 0) printf("neighbor of i = 0 outside off2: j = %d; rsq = %f; new_numj = %d\n", j, rsq, new_numj);
-      }
-    } // for nbor
-  } // if ii
-
-  if (t_per_atom>1) {
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1)
-      new_numj += shfl_down(new_numj, s, t_per_atom);
-  }
-  if (offset==0 && ii<inum) {
-    dev_nbor[ii+nbor_pitch]=new_numj;
-    if (i == 0) printf("i = 0: numj after = %d\n", new_numj);
-  }
-}
-*/
-__kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global int * dev_nbor,
-                                   const __global int * dev_packed,
-                                   __global int * dev_short_nbor,
-                                   const numtyp off2,
-                                   const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
+                                  const __global int * dev_nbor,
+                                  const __global int * dev_packed,
+                                  __global int * dev_short_nbor,
+                                  const numtyp off2,
+                                  const int inum, const int nbor_pitch,
+                                  const int t_per_atom) {
   __local int n_stride;
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 81b2527222..d06d7dfa57 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -101,9 +101,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
     _nbor_data=&(nbor->dev_nbor);
   }
 
-  bool allocate_packed = false;
+  bool alloc_packed=false;
   success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
-                              _gpu_host,max_nbors,cell_size,allocate_packed,_threads_per_atom);
+                              _gpu_host,max_nbors,cell_size,alloc_packed,_threads_per_atom);
   if (success!=0)
     return success;
                               
@@ -231,8 +231,6 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
     add_onefive_neighbors();
   }
 
-  //nbor->copy_unpacked(inum,mn);
-
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
@@ -336,17 +334,17 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
 
 template <class numtyp, class acctyp>
 int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
-                           double **host_x, int *host_type, int *host_amtype,
-                           int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special,
-                           int *nspecial15, tagint **special15,
-                           const bool eflag_in, const bool vflag_in,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **&ilist, int **&jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd) {
+                              double **host_x, int *host_type, int *host_amtype,
+                              int *host_amgroup, double **host_rpole,
+                              double **host_uind, double **host_uinp,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              int *nspecial15, tagint **special15,
+                              const bool eflag_in, const bool vflag_in,
+                              const bool eatom, const bool vatom, int &host_start,
+                              int **&ilist, int **&jnum, const double cpu_time,
+                              bool &success, double *host_q, double *boxlo,
+                              double *prd) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;

From 94d6f7219c999e8b1403be4c4f993c6e850079ae Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 11 Sep 2021 11:22:17 -0500
Subject: [PATCH 026/181] Attempted to reduce the memory footprint of the
 per-atom arrays

---
 lib/gpu/lal_base_amoeba.cpp | 17 +++++++++--------
 src/GPU/pair_amoeba_gpu.cpp |  3 +++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index d06d7dfa57..cd86170e5f 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -103,7 +103,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   bool alloc_packed=false;
   success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,
-                              _gpu_host,max_nbors,cell_size,alloc_packed,_threads_per_atom);
+                              _gpu_host,max_nbors,cell_size,alloc_packed,
+                              _threads_per_atom);
   if (success!=0)
     return success;
                               
@@ -123,7 +124,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   // allocate per-atom array tep 
 
-  int ef_nall=nall;
+  int ef_nall=nlocal; //nall;
   if (ef_nall==0)
     ef_nall=2000;
 
@@ -413,8 +414,8 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
                      boxlo, prd);
 
   // re-allocate dev_short_nbor if necessary
-  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
+  if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
     dev_short_nbor.resize((2+_max_nbors)*_nmax);
   }
 
@@ -468,8 +469,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
                          
   // ------------------- Resize _fieldp array ------------------------
 
-  if (nall>_max_fieldp_size) {
-    _max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
+  if (inum_full>_max_fieldp_size) {
+    _max_fieldp_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
     _fieldp.resize(_max_fieldp_size*8);
   }
   *fieldp_ptr=_fieldp.host.begin();
@@ -537,8 +538,8 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
 
   // ------------------- Resize _fieldp array ------------------------
 
-  if (nall>_max_fieldp_size) {
-    _max_fieldp_size=static_cast<int>(static_cast<double>(nall)*1.10);
+  if (inum_full>_max_fieldp_size) {
+    _max_fieldp_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
     _fieldp.resize(_max_fieldp_size*8);
   }
   *fieldp_ptr=_fieldp.host.begin();
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 9fc2ea5114..edd51667aa 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -991,6 +991,9 @@ void PairAmoebaGPU::udirect2b_cpu()
         tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr;
         tdipdip[ndip++] = bcn[1]*yr*zr;
         tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr;
+        //printf("i = %d: j = %d: poltyp != DIRECT\n", i, j);
+      } else {
+        printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
       }
       
     } // jj

From edd76733a10929ecb3149a928daf7c4399c42d2d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 12 Sep 2021 00:51:48 -0500
Subject: [PATCH 027/181] Working on umutual2b, tdipdip are correct, but
 incorrect results for field and fieldp

---
 lib/gpu/lal_amoeba.cu        | 35 ++++++++++++++++++++++++++++++++++-
 lib/gpu/lal_base_amoeba.cpp  |  4 ++--
 src/AMOEBA/amoeba_induce.cpp | 15 +++++++++++----
 src/GPU/pair_amoeba_gpu.cpp  | 17 ++++++++++++-----
 4 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index fb515c69f7..add17e2725 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -465,6 +465,10 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     numtyp pdi = damping[itype].x;
     numtyp ddi = damping[itype].z;
 
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int jextra=nbor_mem[nbor];
@@ -498,6 +502,32 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp ukzp = polar5[j].z; // uinp[j][2];
 
       numtyp factor_uscale;
+      //const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      //factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        //factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        //factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        //factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_uscale = (numtyp)1.0;
+      }
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
 
       // find terms needed later to compute mutual polarization
       // if (poltyp != DIRECT) 
@@ -520,6 +550,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp scalek = factor_uscale;
       bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
       bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+
       numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
       tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
       tdipdip[1] = bcn[1]*xr*yr;
@@ -527,7 +558,9 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
       tdipdip[4] = bcn[1]*yr*zr;
       tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
-
+      //if (i==0 && j == 10) 
+      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
+      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
       fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
       fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
       fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index cd86170e5f..f4036ec110 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -614,8 +614,8 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
 
   // ------------------- Resize _tep array ------------------------
 
-  if (nall>_max_tep_size) {
-    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+  if (inum_full>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
     _tep.resize(_max_tep_size*4);
   }
   *tep_ptr=_tep.host.begin();
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 2ffd4d275b..2294f543dd 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -279,6 +279,10 @@ void PairAmoeba::induce()
 
     crstyle = FIELD;
     comm->reverse_comm_pair(this);
+    for (int i = 0; i < 10; i++) {
+        printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]);
+      }
+    //error->all(FLERR,"STOP CPU");
 /*
     if (comm->me == 0) {
       printf("CPU: cutghost = %f\n", comm->cutghost[0]);
@@ -369,12 +373,13 @@ void PairAmoeba::induce()
       cfstyle = INDUCE;
       comm->forward_comm_pair(this);
 
-      ufield0c(field,fieldp);
-
-      //error->all(FLERR,"STOP");
+      ufield0c(field,fieldp);    
 
       crstyle = FIELD;
       comm->reverse_comm_pair(this);
+      
+
+      //error->all(FLERR,"STOP");
 /*
       if (comm->me == 0) {
         printf("CPU: iter = %d\n", iter);
@@ -1243,7 +1248,9 @@ void PairAmoeba::umutual2b(double **field, double **fieldp)
       j = jlist[jj];
       uindj = uind[j];
       uinpj = uinp[j];
-
+      //if (i==0 && j == 10) 
+      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
+      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
       fid[0] = tdipdip[0]*uindj[0] + tdipdip[1]*uindj[1] + tdipdip[2]*uindj[2];
       fid[1] = tdipdip[1]*uindj[0] + tdipdip[3]*uindj[1] + tdipdip[4]*uindj[2];
       fid[2] = tdipdip[2]*uindj[0] + tdipdip[4]*uindj[1] + tdipdip[5]*uindj[2];
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index edd51667aa..bdde1176d9 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -111,7 +111,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   tep_pinned = nullptr;
 
   gpu_udirect2b_ready = true;
-  gpu_umutual2b_ready = false;
+  gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
@@ -532,6 +532,14 @@ void PairAmoebaGPU::induce()
       comm->reverse_comm_pair(this);
     }
 
+    if (comm->me == 0) {
+      for (int i = 0; i < 10; i++) {
+        printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]);
+      }
+    }
+    
+    //error->all(FLERR,"STOP GPU");
+
 /*
     if (comm->me == 0) {
       printf("GPU: cutghost = %f\n", comm->cutghost[0]);
@@ -596,12 +604,12 @@ void PairAmoebaGPU::induce()
 
       ufield0c(field,fieldp);
 
-      //error->all(FLERR,"STOP");
-
       if (!gpu_umutual2b_ready) {
         crstyle = FIELD;
         comm->reverse_comm_pair(this);
       }
+
+      //error->all(FLERR,"STOP");
 /*     
      if (comm->me == 0) {
        printf("GPU: iter = %d\n", iter);
@@ -1051,7 +1059,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // accumulate the field and fieldp values from the GPU lib
-  //   field and fieldp may already have some nonzero values from kspace (udirect1)
+  //   field and fieldp may already have some nonzero values from kspace (umutual1)
 
   int nlocal = atom->nlocal;
   double *field_ptr = (double *)fieldp_pinned;
@@ -1071,7 +1079,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
-
 }
 
 /* ---------------------------------------------------------------------- */

From bc665999d5659f820741fd614db488be37c4f47d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 13 Sep 2021 01:11:03 -0500
Subject: [PATCH 028/181] Fixed bugs with the umutual2b kernel, now the field
 and fieldp seems correct

---
 lib/gpu/lal_amoeba.cu       | 29 ++++++++++-------------------
 src/GPU/pair_amoeba_gpu.cpp | 14 ++++++++++++--
 2 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index add17e2725..f640690109 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -463,7 +463,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     igroup = polar3[i].w; // amgroup[i];
     
     numtyp pdi = damping[itype].x;
-    numtyp ddi = damping[itype].z;
+    numtyp pti = damping[itype].y;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -502,16 +502,8 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp ukzp = polar5[j].z; // uinp[j][2];
 
       numtyp factor_uscale;
-      //const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      //factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
-      if (igroup == jgroup) {
-        //factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
-        //factor_dscale = polar_dscale;
-        factor_uscale = polar_uscale;
-      } else {
-        //factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
-        factor_uscale = (numtyp)1.0;
-      }
+      if (igroup == jgroup) factor_uscale = polar_uscale;
+      else factor_uscale = (numtyp)1.0;
 
       // calculate the real space Ewald error function terms
 
@@ -535,15 +527,14 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp scale5 = (numtyp)1.0;
       numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
-        if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp);
-            scale3 = (numtyp)1.0 - expdamp;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-          }
+        numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          scale3 = (numtyp)1.0 - expdamp;
+          scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
         }
+        
       } else { // damp == 0: ???
       }
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index bdde1176d9..b5096b4c1c 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -524,6 +524,14 @@ void PairAmoebaGPU::induce()
           uinp[i][0], uinp[i][1], uinp[i][2]); 
       }
     }
+*/
+/*
+    if (comm->me == 0) {
+      printf("GPU before\n");
+      for (int i = 0; i < 10; i++) {
+        printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]);
+      }
+    }
 */
     ufield0c(field,fieldp);
 
@@ -531,12 +539,14 @@ void PairAmoebaGPU::induce()
       crstyle = FIELD;
       comm->reverse_comm_pair(this);
     }
-
+/*
     if (comm->me == 0) {
+      printf("GPU after \n");
       for (int i = 0; i < 10; i++) {
         printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]);
       }
     }
+*/  
     
     //error->all(FLERR,"STOP GPU");
 
@@ -841,7 +851,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
 
-  udirect2b_cpu();
+  //udirect2b_cpu();
 
   // accumulate the field and fieldp values from the GPU lib
   //   field and fieldp may already have some nonzero values from kspace (udirect1)

From 76794bef588d3df305c5fbd76e035fb9dede16f4 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 13 Sep 2021 01:16:42 -0500
Subject: [PATCH 029/181] Removed some of the debugging stuffs

---
 src/GPU/pair_amoeba_gpu.cpp | 64 +++----------------------------------
 1 file changed, 5 insertions(+), 59 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index b5096b4c1c..640d94972a 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -515,51 +515,16 @@ void PairAmoebaGPU::induce()
 
     cfstyle = INDUCE;
     comm->forward_comm_pair(this);
-/*
-    if (comm->me == 0) {
-      printf("GPU: cutghost = %f\n", comm->cutghost[0]);
-      for (i = 0; i < 20; i++) {
-        printf("i = %d: uind = %f %f %f; udirp = %f %f %f\n",
-          i, uind[i][0], uind[i][1], uind[i][2],
-          uinp[i][0], uinp[i][1], uinp[i][2]); 
-      }
-    }
-*/
-/*
-    if (comm->me == 0) {
-      printf("GPU before\n");
-      for (int i = 0; i < 10; i++) {
-        printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]);
-      }
-    }
-*/
+
     ufield0c(field,fieldp);
 
     if (!gpu_umutual2b_ready) {
       crstyle = FIELD;
       comm->reverse_comm_pair(this);
     }
-/*
-    if (comm->me == 0) {
-      printf("GPU after \n");
-      for (int i = 0; i < 10; i++) {
-        printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]);
-      }
-    }
-*/  
     
     //error->all(FLERR,"STOP GPU");
 
-/*
-    if (comm->me == 0) {
-      printf("GPU: cutghost = %f\n", comm->cutghost[0]);
-      for (i = 0; i < nlocal; i++) {
-        printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
-          i, field[i][0], field[i][1], field[i][2],
-          fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
-      }    
-    }
-*/      
     // set initial conjugate gradient residual and conjugate vector
 
     for (i = 0; i < nlocal; i++) {
@@ -620,16 +585,7 @@ void PairAmoebaGPU::induce()
       }
 
       //error->all(FLERR,"STOP");
-/*     
-     if (comm->me == 0) {
-       printf("GPU: iter = %d\n", iter);
-       for (i = 0; i < 10; i++) {
-          printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
-            i, field[i][0], field[i][1], field[i][2],
-            fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
-        }    
-      }
-*/
+
       for (i = 0; i < nlocal; i++) {
         for (j = 0; j < 3; j++) {
           uind[i][j] = vec[i][j];
@@ -771,16 +727,7 @@ void PairAmoebaGPU::induce()
   memory->destroy(udir);
   memory->destroy(usum);
   memory->destroy(usump);
-/*
-  if (comm->me == 0) {
-    printf("GPU: iter = %d\n", iter);
-    for (i = 0; i < 20; i++) {
-      printf("i = %d: uind = %f %f %f; uinp = %f %f %f\n",
-        i, uind[i][0], uind[i][1], uind[i][2],
-        uinp[i][0], uinp[i][1], uinp[i][2]); 
-    }    
-  }
-*/
+
   // update the lists of previous induced dipole values
   // shift previous m values up to m+1, add new values at m = 0
   // only when preconditioner is used
@@ -851,7 +798,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
 
-  //udirect2b_cpu();
+  // udirect2b_cpu();
 
   // accumulate the field and fieldp values from the GPU lib
   //   field and fieldp may already have some nonzero values from kspace (udirect1)
@@ -1009,9 +956,8 @@ void PairAmoebaGPU::udirect2b_cpu()
         tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr;
         tdipdip[ndip++] = bcn[1]*yr*zr;
         tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr;
-        //printf("i = %d: j = %d: poltyp != DIRECT\n", i, j);
       } else {
-        printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
+        if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
       }
       
     } // jj

From a21095fded3bb490b905bcf9d22f5cd85c8fda28 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 13 Sep 2021 13:47:15 -0500
Subject: [PATCH 030/181] More cleaning up

---
 lib/gpu/lal_amoeba.cpp       | 3 ++-
 src/AMOEBA/amoeba_induce.cpp | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 6f1e0cfaa9..5030025981 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -137,7 +137,8 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
 
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/(BX/this->_threads_per_atom)));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
 
   // Build the short neighbor list if not done yet
   if (!this->short_nbor_avail) {
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 2294f543dd..5b855abdd0 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -279,9 +279,7 @@ void PairAmoeba::induce()
 
     crstyle = FIELD;
     comm->reverse_comm_pair(this);
-    for (int i = 0; i < 10; i++) {
-        printf("i = %d; fieldp = %f %f %f\n", i, fieldp[i][0], fieldp[i][1], fieldp[i][2]);
-      }
+
     //error->all(FLERR,"STOP CPU");
 /*
     if (comm->me == 0) {

From 98c1a0178c8b491636b4e4328963dcdfb3546911 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 16 Sep 2021 17:14:36 -0500
Subject: [PATCH 031/181] Refactored the API so that different off2 values are
 used for different kernels

---
 lib/gpu/lal_amoeba.cpp      | 28 +++++++---------
 lib/gpu/lal_amoeba.h        |  5 ++-
 lib/gpu/lal_amoeba_ext.cpp  | 36 ++++++++++-----------
 lib/gpu/lal_base_amoeba.cpp | 21 ++++++++----
 lib/gpu/lal_base_amoeba.h   | 13 +++++---
 src/GPU/pair_amoeba_gpu.cpp | 64 ++++++++++++++++++++++---------------
 6 files changed, 92 insertions(+), 75 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 5030025981..8adabbe6d5 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -52,8 +52,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
                   const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15,
                   const double cell_size, const double gpu_split, FILE *_screen,
-                  const double aewald, const double felec,
-                  const double off2, const double polar_dscale,
+                  const double aewald, const double polar_dscale,
                   const double polar_uscale) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
@@ -97,8 +96,6 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   ucl_copy(sp_polar,dview,5,false);
 
   _aewald = aewald;
-  _felec = felec;
-  _off2 = off2;
   _polar_dscale = polar_dscale;
   _polar_uscale = polar_uscale;
 
@@ -145,7 +142,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                            &this->_nbor_data->begin(),
-                           &this->dev_short_nbor, &_off2, &ainum,
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
                            &nbor_pitch, &this->_threads_per_atom);
     this->short_nbor_avail = true;
   }
@@ -155,7 +152,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
-                        &this->_threads_per_atom, &_aewald, &_off2,
+                        &this->_threads_per_atom, &_aewald, &this->_off2_polar,
                         &_polar_dscale, &_polar_uscale);
 
   this->time_pair.stop();
@@ -181,19 +178,18 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   if (!this->short_nbor_avail) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                          &this->_nbor_data->begin(),
-                          &this->dev_short_nbor, &_off2, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
     this->short_nbor_avail = true;
   }
 
   this->k_umutual2b.set_size(GX,BX);
   this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                        &this->dev_short_nbor,
-                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
-                        &this->_threads_per_atom, &_aewald, &_off2,
-                        &_polar_dscale, &_polar_uscale);
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
 
   this->time_pair.stop();
   return GX;
@@ -219,7 +215,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
-                          &this->dev_short_nbor, &_off2, &ainum,
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
                           &nbor_pitch, &this->_threads_per_atom);
     this->short_nbor_avail = true;
   }
@@ -230,8 +226,8 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
                     &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,
-                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
+                    &this->_threads_per_atom,  &_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
   this->time_pair.stop();
 
   // Signal that short nbor list is not avail for the next time step
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index ea4f8b9d1d..ce30b6ab19 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -45,8 +45,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
            const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const int maxspecial15, const double cell_size,
            const double gpu_split, FILE *_screen,
-           const double aewald, const double felec,
-           const double off2, const double polar_dscale,
+           const double aewald, const double polar_dscale,
            const double polar_uscale);
 
   /// Clear all host and device data
@@ -75,7 +74,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   /// Number of atom types
   int _lj_types;
 
-  numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale;
+  numtyp _aewald, _polar_dscale, _polar_uscale;
   numtyp _qqrd2e;
 
  protected:
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 5bb4dea25f..bbebaa09da 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -36,8 +36,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double aewald, const double felec,
-                    const double off2, const double polar_dscale,
+                    const double aewald, const double polar_dscale,
                     const double polar_uscale, int& tep_size) {
   AMOEBAMF.clear();
   gpu_mode=AMOEBAMF.device->gpu_mode();
@@ -67,7 +66,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                           host_special_polar_wscale, host_special_polar_piscale,
                           host_special_polar_pscale, nlocal, nall, max_nbors,
                           maxspecial, maxspecial15, cell_size, gpu_split, screen,
-                          aewald, felec, off2, polar_dscale, polar_uscale);
+                          aewald, polar_dscale, polar_uscale);
 
   AMOEBAMF.device->world_barrier();
   if (message)
@@ -87,7 +86,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                             host_special_polar_wscale, host_special_polar_piscale,
                             host_special_polar_pscale, nlocal, nall, max_nbors,
                             maxspecial, maxspecial15, cell_size, gpu_split, screen,
-                            aewald, felec, off2, polar_dscale, polar_uscale);
+                            aewald, polar_dscale, polar_uscale);
 
     AMOEBAMF.device->gpu_barrier();
     if (message)
@@ -111,16 +110,16 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
                            double **host_uind, double **host_uinp,
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom, int &host_start,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void **fieldp_ptr) {
+                           bool &success, const double off2, double *host_q,
+                           double *boxlo, double *prd, void **fieldp_ptr) {
   return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, host_q, boxlo, prd, fieldp_ptr);
+                          cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr);
 }
 
 int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
@@ -132,13 +131,13 @@ int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
                            const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void **fieldp_ptr) {
+                           bool &success, const double off2, double *host_q,
+                           double *boxlo, double *prd, void **fieldp_ptr) {
   return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, host_q, boxlo, prd, fieldp_ptr);
+                          cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr);
 }
 
 int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
@@ -147,17 +146,16 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
                            double **host_rpole, double **host_uind, double **host_uinp,
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom, int &host_start,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void **tep_ptr) {
+                           bool &success, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom,
-                          vatom, host_start, ilist, jnum, cpu_time, success,
-                          host_q, boxlo, prd, tep_ptr);
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
 }
 
 double amoeba_gpu_bytes() {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index f4036ec110..2fe0e1e4b8 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -250,7 +250,8 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
                           const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom,
                           int &host_start, const double cpu_time,
-                          bool &success, double *host_q, const int nlocal,
+                          bool &success, const double off2_polar, const double felec,
+                          double *host_q, const int nlocal,
                           double *boxlo, double *prd, void **tep_ptr) {
   acc_timers();
   int eflag, vflag;
@@ -316,6 +317,8 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
   device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
+  _off2_polar = off2_polar;
+  _felec = felec;
   const int red_blocks=polar_real(eflag,vflag);
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
@@ -437,8 +440,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void** fieldp_ptr) {
+                           bool &success, const double off2_polar, double *host_q,
+                           double *boxlo, double *prd, void** fieldp_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -475,6 +478,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   }
   *fieldp_ptr=_fieldp.host.begin();
 
+  _off2_polar = off2_polar;
   const int red_blocks=udirect2b(eflag,vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
@@ -506,8 +510,8 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void** fieldp_ptr) {
+                           bool &success, const double off2_polar, double *host_q,
+                           double *boxlo, double *prd, void** fieldp_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -544,6 +548,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
   }
   *fieldp_ptr=_fieldp.host.begin();
 
+  _off2_polar = off2_polar;
   const int red_blocks=umutual2b(eflag,vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
@@ -574,8 +579,8 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, double *host_q, double *boxlo,
-                           double *prd, void **tep_ptr) {
+                           bool &success, const double felec, const double off2_polar,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -620,6 +625,8 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
   }
   *tep_ptr=_tep.host.begin();
 
+  _off2_polar = off2_polar;
+  _felec = felec;
   const int red_blocks=polar_real(eflag,vflag);
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index eb8938d7c4..b14a234e7b 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -152,7 +152,7 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+                const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute the real space part of the induced field (umutual2b) with device neighboring
   int** compute_umutual2b(const int ago, const int inum_full, const int nall,
@@ -165,7 +165,7 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+                const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute polar real-space with device neighboring
   int** compute_polar_real(const int ago, const int inum_full, const int nall,
@@ -177,7 +177,8 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd, void **tep_ptr);
+                const double felec, const double off2_polar, double *charge,
+                double *boxlo, double *prd, void **tep_ptr);
 
   /// Compute polar real-space with host neighboring (not active for now)
   void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
@@ -186,8 +187,8 @@ class BaseAmoeba {
                double **host_uinp, int *ilist, int *numj,
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double *charge,
-               const int nlocal, double *boxlo, double *prd, void **tep_ptr);
+               const double cpu_time, bool &success, const double felec, const double off2_polar,
+               double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr);
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -258,6 +259,8 @@ class BaseAmoeba {
   bool short_nbor_avail;
   UCL_D_Vec<int> *_nbor_data;
 
+  numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
+
   void compile_kernels(UCL_Device &dev, const void *pair_string,
      const char *kname_polar, const char *kname_udirect2b,
      const char *kname_umutual2b, const char *kname_short_nbor);
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 640d94972a..f4ead3c5fa 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -59,8 +59,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double aewald, const double felec,
-                    const double off2, const double polar_dscale,
+                    const double aewald, const double polar_dscale,
                     const double polar_uscale, int& tep_size);
 void amoeba_gpu_clear();
 
@@ -69,33 +68,30 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal
               double **host_rpole, double **host_uind, double **host_uinp, 
               double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
-              const bool eflag, const bool vflag,
-              const bool eatom, const bool vatom, int &host_start,
-              int **ilist, int **jnum, const double cpu_time,
-              bool &success, double *host_q, double *boxlo, double *prd,
-              void **fieldp_ptr);
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double off2, double *host_q,
+              double *boxlo, double *prd, void **fieldp_ptr);
 
 int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, 
               double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
-              const bool eflag, const bool vflag,
-              const bool eatom, const bool vatom, int &host_start,
-              int **ilist, int **jnum, const double cpu_time,
-              bool &success, double *host_q, double *boxlo, double *prd,
-              void **fieldp_ptr);
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double off2, double *host_q,
+              double *boxlo, double *prd, void **fieldp_ptr);
 
 int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
               double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
-              const bool eflag, const bool vflag,
-              const bool eatom, const bool vatom, int &host_start,
-              int **ilist, int **jnum, const double cpu_time,
-              bool &success, double *host_q, double *boxlo, double *prd,
-              void **tep_ptr);
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double off2, const double felec, double *host_q,
+              double *boxlo, double *prd, void **tep_ptr);
 
 double amoeba_gpu_bytes();
 
@@ -155,6 +151,15 @@ void PairAmoebaGPU::polar_real()
   }
   inum = atom->nlocal;
 
+  // select the correct cutoff for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  // set the energy unit conversion factor for polar real-space calculation
+
+  double felec = 0.5 * electric / am_dielectric;
+
   firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
                                         atom->type, amtype, amgroup,
                                         rpole, uind, uinp, sublo, subhi,
@@ -162,7 +167,7 @@ void PairAmoebaGPU::polar_real()
                                         atom->nspecial15, atom->special15,
                                         eflag, vflag, eflag_atom, vflag_atom,
                                         host_start, &ilist, &numneigh, cpu_time,
-                                        success, atom->q, domain->boxlo,
+                                        success, felec, off2, atom->q, domain->boxlo,
                                         domain->prd, &tep_pinned);
 
   
@@ -278,11 +283,11 @@ void PairAmoebaGPU::init_style()
 
   // select the squared cutoff (off2) for neighbor list builds (the polar term for now)
   // NOTE: induce and polar terms are using the same flags here
-
+/*
   if (use_ewald) choose(POLAR_LONG);
   else choose(POLAR);
-
-  double cell_size = sqrt(off2) + neighbor->skin;
+*/
+  double cell_size = sqrt(maxcut) + neighbor->skin;
 
   int maxspecial=0;
   int maxspecial15=0;
@@ -303,8 +308,7 @@ void PairAmoebaGPU::init_style()
                                 special_polar_pscale, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                 maxspecial15, cell_size, gpu_mode, screen,
-                                aewald, felec, off2, polar_dscale, polar_uscale,
-                                tep_size);
+                                aewald, polar_dscale, polar_uscale, tep_size);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE)
@@ -784,13 +788,18 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   }
   inum = atom->nlocal;
 
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
   firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
                                         atom->type, amtype, amgroup, rpole, uind, uinp,
                                         sublo, subhi, atom->tag, atom->nspecial, atom->special,
                                         atom->nspecial15, atom->special15,
                                         eflag, vflag, eflag_atom, vflag_atom,
                                         host_start, &ilist, &numneigh, cpu_time,
-                                        success, atom->q, domain->boxlo,
+                                        success, off2, atom->q, domain->boxlo,
                                         domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
@@ -1003,13 +1012,18 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
   }
   inum = atom->nlocal;
 
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
   firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
                                         atom->type, amtype, amgroup, rpole, uind, uinp,
                                         sublo, subhi, atom->tag, atom->nspecial, atom->special,
                                         atom->nspecial15, atom->special15,
                                         eflag, vflag, eflag_atom, vflag_atom,
                                         host_start, &ilist, &numneigh, cpu_time,
-                                        success, atom->q, domain->boxlo,
+                                        success, off2, atom->q, domain->boxlo,
                                         domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");

From c0b967054e144e74e2365aa50790410006aa540e Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 16 Sep 2021 17:27:44 -0500
Subject: [PATCH 032/181] Fixed bugs with zero local atoms (similar to what has
 been done to PPPM interp)

---
 lib/gpu/lal_amoeba.cpp | 28 +++++++++++++++++++---------
 lib/gpu/lal_pppm.cpp   |  6 ++++--
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 8adabbe6d5..6bf93a3eb5 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -128,14 +128,18 @@ double AmoebaT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::udirect2b(const int eflag, const int vflag) {
+  int ainum=this->ans->inum(); 
+  if (ainum == 0)
+    return 0;
+
   int _nall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
-  int ainum=this->ans->inum(); 
 
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
+  this->time_pair.start();
 
   // Build the short neighbor list if not done yet
   if (!this->short_nbor_avail) {
@@ -164,14 +168,17 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::umutual2b(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
-
-  int _nall=this->atom->nall();
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet
@@ -200,14 +207,17 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
-
-  int _nall=this->atom->nall();
-  int ainum=this->ans->inum();
-  int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 6e8fe237a6..87ab6fe775 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -342,13 +342,15 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
   vd_brick.update_device(true);
   time_in.stop();
 
+  int ainum=this->ans->inum();
+  if (ainum==0)
+    return;
+
   time_interp.start();
   // Compute the block size and grid size to keep all cores busy
   int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
-  int ainum=this->ans->inum();
-
   k_interp.set_size(GX,BX);
   k_interp.run(&atom->x, &atom->q, &ainum, &vd_brick, &d_rho_coeff,
                &_npts_x, &_npts_yx, &_brick_x, &_brick_y, &_brick_z, &_delxinv,

From 6293da766142ce0da03be8b0ebc39027c17354b6 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 16 Sep 2021 17:30:56 -0500
Subject: [PATCH 033/181] Cleaned up a bit

---
 src/GPU/pair_amoeba_gpu.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index f4ead3c5fa..e636e824d3 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -169,7 +169,6 @@ void PairAmoebaGPU::polar_real()
                                         host_start, &ilist, &numneigh, cpu_time,
                                         success, felec, off2, atom->q, domain->boxlo,
                                         domain->prd, &tep_pinned);
-
   
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
@@ -281,12 +280,6 @@ void PairAmoebaGPU::init_style()
     }
   }
 
-  // select the squared cutoff (off2) for neighbor list builds (the polar term for now)
-  // NOTE: induce and polar terms are using the same flags here
-/*
-  if (use_ewald) choose(POLAR_LONG);
-  else choose(POLAR);
-*/
   double cell_size = sqrt(maxcut) + neighbor->skin;
 
   int maxspecial=0;
@@ -298,11 +291,6 @@ void PairAmoebaGPU::init_style()
     
   int tep_size;
   int mnf = 5e-2 * neighbor->oneatom;
-
-  // set the energy unit conversion factor for polar real-space calculation
-
-  double felec = 0.5 * electric / am_dielectric;
-
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
                                 special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, atom->nlocal,

From 003bebd31e60118295b38687627b6108cf4f4b4d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 17 Sep 2021 01:19:33 -0500
Subject: [PATCH 034/181] Working on the multipole real-space term, not ready
 yet

---
 lib/gpu/lal_amoeba.cpp      |  51 +++++-
 lib/gpu/lal_amoeba.cu       | 343 ++++++++++++++++++++++++++++++++++++
 lib/gpu/lal_amoeba.h        |   2 +
 lib/gpu/lal_amoeba_ext.cpp  |  34 +++-
 lib/gpu/lal_base_amoeba.cpp |  95 +++++++++-
 lib/gpu/lal_base_amoeba.h   |  25 ++-
 src/AMOEBA/pair_amoeba.h    |   2 +-
 src/GPU/pair_amoeba_gpu.cpp | 332 ++++++++++++++++++++--------------
 src/GPU/pair_amoeba_gpu.h   |   6 +-
 9 files changed, 729 insertions(+), 161 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 6bf93a3eb5..60bc365d12 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -45,7 +45,8 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
-                  const double *host_thole, const double *host_dirdamp, 
+                  const double *host_thole, const double *host_dirdamp,
+                  const double *host_special_mpole,
                   const double *host_special_polar_wscale,
                   const double *host_special_polar_piscale,
                   const double *host_special_polar_pscale,
@@ -57,8 +58,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
-                            "k_amoeba_polar", "k_amoeba_udirect2b",
-                            "k_amoeba_umutual2b", "k_amoeba_short_nbor");
+                            "k_amoeba_multipole", "k_amoeba_udirect2b",
+                            "k_amoeba_umutual2b", "k_amoeba_polar",
+                            "k_amoeba_short_nbor");
   if (success!=0)
     return success;
 
@@ -91,7 +93,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
     dview[i].x=host_special_polar_wscale[i];
     dview[i].y=host_special_polar_piscale[i];
     dview[i].z=host_special_polar_pscale[i];
-    dview[i].w=(numtyp)0;
+    dview[i].w=host_special_mpole[i];
   }
   ucl_copy(sp_polar,dview,5,false);
 
@@ -123,6 +125,47 @@ double AmoebaT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
 }
 
+// ---------------------------------------------------------------------------
+// Calculate the polar real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_avail = true;
+  }
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &_aewald, &this->_felec,
+                    &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
 // ---------------------------------------------------------------------------
 // Calculate the real-space permanent field, returning field and fieldp
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index f640690109..375592e338 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -44,6 +44,27 @@ _texture( q_tex,int2);
 #define local_allocate_store_ufld()                                         \
     __local acctyp red_acc[6][BLOCK_PAIR];
 
+#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                                tep)                                        \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=tq.x;                                                   \
+    red_acc[1][tid]=tq.y;                                                   \
+    red_acc[2][tid]=tq.z;                                                   \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    tq.x=red_acc[0][tid];                                                   \
+    tq.y=red_acc[1][tid];                                                   \
+    tq.z=red_acc[2][tid];                                                   \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                               \
+  }
+
 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
                           i, tep)                                           \
   if (t_per_atom>1) {                                                       \
@@ -130,6 +151,19 @@ _texture( q_tex,int2);
 
 #define local_allocate_store_ufld()
 
+#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                          tep)                                              \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      tq.x += shfl_down(tq.x, s, t_per_atom);                               \
+      tq.y += shfl_down(tq.y, s, t_per_atom);                               \
+      tq.z += shfl_down(tq.z, s, t_per_atom);                               \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                               \
+  }
+
 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
                           i, tep)                                           \
   if (t_per_atom>1) {                                                       \
@@ -185,6 +219,315 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of multipole
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
+                            const __global numtyp *restrict extra,
+                            const __global numtyp4 *restrict damping,
+                            const __global numtyp4 *restrict sp_polar,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            const __global int *dev_short_nbor,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            __global numtyp4 *restrict tep,
+                            const int eflag, const int vflag, const int inum,
+                            const int nall, const int nbor_pitch, const int t_per_atom,
+                            const numtyp aewald, const numtyp felec,
+                            const numtyp off2, const numtyp polar_dscale,
+                            const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_ufld();
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.w=(acctyp)0; 
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int k,m,itype,igroup;
+    numtyp bfac;
+    numtyp term1,term2,term3;
+    numtyp term4,term5;
+    numtyp term6,term7;
+    numtyp rc3[3],rc5[3],rc7[3];
+    numtyp bn[6];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = damping[itype].x;
+    numtyp pti = damping[itype].y;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      
+      numtyp dik = dix*dkx + diy*dky + diz*dkz;
+      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
+      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
+      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
+      numtyp qiqk = (numtyp )2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
+
+      // additional intermediates involving moments and distance
+
+      numtyp dirx = diy*zr - diz*yr;
+      numtyp diry = diz*xr - dix*zr;
+      numtyp dirz = dix*yr - diy*xr;
+      numtyp dkrx = dky*zr - dkz*yr;
+      numtyp dkry = dkz*xr - dkx*zr;
+      numtyp dkrz = dkx*yr - dky*xr;
+      numtyp dikx = diy*dkz - diz*dky;
+      numtyp diky = diz*dkx - dix*dkz;
+      numtyp dikz = dix*dky - diy*dkx;
+      numtyp qirx = qiz*yr - qiy*zr;
+      numtyp qiry = qix*zr - qiz*xr;
+      numtyp qirz = qiy*xr - qix*yr;
+      numtyp qkrx = qkz*yr - qky*zr;
+      numtyp qkry = qkx*zr - qkz*xr;
+      numtyp qkrz = qky*xr - qkx*yr;
+      numtyp qikx = qky*qiz - qkz*qiy;
+      numtyp qiky = qkz*qix - qkx*qiz;
+      numtyp qikz = qkx*qiy - qky*qix;
+      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
+      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
+      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
+      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
+      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
+      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
+      numtyp qikrx = qizk*yr - qiyk*zr;
+      numtyp qikry = qixk*zr - qizk*xr;
+      numtyp qikrz = qiyk*xr - qixk*yr;
+      numtyp qkirx = qkzi*yr - qkyi*zr;
+      numtyp qkiry = qkxi*zr - qkzi*xr;
+      numtyp qkirz = qkyi*xr - qkxi*yr;
+      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
+      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
+      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
+      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
+      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
+      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
+      numtyp diqkrx = diqkz*yr - diqky*zr;
+      numtyp diqkry = diqkx*zr - diqkz*xr;
+      numtyp diqkrz = diqky*xr - diqkx*yr;
+      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
+      numtyp dkqiry = dkqix*zr - dkqiz*xr;
+      numtyp dkqirz = dkqiy*xr - dkqix*yr;
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m < 6; m++) {
+        bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+      }
+      for (m = 0; m < 6; m++) bn[m] *= felec;
+
+      term1 = ci*ck;
+      term2 = ck*dir - ci*dkr + dik;
+      term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
+      term4 = dir*qkr - dkr*qir - 4.0*qik;
+      term5 = qir*qkr;
+
+      numtyp scalek = 1.0 - factor_mpole;
+      rr1 = bn[0] - scalek*rr1;
+      rr3 = bn[1] - scalek*rr3;
+      rr5 = bn[2] - scalek*rr5;
+      rr7 = bn[3] - scalek*rr7;
+      rr9 = bn[4] - scalek*rr9;
+      rr11 = bn[5] - scalek*rr11;
+      numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
+
+      // find standard multipole intermediates for force and torque
+
+      numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11;
+      term1 = -ck*rr3 + dkr*rr5 - qkr*rr7;
+      term2 = ci*rr3 + dir*rr5 + qir*rr7;
+      term3 = (numtyp)2.0 * rr5;
+      term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
+      term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9);
+      term6 = (numtyp)4.0 * rr7;
+
+      energy += e;
+
+      // compute the force components for this interaction
+
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+        term4*qix + term5*qkx + term6*(qixk+qkxi);
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+        term4*qiy + term5*qky + term6*(qiyk+qkyi);
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+        term4*qiz + term5*qkz + term6*(qizk+qkzi);
+
+      // compute the torque components for this interaction
+
+      numtyp tix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+        term4*qirx - term6*(qikrx+qikx);
+      numtyp tiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+        term4*qiry - term6*(qikry+qiky);
+      numtyp tiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+        term4*qirz - term6*(qikrz+qikz);
+
+      // increment force-based gradient and torque on first site
+
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
+      tq.x += tix;
+      tq.y += tiy;
+      tq.z += tiz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = -xr * frcx;
+        numtyp vxy = (numtyp )-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp )-0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = -yr * frcy;
+        numtyp vyz = (numtyp )-0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = -zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate ufld and dufld to compute tep
+  store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+  
+  // accumate force, energy and virial
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
 /* ----------------------------------------------------------------------
   udirect2b = Ewald real direct field via list
   udirect2b computes the real space contribution of the permanent
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index ce30b6ab19..64a1f60271 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -39,6 +39,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, const int max_amtype, const double *host_pdamp,
            const double *host_thole, const double *host_dirdamp, 
+           const double *host_special_mpole,
            const double *host_special_polar_wscale,
            const double *host_special_polar_piscale,
            const double *host_special_polar_pscale,
@@ -79,6 +80,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 
  protected:
   bool _allocated;
+  int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
   int polar_real(const int eflag, const int vflag);
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index bbebaa09da..3e1fbe47b6 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -30,6 +30,7 @@ static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
 int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const double *host_pdamp, const double *host_thole,
                     const double *host_dirdamp,
+                    const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
@@ -63,10 +64,10 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
   int init_ok=0;
   if (world_me==0)
     init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
-                          host_special_polar_wscale, host_special_polar_piscale,
-                          host_special_polar_pscale, nlocal, nall, max_nbors,
-                          maxspecial, maxspecial15, cell_size, gpu_split, screen,
-                          aewald, polar_dscale, polar_uscale);
+                          host_special_mpole, host_special_polar_wscale,
+                          host_special_polar_piscale, host_special_polar_pscale,
+                          nlocal, nall, max_nbors, maxspecial, maxspecial15,
+                          cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
 
   AMOEBAMF.device->world_barrier();
   if (message)
@@ -83,10 +84,10 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
-                            host_special_polar_wscale, host_special_polar_piscale,
-                            host_special_polar_pscale, nlocal, nall, max_nbors,
-                            maxspecial, maxspecial15, cell_size, gpu_split, screen,
-                            aewald, polar_dscale, polar_uscale);
+                            host_special_mpole, host_special_polar_wscale,
+                            host_special_polar_piscale, host_special_polar_pscale,
+                            nlocal, nall, max_nbors, maxspecial, maxspecial15,
+                            cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
 
     AMOEBAMF.device->gpu_barrier();
     if (message)
@@ -104,6 +105,23 @@ void amoeba_gpu_clear() {
   AMOEBAMF.clear();
 }
 
+int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
 int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 2fe0e1e4b8..585061e095 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -36,9 +36,10 @@ template <class numtyp, class acctyp>
 BaseAmoebaT::~BaseAmoeba() {
   delete ans;
   delete nbor;
-  k_polar.clear();
+  k_multipole.clear();
   k_udirect2b.clear();
   k_umutual2b.clear();
+  k_polar.clear();
   k_special15.clear();
   k_short_nbor.clear();
   if (pair_program) delete pair_program;
@@ -56,9 +57,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const int maxspecial15,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
-                             const char *k_name_polar,
+                             const char *k_name_multipole,
                              const char *k_name_udirect2b,
                              const char *k_name_umutual2b,
+                             const char *k_name_polar,
                              const char *k_name_short_nbor) {
   screen=_screen;
 
@@ -91,8 +93,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,
-                  k_name_umutual2b,k_name_short_nbor);
+  compile_kernels(*ucl_device,pair_program,k_name_multipole,k_name_udirect2b,
+                  k_name_umutual2b,k_name_polar,k_name_short_nbor);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -425,6 +427,85 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   return nbor->host_jlist.begin()-host_start;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double felec, const double off2_mpole,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          nullptr, nullptr, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _tep.resize(_max_tep_size*4);
+  }
+  *tep_ptr=_tep.host.begin();
+
+  _off2_mpole = off2_mpole;
+  _felec = felec;
+  const int red_blocks=multipole_real(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
+/*
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
+    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute the direct real space part
 //    of the permanent field
@@ -713,9 +794,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname_polar,
+                                  const char *kname_multipole,
                                   const char *kname_udirect2b,
                                   const char *kname_umutual2b,
+                                  const char *kname_polar,
                                   const char *kname_short_nbor) {
   if (_compiled)
     return;
@@ -725,9 +807,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
   pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   
-  k_polar.set_function(*pair_program,kname_polar);
+  k_multipole.set_function(*pair_program,kname_multipole);
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
+  k_polar.set_function(*pair_program,kname_polar);
   k_short_nbor.set_function(*pair_program,kname_short_nbor);
   k_special15.set_function(*pair_program,"k_special15");
   pos_tex.get_texture(*pair_program,"pos_tex");
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index b14a234e7b..1762f156d3 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -54,8 +54,9 @@ class BaseAmoeba {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_polar, const char *kname_udirect2b,
-                  const char *kname_umutual2b, const char *kname_short_nbor);
+                  const char *kname_multipole, const char *kname_udirect2b,
+                  const char *kname_umutual2b, const char *kname_polar,
+                  const char *kname_short_nbor);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -141,6 +142,18 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
+  /// Compute multipole real-space with device neighboring
+  int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double felec, const double off2_mpole, double *charge,
+                double *boxlo, double *prd, void **tep_ptr);
+
   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
   int** compute_udirect2b(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -241,7 +254,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_special15;
   UCL_Kernel k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
@@ -262,9 +275,11 @@ class BaseAmoeba {
   numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-     const char *kname_polar, const char *kname_udirect2b,
-     const char *kname_umutual2b, const char *kname_short_nbor);
+     const char *kname_multipole, const char *kname_udirect2b,
+     const char *kname_umutual2b, const char *kname_polar,
+     const char *kname_short_nbor);
 
+  virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
   virtual int polar_real(const int eflag, const int vflag) = 0;
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index b2318d296e..72c142888e 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -352,7 +352,7 @@ class PairAmoeba : public Pair {
   void dispersion_kspace();
 
   void multipole();
-  void multipole_real();
+  virtual void multipole_real();
   void multipole_kspace();
 
   void polar();
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index e636e824d3..30b35919c1 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -53,6 +53,7 @@ enum{GORDON1,GORDON2};
 int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const double *host_pdamp, const double *host_thole,
                     const double *host_dirdamp,
+                    const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
@@ -63,6 +64,15 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const double polar_uscale, int& tep_size);
 void amoeba_gpu_clear();
 
+int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double *sublo, double *subhi, tagint *tag,
+              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double felec, const double off2, double *host_q,
+              double *boxlo, double *prd, void **tep_ptr);
+
 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, 
@@ -90,7 +100,7 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double off2, const double felec, double *host_q,
+              bool &success, const double felec, const double off2, double *host_q,
               double *boxlo, double *prd, void **tep_ptr);
 
 double amoeba_gpu_bytes();
@@ -106,6 +116,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   fieldp_pinned = nullptr;
   tep_pinned = nullptr;
 
+  gpu_multipole_real_ready = false;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;
@@ -122,139 +133,6 @@ PairAmoebaGPU::~PairAmoebaGPU()
   amoeba_gpu_clear();
 }
 
-/* ---------------------------------------------------------------------- */
-
-void PairAmoebaGPU::polar_real()
-{
-  if (!gpu_polar_real_ready) {
-    PairAmoeba::polar_real();
-    return;
-  }
-
-  int eflag=1, vflag=1;
-  int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
-  
-  double sublo[3],subhi[3];
-  if (domain->triclinic == 0) {
-    sublo[0] = domain->sublo[0];
-    sublo[1] = domain->sublo[1];
-    sublo[2] = domain->sublo[2];
-    subhi[0] = domain->subhi[0];
-    subhi[1] = domain->subhi[1];
-    subhi[2] = domain->subhi[2];
-  } else {
-    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
-  }
-  inum = atom->nlocal;
-
-  // select the correct cutoff for the term
-
-  if (use_ewald) choose(POLAR_LONG);
-  else choose(POLAR);
-
-  // set the energy unit conversion factor for polar real-space calculation
-
-  double felec = 0.5 * electric / am_dielectric;
-
-  firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup,
-                                        rpole, uind, uinp, sublo, subhi,
-                                        atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, felec, off2, atom->q, domain->boxlo,
-                                        domain->prd, &tep_pinned);
-  
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
-
-  // reference to the tep array from GPU lib
-
-  if (tep_single) {
-    float *tep_ptr = (float *)tep_pinned;
-    compute_force_from_tep<float>(tep_ptr);
-  } else {
-    double *tep_ptr = (double *)tep_pinned;
-    compute_force_from_tep<double>(tep_ptr);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   init specific to this pair style
-------------------------------------------------------------------------- */
-
-template <class numtyp>
-void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
-{
-  int i,ix,iy,iz;
-  double ci,dix,diy,diz;
-  double qixx,qixy,qixz;
-  double qiyy,qiyz,qizz;
-  double xix,yix,zix;
-  double xiy,yiy,ziy;
-  double xiz,yiz,ziz;
-  double vxx,vyy,vzz;
-  double vxy,vxz,vyz;
-  double fix[3],fiy[3],fiz[3],tep[4];
-
-  double** x = atom->x;
-  int nlocal = atom->nlocal;
-
-  for (i = 0; i < nlocal; i++) {
-    dix = rpole[i][1];
-    diy = rpole[i][2];
-    diz = rpole[i][3];
-    qixx = rpole[i][4];
-    qixy = rpole[i][5];
-    qixz = rpole[i][6];
-    qiyy = rpole[i][8];
-    qiyz = rpole[i][9];
-    qizz = rpole[i][12];
-    
-    tep[0] = tep_ptr[4*i];
-    tep[1] = tep_ptr[4*i+1];
-    tep[2] = tep_ptr[4*i+2];
-
-    torque2force(i,tep,fix,fiy,fiz,fpolar);
-
-    iz = zaxis2local[i];
-    ix = xaxis2local[i];
-    iy = yaxis2local[i];
-
-    xiz = x[iz][0] - x[i][0];
-    yiz = x[iz][1] - x[i][1];
-    ziz = x[iz][2] - x[i][2];
-    xix = x[ix][0] - x[i][0];
-    yix = x[ix][1] - x[i][1];
-    zix = x[ix][2] - x[i][2];
-    xiy = x[iy][0] - x[i][0];
-    yiy = x[iy][1] - x[i][1];
-    ziy = x[iy][2] - x[i][2];
-
-    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
-    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
-    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
-    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
-                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
-    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
-                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
-    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
-                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-
-    virpolar[0] += vxx;
-    virpolar[1] += vyy;
-    virpolar[2] += vzz;
-    virpolar[3] += vxy;
-    virpolar[4] += vxz;
-    virpolar[5] += vyz;
-  }
-}
-
 /* ----------------------------------------------------------------------
    init specific to this pair style
 ------------------------------------------------------------------------- */
@@ -292,7 +170,7 @@ void PairAmoebaGPU::init_style()
   int tep_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
-                                special_polar_wscale, special_polar_piscale,
+                                special_mpole, special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                 maxspecial15, cell_size, gpu_mode, screen,
@@ -308,6 +186,68 @@ void PairAmoebaGPU::init_style()
     tep_single = true;
 }
 
+/* ---------------------------------------------------------------------- */
+
+void PairAmoebaGPU::multipole_real()
+{
+  if (!gpu_multipole_real_ready) {
+    PairAmoeba::multipole_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  if (use_ewald) choose(MPOLE_LONG);
+  else choose(MPOLE);
+
+  // set the energy unit conversion factor for multipolar real-space calculation
+
+  double felec = electric / am_dielectric;
+
+  firstneigh = amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
+                                                 atom->type, amtype, amgroup, rpole,
+                                                 sublo, subhi, atom->tag,
+                                                 atom->nspecial, atom->special,
+                                                 atom->nspecial15, atom->special15,
+                                                 eflag, vflag, eflag_atom, vflag_atom,
+                                                 host_start, &ilist, &numneigh, cpu_time,
+                                                 success, felec, off2, atom->q, domain->boxlo,
+                                                 domain->prd, &tep_pinned);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tep_single) {
+    float *tep_ptr = (float *)tep_pinned;
+    compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
+  } else {
+    double *tep_ptr = (double *)tep_pinned;
+    compute_force_from_tep<double>(tep_ptr, fmpole, virmpole);
+  }
+}
+
 /* ----------------------------------------------------------------------
    induce = induced dipole moments via pre-conditioned CG solver
    adapted from Tinker induce0a() routine
@@ -1041,6 +981,128 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
 
 /* ---------------------------------------------------------------------- */
 
+void PairAmoebaGPU::polar_real()
+{
+  if (!gpu_polar_real_ready) {
+    PairAmoeba::polar_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  // set the energy unit conversion factor for polar real-space calculation
+
+  double felec = 0.5 * electric / am_dielectric;
+
+  firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
+                                        atom->type, amtype, amgroup,
+                                        rpole, uind, uinp, sublo, subhi,
+                                        atom->tag, atom->nspecial, atom->special,
+                                        atom->nspecial15, atom->special15,
+                                        eflag, vflag, eflag_atom, vflag_atom,
+                                        host_start, &ilist, &numneigh, cpu_time,
+                                        success, felec, off2, atom->q, domain->boxlo,
+                                        domain->prd, &tep_pinned);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tep_single) {
+    float *tep_ptr = (float *)tep_pinned;
+    compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
+  } else {
+    double *tep_ptr = (double *)tep_pinned;
+    compute_force_from_tep<double>(tep_ptr, fpolar, virpolar);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
+                                           double** force_comp,
+                                           double* virial_comp)
+{
+  int i,ix,iy,iz;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double fix[3],fiy[3],fiz[3],tep[4];
+
+  double** x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    tep[0] = tep_ptr[4*i];
+    tep[1] = tep_ptr[4*i+1];
+    tep[2] = tep_ptr[4*i+2];
+
+    torque2force(i,tep,fix,fiy,fiz,force_comp);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+
+    virial_comp[0] += vxx;
+    virial_comp[1] += vyy;
+    virial_comp[2] += vzz;
+    virial_comp[3] += vxy;
+    virial_comp[4] += vxz;
+    virial_comp[5] += vyz;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 double PairAmoebaGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index 4dc547e469..a913449a62 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -35,9 +35,10 @@ class PairAmoebaGPU : public PairAmoeba {
 
   virtual void induce();
 
-  virtual void polar_real();
+  virtual void multipole_real();
   virtual void udirect2b(double **, double **);
   virtual void umutual2b(double **, double **);
+  virtual void polar_real();
 
  private:
   int gpu_mode;
@@ -46,6 +47,7 @@ class PairAmoebaGPU : public PairAmoeba {
   void *fieldp_pinned;
   bool tep_single;
 
+  bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
   bool gpu_umutual2b_ready;
   bool gpu_polar_real_ready;
@@ -53,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba {
   void udirect2b_cpu();
 
   template<class numtyp>
-  void compute_force_from_tep(const numtyp*);
+  void compute_force_from_tep(const numtyp*, double**, double*);
 };
 
 }    // namespace LAMMPS_NS

From d9267059505be34d3a7c9469b6b7e071eb2b6219 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 17 Sep 2021 01:32:00 -0500
Subject: [PATCH 035/181] Short neighbor list for multipole real-space should
 be built with off2_mpole

---
 lib/gpu/lal_amoeba.cpp      | 17 ++++++++---------
 lib/gpu/lal_amoeba.cu       |  2 +-
 src/GPU/pair_amoeba_gpu.cpp |  2 +-
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 60bc365d12..d109c98c42 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -143,15 +143,14 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
                                (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
-  // Build the short neighbor list if not done yet
-  if (!this->short_nbor_avail) {
-    this->k_short_nbor.set_size(GX,BX);
-    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                          &this->_nbor_data->begin(),
-                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom);
-    this->short_nbor_avail = true;
-  }
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step
+  
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
 
   this->k_multipole.set_size(GX,BX);
   this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 375592e338..e14cb99328 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -826,7 +826,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      if (r2>off2) continue;
+      //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 30b35919c1..67c9d6109f 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -116,7 +116,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   fieldp_pinned = nullptr;
   tep_pinned = nullptr;
 
-  gpu_multipole_real_ready = false;
+  gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;

From 2e6df83b9b7979ed2f2a79591a18c520ff6e94fc Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 17 Sep 2021 15:24:36 -0500
Subject: [PATCH 036/181] Fixed bugs in the multipole real-space part on the
 GPU; separately multipole real and polar real work correctly (along with
 udirect2b and umutual2b), but together they are conflicting due to the use of
 ans to copy forces back from device to host. The other 2 kernels (induce
 part) do not touch forces and energies.

---
 lib/gpu/lal_amoeba.cpp          | 12 +++----
 lib/gpu/lal_amoeba.cu           | 62 +++++++++++++++------------------
 lib/gpu/lal_amoeba.h            |  5 ++-
 lib/gpu/lal_amoeba_ext.cpp      | 24 ++++++-------
 lib/gpu/lal_base_amoeba.cpp     | 23 +++++++-----
 lib/gpu/lal_base_amoeba.h       | 15 ++++----
 src/AMOEBA/amoeba_multipole.cpp | 30 +++++++++++-----
 src/AMOEBA/amoeba_polar.cpp     |  8 +++--
 src/AMOEBA/pair_amoeba.cpp      |  3 ++
 src/GPU/pair_amoeba_gpu.cpp     | 45 ++++++++++++------------
 10 files changed, 123 insertions(+), 104 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index d109c98c42..af71decb86 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -53,8 +53,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
                   const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15,
                   const double cell_size, const double gpu_split, FILE *_screen,
-                  const double aewald, const double polar_dscale,
-                  const double polar_uscale) {
+                  const double polar_dscale, const double polar_uscale) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
@@ -97,7 +96,6 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   }
   ucl_copy(sp_polar,dview,5,false);
 
-  _aewald = aewald;
   _polar_dscale = polar_dscale;
   _polar_uscale = polar_uscale;
 
@@ -158,7 +156,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
                     &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,  &_aewald, &this->_felec,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
                     &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
   this->time_pair.stop();
 
@@ -198,7 +196,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
-                        &this->_threads_per_atom, &_aewald, &this->_off2_polar,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
                         &_polar_dscale, &_polar_uscale);
 
   this->time_pair.stop();
@@ -237,7 +235,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
-                        &nbor_pitch, &this->_threads_per_atom, &_aewald,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
                         &this->_off2_polar, &_polar_dscale, &_polar_uscale);
 
   this->time_pair.stop();
@@ -278,7 +276,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
                     &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,  &_aewald, &this->_felec,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
                     &this->_off2_polar, &_polar_dscale, &_polar_uscale);
   this->time_pair.stop();
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index e14cb99328..910316d289 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -225,20 +225,20 @@ _texture( q_tex,int2);
 ------------------------------------------------------------------------- */
 
 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
-                            const __global numtyp *restrict extra,
-                            const __global numtyp4 *restrict damping,
-                            const __global numtyp4 *restrict sp_polar,
-                            const __global int *dev_nbor,
-                            const __global int *dev_packed,
-                            const __global int *dev_short_nbor,
-                            __global acctyp4 *restrict ans,
-                            __global acctyp *restrict engv,
-                            __global numtyp4 *restrict tep,
-                            const int eflag, const int vflag, const int inum,
-                            const int nall, const int nbor_pitch, const int t_per_atom,
-                            const numtyp aewald, const numtyp felec,
-                            const numtyp off2, const numtyp polar_dscale,
-                            const numtyp polar_uscale)
+                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict damping,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict ans,
+                                __global acctyp *restrict engv,
+                                __global numtyp4 *restrict tep,
+                                const int eflag, const int vflag, const int inum,
+                                const int nall, const int nbor_pitch, const int t_per_atom,
+                                const numtyp aewald, const numtyp felec,
+                                const numtyp off2, const numtyp polar_dscale,
+                                const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -257,7 +257,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   }
 
   acctyp4 tq;
-  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.w=(acctyp)0; 
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
 
   numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
   numtyp4* polar1 = (numtyp4*)(&extra[0]);
@@ -272,7 +272,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     numtyp term1,term2,term3;
     numtyp term4,term5;
     numtyp term6,term7;
-    numtyp rc3[3],rc5[3],rc7[3];
     numtyp bn[6];
     numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
 
@@ -309,9 +308,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     // debug:
     // xi__ = ix; xi__.w = itype;
 
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
-
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int jextra=nbor_mem[nbor];
@@ -326,10 +322,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
+      if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
-      
       numtyp ck = polar1[j].x;   // rpole[j][0];
       numtyp dkx = polar1[j].y;  // rpole[j][1];
       numtyp dky = polar1[j].z;  // rpole[j][2];
@@ -363,7 +358,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
       numtyp diqk = dix*qkx + diy*qky + diz*qkz;
       numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
-      numtyp qiqk = (numtyp )2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
         qixx*qkxx + qiyy*qkyy + qizz*qkzz;
 
       // additional intermediates involving moments and distance
@@ -452,8 +447,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
       term4 = dir*qkr - dkr*qir - 4.0*qik;
       term5 = qir*qkr;
-
-      numtyp scalek = 1.0 - factor_mpole;
+      numtyp scalek = (numtyp)1.0 - factor_mpole;
       rr1 = bn[0] - scalek*rr1;
       rr3 = bn[1] - scalek*rr3;
       rr5 = bn[2] - scalek*rr5;
@@ -485,11 +479,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
       // compute the torque components for this interaction
 
-      numtyp tix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
         term4*qirx - term6*(qikrx+qikx);
-      numtyp tiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
         term4*qiry - term6*(qikry+qiky);
-      numtyp tiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
         term4*qirz - term6*(qikrz+qikz);
 
       // increment force-based gradient and torque on first site
@@ -497,16 +491,16 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       f.x += frcx;
       f.y += frcy;
       f.z += frcz;
-      tq.x += tix;
-      tq.y += tiy;
-      tq.z += tiz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;
 
       if (EVFLAG && vflag) {
         numtyp vxx = -xr * frcx;
-        numtyp vxy = (numtyp )-0.5 * (yr*frcx+xr*frcy);
-        numtyp vxz = (numtyp )-0.5 * (zr*frcx+xr*frcz);
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
         numtyp vyy = -yr * frcy;
-        numtyp vyz = (numtyp )-0.5 * (zr*frcy+yr*frcz);
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
         numtyp vzz = -zr * frcz;
 
         virial[0] += vxx;
@@ -520,7 +514,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     
   } // ii<inum
 
-  // accumulate ufld and dufld to compute tep
+  // accumulate tq
   store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
   
   // accumate force, energy and virial
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 64a1f60271..3d7150c189 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -46,8 +46,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
            const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const int maxspecial15, const double cell_size,
            const double gpu_split, FILE *_screen,
-           const double aewald, const double polar_dscale,
-           const double polar_uscale);
+           const double polar_dscale, const double polar_uscale);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
@@ -75,7 +74,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   /// Number of atom types
   int _lj_types;
 
-  numtyp _aewald, _polar_dscale, _polar_uscale;
+  numtyp _polar_dscale, _polar_uscale;
   numtyp _qqrd2e;
 
  protected:
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 3e1fbe47b6..8493e9331d 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -37,8 +37,8 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double aewald, const double polar_dscale,
-                    const double polar_uscale, int& tep_size) {
+                    const double polar_dscale, const double polar_uscale,
+                    int& tep_size) {
   AMOEBAMF.clear();
   gpu_mode=AMOEBAMF.device->gpu_mode();
   double gpu_split=AMOEBAMF.device->particle_split();
@@ -67,7 +67,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                           host_special_mpole, host_special_polar_wscale,
                           host_special_polar_piscale, host_special_polar_pscale,
                           nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                          cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
+                          cell_size, gpu_split, screen, polar_dscale, polar_uscale);
 
   AMOEBAMF.device->world_barrier();
   if (message)
@@ -87,7 +87,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                             host_special_mpole, host_special_polar_wscale,
                             host_special_polar_piscale, host_special_polar_pscale,
                             nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                            cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
+                            cell_size, gpu_split, screen, polar_dscale, polar_uscale);
 
     AMOEBAMF.device->gpu_barrier();
     if (message)
@@ -113,13 +113,13 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                            const bool eflag, const bool vflag, const bool eatom,
                            const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double felec, const double off2,
+                           bool &success, const double aewald, const double felec, const double off2,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
 }
 
 int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
@@ -131,13 +131,13 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
                            const bool eflag, const bool vflag, const bool eatom,
                            const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double off2, double *host_q,
+                           bool &success,  const double aewald, const double off2, double *host_q,
                            double *boxlo, double *prd, void **fieldp_ptr) {
   return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr);
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
 }
 
 int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
@@ -149,13 +149,13 @@ int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
                            const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double off2, double *host_q,
+                           bool &success, const double aewald, const double off2, double *host_q,
                            double *boxlo, double *prd, void **fieldp_ptr) {
   return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr);
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
 }
 
 int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
@@ -167,13 +167,13 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
                            const bool eflag, const bool vflag, const bool eatom,
                            const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double felec, const double off2,
+                           bool &success, const double aewald, const double felec, const double off2,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
 }
 
 double amoeba_gpu_bytes() {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 585061e095..3480ce55db 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -252,8 +252,8 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
                           const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom,
                           int &host_start, const double cpu_time,
-                          bool &success, const double off2_polar, const double felec,
-                          double *host_q, const int nlocal,
+                          bool &success, const double aewald, const double felec, 
+                          const double off2_polar, double *host_q, const int nlocal,
                           double *boxlo, double *prd, void **tep_ptr) {
   acc_timers();
   int eflag, vflag;
@@ -440,7 +440,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double felec, const double off2_mpole,
+                           bool &success, const double aewald, const double felec, const double off2_mpole,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   acc_timers();
   int eflag, vflag;
@@ -488,6 +488,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co
 
   _off2_mpole = off2_mpole;
   _felec = felec;
+  _aewald = aewald;
   const int red_blocks=multipole_real(eflag,vflag);
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
@@ -521,8 +522,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double off2_polar, double *host_q,
-                           double *boxlo, double *prd, void** fieldp_ptr) {
+                           bool &success, const double aewald, const double off2_polar,
+                           double *host_q, double *boxlo, double *prd, void** fieldp_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -560,6 +561,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
   *fieldp_ptr=_fieldp.host.begin();
 
   _off2_polar = off2_polar;
+  _aewald = aewald;
   const int red_blocks=udirect2b(eflag,vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
@@ -591,8 +593,8 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double off2_polar, double *host_q,
-                           double *boxlo, double *prd, void** fieldp_ptr) {
+                           bool &success, const double aewald, const double off2_polar,
+                           double *host_q, double *boxlo, double *prd, void** fieldp_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -630,6 +632,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
   *fieldp_ptr=_fieldp.host.begin();
 
   _off2_polar = off2_polar;
+  _aewald = aewald;
   const int red_blocks=umutual2b(eflag,vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
@@ -660,8 +663,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
                            const bool eflag_in, const bool vflag_in,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double felec, const double off2_polar,
-                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+                           bool &success, const double aewald, const double felec,
+                           const double off2_polar, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -708,6 +712,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
 
   _off2_polar = off2_polar;
   _felec = felec;
+  _aewald = aewald;
   const int red_blocks=polar_real(eflag,vflag);
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 1762f156d3..0b6c09742e 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -151,7 +151,7 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                const double felec, const double off2_mpole, double *charge,
+                const double aewald, const double felec, const double off2_mpole, double *charge,
                 double *boxlo, double *prd, void **tep_ptr);
 
   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
@@ -165,7 +165,8 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+                const double aewald, const double off2_polar, double *charge,
+                double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute the real space part of the induced field (umutual2b) with device neighboring
   int** compute_umutual2b(const int ago, const int inum_full, const int nall,
@@ -178,7 +179,8 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+                const double aewald, const double off2_polar, double *charge,
+                double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute polar real-space with device neighboring
   int** compute_polar_real(const int ago, const int inum_full, const int nall,
@@ -190,7 +192,7 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                const double felec, const double off2_polar, double *charge,
+                const double aewald, const double felec, const double off2_polar, double *charge,
                 double *boxlo, double *prd, void **tep_ptr);
 
   /// Compute polar real-space with host neighboring (not active for now)
@@ -200,7 +202,7 @@ class BaseAmoeba {
                double **host_uinp, int *ilist, int *numj,
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, const double felec, const double off2_polar,
+               const double cpu_time, bool &success, const double aewald, const double felec, const double off2_polar,
                double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr);
 
   // -------------------------- DEVICE DATA -------------------------
@@ -272,7 +274,8 @@ class BaseAmoeba {
   bool short_nbor_avail;
   UCL_D_Vec<int> *_nbor_data;
 
-  numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
+  numtyp _aewald,_felec;
+  numtyp _off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
      const char *kname_multipole, const char *kname_udirect2b,
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index c06f07d70c..62255db6f2 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -369,6 +369,9 @@ void PairAmoeba::multipole_real()
         bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
       }
       for (k = 0; k < 6; k++) bn[k] *= felec;
+      //if (i == 0 && j < 10) {
+      //  printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
+      //}
 
       // find damped multipole intermediates and energy value
 
@@ -447,6 +450,10 @@ void PairAmoeba::multipole_real()
         rr9 = bn[4] - scalek*rr9;
         rr11 = bn[5] - scalek*rr11;
         e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
+        if (i == 0  && j < 10) {
+          //printf("j = %d: scalek = %f; rr11 = %f; terms: %f %f %f %f %f\n", j, scalek, rr11, term1, term2, term3, term4, term5);
+          //printf("j = %d: felec = %f; rr1 = %f; bn0 = %f\n", j, felec, rr1, bn[0]);
+        }
 
         // find standard multipole intermediates for force and torque
 
@@ -457,6 +464,7 @@ void PairAmoeba::multipole_real()
         term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
         term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
         term6 = 4.0 * rr7;
+        
       }
 
       empole += e;
@@ -515,16 +523,20 @@ void PairAmoeba::multipole_real()
       tq[i][0] += ttmi[0];
       tq[i][1] += ttmi[1];
       tq[i][2] += ttmi[2];
-
+      //if (i == 0  && j < 10) {
+      //  printf("j = %d: erfc = %f; f: %f %f %f; tq =  %f %f %f\n", j, erfc(ralpha), frcx, frcy, frcz, ttmi[0], ttmi[1], ttmi[2]);
+        //printf("j = %d: terms: %f %f %f; tq =  %f %f %f\n", j, term1, term2, term3, qikrx, qikry, qikrz);
+      //}
       // increment force-based gradient and torque on second site
-
+      // commenting out j parts for DEBUGGING
+      
       fmpole[j][0] -= frcx;
       fmpole[j][1] -= frcy;
       fmpole[j][2] -= frcz;
       tq[j][0] += ttmk[0];
       tq[j][1] += ttmk[1];
       tq[j][2] += ttmk[2];
-
+      
       // increment the virial due to pairwise Cartesian forces
 
       vxx = -xr * frcx;
@@ -556,10 +568,11 @@ void PairAmoeba::multipole_real()
   comm->reverse_comm_pair(this);
 
   // resolve site torques then increment forces and virial
-
+  printf("compute multipole real\n");
   for (i = 0; i < nlocal; i++) {
-    torque2force(i,tq[i],fix,fiy,fiz,fmpole);  
-
+    if (i == 0) printf("before fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]);
+    torque2force(i,tq[i],fix,fiy,fiz,fmpole);
+    if (i == 0) printf("after fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]);
     iz = zaxis2local[i];
     ix = xaxis2local[i];
     iy = yaxis2local[i];
@@ -575,15 +588,16 @@ void PairAmoeba::multipole_real()
     ziy = x[iy][2] - x[i][2];
 
     vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
     vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
                  xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
     vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
                  xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
-    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
     vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
 
+    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
     virmpole[0] += vxx;
     virmpole[1] += vyy;
     virmpole[2] += vzz;
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index 1503243220..659194ac0b 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -1176,7 +1176,7 @@ void PairAmoeba::polar_real()
   comm->reverse_comm_pair(this);
 
   // torque is induced field and gradient cross permanent moments
-  
+  printf("compute polar real\n");
   for (i = 0; i < nlocal; i++) {
     dix = rpole[i][1];
     diy = rpole[i][2];
@@ -1197,8 +1197,10 @@ void PairAmoeba::polar_real()
       qiyz*dufld[i][3] - qixz*dufld[i][4] + 
       2.0*qixy*(dufld[i][0]-dufld[i][2]) + (qiyy-qixx)*dufld[i][1];
 
+    if (i == 0) printf("before fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]);
     torque2force(i,tep,fix,fiy,fiz,fpolar);
-    
+    if (i == 0) printf("after fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]);
+
     iz = zaxis2local[i];
     ix = xaxis2local[i];
     iy = yaxis2local[i];
@@ -1222,7 +1224,7 @@ void PairAmoeba::polar_real()
                  xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
     vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-
+    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
     virpolar[0] += vxx;
     virpolar[1] += vyy;
     virpolar[2] += vzz;
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index f9e098e884..5157739f0e 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -972,6 +972,9 @@ void PairAmoeba::init_style()
   // request neighbor lists
 
   int irequest = neighbor->request(this,instance_me);
+  // for DEBUGGING with GPU
+  //neighbor->requests[irequest]->half = 0;
+  //neighbor->requests[irequest]->full = 1;
 
   // open debug output files
   // names are hard-coded
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 67c9d6109f..d33b8d1431 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -60,8 +60,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double aewald, const double polar_dscale,
-                    const double polar_uscale, int& tep_size);
+                    const double polar_dscale, const double polar_uscale, int& tep_size);
 void amoeba_gpu_clear();
 
 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
@@ -70,8 +69,8 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
               int **nspecial, tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double felec, const double off2, double *host_q,
-              double *boxlo, double *prd, void **tep_ptr);
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tep_ptr);
 
 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
@@ -80,7 +79,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double off2, double *host_q,
+              bool &success, const double aewald, const double off2, double *host_q,
               double *boxlo, double *prd, void **fieldp_ptr);
 
 int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
@@ -90,7 +89,7 @@ int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nal
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double off2, double *host_q,
+              bool &success, const double aewald, const double off2, double *host_q,
               double *boxlo, double *prd, void **fieldp_ptr);
 
 int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
@@ -100,8 +99,8 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double felec, const double off2, double *host_q,
-              double *boxlo, double *prd, void **tep_ptr);
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tep_ptr);
 
 double amoeba_gpu_bytes();
 
@@ -119,7 +118,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = false;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -174,7 +173,7 @@ void PairAmoebaGPU::init_style()
                                 special_polar_pscale, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                 maxspecial15, cell_size, gpu_mode, screen,
-                                aewald, polar_dscale, polar_uscale, tep_size);
+                                polar_dscale, polar_uscale, tep_size);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE)
@@ -231,14 +230,14 @@ void PairAmoebaGPU::multipole_real()
                                                  atom->nspecial15, atom->special15,
                                                  eflag, vflag, eflag_atom, vflag_atom,
                                                  host_start, &ilist, &numneigh, cpu_time,
-                                                 success, felec, off2, atom->q, domain->boxlo,
-                                                 domain->prd, &tep_pinned);
+                                                 success, aewald, felec, off2, atom->q,
+                                                 domain->boxlo, domain->prd, &tep_pinned);
   
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // reference to the tep array from GPU lib
-
+  printf("compute multipole real\n");
   if (tep_single) {
     float *tep_ptr = (float *)tep_pinned;
     compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
@@ -727,7 +726,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
                                         atom->nspecial15, atom->special15,
                                         eflag, vflag, eflag_atom, vflag_atom,
                                         host_start, &ilist, &numneigh, cpu_time,
-                                        success, off2, atom->q, domain->boxlo,
+                                        success, aewald, off2, atom->q, domain->boxlo,
                                         domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
@@ -951,7 +950,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
                                         atom->nspecial15, atom->special15,
                                         eflag, vflag, eflag_atom, vflag_atom,
                                         host_start, &ilist, &numneigh, cpu_time,
-                                        success, off2, atom->q, domain->boxlo,
+                                        success,aewald, off2, atom->q, domain->boxlo,
                                         domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
@@ -1008,7 +1007,7 @@ void PairAmoebaGPU::polar_real()
   }
   inum = atom->nlocal;
 
-  // select the correct cutoff for the term
+  // select the correct cutoff and aewald for the term
 
   if (use_ewald) choose(POLAR_LONG);
   else choose(POLAR);
@@ -1024,14 +1023,14 @@ void PairAmoebaGPU::polar_real()
                                         atom->nspecial15, atom->special15,
                                         eflag, vflag, eflag_atom, vflag_atom,
                                         host_start, &ilist, &numneigh, cpu_time,
-                                        success, felec, off2, atom->q, domain->boxlo,
+                                        success, aewald, felec, off2, atom->q, domain->boxlo,
                                         domain->prd, &tep_pinned);
   
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // reference to the tep array from GPU lib
-
+  printf("compute polar real\n");
   if (tep_single) {
     float *tep_ptr = (float *)tep_pinned;
     compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
@@ -1066,7 +1065,9 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
     tep[1] = tep_ptr[4*i+1];
     tep[2] = tep_ptr[4*i+2];
 
+    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
     torque2force(i,tep,fix,fiy,fiz,force_comp);
+    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
 
     iz = zaxis2local[i];
     ix = xaxis2local[i];
@@ -1086,12 +1087,12 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
     vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
     vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
     vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
-                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
     vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
-                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
     vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
-                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-
+                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
     virial_comp[0] += vxx;
     virial_comp[1] += vyy;
     virial_comp[2] += vzz;

From f5713a52b34e168d725b9ca4a471b484f02596a2 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 17 Sep 2021 16:39:57 -0500
Subject: [PATCH 037/181] Added another kernel to accumulate forces, energies
 and virial on the device (similar to the tersoff kernels) as multiple kernels
 all added to those quantities; also only copy answers back to the host in the
 last kernel in a time step; cleaned up debugging messages

---
 lib/gpu/lal_amoeba.cu           | 197 ++++++++++++++++++++++++++++++--
 lib/gpu/lal_base_amoeba.cpp     |  11 +-
 lib/gpu/lal_tersoff.cu          |   6 +-
 src/AMOEBA/amoeba_multipole.cpp |  11 +-
 src/AMOEBA/amoeba_polar.cpp     |   4 +-
 src/GPU/pair_amoeba_gpu.cpp     |   2 +-
 6 files changed, 204 insertions(+), 27 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 910316d289..49c0d78d7f 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -147,6 +147,70 @@ _texture( q_tex,int2);
     fieldp[ii+inum] = fp;                                                   \
   }
 
+#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+          engv[ei]+=e_coul*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// SHUFFLE_AVAIL == 1
 #else
 
 #define local_allocate_store_ufld()
@@ -214,7 +278,120 @@ _texture( q_tex,int2);
     fieldp[ii+inum] = fp;                                                   \
   }
 
-#endif
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    if (ii == 0) printf("old = %f %f %f\n", old.x, old.y, old.z);           \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+    if (ii == 0) printf("new = %f %f %f\n", old.x, old.y, old.z);           \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+            engv[ei]+=e_coul*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// EVFLAG == 0
+#else
+
+#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }
+
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL
 
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
@@ -244,7 +421,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_ufld();
   local_allocate_store_charge();
 
   acctyp4 f;
@@ -259,7 +435,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   acctyp4 tq;
   tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
 
-  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
   numtyp4* polar1 = (numtyp4*)(&extra[0]);
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
@@ -271,9 +446,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     numtyp bfac;
     numtyp term1,term2,term3;
     numtyp term4,term5;
-    numtyp term6,term7;
+    numtyp term6;
     numtyp bn[6];
-    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
 
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -639,16 +814,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       int jtype =   polar3[j].z; // amtype[j];
       int jgroup =  polar3[j].w; // amgroup[j];
 
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      numtyp factor_dscale, factor_pscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
       if (igroup == jgroup) {
         factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
         factor_dscale = polar_dscale;
-        factor_uscale = polar_uscale;
       } else {
         factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
-        factor_dscale = factor_uscale = (numtyp)1.0;
+        factor_dscale = (numtyp)1.0;
       }
 
       // intermediates involving moments and separation distance
@@ -1479,8 +1652,10 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
 
   // accumate force, energy and virial
-  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-     offset,eflag,vflag,ans,engv);
+  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+//     offset,eflag,vflag,ans,engv);
+  store_answers_p(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 3480ce55db..f70903c889 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -322,6 +322,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
   _off2_polar = off2_polar;
   _felec = felec;
   const int red_blocks=polar_real(eflag,vflag);
+  
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
@@ -490,8 +491,11 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co
   _felec = felec;
   _aewald = aewald;
   const int red_blocks=multipole_real(eflag,vflag);
-  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  device->add_ans_object(ans);
+
+  // leave the answers (forces, energies and virial) on the device, only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
   hd_balancer.stop_timer();
 
   // copy tep from device to host
@@ -714,8 +718,11 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
   _felec = felec;
   _aewald = aewald;
   const int red_blocks=polar_real(eflag,vflag);
+
+  // only copy answers (forces, energies and virial) back from the device in the last kernel (which is polar_real here)
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
+
   hd_balancer.stop_timer();
 
   // copy tep from device to host
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index 8baa5ce12a..feab8bb5c0 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -106,6 +106,7 @@ _texture_2d( pos_tex,int4);
     }                                                                       \
   }
 
+// (SHUFFLE_AVAIL == 1)
 #else
 
 #define local_allocate_acc_zeta()
@@ -202,6 +203,7 @@ _texture_2d( pos_tex,int4);
     }                                                                       \
   }
 
+// EVFLAG == 0
 #else
 
 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
@@ -216,8 +218,8 @@ _texture_2d( pos_tex,int4);
     ans[ii]=old;                                                            \
   }
 
-#endif
-#endif
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL
 
 #ifdef LAL_SIMD_IP_SYNC
 #define t_per_atom t_per_atom_in
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index 62255db6f2..3f5c9082e7 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -523,10 +523,7 @@ void PairAmoeba::multipole_real()
       tq[i][0] += ttmi[0];
       tq[i][1] += ttmi[1];
       tq[i][2] += ttmi[2];
-      //if (i == 0  && j < 10) {
-      //  printf("j = %d: erfc = %f; f: %f %f %f; tq =  %f %f %f\n", j, erfc(ralpha), frcx, frcy, frcz, ttmi[0], ttmi[1], ttmi[2]);
-        //printf("j = %d: terms: %f %f %f; tq =  %f %f %f\n", j, term1, term2, term3, qikrx, qikry, qikrz);
-      //}
+
       // increment force-based gradient and torque on second site
       // commenting out j parts for DEBUGGING
       
@@ -568,11 +565,10 @@ void PairAmoeba::multipole_real()
   comm->reverse_comm_pair(this);
 
   // resolve site torques then increment forces and virial
-  printf("compute multipole real\n");
+
   for (i = 0; i < nlocal; i++) {
-    if (i == 0) printf("before fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]);
     torque2force(i,tq[i],fix,fiy,fiz,fmpole);
-    if (i == 0) printf("after fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]);
+
     iz = zaxis2local[i];
     ix = xaxis2local[i];
     iy = yaxis2local[i];
@@ -597,7 +593,6 @@ void PairAmoeba::multipole_real()
     vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
 
-    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
     virmpole[0] += vxx;
     virmpole[1] += vyy;
     virmpole[2] += vzz;
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index 659194ac0b..f4acf3e7a8 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -1176,7 +1176,7 @@ void PairAmoeba::polar_real()
   comm->reverse_comm_pair(this);
 
   // torque is induced field and gradient cross permanent moments
-  printf("compute polar real\n");
+
   for (i = 0; i < nlocal; i++) {
     dix = rpole[i][1];
     diy = rpole[i][2];
@@ -1197,9 +1197,7 @@ void PairAmoeba::polar_real()
       qiyz*dufld[i][3] - qixz*dufld[i][4] + 
       2.0*qixy*(dufld[i][0]-dufld[i][2]) + (qiyy-qixx)*dufld[i][1];
 
-    if (i == 0) printf("before fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]);
     torque2force(i,tep,fix,fiy,fiz,fpolar);
-    if (i == 0) printf("after fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]);
 
     iz = zaxis2local[i];
     ix = xaxis2local[i];
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index d33b8d1431..dcf7d95047 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -118,7 +118,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
-  gpu_polar_real_ready = false;
+  gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }

From 78045d8f7621cc12cb60beb4fd9d9008bb1c65e3 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 17 Sep 2021 23:13:51 -0500
Subject: [PATCH 038/181] Cleaned up debugging stuffs and unused variables

---
 lib/gpu/lal_amoeba.cu       |  68 +++++++++------------
 src/GPU/pair_amoeba_gpu.cpp | 116 ++++++++++++++++++------------------
 src/GPU/pair_amoeba_gpu.h   |   6 +-
 3 files changed, 89 insertions(+), 101 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 49c0d78d7f..41185f30e3 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -62,7 +62,7 @@ _texture( q_tex,int2);
     tq.z=red_acc[2][tid];                                                   \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    tep[i]=tq;                                                               \
+    tep[i]=tq;                                                              \
   }
 
 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
@@ -147,14 +147,14 @@ _texture( q_tex,int2);
     fieldp[ii+inum] = fp;                                                   \
   }
 
-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
     if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
       if (eflag) {                                                          \
         simdsync();                                                         \
-        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul);         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
       if (vflag) {                                                          \
         simdsync();                                                         \
@@ -174,7 +174,7 @@ _texture( q_tex,int2);
     if (eflag!=2 && vflag!=2) {                                             \
       if (eflag) {                                                          \
         simdsync();                                                         \
-        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);               \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
         if (vflag) __syncthreads();                                         \
         if (tid==0) {                                                       \
           engv[ei]+=energy*(acctyp)0.5;                                     \
@@ -225,7 +225,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    tep[i]=tq;                                                               \
+    tep[i]=tq;                                                              \
   }
 
 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
@@ -280,25 +280,23 @@ _texture( q_tex,int2);
 
 #if (EVFLAG == 1)
 
-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
     if (vflag==2 || eflag==2) {                                             \
       if (eflag)                                                            \
-        simd_reduce_add2(t_per_atom,energy,e_coul);                                \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
       if (vflag)                                                            \
         simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
-    if (ii == 0) printf("old = %f %f %f\n", old.x, old.y, old.z);           \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
     ans[ii]=old;                                                            \
-    if (ii == 0) printf("new = %f %f %f\n", old.x, old.y, old.z);           \
   }                                                                         \
   if (eflag || vflag) {                                                     \
     if (eflag!=2 && vflag!=2) {                                             \
@@ -378,7 +376,7 @@ _texture( q_tex,int2);
 // EVFLAG == 0
 #else
 
-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
@@ -402,20 +400,20 @@ _texture( q_tex,int2);
 ------------------------------------------------------------------------- */
 
 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
-                                const __global numtyp *restrict extra,
-                                const __global numtyp4 *restrict damping,
-                                const __global numtyp4 *restrict sp_polar,
-                                const __global int *dev_nbor,
-                                const __global int *dev_packed,
-                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
-                                __global acctyp *restrict engv,
-                                __global numtyp4 *restrict tep,
-                                const int eflag, const int vflag, const int inum,
-                                const int nall, const int nbor_pitch, const int t_per_atom,
-                                const numtyp aewald, const numtyp felec,
-                                const numtyp off2, const numtyp polar_dscale,
-                                const numtyp polar_uscale)
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 __global numtyp4 *restrict tep,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp felec, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -439,14 +437,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
 
-  //numtyp4 xi__;
-
   if (ii<inum) {
-    int k,m,itype,igroup;
+    int m,itype,igroup;
     numtyp bfac;
     numtyp term1,term2,term3;
-    numtyp term4,term5;
-    numtyp term6;
+    numtyp term4,term5,term6;
     numtyp bn[6];
     numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
 
@@ -480,9 +475,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
 
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int jextra=nbor_mem[nbor];
@@ -497,7 +489,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      if (r2>off2) continue;
+      //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
       numtyp ck = polar1[j].x;   // rpole[j][0];
@@ -613,14 +605,14 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       for (m = 1; m < 6; m++) {
         bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
-        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
       }
       for (m = 0; m < 6; m++) bn[m] *= felec;
 
       term1 = ci*ck;
       term2 = ck*dir - ci*dkr + dik;
-      term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
-      term4 = dir*qkr - dkr*qir - 4.0*qik;
+      term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
       term5 = qir*qkr;
       numtyp scalek = (numtyp)1.0 - factor_mpole;
       rr1 = bn[0] - scalek*rr1;
@@ -730,8 +722,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
 
-  //numtyp4 xi__;
-
   if (ii<inum) {
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -1337,7 +1327,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       for (m = 1; m <= 4; m++) {
         bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
-        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
       }
       for (m = 0; m < 5; m++) bn[m] *= felec;
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index dcf7d95047..f932f05e25 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -60,7 +60,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double polar_dscale, const double polar_uscale, int& tep_size);
+                    const double polar_dscale, const double polar_uscale, int& tq_size);
 void amoeba_gpu_clear();
 
 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
@@ -70,7 +70,7 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
               bool &success, const double aewald, const double felec, const double off2,
-              double *host_q, double *boxlo, double *prd, void **tep_ptr);
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
 
 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
@@ -100,7 +100,7 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
               bool &success, const double aewald, const double felec, const double off2,
-              double *host_q, double *boxlo, double *prd, void **tep_ptr);
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
 
 double amoeba_gpu_bytes();
 
@@ -113,7 +113,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   cpu_time = 0.0;
   suffix_flag |= Suffix::GPU;
   fieldp_pinned = nullptr;
-  tep_pinned = nullptr;
+  tq_pinned = nullptr;
 
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
@@ -166,23 +166,23 @@ void PairAmoebaGPU::init_style()
     maxspecial15=atom->maxspecial15;
   }
     
-  int tep_size;
+  int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
                                 special_mpole, special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                 maxspecial15, cell_size, gpu_mode, screen,
-                                polar_dscale, polar_uscale, tep_size);
+                                polar_dscale, polar_uscale, tq_size);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE)
     error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
 
-  if (tep_size == sizeof(double))
-    tep_single = false;
+  if (tq_size == sizeof(double))
+    tq_single = false;
   else
-    tep_single = true;
+    tq_single = true;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -231,19 +231,19 @@ void PairAmoebaGPU::multipole_real()
                                                  eflag, vflag, eflag_atom, vflag_atom,
                                                  host_start, &ilist, &numneigh, cpu_time,
                                                  success, aewald, felec, off2, atom->q,
-                                                 domain->boxlo, domain->prd, &tep_pinned);
+                                                 domain->boxlo, domain->prd, &tq_pinned);
   
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // reference to the tep array from GPU lib
-  printf("compute multipole real\n");
-  if (tep_single) {
-    float *tep_ptr = (float *)tep_pinned;
-    compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
+
+  if (tq_single) {
+    float *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, fmpole, virmpole);
   } else {
-    double *tep_ptr = (double *)tep_pinned;
-    compute_force_from_tep<double>(tep_ptr, fmpole, virmpole);
+    double *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, fmpole, virmpole);
   }
 }
 
@@ -681,7 +681,6 @@ void PairAmoebaGPU::induce()
   }
 }
 
-
 /* ----------------------------------------------------------------------
    udirect2b = Ewald real direct field via list
    udirect2b computes the real space contribution of the permanent
@@ -721,19 +720,20 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   else choose(POLAR);
 
   firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup, rpole, uind, uinp,
-                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, aewald, off2, atom->q, domain->boxlo,
-                                        domain->prd, &fieldp_pinned);
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success, aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
-
+  // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
   // udirect2b_cpu();
 
   // accumulate the field and fieldp values from the GPU lib
@@ -945,13 +945,14 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
   else choose(POLAR);
 
   firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup, rpole, uind, uinp,
-                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success,aewald, off2, atom->q, domain->boxlo,
-                                        domain->prd, &fieldp_pinned);
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success,aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
@@ -1017,37 +1018,37 @@ void PairAmoebaGPU::polar_real()
   double felec = 0.5 * electric / am_dielectric;
 
   firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup,
-                                        rpole, uind, uinp, sublo, subhi,
-                                        atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, aewald, felec, off2, atom->q, domain->boxlo,
-                                        domain->prd, &tep_pinned);
+                                             atom->type, amtype, amgroup,
+                                             rpole, uind, uinp, sublo, subhi,
+                                             atom->tag, atom->nspecial, atom->special,
+                                             atom->nspecial15, atom->special15,
+                                             eflag, vflag, eflag_atom, vflag_atom,
+                                             host_start, &ilist, &numneigh, cpu_time,
+                                             success, aewald, felec, off2, atom->q,
+                                             domain->boxlo, domain->prd, &tq_pinned);
   
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // reference to the tep array from GPU lib
-  printf("compute polar real\n");
-  if (tep_single) {
-    float *tep_ptr = (float *)tep_pinned;
-    compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
+
+  if (tq_single) {
+    float *tep_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tep_ptr, fpolar, virpolar);
   } else {
-    double *tep_ptr = (double *)tep_pinned;
-    compute_force_from_tep<double>(tep_ptr, fpolar, virpolar);
+    double *tep_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tep_ptr, fpolar, virpolar);
   }
 }
 
 /* ----------------------------------------------------------------------
-   init specific to this pair style
+   compute atom forces from torques
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
-void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
-                                           double** force_comp,
-                                           double* virial_comp)
+void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr,
+                                              double** force_comp,
+                                              double* virial_comp)
 {
   int i,ix,iy,iz;
   double xix,yix,zix;
@@ -1055,19 +1056,16 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
   double xiz,yiz,ziz;
   double vxx,vyy,vzz;
   double vxy,vxz,vyz;
-  double fix[3],fiy[3],fiz[3],tep[4];
+  double fix[3],fiy[3],fiz[3],_tq[4];
 
   double** x = atom->x;
   int nlocal = atom->nlocal;
 
   for (i = 0; i < nlocal; i++) {
-    tep[0] = tep_ptr[4*i];
-    tep[1] = tep_ptr[4*i+1];
-    tep[2] = tep_ptr[4*i+2];
-
-    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
-    torque2force(i,tep,fix,fiy,fiz,force_comp);
-    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
+    _tq[0] = tq_ptr[4*i];
+    _tq[1] = tq_ptr[4*i+1];
+    _tq[2] = tq_ptr[4*i+2];
+    torque2force(i,_tq,fix,fiy,fiz,force_comp);
 
     iz = zaxis2local[i];
     ix = xaxis2local[i];
@@ -1092,7 +1090,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
                  xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
     vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
+
     virial_comp[0] += vxx;
     virial_comp[1] += vyy;
     virial_comp[2] += vzz;
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index a913449a62..d9a3fc5904 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -43,9 +43,9 @@ class PairAmoebaGPU : public PairAmoeba {
  private:
   int gpu_mode;
   double cpu_time;
-  void *tep_pinned;
+  void *tq_pinned;
   void *fieldp_pinned;
-  bool tep_single;
+  bool tq_single;
 
   bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
@@ -55,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba {
   void udirect2b_cpu();
 
   template<class numtyp>
-  void compute_force_from_tep(const numtyp*, double**, double*);
+  void compute_force_from_torque(const numtyp*, double**, double*);
 };
 
 }    // namespace LAMMPS_NS

From 5d801e985fd78e631c928cb15aaa85be1529ab98 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 17 Sep 2021 23:24:23 -0500
Subject: [PATCH 039/181] More cleanup

---
 lib/gpu/lal_amoeba.cpp      |  14 ++--
 lib/gpu/lal_base_amoeba.cpp | 125 ++++++++++++++++++++----------------
 lib/gpu/lal_base_amoeba.h   |  11 ++--
 3 files changed, 82 insertions(+), 68 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index af71decb86..d2f2b1bf79 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -182,13 +182,13 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet
-  if (!this->short_nbor_avail) {
+  if (!this->short_nbor_polar_avail) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                            &this->_nbor_data->begin(),
                            &this->dev_short_nbor, &this->_off2_polar, &ainum,
                            &nbor_pitch, &this->_threads_per_atom);
-    this->short_nbor_avail = true;
+    this->short_nbor_polar_avail = true;
   }
   
   this->k_udirect2b.set_size(GX,BX);
@@ -222,13 +222,13 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet
-  if (!this->short_nbor_avail) {
+  if (!this->short_nbor_polar_avail) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                            &this->_nbor_data->begin(), &this->dev_short_nbor,
                            &this->_off2_polar, &ainum, &nbor_pitch,
                            &this->_threads_per_atom);
-    this->short_nbor_avail = true;
+    this->short_nbor_polar_avail = true;
   }
 
   this->k_umutual2b.set_size(GX,BX);
@@ -261,13 +261,13 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet
-  if (!this->short_nbor_avail) {
+  if (!this->short_nbor_polar_avail) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
                           &nbor_pitch, &this->_threads_per_atom);
-    this->short_nbor_avail = true;
+    this->short_nbor_polar_avail = true;
   }
 
   this->k_polar.set_size(GX,BX);
@@ -283,7 +283,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   // Signal that short nbor list is not avail for the next time step
   //   do it here because polar_real() is the last kernel in a time step at this point
 
-  this->short_nbor_avail = false;
+  this->short_nbor_polar_avail = false;
 
   return GX;
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index f70903c889..e777981912 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -21,7 +21,7 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_avail(false) {
+BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_avail(false) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -241,11 +241,12 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
 }
 
 // ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials,..
+// Copy nbor list from host if necessary and then calculate forces, virials
+// for the polar real-space term
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
-                          double **host_x, int *host_type, int *host_amtype,
+void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full,
+                          const int nall, double **host_x, int *host_type, int *host_amtype,
                           int *host_amgroup, double **host_rpole,
                           double **host_uind, double **host_uinp,
                           int *ilist, int *numj, int **firstneigh,
@@ -432,17 +433,20 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const int nall,
-                           double **host_x, int *host_type, int *host_amtype,
-                           int *host_amgroup, double **host_rpole,
-                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special,
-                           int *nspecial15, tagint **special15,
-                           const bool eflag_in, const bool vflag_in,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double felec, const double off2_mpole,
-                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
+                                          const int nall, double **host_x,
+                                          int *host_type, int *host_amtype,
+                                          int *host_amgroup, double **host_rpole,
+                                          double *sublo, double *subhi, tagint *tag,
+                                          int **nspecial, tagint **special,
+                                          int *nspecial15, tagint **special15,
+                                          const bool eflag_in, const bool vflag_in,
+                                          const bool eatom, const bool vatom,
+                                          int &host_start, int **ilist, int **jnum,
+                                          const double cpu_time, bool &success,
+                                          const double aewald, const double felec,
+                                          const double off2_mpole, double *host_q,
+                                          double *boxlo, double *prd, void **tep_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -492,7 +496,8 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co
   _aewald = aewald;
   const int red_blocks=multipole_real(eflag,vflag);
 
-  // leave the answers (forces, energies and virial) on the device, only copy them back in the last kernel (polar_real)
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
   //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   //device->add_ans_object(ans);
 
@@ -516,18 +521,21 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co
 //    of the permanent field
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const int nall,
-                           double **host_x, int *host_type, int *host_amtype,
-                           int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special,
-                           int *nspecial15, tagint **special15,
-                           const bool eflag_in, const bool vflag_in,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2_polar,
-                           double *host_q, double *boxlo, double *prd, void** fieldp_ptr) {
+int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
+                                     const int nall, double **host_x,
+                                     int *host_type, int *host_amtype,
+                                     int *host_amgroup, double **host_rpole,
+                                     double **host_uind, double **host_uinp,
+                                     double *sublo, double *subhi, tagint *tag,
+                                     int **nspecial, tagint **special,
+                                     int *nspecial15, tagint **special15,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     int &host_start, int **ilist, int **jnum,
+                                     const double cpu_time, bool &success,
+                                     const double aewald, const double off2_polar,
+                                     double *host_q, double *boxlo, double *prd,
+                                     void** fieldp_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -587,18 +595,21 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
 //    of the induced field
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const int nall,
-                           double **host_x, int *host_type, int *host_amtype,
-                           int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special,
-                           int *nspecial15, tagint **special15,
-                           const bool eflag_in, const bool vflag_in,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2_polar,
-                           double *host_q, double *boxlo, double *prd, void** fieldp_ptr) {
+int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
+                                     const int nall, double **host_x,
+                                     int *host_type, int *host_amtype,
+                                     int *host_amgroup, double **host_rpole,
+                                     double **host_uind, double **host_uinp,
+                                     double *sublo, double *subhi, tagint *tag,
+                                     int **nspecial, tagint **special,
+                                     int *nspecial15, tagint **special15,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     int &host_start, int **ilist, int **jnum,
+                                     const double cpu_time, bool &success,
+                                     const double aewald, const double off2_polar,
+                                     double *host_q, double *boxlo, double *prd,
+                                     void** fieldp_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -657,19 +668,21 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const int nall,
-                           double **host_x, int *host_type, int *host_amtype,
-                           int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special,
-                           int *nspecial15, tagint **special15,
-                           const bool eflag_in, const bool vflag_in,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double felec,
-                           const double off2_polar, double *host_q, double *boxlo,
-                           double *prd, void **tep_ptr) {
+int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
+                                      const int nall, double **host_x,
+                                      int *host_type, int *host_amtype,
+                                      int *host_amgroup, double **host_rpole,
+                                      double **host_uind, double **host_uinp,
+                                      double *sublo, double *subhi, tagint *tag,
+                                      int **nspecial, tagint **special,
+                                      int *nspecial15, tagint **special15,
+                                      const bool eflag_in, const bool vflag_in,
+                                      const bool eatom, const bool vatom,
+                                      int &host_start, int **ilist, int **jnum,
+                                      const double cpu_time, bool &success,
+                                      const double aewald, const double felec,
+                                      const double off2_polar, double *host_q,
+                                      double *boxlo, double *prd, void **tep_ptr) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -719,7 +732,8 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
   _aewald = aewald;
   const int red_blocks=polar_real(eflag,vflag);
 
-  // only copy answers (forces, energies and virial) back from the device in the last kernel (which is polar_real here)
+  // only copy answers (forces, energies and virial) back from the device
+  //   in the last kernel (which is polar_real here)
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
 
@@ -746,8 +760,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
-    double** uind, double** uinp) {
-
+                                  double** uind, double** uinp) {
   // signal that we need to transfer extra data from the host
 
   atom->extra_data_unavail();
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 0b6c09742e..a45316b6f3 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -192,8 +192,8 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                const double aewald, const double felec, const double off2_polar, double *charge,
-                double *boxlo, double *prd, void **tep_ptr);
+                const double aewald, const double felec, const double off2_polar,
+                double *charge, double *boxlo, double *prd, void **tep_ptr);
 
   /// Compute polar real-space with host neighboring (not active for now)
   void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
@@ -202,8 +202,9 @@ class BaseAmoeba {
                double **host_uinp, int *ilist, int *numj,
                int **firstneigh, const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, const double aewald, const double felec, const double off2_polar,
-               double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr);
+               const double cpu_time, bool &success, const double aewald, const double felec,
+               const double off2_polar, double *charge, const int nlocal, double *boxlo,
+               double *prd, void **tep_ptr);
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -271,7 +272,7 @@ class BaseAmoeba {
   int _extra_fields;
   double _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15, _max_nbors;
   double _gpu_overhead, _driver_overhead;
-  bool short_nbor_avail;
+  bool short_nbor_polar_avail;
   UCL_D_Vec<int> *_nbor_data;
 
   numtyp _aewald,_felec;

From 1166845fcf025292ac37646ed37e4b62d3bcc85b Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 18 Sep 2021 10:22:22 -0500
Subject: [PATCH 040/181] Prepared data structure for the dispersion real-space
 term

---
 lib/gpu/lal_amoeba.cpp      |  38 ++++++---
 lib/gpu/lal_amoeba.cu       | 166 ++++++++++++++++++++++++++++++------
 lib/gpu/lal_amoeba.h        |  14 +--
 lib/gpu/lal_amoeba_ext.cpp  |  24 +++---
 src/GPU/pair_amoeba_gpu.cpp |  12 +--
 5 files changed, 197 insertions(+), 57 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index d2f2b1bf79..28ed02b480 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -44,12 +44,14 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
-                  const double *host_thole, const double *host_dirdamp,
+int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                  const double *host_pdamp, const double *host_thole,
+                  const double *host_dirdamp, const int *host_amtype2class,
                   const double *host_special_mpole,
                   const double *host_special_polar_wscale,
                   const double *host_special_polar_piscale,
                   const double *host_special_polar_pscale,
+                  const double *host_csix, const double *host_adisp,
                   const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15,
                   const double cell_size, const double gpu_split, FILE *_screen,
@@ -80,11 +82,22 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
     host_write[i].x = host_pdamp[i];
     host_write[i].y = host_thole[i];
     host_write[i].z = host_dirdamp[i];
-    host_write[i].w = (numtyp)0;
+    host_write[i].w = host_amtype2class[i];
   }
 
-  damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
-  ucl_copy(damping,host_write,false);
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = (numtyp)0;
+    host_write2[i].w = (numtyp)0;
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
 
   UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
   sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
@@ -100,9 +113,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   _polar_uscale = polar_uscale;
 
   _allocated=true;
-  this->_max_bytes=damping.row_bytes()
-    + sp_polar.row_bytes()
-    + this->_tep.row_bytes();
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+    + sp_polar.row_bytes() + this->_tep.row_bytes();
   return 0;
 }
 
@@ -112,7 +124,7 @@ void AmoebaT::clear() {
     return;
   _allocated=false;
 
-  damping.clear();
+  coeff_amtype.clear();
   sp_polar.clear();
   
   this->clear_atomic();
@@ -151,7 +163,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
                          &nbor_pitch, &this->_threads_per_atom);
 
   this->k_multipole.set_size(GX,BX);
-  this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+  this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
@@ -192,7 +204,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   }
   
   this->k_udirect2b.set_size(GX,BX);
-  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
@@ -232,7 +244,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   }
 
   this->k_umutual2b.set_size(GX,BX);
-  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
                         &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
@@ -271,7 +283,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   }
 
   this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 41185f30e3..5a1151f610 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -147,7 +147,7 @@ _texture( q_tex,int2);
     fieldp[ii+inum] = fp;                                                   \
   }
 
-#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
@@ -210,8 +210,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }
 
-// SHUFFLE_AVAIL == 1
-#else
+#else // SHUFFLE_AVAIL == 1
 
 #define local_allocate_store_ufld()
 
@@ -280,7 +279,7 @@ _texture( q_tex,int2);
 
 #if (EVFLAG == 1)
 
-#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
@@ -376,7 +375,7 @@ _texture( q_tex,int2);
 // EVFLAG == 0
 #else
 
-#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
@@ -394,6 +393,125 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
+/* ----------------------------------------------------------------------
+   dispersion = real-space portion of Ewald dispersion
+   adapted from Tinker edreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp felec, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int m,itype,igroup;
+    numtyp bfac;
+    numtyp term1,term2,term3;
+    numtyp term4,term5,term6;
+    numtyp bn[6];
+    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+    } // nbor
+    
+  } // ii<inum
+
+  // accumate force, energy and virial
+  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //   offset,eflag,vflag,ans,engv);
+}
+
 /* ----------------------------------------------------------------------
    multipole_real = real-space portion of multipole
    adapted from Tinker emreal1d() routine
@@ -401,7 +519,7 @@ _texture( q_tex,int2);
 
 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_polar,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
@@ -697,7 +815,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
 __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_polar,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
@@ -759,9 +877,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
     // debug:
     // xi__ = ix; xi__.w = itype;
 
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
-    numtyp ddi = damping[itype].z;
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+    numtyp ddi = coeff[itype].z;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -848,9 +966,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp scale3 = (numtyp)1.0;
       numtyp scale5 = (numtyp)1.0;
       numtyp scale7 = (numtyp)1.0;
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
         if (pgamma != (numtyp)0.0) {
           damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
           if (damp < (numtyp)50.0) {
@@ -860,7 +978,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
             scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
           }
         } else {
-          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+          pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
           damp = pgamma * ucl_powr(r/damp,3.0);
           if (damp < (numtyp)50.0) {
             numtyp expdamp = ucl_exp(-damp);
@@ -911,7 +1029,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
 __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_polar,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
@@ -962,8 +1080,8 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
     
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -1025,9 +1143,9 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       // if (poltyp != DIRECT) 
       numtyp scale3 = (numtyp)1.0;
       numtyp scale5 = (numtyp)1.0;
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
         damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
         if (damp < (numtyp)50.0) {
           numtyp expdamp = ucl_exp(-damp);
@@ -1131,7 +1249,7 @@ __kernel void k_special15(__global int * dev_nbor,
 
 __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict extra,
-                            const __global numtyp4 *restrict damping,
+                            const __global numtyp4 *restrict coeff,
                             const __global numtyp4 *restrict sp_polar,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
@@ -1233,8 +1351,8 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
     // debug:
     // xi__ = ix; xi__.w = itype;
 
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -1344,9 +1462,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
       // apply Thole polarization damping to scale factors
 
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
         damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
         if (damp < (numtyp)50.0) {
           numtyp expdamp = ucl_exp(-damp);
@@ -1644,7 +1762,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   // accumate force, energy and virial
   //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
 //     offset,eflag,vflag,ans,engv);
-  store_answers_p(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 3d7150c189..4d45ec6e5a 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -37,12 +37,13 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const int max_amtype, const double *host_pdamp,
-           const double *host_thole, const double *host_dirdamp, 
-           const double *host_special_mpole,
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole, const double *host_dirdamp,
+           const int *host_amtype2class, const double *host_special_mpole,
            const double *host_special_polar_wscale,
            const double *host_special_polar_piscale,
            const double *host_special_polar_pscale,
+           const double *host_csix, const double *host_adisp,
            const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const int maxspecial15, const double cell_size,
            const double gpu_split, FILE *_screen,
@@ -60,8 +61,11 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// pdamp = damping.x; thole = damping.y
-  UCL_D_Vec<numtyp4> damping;
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
   /// Special polar values [0-4]: 
   ///   sp_polar.x = special_polar_wscale
   ///   sp_polar.y special_polar_pscale,
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 8493e9331d..804bf10f32 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -27,13 +27,14 @@ static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int amoeba_gpu_init(const int ntypes, const int max_amtype,
+int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                     const double *host_pdamp, const double *host_thole,
-                    const double *host_dirdamp,
+                    const double *host_dirdamp, const int *host_amtype2class,
                     const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
@@ -63,11 +64,13 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
-                          host_special_mpole, host_special_polar_wscale,
+    init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                          host_pdamp, host_thole, host_dirdamp,
+                          host_amtype2class, host_special_mpole, host_special_polar_wscale,
                           host_special_polar_piscale, host_special_polar_pscale,
-                          nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                          cell_size, gpu_split, screen, polar_dscale, polar_uscale);
+                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split,
+                          screen, polar_dscale, polar_uscale);
 
   AMOEBAMF.device->world_barrier();
   if (message)
@@ -83,11 +86,12 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
-                            host_special_mpole, host_special_polar_wscale,
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_mpole, host_special_polar_wscale,
                             host_special_polar_piscale, host_special_polar_pscale,
-                            nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                            cell_size, gpu_split, screen, polar_dscale, polar_uscale);
+                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split,
+                            screen, polar_dscale, polar_uscale);
 
     AMOEBAMF.device->gpu_barrier();
     if (message)
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index f932f05e25..25f4718163 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -50,13 +50,14 @@ enum{GORDON1,GORDON2};
 
 // External functions from cuda library for atom decomposition
 
-int amoeba_gpu_init(const int ntypes, const int max_amtype,
+int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                     const double *host_pdamp, const double *host_thole,
-                    const double *host_dirdamp,
+                    const double *host_dirdamp, const int* host_amtype2class,
                     const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
@@ -168,9 +169,10 @@ void PairAmoebaGPU::init_style()
     
   int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
-  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
-                                special_mpole, special_polar_wscale, special_polar_piscale,
-                                special_polar_pscale, atom->nlocal,
+  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
+                                pdamp, thole, dirdamp, amtype2class, special_mpole,
+                                special_polar_wscale, special_polar_piscale,
+                                special_polar_pscale, csix, adisp, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                 maxspecial15, cell_size, gpu_mode, screen,
                                 polar_dscale, polar_uscale, tq_size);

From 0228867d8e547feb35a28190285327df8081ccec Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 19 Sep 2021 23:40:43 -0500
Subject: [PATCH 041/181] Added the dispersion real space kernel and transfer
 special coeffs to the device

---
 lib/gpu/lal_amoeba.cpp      |  16 +++-
 lib/gpu/lal_amoeba.cu       | 156 +++++++++++++++++++++++++-----------
 lib/gpu/lal_amoeba.h        |   8 ++
 lib/gpu/lal_amoeba_ext.cpp  |  14 +++-
 src/GPU/pair_amoeba_gpu.cpp |   9 ++-
 src/GPU/pair_amoeba_gpu.h   |   3 +
 6 files changed, 153 insertions(+), 53 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 28ed02b480..1d62e483d8 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -47,6 +47,9 @@ template <class numtyp, class acctyp>
 int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
                   const double *host_pdamp, const double *host_thole,
                   const double *host_dirdamp, const int *host_amtype2class,
+                  const double *host_special_hal,
+                  const double *host_special_repel,
+                  const double *host_special_disp,
                   const double *host_special_mpole,
                   const double *host_special_polar_wscale,
                   const double *host_special_polar_piscale,
@@ -109,12 +112,21 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
   }
   ucl_copy(sp_polar,dview,5,false);
 
+  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_hal[i];
+    dview[i].y=host_special_repel[i];
+    dview[i].z=host_special_disp[i];
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_nonpolar,dview,5,false);
+
   _polar_dscale = polar_dscale;
   _polar_uscale = polar_uscale;
 
   _allocated=true;
   this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
-    + sp_polar.row_bytes() + this->_tep.row_bytes();
+    + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes();
   return 0;
 }
 
@@ -125,7 +137,9 @@ void AmoebaT::clear() {
   _allocated=false;
 
   coeff_amtype.clear();
+  coeff_amclass.clear();
   sp_polar.clear();
+  sp_nonpolar.clear();
   
   this->clear_atomic();
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 5a1151f610..8915ef0146 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -400,8 +400,9 @@ _texture( q_tex,int2);
 
 __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global numtyp4 *restrict coeff_amtype,
+                                 const __global numtyp4 *restrict coeff_amclass,
+                                 const __global numtyp4 *restrict sp_disp,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
@@ -428,20 +429,11 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
     for (int l=0; l<6; l++) virial[l]=(acctyp)0;
   }
 
-  acctyp4 tq;
-  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
-
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
 
   if (ii<inum) {
-    int m,itype,igroup;
-    numtyp bfac;
-    numtyp term1,term2,term3;
-    numtyp term4,term5,term6;
-    numtyp bn[6];
-    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+    int itype,iclass;
+    numtyp ci,ai;
 
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -460,18 +452,10 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    ci  = polar1[i].x;    // rpole[i][0];
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
+    itype  = polar3[i].z;            // amtype[i];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    ci = coeff_amclass[iclass].x;    // csix[iclass];
+    ai = coeff_amclass[iclass].y;    // adisp[iclass];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -482,34 +466,115 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
       //int jtype=jx.w;
  
       // Compute r12
-      numtyp xr = jx.x - ix.x;
-      numtyp yr = jx.y - ix.y;
-      numtyp zr = jx.z - ix.z;
+      numtyp xr = ix.x - jx.x;
+      numtyp yr = ix.y - jx.y;
+      numtyp zr = ix.z - jx.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
   
-      numtyp r = ucl_sqrt(r2);
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
       int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
+      numtyp ak = coeff_amclass[jclass].y;    // adisp[jclass];
 
+      numtyp r6 = r2*r2*r2;
+      numtyp ralpha2 = r2 * aewald*aewald;
+      numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2;
+      numtyp expterm = ucl_exp(-ralpha2);
+      numtyp expa = expterm * term;
+
+      // find the damping factor for the dispersion interaction
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp r7 = r6 * r;
+      numtyp di = ai * r;
+      numtyp di2 = di * di;
+      numtyp di3 = di * di2;
+      numtyp dk = ak * r;
+      numtyp expi = ucl_exp(-di);
+      numtyp expk = ucl_exp(-dk);
+     
+      numtyp ai2,ak2;
+      numtyp di4,di5;
+      numtyp dk2,dk3;
+      numtyp ti,ti2;
+      numtyp tk,tk2;
+      numtyp damp3,damp5;
+      numtyp ddamp;
+      numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)];
+
+      if (ai != ak) {
+        ai2 = ai * ai;
+        ak2 = ak * ak;
+        dk2 = dk * dk;
+        dk3 = dk * dk2;
+        ti = ak2 / (ak2-ai2);
+        ti2 = ti * ti;
+        tk = ai2 / (ai2-ak2);
+        tk2 = tk * tk;
+        damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk
+          - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi
+          - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk;
+        damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
+          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
+          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
+        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + 
+          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);
+
+      } else {
+        di4 = di2 * di2;
+        di5 = di2 * di3;
+        damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi;
+        damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi;
+        ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0;
+      }
+
+      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
+      
+      // apply damping and scaling factors for this interaction
+
+      numtyp scale = factor_disp * damp*damp;
+      scale = scale - (numtyp )1.0;
+      numtyp e = -ci * ck * (expa+scale) / r6;
+      numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
+      numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;
+
+      energy+= e;
+
+      // increment the damped dispersion derivative components
+
+      numtyp dedx = de * xr;
+      numtyp dedy = de * yr;
+      numtyp dedz = de * zr;
+      f.x += dedx;
+      f.y += dedy;
+      f.z += dedz;
+      
+      // increment the internal virial tensor components
+
+      numtyp vxx = xr * dedx;
+      numtyp vyx = yr * dedx;
+      numtyp vzx = zr * dedx;
+      numtyp vyy = yr * dedy;
+      numtyp vzy = zr * dedy;
+      numtyp vzz = zr * dedz;
+
+      virial[0] += vxx;
+      virial[1] += vyy;
+      virial[2] += vzz;
+      virial[3] += vyx;
+      virial[4] += vzx;
+      virial[5] += vzy;
     } // nbor
     
   } // ii<inum
 
   // accumate force, energy and virial
-  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //   offset,eflag,vflag,ans,engv);
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
 }
 
 /* ----------------------------------------------------------------------
@@ -556,7 +621,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
 
   if (ii<inum) {
-    int m,itype,igroup;
+    int m;
     numtyp bfac;
     numtyp term1,term2,term3;
     numtyp term4,term5,term6;
@@ -590,8 +655,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     qiyy = polar2[i].w;   // rpole[i][8];
     qiyz   = polar3[i].x; // rpole[i][9];
     qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -1391,9 +1454,8 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp ukyp = polar5[j].y; // uinp[j][1];
       numtyp ukzp = polar5[j].z; // uinp[j][2];
 
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      numtyp factor_dscale, factor_pscale, factor_uscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
       if (igroup == jgroup) {
         factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
         factor_dscale = polar_dscale;
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 4d45ec6e5a..39d65375cb 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -40,6 +40,8 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   int init(const int ntypes, const int max_amtype, const int max_amclass,
            const double *host_pdamp, const double *host_thole, const double *host_dirdamp,
            const int *host_amtype2class, const double *host_special_mpole,
+           const double *host_special_hal, const double *host_special_repel,
+           const double *host_special_disp,
            const double *host_special_polar_wscale,
            const double *host_special_polar_piscale,
            const double *host_special_polar_pscale,
@@ -70,7 +72,13 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   ///   sp_polar.x = special_polar_wscale
   ///   sp_polar.y special_polar_pscale,
   ///   sp_polar.z = special_polar_piscale
+  ///   sp_polar.w = special_mpole
   UCL_D_Vec<numtyp4> sp_polar;
+  /// Special nonpolar values [0-4]: 
+  ///   sp_nonpolar.x = special_hal
+  ///   sp_nonpolar.y special_repel
+  ///   sp_nonpolar.z = special_disp
+  UCL_D_Vec<numtyp4> sp_nonpolar;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 804bf10f32..86cf6f4c54 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -30,6 +30,9 @@ static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
 int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                     const double *host_pdamp, const double *host_thole,
                     const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_hal,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
                     const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
@@ -66,7 +69,9 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
   if (world_me==0)
     init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
                           host_pdamp, host_thole, host_dirdamp,
-                          host_amtype2class, host_special_mpole, host_special_polar_wscale,
+                          host_amtype2class, host_special_hal,
+                          host_special_repel, host_special_disp,
+                          host_special_mpole, host_special_polar_wscale,
                           host_special_polar_piscale, host_special_polar_pscale,
                           host_csix, host_adisp, nlocal, nall, max_nbors,
                           maxspecial, maxspecial15, cell_size, gpu_split,
@@ -86,8 +91,11 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp,
-                            host_amtype2class, host_special_mpole, host_special_polar_wscale,
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                            host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_hal,
+                            host_special_repel, host_special_disp,
+                            host_special_mpole, host_special_polar_wscale,
                             host_special_polar_piscale, host_special_polar_pscale,
                             host_csix, host_adisp, nlocal, nall, max_nbors,
                             maxspecial, maxspecial15, cell_size, gpu_split,
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 25f4718163..35bba58a14 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -53,7 +53,8 @@ enum{GORDON1,GORDON2};
 int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                     const double *host_pdamp, const double *host_thole,
                     const double *host_dirdamp, const int* host_amtype2class,
-                    const double *host_special_mpole,
+                    const double *host_special_hal, const double *host_special_repel,
+                    const double *host_special_disp, const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
@@ -116,6 +117,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   fieldp_pinned = nullptr;
   tq_pinned = nullptr;
 
+  gpu_hal_ready = false;
+  gpu_repulsion_ready = false;
+  gpu_dispersion_real_ready = false;
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
@@ -170,7 +174,8 @@ void PairAmoebaGPU::init_style()
   int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
-                                pdamp, thole, dirdamp, amtype2class, special_mpole,
+                                pdamp, thole, dirdamp, amtype2class, special_hal,
+                                special_repel, special_disp, special_mpole,
                                 special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, csix, adisp, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index d9a3fc5904..710f997e4c 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -47,6 +47,9 @@ class PairAmoebaGPU : public PairAmoeba {
   void *fieldp_pinned;
   bool tq_single;
 
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
   bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
   bool gpu_umutual2b_ready;

From 4e88cd158ee21fc4fcdfc85d66073ea5b220f6bc Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 20 Sep 2021 11:38:50 -0500
Subject: [PATCH 042/181] Fixed bugs with _tep and _fieldp to allow
 mixed-precision builds, being defensive with acctyp for these variables

---
 lib/gpu/lal_amoeba.cu      | 140 ++++++++++++++++++-------------------
 lib/gpu/lal_amoeba_ext.cpp |   2 +-
 lib/gpu/lal_base_amoeba.h  |   2 +-
 3 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 8915ef0146..3c5b949c72 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -102,7 +102,7 @@ _texture( q_tex,int2);
     dufld[5]=red_acc[5][tid];                                               \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    numtyp4 t;                                                              \
+    acctyp4 t;                                                              \
     t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
       (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
     t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@@ -136,7 +136,7 @@ _texture( q_tex,int2);
     _fieldp[5]=red_acc[5][tid];                                             \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    numtyp4 f, fp;                                                          \
+    acctyp4 f, fp;                                                          \
     f.x = _fieldp[0];                                                       \
     f.y = _fieldp[1];                                                       \
     f.z = _fieldp[2];                                                       \
@@ -243,7 +243,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    numtyp4 t;                                                              \
+    acctyp4 t;                                                              \
     t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
       (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
     t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
@@ -266,7 +266,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    numtyp4 f, fp;                                                          \
+    acctyp4 f, fp;                                                          \
     f.x = _fieldp[0];                                                       \
     f.y = _fieldp[1];                                                       \
     f.z = _fieldp[2];                                                       \
@@ -591,7 +591,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                                  const __global int *dev_short_nbor,
                                  __global acctyp4 *restrict ans,
                                  __global acctyp *restrict engv,
-                                 __global numtyp4 *restrict tep,
+                                 __global acctyp4 *restrict tep,
                                  const int eflag, const int vflag, const int inum,
                                  const int nall, const int nbor_pitch,
                                  const int t_per_atom, const numtyp aewald,
@@ -883,7 +883,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
-                                 __global numtyp4 *restrict fieldp,
+                                 __global acctyp4 *restrict fieldp,
                                  const int inum,  const int nall,
                                  const int nbor_pitch, const int t_per_atom,
                                  const numtyp aewald, const numtyp off2,
@@ -1097,7 +1097,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
-                                 __global numtyp4 *restrict fieldp,
+                                 __global acctyp4 *restrict fieldp,
                                  const int inum,  const int nall,
                                  const int nbor_pitch, const int t_per_atom,
                                  const numtyp aewald, const numtyp off2,
@@ -1256,75 +1256,26 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
   store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
-/* ----------------------------------------------------------------------
-   scan standard neighbor list and make it compatible with 1-5 neighbors
-   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
-   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
-   else do nothing to IJ entry
-------------------------------------------------------------------------- */
-
-__kernel void k_special15(__global int * dev_nbor,
-                          const __global int * dev_packed,
-                          const __global tagint *restrict tag,
-                          const __global int *restrict nspecial15,
-                          const __global tagint *restrict special15,
-                          const int inum, const int nall, const int nbor_pitch,
-                          const int t_per_atom) {
-  int tid, ii, offset, n_stride, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  if (ii<inum) {
-  
-    int numj, nbor, nbor_end;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    int n15 = nspecial15[ii];
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int sj=dev_packed[nbor];
-      int which = sj >> SBBITS & 3;
-      int j = sj & NEIGHMASK;
-      tagint jtag = tag[j];
-
-      if (!which) {
-        int offset=ii;
-        for (int k=0; k<n15; k++) {
-          if (special15[offset] == jtag) {
-            which = 4;
-            break;
-          }
-          offset += nall;
-        }
-      }
-
-      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
-    } // for nbor
-
-  } // if ii
-}
-
 /* ----------------------------------------------------------------------
    polar_real = real-space portion of induced dipole polarization
    adapted from Tinker epreal1d() routine
 ------------------------------------------------------------------------- */
 
 __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
-                            const __global numtyp *restrict extra,
-                            const __global numtyp4 *restrict coeff,
-                            const __global numtyp4 *restrict sp_polar,
-                            const __global int *dev_nbor,
-                            const __global int *dev_packed,
-                            const __global int *dev_short_nbor,
-                            __global acctyp4 *restrict ans,
-                            __global acctyp *restrict engv,
-                            __global numtyp4 *restrict tep,
-                            const int eflag, const int vflag, const int inum,
-                            const int nall, const int nbor_pitch, const int t_per_atom,
-                            const numtyp aewald, const numtyp felec,
-                            const numtyp off2, const numtyp polar_dscale,
-                            const numtyp polar_uscale)
+                             const __global numtyp *restrict extra,
+                             const __global numtyp4 *restrict coeff,
+                             const __global numtyp4 *restrict sp_polar,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             const __global int *dev_short_nbor,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             __global acctyp4 *restrict tep,
+                             const int eflag, const int vflag, const int inum,
+                             const int nall, const int nbor_pitch, const int t_per_atom,
+                             const numtyp aewald, const numtyp felec,
+                             const numtyp off2, const numtyp polar_dscale,
+                             const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -1828,6 +1779,55 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+  
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
 __kernel void k_amoeba_short_nbor(const __global numtyp4 *restrict x_,
                                   const __global int * dev_nbor,
                                   const __global int * dev_packed,
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 86cf6f4c54..55c08adf82 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -52,7 +52,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
   int gpu_rank=AMOEBAMF.device->gpu_rank();
   int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
 
-  tep_size=sizeof(PRECISION);
+  tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION);
 
   AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index a45316b6f3..fea1728e8c 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -235,7 +235,7 @@ class BaseAmoeba {
     double** uind, double** uinp);
 
   /// Per-atom arrays
-  UCL_Vector<numtyp,numtyp> _tep, _fieldp;
+  UCL_Vector<acctyp,acctyp> _tep, _fieldp;
   int _nmax, _max_tep_size, _max_fieldp_size;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------

From 42034bd1c9809a50b04dad411744039e02c67842 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 20 Sep 2021 12:48:29 -0500
Subject: [PATCH 043/181] Fixed bugs for undefined tagint and ucl_powr
 ambiguity in kernels for OpenCL builds

---
 lib/gpu/lal_amoeba.cu       | 13 ++++++++++++-
 lib/gpu/lal_base_amoeba.cpp |  3 ---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 3c5b949c72..e44f302563 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -37,7 +37,18 @@ _texture( q_tex,int2);
 #else
 #define pos_tex x_
 #define q_tex q_
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
 #endif
+#ifdef LAMMPS_BIGBIG
+#define tagint long
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+
+#endif // defined(NV_KERNEL) || defined(USE_HIP)
+
 
 #if (SHUFFLE_AVAIL == 0)
 
@@ -1042,7 +1053,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
           }
         } else {
           pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
-          damp = pgamma * ucl_powr(r/damp,3.0);
+          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
           if (damp < (numtyp)50.0) {
             numtyp expdamp = ucl_exp(-damp);
             scale3 = (numtyp)1.0 - expdamp;
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index e777981912..a5552f6f3b 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -27,9 +27,6 @@ BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0), short_nbor_polar_av
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
-  #if defined(LAL_OCL_EV_JIT)
-  pair_program_noev=nullptr;
-  #endif
 }
 
 template <class numtyp, class acctyp>

From a2fd784034f1cf05ff0662b811f53c4f4dfc283f Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 21 Sep 2021 10:55:38 -0500
Subject: [PATCH 044/181] Added the dispersion real space term, which is for
 HIPPO.

---
 lib/gpu/lal_amoeba.cpp      | 49 +++++++++++++++++++++--
 lib/gpu/lal_amoeba.cu       | 11 +++---
 lib/gpu/lal_amoeba.h        |  9 +++--
 lib/gpu/lal_amoeba_ext.cpp  | 17 ++++++++
 lib/gpu/lal_base_amoeba.cpp | 79 +++++++++++++++++++++++++++++++++++--
 lib/gpu/lal_base_amoeba.h   | 31 ++++++++++-----
 src/AMOEBA/pair_amoeba.h    |  2 +-
 src/GPU/pair_amoeba_gpu.cpp | 63 ++++++++++++++++++++++++++++-
 src/GPU/pair_amoeba_gpu.h   |  1 +
 9 files changed, 234 insertions(+), 28 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 1d62e483d8..a9e02ee7b4 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -62,9 +62,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
-                            "k_amoeba_multipole", "k_amoeba_udirect2b",
-                            "k_amoeba_umutual2b", "k_amoeba_polar",
-                            "k_amoeba_short_nbor");
+                            "k_amoeba_dispersion", "k_amoeba_multipole",
+                            "k_amoeba_udirect2b", "k_amoeba_umutual2b",
+                            "k_amoeba_polar", "k_amoeba_short_nbor");
   if (success!=0)
     return success;
 
@@ -150,7 +150,48 @@ double AmoebaT::host_memory_usage() const {
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the polar real-space term, returning tep
+// Calculate the dispersion real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::dispersion_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step
+  
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_disp, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+  printf("launching dispersion\n");
+  this->k_dispersion.set_size(GX,BX);
+  this->k_dispersion.run(&this->atom->x, &this->atom->extra,
+                         &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                         &this->dev_short_nbor,
+                         &this->ans->force, &this->ans->engv,
+                         &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                         &this->_threads_per_atom,  &this->_aewald,
+                         &this->_off2_disp);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the multipole real-space term, returning tep
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::multipole_real(const int eflag, const int vflag) {
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index e44f302563..60205b16ff 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -413,7 +413,7 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
                                  const __global numtyp4 *restrict coeff_amtype,
                                  const __global numtyp4 *restrict coeff_amclass,
-                                 const __global numtyp4 *restrict sp_disp,
+                                 const __global numtyp4 *restrict sp_nonpolar,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
@@ -422,8 +422,7 @@ __kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
                                  const int eflag, const int vflag, const int inum,
                                  const int nall, const int nbor_pitch,
                                  const int t_per_atom, const numtyp aewald,
-                                 const numtyp felec, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
+                                 const numtyp off2)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -876,9 +875,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   // accumulate tq
   store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
   
-  // accumate force, energy and virial
+  // accumate force, energy and virial: use _acc if not the first kernel
   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv);
+  //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //   offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
 /* ----------------------------------------------------------------------
@@ -1785,7 +1786,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
   // accumate force, energy and virial
   //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-//     offset,eflag,vflag,ans,engv);
+  //     offset,eflag,vflag,ans,engv);
   store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 39d65375cb..df556a1018 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -38,9 +38,11 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
   int init(const int ntypes, const int max_amtype, const int max_amclass,
-           const double *host_pdamp, const double *host_thole, const double *host_dirdamp,
-           const int *host_amtype2class, const double *host_special_mpole,
-           const double *host_special_hal, const double *host_special_repel,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_hal,
+           const double *host_special_repel,
            const double *host_special_disp,
            const double *host_special_polar_wscale,
            const double *host_special_polar_piscale,
@@ -91,6 +93,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 
  protected:
   bool _allocated;
+  int dispersion_real(const int eflag, const int vflag);
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 55c08adf82..309830e1ce 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -117,6 +117,23 @@ void amoeba_gpu_clear() {
   AMOEBAMF.clear();
 }
 
+int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd) {
+  return AMOEBAMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd);
+}
+
 int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index a5552f6f3b..f252131ea7 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -33,6 +33,7 @@ template <class numtyp, class acctyp>
 BaseAmoebaT::~BaseAmoeba() {
   delete ans;
   delete nbor;
+  k_dispersion.clear();
   k_multipole.clear();
   k_udirect2b.clear();
   k_umutual2b.clear();
@@ -54,6 +55,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const int maxspecial15,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
+                             const char *k_name_dispersion,
                              const char *k_name_multipole,
                              const char *k_name_udirect2b,
                              const char *k_name_umutual2b,
@@ -90,8 +92,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_multipole,k_name_udirect2b,
-                  k_name_umutual2b,k_name_polar,k_name_short_nbor);
+  compile_kernels(*ucl_device,pair_program,k_name_dispersion,k_name_multipole,
+                  k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -427,7 +429,74 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute polar real-space
+// Reneighbor on GPU if necessary, and then compute dispersion real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute_dispersion_real(const int ago, const int inum_full,
+                                           const int nall, double **host_x,
+                                           int *host_type, int *host_amtype,
+                                           int *host_amgroup, double **host_rpole,
+                                           double *sublo, double *subhi, tagint *tag,
+                                           int **nspecial, tagint **special,
+                                           int *nspecial15, tagint **special15,
+                                           const bool eflag_in, const bool vflag_in,
+                                           const bool eatom, const bool vatom,
+                                           int &host_start, int **ilist, int **jnum,
+                                           const double cpu_time, bool &success,
+                                           const double aewald, const double off2_disp,
+                                           double *host_q, double *boxlo, double *prd) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          nullptr, nullptr, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  _off2_disp = off2_disp;
+  _aewald = aewald;
+  const int red_blocks=dispersion_real(eflag,vflag);
+
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
+  hd_balancer.stop_timer();
+
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
@@ -816,6 +885,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname_dispersion,
                                   const char *kname_multipole,
                                   const char *kname_udirect2b,
                                   const char *kname_umutual2b,
@@ -828,7 +898,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   pair_program=new UCL_Program(dev);
   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
   pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
-  
+
+  k_dispersion.set_function(*pair_program,kname_dispersion);
   k_multipole.set_function(*pair_program,kname_multipole);
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index fea1728e8c..fcff3186c7 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -54,9 +54,9 @@ class BaseAmoeba {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_multipole, const char *kname_udirect2b,
-                  const char *kname_umutual2b, const char *kname_polar,
-                  const char *kname_short_nbor);
+                  const char *kname_dispersion, const char *kname_multipole,
+                  const char *kname_udirect2b, const char *kname_umutual2b,
+                  const char *kname_polar, const char *kname_short_nbor);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -142,6 +142,18 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
+  /// Compute dispersion real-space with device neighboring
+  int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double off2_disp, double *charge,
+                double *boxlo, double *prd);
+
   /// Compute multipole real-space with device neighboring
   int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -257,8 +269,8 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_special15;
-  UCL_Kernel k_short_nbor;
+  UCL_Kernel k_dispersion, k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_special15, k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
 
@@ -276,13 +288,14 @@ class BaseAmoeba {
   UCL_D_Vec<int> *_nbor_data;
 
   numtyp _aewald,_felec;
-  numtyp _off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
+  numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-     const char *kname_multipole, const char *kname_udirect2b,
-     const char *kname_umutual2b, const char *kname_polar,
-     const char *kname_short_nbor);
+     const char *kname_dispersion, const char *kname_multipole,
+     const char *kname_udirect2b, const char *kname_umutual2b,
+     const char *kname_polar, const char *kname_short_nbor);
 
+  virtual int dispersion_real(const int eflag, const int vflag) = 0;
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 72c142888e..8a2f09d443 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -348,7 +348,7 @@ class PairAmoeba : public Pair {
                int, double, double, double *);
 
   void dispersion();
-  void dispersion_real();
+  virtual void dispersion_real();
   void dispersion_kspace();
 
   void multipole();
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 35bba58a14..4894ac6203 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -65,6 +65,17 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
                     const double polar_dscale, const double polar_uscale, int& tq_size);
 void amoeba_gpu_clear();
 
+int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd);
+
 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double *sublo, double *subhi, tagint *tag,
@@ -118,8 +129,8 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   tq_pinned = nullptr;
 
   gpu_hal_ready = false;
-  gpu_repulsion_ready = false;
-  gpu_dispersion_real_ready = false;
+  gpu_repulsion_ready = false;         // true for HIPPO
+  gpu_dispersion_real_ready = false;   // true for HIPPO
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
@@ -194,6 +205,54 @@ void PairAmoebaGPU::init_style()
 
 /* ---------------------------------------------------------------------- */
 
+void PairAmoebaGPU::dispersion_real()
+{
+  if (!gpu_dispersion_real_ready) {
+    PairAmoeba::dispersion_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  if (use_dewald) choose(DISP_LONG);
+  else choose(DISP);
+
+  firstneigh = amoeba_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x,
+                                                 atom->type, amtype, amgroup, rpole,
+                                                 sublo, subhi, atom->tag,
+                                                 atom->nspecial, atom->special,
+                                                 atom->nspecial15, atom->special15,
+                                                 eflag, vflag, eflag_atom, vflag_atom,
+                                                 host_start, &ilist, &numneigh, cpu_time,
+                                                 success, aewald, off2, atom->q,
+                                                 domain->boxlo, domain->prd);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+}
+
+/* ---------------------------------------------------------------------- */
+
 void PairAmoebaGPU::multipole_real()
 {
   if (!gpu_multipole_real_ready) {
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index 710f997e4c..de17703dc7 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -35,6 +35,7 @@ class PairAmoebaGPU : public PairAmoeba {
 
   virtual void induce();
 
+  virtual void dispersion_real();
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
   virtual void umutual2b(double **, double **);

From d77d5b7f0a1db4b4cc2eec14c1b2ecd9ba49936b Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 21 Sep 2021 15:40:06 -0500
Subject: [PATCH 045/181] Added classes for hippo/gpu, refactored BaseAmoeba
 and made room for the dispersion real-space term in hippo

---
 lib/gpu/lal_amoeba.cpp      |   43 +-
 lib/gpu/lal_amoeba.h        |    1 -
 lib/gpu/lal_amoeba_ext.cpp  |    4 +-
 lib/gpu/lal_base_amoeba.cpp |   73 +-
 lib/gpu/lal_base_amoeba.h   |   19 +-
 lib/gpu/lal_hippo.cpp       |  430 ++++++++
 lib/gpu/lal_hippo.cu        | 1892 +++++++++++++++++++++++++++++++++++
 lib/gpu/lal_hippo.h         |  120 +++
 lib/gpu/lal_hippo_ext.cpp   |  210 ++++
 src/GPU/pair_amoeba_gpu.cpp |   65 +-
 src/GPU/pair_amoeba_gpu.h   |    2 +-
 src/GPU/pair_hippo_gpu.cpp  | 1175 ++++++++++++++++++++++
 src/GPU/pair_hippo_gpu.h    |   80 ++
 13 files changed, 3918 insertions(+), 196 deletions(-)
 create mode 100644 lib/gpu/lal_hippo.cpp
 create mode 100644 lib/gpu/lal_hippo.cu
 create mode 100644 lib/gpu/lal_hippo.h
 create mode 100644 lib/gpu/lal_hippo_ext.cpp
 create mode 100644 src/GPU/pair_hippo_gpu.cpp
 create mode 100644 src/GPU/pair_hippo_gpu.h

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index a9e02ee7b4..8d9af4706e 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -62,7 +62,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
-                            "k_amoeba_dispersion", "k_amoeba_multipole",
+                            "k_amoeba_multipole",
                             "k_amoeba_udirect2b", "k_amoeba_umutual2b",
                             "k_amoeba_polar", "k_amoeba_short_nbor");
   if (success!=0)
@@ -149,47 +149,6 @@ double AmoebaT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
 }
 
-// ---------------------------------------------------------------------------
-// Calculate the dispersion real-space term, returning tep
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int AmoebaT::dispersion_real(const int eflag, const int vflag) {
-  int ainum=this->ans->inum();
-  if (ainum == 0)
-    return 0;
-
-  int _nall=this->atom->nall();
-  int nbor_pitch=this->nbor->nbor_pitch();
-
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-  this->time_pair.start();
-
-  // Build the short neighbor list for the cutoff off2_mpole,
-  //   at this point mpole is the first kernel in a time step
-  
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &this->_off2_disp, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
-  printf("launching dispersion\n");
-  this->k_dispersion.set_size(GX,BX);
-  this->k_dispersion.run(&this->atom->x, &this->atom->extra,
-                         &coeff_amtype, &coeff_amclass, &sp_nonpolar,
-                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                         &this->dev_short_nbor,
-                         &this->ans->force, &this->ans->engv,
-                         &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                         &this->_threads_per_atom,  &this->_aewald,
-                         &this->_off2_disp);
-  this->time_pair.stop();
-
-  return GX;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate the multipole real-space term, returning tep
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index df556a1018..04eb6e4aa9 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -93,7 +93,6 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  int dispersion_real(const int eflag, const int vflag);
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 309830e1ce..565f16b627 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -116,7 +116,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
 void amoeba_gpu_clear() {
   AMOEBAMF.clear();
 }
-
+/*
 int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -133,7 +133,7 @@ int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, off2, host_q, boxlo, prd);
 }
-
+*/
 int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index f252131ea7..b8e927d6ce 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -33,7 +33,6 @@ template <class numtyp, class acctyp>
 BaseAmoebaT::~BaseAmoeba() {
   delete ans;
   delete nbor;
-  k_dispersion.clear();
   k_multipole.clear();
   k_udirect2b.clear();
   k_umutual2b.clear();
@@ -55,7 +54,6 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const int maxspecial15,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
-                             const char *k_name_dispersion,
                              const char *k_name_multipole,
                              const char *k_name_udirect2b,
                              const char *k_name_umutual2b,
@@ -92,7 +90,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_dispersion,k_name_multipole,
+  compile_kernels(*ucl_device,pair_program,k_name_multipole,
                   k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
@@ -428,73 +426,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   return nbor->host_jlist.begin()-host_start;
 }
 
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute dispersion real-space
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_dispersion_real(const int ago, const int inum_full,
-                                           const int nall, double **host_x,
-                                           int *host_type, int *host_amtype,
-                                           int *host_amgroup, double **host_rpole,
-                                           double *sublo, double *subhi, tagint *tag,
-                                           int **nspecial, tagint **special,
-                                           int *nspecial15, tagint **special15,
-                                           const bool eflag_in, const bool vflag_in,
-                                           const bool eatom, const bool vatom,
-                                           int &host_start, int **ilist, int **jnum,
-                                           const double cpu_time, bool &success,
-                                           const double aewald, const double off2_disp,
-                                           double *host_q, double *boxlo, double *prd) {
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-  // NOTE: 
-  //   For now we invoke precompute() again here,
-  //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   Once all the kernels are ready, precompute() is needed only once
-  //     in the first kernel in a time step.
-  //   We only need to cast uind and uinp from host to device here
-  //     if the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type, amtype, amgroup, rpole) are ready on the device.
-
-  int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          nullptr, nullptr, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
-
-  _off2_disp = off2_disp;
-  _aewald = aewald;
-  const int red_blocks=dispersion_real(eflag,vflag);
-
-  // leave the answers (forces, energies and virial) on the device,
-  //   only copy them back in the last kernel (polar_real)
-  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //device->add_ans_object(ans);
-
-  hd_balancer.stop_timer();
-
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
@@ -885,7 +816,6 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname_dispersion,
                                   const char *kname_multipole,
                                   const char *kname_udirect2b,
                                   const char *kname_umutual2b,
@@ -899,7 +829,6 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
   pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
 
-  k_dispersion.set_function(*pair_program,kname_dispersion);
   k_multipole.set_function(*pair_program,kname_multipole);
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index fcff3186c7..40da00f176 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -54,7 +54,7 @@ class BaseAmoeba {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_dispersion, const char *kname_multipole,
+                  const char *kname_multipole,
                   const char *kname_udirect2b, const char *kname_umutual2b,
                   const char *kname_polar, const char *kname_short_nbor);
 
@@ -142,18 +142,6 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
-  /// Compute dispersion real-space with device neighboring
-  int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double aewald, const double off2_disp, double *charge,
-                double *boxlo, double *prd);
-
   /// Compute multipole real-space with device neighboring
   int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -269,7 +257,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_dispersion, k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
   UCL_Kernel k_special15, k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
@@ -291,11 +279,10 @@ class BaseAmoeba {
   numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-     const char *kname_dispersion, const char *kname_multipole,
+     const char *kname_multipole,
      const char *kname_udirect2b, const char *kname_umutual2b,
      const char *kname_polar, const char *kname_short_nbor);
 
-  virtual int dispersion_real(const int eflag, const int vflag) = 0;
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
new file mode 100644
index 0000000000..7fa358e35a
--- /dev/null
+++ b/lib/gpu/lal_hippo.cpp
@@ -0,0 +1,430 @@
+/***************************************************************************
+                                 hippo.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "hippo_cl.h"
+#elif defined(USE_CUDART)
+const char *hippo=0;
+#else
+#include "hippo_cubin.h"
+#endif
+
+#include "lal_hippo.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define HippoT Hippo<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+HippoT::~Hippo() {
+  clear();
+  k_dispersion.clear();
+}
+
+template <class numtyp, class acctyp>
+int HippoT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                  const double *host_pdamp, const double *host_thole,
+                  const double *host_dirdamp, const int *host_amtype2class,
+                  const double *host_special_hal,
+                  const double *host_special_repel,
+                  const double *host_special_disp,
+                  const double *host_special_mpole,
+                  const double *host_special_polar_wscale,
+                  const double *host_special_polar_piscale,
+                  const double *host_special_polar_pscale,
+                  const double *host_csix, const double *host_adisp,
+                  const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15,
+                  const double cell_size, const double gpu_split, FILE *_screen,
+                  const double polar_dscale, const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,hippo,
+                            "k_hippo_multipole",
+                            "k_hippo_udirect2b", "k_hippo_umutual2b",
+                            "k_hippo_polar", "k_hippo_short_nbor");
+  if (success!=0)
+    return success;
+
+  k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = host_dirdamp[i];
+    host_write[i].w = host_amtype2class[i];
+  }
+
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = (numtyp)0;
+    host_write2[i].w = (numtyp)0;
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_polar_wscale[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=host_special_mpole[i];
+  }
+  ucl_copy(sp_polar,dview,5,false);
+
+  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_hal[i];
+    dview[i].y=host_special_repel[i];
+    dview[i].z=host_special_disp[i];
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_nonpolar,dview,5,false);
+
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+    + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void HippoT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff_amtype.clear();
+  coeff_amclass.clear();
+  sp_polar.clear();
+  sp_nonpolar.clear();
+  
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double HippoT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute dispersion real-space
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
+                                           const int nall, double **host_x,
+                                           int *host_type, int *host_amtype,
+                                           int *host_amgroup, double **host_rpole,
+                                           double *sublo, double *subhi, tagint *tag,
+                                           int **nspecial, tagint **special,
+                                           int *nspecial15, tagint **special15,
+                                           const bool eflag_in, const bool vflag_in,
+                                           const bool eatom, const bool vatom,
+                                           int &host_start, int **ilist, int **jnum,
+                                           const double cpu_time, bool &success,
+                                           const double aewald, const double off2_disp,
+                                           double *host_q, double *boxlo, double *prd) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          nullptr, nullptr, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  this->_off2_disp = off2_disp;
+  this->_aewald = aewald;
+  const int red_blocks=dispersion_real(eflag,vflag);
+
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
+  this->hd_balancer.stop_timer();
+
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the dispersion real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::dispersion_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step
+  
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_disp, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_dispersion.set_size(GX,BX);
+  k_dispersion.run(&this->atom->x, &this->atom->extra,
+                         &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                         &this->dev_short_nbor,
+                         &this->ans->force, &this->ans->engv,
+                         &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                         &this->_threads_per_atom,  &this->_aewald,
+                         &this->_off2_disp);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the multipole real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step
+  
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the real-space permanent field, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::udirect2b(const int eflag, const int vflag) {
+  int ainum=this->ans->inum(); 
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+  
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the real-space induced field, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::umutual2b(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the polar real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_polar_avail = false;
+
+  return GX;
+}
+
+template class Hippo<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
new file mode 100644
index 0000000000..a21afe6cd8
--- /dev/null
+++ b/lib/gpu/lal_hippo.cu
@@ -0,0 +1,1892 @@
+// **************************************************************************
+//                                   hippo.cu
+//                             -------------------
+//                          Trung Dac Nguyen (Northwestern)
+//
+//  Device code for acceleration of the hippo pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : trung.nguyen@northwestern.edu
+// ***************************************************************************
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include <stdio.h>
+#include "lal_aux_fun1.h"
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#include "inttypes.h"
+#define tagint int64_t
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+#ifndef _DOUBLE_DOUBLE
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+#else
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#define tagint long
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+
+#endif // defined(NV_KERNEL) || defined(USE_HIP)
+
+
+#if (SHUFFLE_AVAIL == 0)
+
+#define local_allocate_store_ufld()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+
+#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                                tep)                                        \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=tq.x;                                                   \
+    red_acc[1][tid]=tq.y;                                                   \
+    red_acc[2][tid]=tq.z;                                                   \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    tq.x=red_acc[0][tid];                                                   \
+    tq.y=red_acc[1][tid];                                                   \
+    tq.z=red_acc[2][tid];                                                   \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                              \
+  }
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=ufld[0];                                                \
+    red_acc[1][tid]=ufld[1];                                                \
+    red_acc[2][tid]=ufld[2];                                                \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    ufld[0]=red_acc[0][tid];                                                \
+    ufld[1]=red_acc[1][tid];                                                \
+    ufld[2]=red_acc[2][tid];                                                \
+    red_acc[0][tid]=dufld[0];                                               \
+    red_acc[1][tid]=dufld[1];                                               \
+    red_acc[2][tid]=dufld[2];                                               \
+    red_acc[3][tid]=dufld[3];                                               \
+    red_acc[4][tid]=dufld[4];                                               \
+    red_acc[5][tid]=dufld[5];                                               \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    dufld[0]=red_acc[0][tid];                                               \
+    dufld[1]=red_acc[1][tid];                                               \
+    dufld[2]=red_acc[2][tid];                                               \
+    dufld[3]=red_acc[3][tid];                                               \
+    dufld[4]=red_acc[4][tid];                                               \
+    dufld[5]=red_acc[5][tid];                                               \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                              fieldp)                                       \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=_fieldp[0];                                             \
+    red_acc[1][tid]=_fieldp[1];                                             \
+    red_acc[2][tid]=_fieldp[2];                                             \
+    red_acc[3][tid]=_fieldp[3];                                             \
+    red_acc[4][tid]=_fieldp[4];                                             \
+    red_acc[5][tid]=_fieldp[5];                                             \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    _fieldp[0]=red_acc[0][tid];                                             \
+    _fieldp[1]=red_acc[1][tid];                                             \
+    _fieldp[2]=red_acc[2][tid];                                             \
+    _fieldp[3]=red_acc[3][tid];                                             \
+    _fieldp[4]=red_acc[4][tid];                                             \
+    _fieldp[5]=red_acc[5][tid];                                             \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 f, fp;                                                          \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
+    fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
+    fieldp[ii+inum] = fp;                                                   \
+  }
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+          engv[ei]+=e_coul*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else // SHUFFLE_AVAIL == 1
+
+#define local_allocate_store_ufld()
+
+#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                          tep)                                              \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      tq.x += shfl_down(tq.x, s, t_per_atom);                               \
+      tq.y += shfl_down(tq.y, s, t_per_atom);                               \
+      tq.z += shfl_down(tq.z, s, t_per_atom);                               \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                              \
+  }
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      ufld[0] += shfl_down(ufld[0], s, t_per_atom);                         \
+      ufld[1] += shfl_down(ufld[1], s, t_per_atom);                         \
+      ufld[2] += shfl_down(ufld[2], s, t_per_atom);                         \
+      dufld[0] += shfl_down(dufld[0], s, t_per_atom);                       \
+      dufld[1] += shfl_down(dufld[1], s, t_per_atom);                       \
+      dufld[2] += shfl_down(dufld[2], s, t_per_atom);                       \
+      dufld[3] += shfl_down(dufld[3], s, t_per_atom);                       \
+      dufld[4] += shfl_down(dufld[4], s, t_per_atom);                       \
+      dufld[5] += shfl_down(dufld[5], s, t_per_atom);                       \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#define store_answers_fieldp(_fieldp, ii, inum, tid, t_per_atom, offset, i, \
+                             fieldp)                                        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
+      _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom);                   \
+      _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom);                   \
+      _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom);                   \
+      _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom);                   \
+      _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom);                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 f, fp;                                                          \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
+    fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
+    fieldp[ii+inum] = fp;                                                   \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+            engv[ei]+=e_coul*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// EVFLAG == 0
+#else
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }
+
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MY_PIS (acctyp)1.77245385090551602729
+
+/* ----------------------------------------------------------------------
+   dispersion = real-space portion of Ewald dispersion
+   adapted from Tinker edreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff_amtype,
+                                 const __global numtyp4 *restrict coeff_amclass,
+                                 const __global numtyp4 *restrict sp_nonpolar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp off2)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int itype,iclass;
+    numtyp ci,ai;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    itype  = polar3[i].z;            // amtype[i];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    ci = coeff_amclass[iclass].x;    // csix[iclass];
+    ai = coeff_amclass[iclass].y;    // adisp[iclass];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = ix.x - jx.x;
+      numtyp yr = ix.y - jx.y;
+      numtyp zr = ix.z - jx.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      int jtype =   polar3[j].z; // amtype[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
+      numtyp ak = coeff_amclass[jclass].y;    // adisp[jclass];
+
+      numtyp r6 = r2*r2*r2;
+      numtyp ralpha2 = r2 * aewald*aewald;
+      numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2;
+      numtyp expterm = ucl_exp(-ralpha2);
+      numtyp expa = expterm * term;
+
+      // find the damping factor for the dispersion interaction
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp r7 = r6 * r;
+      numtyp di = ai * r;
+      numtyp di2 = di * di;
+      numtyp di3 = di * di2;
+      numtyp dk = ak * r;
+      numtyp expi = ucl_exp(-di);
+      numtyp expk = ucl_exp(-dk);
+     
+      numtyp ai2,ak2;
+      numtyp di4,di5;
+      numtyp dk2,dk3;
+      numtyp ti,ti2;
+      numtyp tk,tk2;
+      numtyp damp3,damp5;
+      numtyp ddamp;
+      numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)];
+
+      if (ai != ak) {
+        ai2 = ai * ai;
+        ak2 = ak * ak;
+        dk2 = dk * dk;
+        dk3 = dk * dk2;
+        ti = ak2 / (ak2-ai2);
+        ti2 = ti * ti;
+        tk = ai2 / (ai2-ak2);
+        tk2 = tk * tk;
+        damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk
+          - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi
+          - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk;
+        damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
+          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
+          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
+        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + 
+          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);
+
+      } else {
+        di4 = di2 * di2;
+        di5 = di2 * di3;
+        damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi;
+        damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi;
+        ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0;
+      }
+
+      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
+      
+      // apply damping and scaling factors for this interaction
+
+      numtyp scale = factor_disp * damp*damp;
+      scale = scale - (numtyp )1.0;
+      numtyp e = -ci * ck * (expa+scale) / r6;
+      numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
+      numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;
+
+      energy+= e;
+
+      // increment the damped dispersion derivative components
+
+      numtyp dedx = de * xr;
+      numtyp dedy = de * yr;
+      numtyp dedz = de * zr;
+      f.x += dedx;
+      f.y += dedy;
+      f.z += dedz;
+      
+      // increment the internal virial tensor components
+
+      numtyp vxx = xr * dedx;
+      numtyp vyx = yr * dedx;
+      numtyp vzx = zr * dedx;
+      numtyp vyy = yr * dedy;
+      numtyp vzy = zr * dedy;
+      numtyp vzz = zr * dedz;
+
+      virial[0] += vxx;
+      virial[1] += vyy;
+      virial[2] += vzz;
+      virial[3] += vyx;
+      virial[4] += vzx;
+      virial[5] += vzy;
+    } // nbor
+    
+  } // ii<inum
+
+  // accumate force, energy and virial
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of multipole
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 __global acctyp4 *restrict tep,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp felec, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int m;
+    numtyp bfac;
+    numtyp term1,term2,term3;
+    numtyp term4,term5,term6;
+    numtyp bn[6];
+    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      
+      numtyp dik = dix*dkx + diy*dky + diz*dkz;
+      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
+      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
+      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
+
+      // additional intermediates involving moments and distance
+
+      numtyp dirx = diy*zr - diz*yr;
+      numtyp diry = diz*xr - dix*zr;
+      numtyp dirz = dix*yr - diy*xr;
+      numtyp dkrx = dky*zr - dkz*yr;
+      numtyp dkry = dkz*xr - dkx*zr;
+      numtyp dkrz = dkx*yr - dky*xr;
+      numtyp dikx = diy*dkz - diz*dky;
+      numtyp diky = diz*dkx - dix*dkz;
+      numtyp dikz = dix*dky - diy*dkx;
+      numtyp qirx = qiz*yr - qiy*zr;
+      numtyp qiry = qix*zr - qiz*xr;
+      numtyp qirz = qiy*xr - qix*yr;
+      numtyp qkrx = qkz*yr - qky*zr;
+      numtyp qkry = qkx*zr - qkz*xr;
+      numtyp qkrz = qky*xr - qkx*yr;
+      numtyp qikx = qky*qiz - qkz*qiy;
+      numtyp qiky = qkz*qix - qkx*qiz;
+      numtyp qikz = qkx*qiy - qky*qix;
+      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
+      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
+      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
+      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
+      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
+      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
+      numtyp qikrx = qizk*yr - qiyk*zr;
+      numtyp qikry = qixk*zr - qizk*xr;
+      numtyp qikrz = qiyk*xr - qixk*yr;
+      numtyp qkirx = qkzi*yr - qkyi*zr;
+      numtyp qkiry = qkxi*zr - qkzi*xr;
+      numtyp qkirz = qkyi*xr - qkxi*yr;
+      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
+      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
+      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
+      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
+      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
+      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
+      numtyp diqkrx = diqkz*yr - diqky*zr;
+      numtyp diqkry = diqkx*zr - diqkz*xr;
+      numtyp diqkrz = diqky*xr - diqkx*yr;
+      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
+      numtyp dkqiry = dkqix*zr - dkqiz*xr;
+      numtyp dkqirz = dkqiy*xr - dkqix*yr;
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m < 6; m++) {
+        bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
+      }
+      for (m = 0; m < 6; m++) bn[m] *= felec;
+
+      term1 = ci*ck;
+      term2 = ck*dir - ci*dkr + dik;
+      term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
+      term5 = qir*qkr;
+      numtyp scalek = (numtyp)1.0 - factor_mpole;
+      rr1 = bn[0] - scalek*rr1;
+      rr3 = bn[1] - scalek*rr3;
+      rr5 = bn[2] - scalek*rr5;
+      rr7 = bn[3] - scalek*rr7;
+      rr9 = bn[4] - scalek*rr9;
+      rr11 = bn[5] - scalek*rr11;
+      numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
+
+      // find standard multipole intermediates for force and torque
+
+      numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11;
+      term1 = -ck*rr3 + dkr*rr5 - qkr*rr7;
+      term2 = ci*rr3 + dir*rr5 + qir*rr7;
+      term3 = (numtyp)2.0 * rr5;
+      term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
+      term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9);
+      term6 = (numtyp)4.0 * rr7;
+
+      energy += e;
+
+      // compute the force components for this interaction
+
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+        term4*qix + term5*qkx + term6*(qixk+qkxi);
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+        term4*qiy + term5*qky + term6*(qiyk+qkyi);
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+        term4*qiz + term5*qkz + term6*(qizk+qkzi);
+
+      // compute the torque components for this interaction
+
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+        term4*qirx - term6*(qikrx+qikx);
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+        term4*qiry - term6*(qikry+qiky);
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+        term4*qirz - term6*(qikrz+qikz);
+
+      // increment force-based gradient and torque on first site
+
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = -xr * frcx;
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = -yr * frcy;
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = -zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate tq
+  store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+  
+  // accumate force, energy and virial: use _acc if not the first kernel
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+  //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //   offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+}
+
+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+    numtyp ddi = coeff[itype].z;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+      
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      numtyp factor_dscale, factor_pscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for Thole polarization damping
+
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp scale7 = (numtyp)1.0;
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp) ;
+            scale3 = (numtyp)1.0 - expdamp ;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
+          }
+        } else {
+          pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_dscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+        
+      scalek = factor_pscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+  umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
+
+      numtyp factor_uscale;
+      if (igroup == jgroup) factor_uscale = polar_uscale;
+      else factor_uscale = (numtyp)1.0;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find terms needed later to compute mutual polarization
+      // if (poltyp != DIRECT) 
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          scale3 = (numtyp)1.0 - expdamp;
+          scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+        }
+        
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_uscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+
+      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
+      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
+      tdipdip[1] = bcn[1]*xr*yr;
+      tdipdip[2] = bcn[1]*xr*zr;
+      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
+      tdipdip[4] = bcn[1]*yr*zr;
+      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
+      //if (i==0 && j == 10) 
+      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
+      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
+      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
+      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
+      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
+      
+      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
+      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
+      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
+      
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
+                             const __global numtyp *restrict extra,
+                             const __global numtyp4 *restrict coeff,
+                             const __global numtyp4 *restrict sp_polar,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             const __global int *dev_short_nbor,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             __global acctyp4 *restrict tep,
+                             const int eflag, const int vflag, const int inum,
+                             const int nall, const int nbor_pitch, const int t_per_atom,
+                             const numtyp aewald, const numtyp felec,
+                             const numtyp off2, const numtyp polar_dscale,
+                             const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_ufld();
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp ufld[3];
+  ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
+  acctyp dufld[6];
+  for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int k,m,itype,igroup;
+    numtyp bfac;
+    numtyp psc3,psc5,psc7;
+    numtyp dsc3,dsc5,dsc7;
+    numtyp usc3,usc5;
+    numtyp psr3,psr5,psr7;
+    numtyp dsr3,dsr5,dsr7;
+    numtyp usr5;
+    numtyp term1,term2,term3;
+    numtyp term4,term5;
+    numtyp term6,term7;
+    numtyp rc3[3],rc5[3],rc7[3];
+    numtyp prc3[3],prc5[3],prc7[3];
+    numtyp drc3[3],drc5[3],drc7[3];
+    numtyp urc3[3],urc5[3];
+    numtyp bn[5];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    uix = polar4[i].x;    // uind[i][0];
+    uiy = polar4[i].y;    // uind[i][1];
+    uiz = polar4[i].z;    // uind[i][2];
+    uixp = polar5[i].x;   // uinp[i][0];
+    uiyp = polar5[i].y;   // uinp[i][1];
+    uizp = polar5[i].z;   // uinp[i][2];
+
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
+
+      numtyp factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      numtyp uir = uix*xr + uiy*yr + uiz*zr;
+      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
+      numtyp ukr = ukx*xr + uky*yr + ukz*zr;
+      numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m <= 4; m++) {
+        bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
+      }
+      for (m = 0; m < 5; m++) bn[m] *= felec;
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp sc3 = (numtyp)1.0;
+      numtyp sc5 = (numtyp)1.0;
+      numtyp sc7 = (numtyp)1.0;
+      for (k = 0; k < 3; k++) {
+        rc3[k] = (numtyp)0.0;
+        rc5[k] = (numtyp)0.0;
+        rc7[k] = (numtyp)0.0;
+      }
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          sc3 = (numtyp)1.0 - expdamp;
+          sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
+          sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
+          numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
+          numtyp temp5 = damp;
+          numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
+          rc3[0] = xr * temp3;
+          rc3[1] = yr * temp3;
+          rc3[2] = zr * temp3;
+          rc5[0] = rc3[0] * temp5;
+          rc5[1] = rc3[1] * temp5;
+          rc5[2] = rc3[2] * temp5;
+          rc7[0] = rc5[0] * temp7;
+          rc7[1] = rc5[1] * temp7;
+          rc7[2] = rc5[2] * temp7;
+        }
+
+        psc3 = (numtyp)1.0 - sc3*factor_pscale;
+        psc5 = (numtyp)1.0 - sc5*factor_pscale;
+        psc7 = (numtyp)1.0 - sc7*factor_pscale;
+        dsc3 = (numtyp)1.0 - sc3*factor_dscale;
+        dsc5 = (numtyp)1.0 - sc5*factor_dscale;
+        dsc7 = (numtyp)1.0 - sc7*factor_dscale;
+        usc3 = (numtyp)1.0 - sc3*factor_uscale;
+        usc5 = (numtyp)1.0 - sc5*factor_uscale;
+        psr3 = bn[1] - psc3*rr3;
+        psr5 = bn[2] - psc5*rr5;
+        psr7 = bn[3] - psc7*rr7;
+        dsr3 = bn[1] - dsc3*rr3;
+        dsr5 = bn[2] - dsc5*rr5;
+        dsr7 = bn[3] - dsc7*rr7;
+        usr5 = bn[2] - usc5*rr5;
+        for (k = 0; k < 3; k++) {
+          prc3[k] = rc3[k] * factor_pscale;
+          prc5[k] = rc5[k] * factor_pscale;
+          prc7[k] = rc7[k] * factor_pscale;
+          drc3[k] = rc3[k] * factor_dscale;
+          drc5[k] = rc5[k] * factor_dscale;
+          drc7[k] = rc7[k] * factor_dscale;
+          urc3[k] = rc3[k] * factor_uscale;
+          urc5[k] = rc5[k] * factor_uscale;
+        }
+      } else { // damp == 0: ???
+      }
+
+      // get the induced dipole field used for dipole torques
+
+      numtyp tix3 = psr3*ukx + dsr3*ukxp;
+      numtyp tiy3 = psr3*uky + dsr3*ukyp;
+      numtyp tiz3 = psr3*ukz + dsr3*ukzp;
+      numtyp tuir = -psr5*ukr - dsr5*ukrp;
+      
+      ufld[0] += tix3 + xr*tuir;
+      ufld[1] += tiy3 + yr*tuir;
+      ufld[2] += tiz3 + zr*tuir;
+
+      // get induced dipole field gradient used for quadrupole torques
+
+      numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
+      numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
+      numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
+      tuir = -psr7*ukr - dsr7*ukrp;
+      
+      dufld[0] += xr*tix5 + xr*xr*tuir;
+      dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
+      dufld[2] += yr*tiy5 + yr*yr*tuir;
+      dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
+      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
+      dufld[5] += zr*tiz5 + zr*zr*tuir;
+      
+      // get the dEd/dR terms used for direct polarization force
+
+      term1 = bn[2] - dsc3*rr5;
+      term2 = bn[3] - dsc5*rr7;
+      term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
+      term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
+      term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
+      numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
+      numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
+      term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
+      term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
+      numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
+      term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
+      term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
+      term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
+      term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
+      numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*xr*yr - rr3*yr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
+      term7 = rr5*drc5[0] - term2*xr;
+      numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;
+
+      term3 = term1*xr*zr - rr3*zr*drc3[0];
+      term5 = term2*xr*zr - rr5*zr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
+      numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*yr*zr - rr3*zr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
+      term7 = rr5*drc5[1] - term2*yr;
+      numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
+      numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
+      numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;
+
+      numtyp frcx = depx;
+      numtyp frcy = depy;
+      numtyp frcz = depz;
+
+      // get the dEp/dR terms used for direct polarization force
+      
+      // tixx and tkxx
+      term1 = bn[2] - psc3*rr5;
+      term2 = bn[3] - psc5*rr7;
+      term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr - psr5*xr;
+      term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
+      term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
+      tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
+      tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      // tiyy and tkyy
+      term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr - psr5*yr;
+      term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
+      term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
+      tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tizz and tkzz
+      term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
+      term4 = rr3*prc3[2] - term1*zr - psr5*zr;
+      term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
+      term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
+      term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
+      tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tixy and tkxy
+      term3 = term1*xr*yr - rr3*yr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
+      term7 = rr5*prc5[0] - term2*xr;
+      tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tixz and tkxz
+      term3 = term1*xr*zr - rr3*zr*prc3[0];
+      term5 = term2*xr*zr - rr5*zr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
+      tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tiyz and tkyz
+      term3 = term1*yr*zr - rr3*zr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
+      term7 = rr5*prc5[1] - term2*yr;
+      tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
+      depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
+      depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      // get the dtau/dr terms used for mutual polarization force
+      // poltyp == MUTUAL  && amoeba
+          
+      term1 = bn[2] - usc3*rr5;
+      term2 = bn[3] - usc5*rr7;
+      term3 = usr5 + term1;
+      term4 = rr3 * factor_uscale;
+      term5 = -xr*term3 + rc3[0]*term4;
+      term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
+      tixx = uix*term5 + uir*term6;
+      tkxx = ukx*term5 + ukr*term6;
+
+      term5 = -yr*term3 + rc3[1]*term4;
+      term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
+      tiyy = uiy*term5 + uir*term6;
+      tkyy = uky*term5 + ukr*term6;
+
+      term5 = -zr*term3 + rc3[2]*term4;
+      term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
+      tizz = uiz*term5 + uir*term6;
+      tkzz = ukz*term5 + ukr*term6;
+
+      term4 = -usr5 * yr;
+      term5 = -xr*term1 + rr3*urc3[0];
+      term6 = xr*yr*term2 - rr5*yr*urc5[0];
+      tixy = uix*term4 + uiy*term5 + uir*term6;
+      tkxy = ukx*term4 + uky*term5 + ukr*term6;
+
+      term4 = -usr5 * zr;
+      term6 = xr*zr*term2 - rr5*zr*urc5[0];
+      tixz = uix*term4 + uiz*term5 + uir*term6;
+      tkxz = ukx*term4 + ukz*term5 + ukr*term6;
+
+      term5 = -yr*term1 + rr3*urc3[1];
+      term6 = yr*zr*term2 - rr5*zr*urc5[1];
+      tiyz = uiy*term4 + uiz*term5 + uir*term6;
+      tkyz = uky*term4 + ukz*term5 + ukr*term6;
+
+      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
+        + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
+      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
+        + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
+      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
+        + tkxz*uixp + tkyz*uiyp + tkzz*uizp;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = xr * frcx;
+        numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = yr * frcy;
+        numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate ufld and dufld to compute tep
+  store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
+
+  // accumate force, energy and virial
+  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //     offset,eflag,vflag,ans,engv);
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+}
+
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+  
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
+__kernel void k_hippo_short_nbor(const __global numtyp4 *restrict x_,
+                                  const __global int * dev_nbor,
+                                  const __global int * dev_packed,
+                                  __global int * dev_short_nbor,
+                                  const numtyp off2,
+                                  const int inum, const int nbor_pitch,
+                                  const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<off2) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
new file mode 100644
index 0000000000..9fcb11c164
--- /dev/null
+++ b/lib/gpu/lal_hippo.h
@@ -0,0 +1,120 @@
+/***************************************************************************
+                                  hippo.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_HIPPO_H
+#define LAL_HIPPO_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Hippo : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Hippo();
+  ~Hippo();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_hal,
+           const double *host_special_repel,
+           const double *host_special_disp,
+           const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const double *host_csix, const double *host_adisp,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double polar_dscale, const double polar_uscale);
+
+  /// Compute dispersion real-space with device neighboring
+  int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double off2_disp, double *charge,
+                double *boxlo, double *prd);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
+  /// Special polar values [0-4]: 
+  ///   sp_polar.x = special_polar_wscale
+  ///   sp_polar.y special_polar_pscale,
+  ///   sp_polar.z = special_polar_piscale
+  ///   sp_polar.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_polar;
+  /// Special nonpolar values [0-4]: 
+  ///   sp_nonpolar.x = special_hal
+  ///   sp_nonpolar.y special_repel
+  ///   sp_nonpolar.z = special_disp
+  UCL_D_Vec<numtyp4> sp_nonpolar;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+  UCL_Kernel k_dispersion;
+
+ protected:
+  bool _allocated;
+  int dispersion_real(const int eflag, const int vflag);
+  int multipole_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+  
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
new file mode 100644
index 0000000000..b9e31e7b20
--- /dev/null
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -0,0 +1,210 @@
+/***************************************************************************
+                                 hippo_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to hippo acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_hippo.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_hal,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale,
+                    int& tep_size) {
+  HIPPOMF.clear();
+  gpu_mode=HIPPOMF.device->gpu_mode();
+  double gpu_split=HIPPOMF.device->particle_split();
+  int first_gpu=HIPPOMF.device->first_device();
+  int last_gpu=HIPPOMF.device->last_device();
+  int world_me=HIPPOMF.device->world_me();
+  int gpu_rank=HIPPOMF.device->gpu_rank();
+  int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
+
+  tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION);
+
+  HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
+
+  bool message=false;
+  if (HIPPOMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                          host_pdamp, host_thole, host_dirdamp,
+                          host_amtype2class, host_special_hal,
+                          host_special_repel, host_special_disp,
+                          host_special_mpole, host_special_polar_wscale,
+                          host_special_polar_piscale, host_special_polar_pscale,
+                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split,
+                          screen, polar_dscale, polar_uscale);
+
+  HIPPOMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                            host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_hal,
+                            host_special_repel, host_special_disp,
+                            host_special_mpole, host_special_polar_wscale,
+                            host_special_polar_piscale, host_special_polar_pscale,
+                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split,
+                            screen, polar_dscale, polar_uscale);
+
+    HIPPOMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    HIPPOMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void hippo_gpu_clear() {
+  HIPPOMF.clear();
+}
+
+int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd) {                             
+  return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd);
+}
+
+int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success,  const double aewald, const double off2, double *host_q,
+                           double *boxlo, double *prd, void **fieldp_ptr) {
+  return HIPPOMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
+}
+
+int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2, double *host_q,
+                           double *boxlo, double *prd, void **fieldp_ptr) {
+  return HIPPOMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
+}
+
+int** hippo_gpu_compute_polar_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup,
+                           double **host_rpole, double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  return HIPPOMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+double hippo_gpu_bytes() {
+  return HIPPOMF.host_memory_usage();
+}
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 4894ac6203..91bc679447 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -65,17 +65,6 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
                     const double polar_dscale, const double polar_uscale, int& tq_size);
 void amoeba_gpu_clear();
 
-int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd);
-
 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double *sublo, double *subhi, tagint *tag,
@@ -128,9 +117,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   fieldp_pinned = nullptr;
   tq_pinned = nullptr;
 
-  gpu_hal_ready = false;
-  gpu_repulsion_ready = false;         // true for HIPPO
-  gpu_dispersion_real_ready = false;   // true for HIPPO
+  gpu_hal_ready = false;               // true for AMOEBA when ready
+  gpu_repulsion_ready = false;         // always false for AMOEBA
+  gpu_dispersion_real_ready = false;   // always false for AMOEBA
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
@@ -205,54 +194,6 @@ void PairAmoebaGPU::init_style()
 
 /* ---------------------------------------------------------------------- */
 
-void PairAmoebaGPU::dispersion_real()
-{
-  if (!gpu_dispersion_real_ready) {
-    PairAmoeba::dispersion_real();
-    return;
-  }
-
-  int eflag=1, vflag=1;
-  int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
-  
-  double sublo[3],subhi[3];
-  if (domain->triclinic == 0) {
-    sublo[0] = domain->sublo[0];
-    sublo[1] = domain->sublo[1];
-    sublo[2] = domain->sublo[2];
-    subhi[0] = domain->subhi[0];
-    subhi[1] = domain->subhi[1];
-    subhi[2] = domain->subhi[2];
-  } else {
-    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
-  }
-  inum = atom->nlocal;
-
-  // select the correct cutoff for the term
-
-  if (use_dewald) choose(DISP_LONG);
-  else choose(DISP);
-
-  firstneigh = amoeba_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x,
-                                                 atom->type, amtype, amgroup, rpole,
-                                                 sublo, subhi, atom->tag,
-                                                 atom->nspecial, atom->special,
-                                                 atom->nspecial15, atom->special15,
-                                                 eflag, vflag, eflag_atom, vflag_atom,
-                                                 host_start, &ilist, &numneigh, cpu_time,
-                                                 success, aewald, off2, atom->q,
-                                                 domain->boxlo, domain->prd);
-  
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
-}
-
-/* ---------------------------------------------------------------------- */
-
 void PairAmoebaGPU::multipole_real()
 {
   if (!gpu_multipole_real_ready) {
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index de17703dc7..e0210faa68 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -35,7 +35,7 @@ class PairAmoebaGPU : public PairAmoeba {
 
   virtual void induce();
 
-  virtual void dispersion_real();
+  //virtual void dispersion_real();
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
   virtual void umutual2b(double **, double **);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
new file mode 100644
index 0000000000..ce0051962b
--- /dev/null
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -0,0 +1,1175 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (Northwestern)
+------------------------------------------------------------------------- */
+
+#include "pair_hippo_gpu.h"
+
+#include "amoeba_convolution.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "fix_store.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "math_const.h"
+#include "memory.h"
+#include "my_page.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "suffix.h"
+#include <cmath>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+enum{INDUCE,RSD,SETUP_hippo,SETUP_HIPPO,KMPOLE,AMGROUP};   // forward comm
+enum{FIELD,ZRSD,TORQUE,UFLD};                               // reverse comm
+enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
+enum{MUTUAL,OPT,TCG,DIRECT};
+enum{GEAR,ASPC,LSQR};
+enum{BUILD,APPLY};
+enum{GORDON1,GORDON2};
+
+#define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
+
+// External functions from cuda library for atom decomposition
+
+int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int* host_amtype2class,
+                    const double *host_special_hal, const double *host_special_repel,
+                    const double *host_special_disp, const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale, int& tq_size);
+void hippo_gpu_clear();
+
+int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd);
+
+int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double *sublo, double *subhi, tagint *tag,
+              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
+
+int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp, 
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double off2, double *host_q,
+              double *boxlo, double *prd, void **fieldp_ptr);
+
+int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp, 
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double off2, double *host_q,
+              double *boxlo, double *prd, void **fieldp_ptr);
+
+int ** hippo_gpu_compute_polar_real(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp,
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
+
+double hippo_gpu_bytes();
+
+/* ---------------------------------------------------------------------- */
+
+PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
+{
+  respa_enable = 0;
+  reinitflag = 0;
+  cpu_time = 0.0;
+  suffix_flag |= Suffix::GPU;
+  fieldp_pinned = nullptr;
+  tq_pinned = nullptr;
+
+  gpu_hal_ready = false;               // always false for HIPPO
+  gpu_repulsion_ready = false;         // true for HIPPO when ready
+  gpu_dispersion_real_ready = false;   // true for HIPPO when ready
+  gpu_multipole_real_ready = true;
+  gpu_udirect2b_ready = true;
+  gpu_umutual2b_ready = true;
+  gpu_polar_real_ready = true;
+
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairHippoGPU::~PairHippoGPU()
+{
+  hippo_gpu_clear();
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::init_style()
+{
+  PairAmoeba::init_style();
+
+  // Repeat cutsq calculation because done after call to init_style
+
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  int maxspecial15=0;
+  if (atom->molecular != Atom::ATOMIC) {
+    maxspecial=atom->maxspecial;
+    maxspecial15=atom->maxspecial15;
+  }
+    
+  int tq_size;
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
+                                pdamp, thole, dirdamp, amtype2class, special_hal,
+                                special_repel, special_disp, special_mpole,
+                                special_polar_wscale, special_polar_piscale,
+                                special_polar_pscale, csix, adisp, atom->nlocal,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
+                                maxspecial15, cell_size, gpu_mode, screen,
+                                polar_dscale, polar_uscale, tq_size);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode == GPU_FORCE)
+    error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now");
+
+  if (tq_size == sizeof(double))
+    tq_single = false;
+  else
+    tq_single = true;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHippoGPU::dispersion_real()
+{
+  if (!gpu_dispersion_real_ready) {
+    PairAmoeba::dispersion_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  if (use_dewald) choose(DISP_LONG);
+  else choose(DISP);
+
+  firstneigh = hippo_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x,
+                                                 atom->type, amtype, amgroup, rpole,
+                                                 sublo, subhi, atom->tag,
+                                                 atom->nspecial, atom->special,
+                                                 atom->nspecial15, atom->special15,
+                                                 eflag, vflag, eflag_atom, vflag_atom,
+                                                 host_start, &ilist, &numneigh, cpu_time,
+                                                 success, aewald, off2, atom->q,
+                                                 domain->boxlo, domain->prd);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHippoGPU::multipole_real()
+{
+  if (!gpu_multipole_real_ready) {
+    PairAmoeba::multipole_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  if (use_ewald) choose(MPOLE_LONG);
+  else choose(MPOLE);
+
+  // set the energy unit conversion factor for multipolar real-space calculation
+
+  double felec = electric / am_dielectric;
+
+  firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
+                                                 atom->type, amtype, amgroup, rpole,
+                                                 sublo, subhi, atom->tag,
+                                                 atom->nspecial, atom->special,
+                                                 atom->nspecial15, atom->special15,
+                                                 eflag, vflag, eflag_atom, vflag_atom,
+                                                 host_start, &ilist, &numneigh, cpu_time,
+                                                 success, aewald, felec, off2, atom->q,
+                                                 domain->boxlo, domain->prd, &tq_pinned);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tq_single) {
+    float *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, fmpole, virmpole);
+  } else {
+    double *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, fmpole, virmpole);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   induce = induced dipole moments via pre-conditioned CG solver
+   adapted from Tinker induce0a() routine
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::induce()
+{
+  bool done;
+  int i,j,m,ii,itype;
+  int iter,maxiter;
+  double polmin;
+  double eps,epsold;
+  double epsd,epsp;
+  double udsum,upsum;
+  double a,ap,b,bp;
+  double sum,sump,term;
+  double reduce[4],allreduce[4];
+
+  double *poli;
+  double **conj,**conjp;
+  double **vec,**vecp;
+  double **udir,**usum,**usump;
+
+  int debug = 1;
+
+  // set cutoffs, taper coeffs, and PME params
+  // create qfac here, free at end of polar()
+  
+  if (use_ewald) {
+    choose(POLAR_LONG);
+    int nmine = p_kspace->nfft_owned;
+    memory->create(qfac,nmine,"ameoba/induce:qfac");
+  } else choose(POLAR);
+
+  // owned atoms
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int nlocal = atom->nlocal;
+
+  // zero out the induced dipoles at each site
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      uind[i][j] = 0.0;
+      uinp[i][j] = 0.0;
+    }
+  }
+
+  // allocation of arrays
+  // NOTE: not all are used by all methods
+  // NOTE: could be re-allocated dynamically
+
+  memory->create(poli,nlocal,"ameoba/induce:poli");
+  memory->create(conj,nlocal,3,"ameoba/induce:conj");
+  memory->create(conjp,nlocal,3,"ameoba/induce:conjp");
+  memory->create(vec,nlocal,3,"ameoba/induce:vec");
+  memory->create(vecp,nlocal,3,"ameoba/induce:vecp");
+  memory->create(udir,nlocal,3,"ameoba/induce:udir");
+  memory->create(usum,nlocal,3,"ameoba/induce:usum");
+  memory->create(usump,nlocal,3,"ameoba/induce:usump");
+
+  // get the electrostatic field due to permanent multipoles
+  
+  dfield0c(field,fieldp);
+
+  // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
+
+  if (!gpu_udirect2b_ready) {
+    crstyle = FIELD;
+    comm->reverse_comm_pair(this);
+  }
+
+  // set induced dipoles to polarizability times direct field
+
+  for (i = 0; i < nlocal; i++) {
+    itype = amtype[i];
+    for (j = 0; j < 3; j++) {
+      udir[i][j] = polarity[itype] * field[i][j];
+      udirp[i][j] = polarity[itype] * fieldp[i][j];
+      if (pcgguess) {
+        uind[i][j] = udir[i][j];
+        uinp[i][j] = udirp[i][j];
+      }
+    }
+  }
+/*
+  printf("GPU: cutghost = %f\n", comm->cutghost[0]);
+  for (i = 0; i < 10; i++) {
+    printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
+      i, udir[i][0], udir[i][1], udir[i][2],
+      udirp[i][0], udirp[i][1], udirp[i][2]); 
+  }
+*/
+  // get induced dipoles via the OPT extrapolation method
+  // NOTE: any way to rewrite these loops to avoid allocating
+  //       uopt,uoptp with a optorder+1 dimension, just optorder ??
+  //       since no need to store optorder+1 values after these loops
+
+  if (poltyp == OPT) { 
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uopt[i][0][j] = udir[i][j];
+        uoptp[i][0][j] = udirp[i][j];
+      }
+    }
+
+    for (m = 1; m <= optorder; m++) {
+      optlevel = m - 1;     // used in umutual1() for fopt,foptp
+
+      cfstyle = INDUCE;
+      comm->forward_comm_pair(this);
+
+      ufield0c(field,fieldp);
+
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm_pair(this);
+      }
+
+      for (i = 0; i < nlocal; i++) {
+	      itype = amtype[i];
+        for (j = 0; j < 3; j++) {
+          uopt[i][m][j] = polarity[itype] * field[i][j];
+          uoptp[i][m][j] = polarity[itype] * fieldp[i][j];
+          uind[i][j] = uopt[i][m][j];
+          uinp[i][j] = uoptp[i][m][j];
+        }
+      }
+    }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uind[i][j] = 0.0;
+        uinp[i][j] = 0.0;
+        usum[i][j] = 0.0;
+        usump[i][j] = 0.0;
+        for (m = 0; m <= optorder; m++) {
+          usum[i][j] += uopt[i][m][j];
+          usump[i][j] += uoptp[i][m][j];
+          uind[i][j] += copt[m]*usum[i][j];
+          uinp[i][j] += copt[m]*usump[i][j];
+        }
+      }
+    }
+  }
+
+  // set tolerances for computation of mutual induced dipoles
+
+  if (poltyp == MUTUAL) {
+    done = false;
+    maxiter = 100;
+    iter = 0;
+    polmin = 0.00000001;
+    eps = 100.0;
+
+    // estimate induced dipoles using a polynomial predictor
+
+    if (use_pred && nualt == maxualt) {
+      ulspred();
+
+      double ***udalt = fixudalt->tstore;
+      double ***upalt = fixupalt->tstore;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          udsum = 0.0;
+          upsum = 0.0;
+          for (m = 0; m < nualt; m++) {
+            udsum += bpred[m]*udalt[i][m][j];
+            upsum += bpredp[m]*upalt[i][m][j];
+          }
+          uind[i][j] = udsum;
+          uinp[i][j] = upsum;
+        }
+      }
+    }
+
+    // estimate induced dipoles via inertial extended Lagrangian
+    // not supported for now
+    // requires uaux,upaux to persist with each atom
+    // also requires a velocity vector(s) to persist
+    // also requires updating uaux,upaux in the Verlet integration
+
+    /*
+    if (use_ielscf) {
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uaux[i][j];
+          uinp[i][j] = upaux[i][j];
+        }
+      }
+    }
+    */
+
+    // get the electrostatic field due to induced dipoles
+
+    cfstyle = INDUCE;
+    comm->forward_comm_pair(this);
+
+    ufield0c(field,fieldp);
+
+    if (!gpu_umutual2b_ready) {
+      crstyle = FIELD;
+      comm->reverse_comm_pair(this);
+    }
+    
+    //error->all(FLERR,"STOP GPU");
+
+    // set initial conjugate gradient residual and conjugate vector
+
+    for (i = 0; i < nlocal; i++) {
+      itype = amtype[i];
+
+      poli[i] = MAX(polmin,polarity[itype]);
+      for (j = 0; j < 3; j++) {
+        if (pcgguess) {
+          rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j];
+          rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j];
+        } else {
+          rsd[i][j] = udir[i][j] / poli[i];
+          rsdp[i][j] = udirp[i][j] / poli[i];
+        }
+        zrsd[i][j] = rsd[i][j];
+        zrsdp[i][j] = rsdp[i][j];
+      }
+    }
+
+    if (pcgprec) {
+      cfstyle = RSD;
+      comm->forward_comm_pair(this);
+      uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+      crstyle = ZRSD;
+      comm->reverse_comm_pair(this);
+   }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        conj[i][j] = zrsd[i][j];
+        conjp[i][j] = zrsdp[i][j];
+      }
+    }
+
+    // conjugate gradient iteration of the mutual induced dipoles
+
+    while (!done) {
+      iter++;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          vec[i][j] = uind[i][j];
+          vecp[i][j] = uinp[i][j];
+          uind[i][j] = conj[i][j];
+          uinp[i][j] = conjp[i][j];
+        }
+      }
+
+      cfstyle = INDUCE;
+      comm->forward_comm_pair(this);
+
+      ufield0c(field,fieldp);
+
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm_pair(this);
+      }
+
+      //error->all(FLERR,"STOP");
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = vec[i][j];
+          uinp[i][j] = vecp[i][j];
+          vec[i][j] = conj[i][j]/poli[i] - field[i][j];
+          vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j];
+        }
+      }
+
+      a = 0.0;
+      ap = 0.0;
+      sum = 0.0;
+      sump = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          a += conj[i][j]*vec[i][j];
+          ap += conjp[i][j]*vecp[i][j];
+          sum += rsd[i][j]*zrsd[i][j];
+          sump += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      reduce[0] = a;
+      reduce[1] = ap;
+      reduce[2] = sum;
+      reduce[3] = sump;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      a = allreduce[0];
+      ap = allreduce[1];
+      sum = allreduce[2];
+      sump = allreduce[3];
+
+      if (a != 0.0) a = sum / a;
+      if (ap != 0.0) ap = sump / ap;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uind[i][j] + a*conj[i][j];
+          uinp[i][j] = uinp[i][j] + ap*conjp[i][j];
+          rsd[i][j] = rsd[i][j] - a*vec[i][j];
+          rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j];
+          zrsd[i][j] = rsd[i][j];
+          zrsdp[i][j] = rsdp[i][j];
+        }
+      }
+
+      if (pcgprec) {
+        cfstyle = RSD;
+        comm->forward_comm_pair(this);
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+        crstyle = ZRSD;
+        comm->reverse_comm_pair(this);
+      }
+
+      b = 0.0;
+      bp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          b += rsd[i][j]*zrsd[i][j];
+          bp += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      // NOTE: comp of b,bp and allreduce only needed if pcgprec ?
+
+      reduce[0] = b;
+      reduce[1] = bp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      b = allreduce[0];
+      bp = allreduce[1];
+
+      if (sum != 0.0) b /= sum;
+      if (sump != 0.0) bp /= sump;
+
+      epsd = 0.0;
+      epsp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          conj[i][j] = zrsd[i][j] + b*conj[i][j];
+          conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j];
+          epsd += rsd[i][j]*rsd[i][j];
+          epsp += rsdp[i][j]*rsdp[i][j];
+        }
+      }
+
+      reduce[0] = epsd;
+      reduce[1] = epsp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      epsd = allreduce[0];
+      epsp = allreduce[1];
+
+      // check the convergence of the mutual induced dipoles
+
+      epsold = eps;
+      eps = MAX(epsd,epsp);
+      eps = DEBYE * sqrt(eps/atom->natoms);
+
+      if (eps < poleps) done = true;
+      if (eps > epsold) done = true;
+      if (iter >= politer) done = true;
+
+      //  apply a "peek" iteration to the mutual induced dipoles
+     
+      if (done) {
+        for (i = 0; i < nlocal; i++) {
+          term = pcgpeek * poli[i];
+          for (j = 0; j < 3; j++) {
+            uind[i][j] += term*rsd[i][j];
+            uinp[i][j] += term*rsdp[i][j];
+          }
+        }
+      }
+
+    }
+
+    // terminate the calculation if dipoles failed to converge
+    // NOTE: could make this an error
+    
+    if (iter >= maxiter || eps > epsold)
+      if (me == 0)
+	      error->warning(FLERR,"hippo induced dipoles did not converge");
+  }
+
+  // DEBUG output to dump file
+
+  if (uind_flag) 
+    dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp);
+
+  // deallocation of arrays
+
+  memory->destroy(poli);
+  memory->destroy(conj);
+  memory->destroy(conjp);
+  memory->destroy(vec);
+  memory->destroy(vecp);
+  memory->destroy(udir);
+  memory->destroy(usum);
+  memory->destroy(usump);
+
+  // update the lists of previous induced dipole values
+  // shift previous m values up to m+1, add new values at m = 0
+  // only when preconditioner is used
+
+  if (use_pred) {
+    double ***udalt = fixudalt->tstore;
+    double ***upalt = fixupalt->tstore;
+
+    nualt = MIN(nualt+1,maxualt);
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        for (m = nualt-1; m > 0; m--) {
+          udalt[i][m][j] = udalt[i][m-1][j];
+          upalt[i][m][j] = upalt[i][m-1][j];
+        }
+        udalt[i][0][j] = uind[i][j];
+        upalt[i][0][j] = uinp[i][j];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::udirect2b(double **field, double **fieldp)
+{
+  if (!gpu_udirect2b_ready) {
+    PairAmoeba::udirect2b(field, fieldp);
+    return;
+  }
+   
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  firstneigh = hippo_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success, aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+  // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
+  // udirect2b_cpu();
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (udirect1)
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
+  }
+  
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+     atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::udirect2b_cpu()
+{
+  int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup;
+  double xr,yr,zr,r,r2;
+  double rr1,rr2,rr3,rr5;
+  double bfac,exp2a;
+  double ralpha,aefac;
+  double aesq2,aesq2n;
+  double pdi,pti,ddi;
+  double pgamma;
+  double damp,expdamp;
+  double scale3,scale5;
+  double scale7,scalek;
+  double bn[4],bcn[3];
+  double factor_dscale,factor_pscale,factor_uscale,factor_wscale;
+
+  int inum,jnum;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+
+  // neigh list
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // NOTE: doesn't this have a problem if aewald is tiny ??
+  
+  aesq2 = 2.0 * aewald * aewald;
+  aesq2n = 0.0;
+  if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+
+  int *neighptr;
+  double *tdipdip;
+
+  // compute the real space portion of the Ewald summation
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = amtype[i];
+    igroup = amgroup[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    n = ndip = 0;
+    neighptr = ipage_dipole->vget();
+    tdipdip = dpage_dipdip->vget();
+
+    pdi = pdamp[itype];
+    pti = thole[itype];
+    ddi = dirdamp[itype];
+    
+    // evaluate all sites within the cutoff distance
+
+    for (jj = 0; jj < jnum; jj++) {
+      jextra = jlist[jj];
+      j = jextra & NEIGHMASK15;
+      
+      xr = x[j][0] - x[i][0];
+      yr = x[j][1] - x[i][1];
+      zr = x[j][2] - x[i][2];
+      r2 = xr*xr + yr* yr + zr*zr;
+      if (r2 > off2) continue;
+
+      jtype = amtype[j];
+      jgroup = amgroup[j];
+      
+      factor_wscale = special_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = special_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = special_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = 1.0;
+      }
+
+      r = sqrt(r2);
+      rr1 = 1.0 / r;
+      rr2 = rr1 * rr1;
+      rr3 = rr2 * rr1;
+      rr5 = 3.0 * rr2 * rr3;
+
+      // calculate the real space Ewald error function terms
+
+      ralpha = aewald * r;
+      bn[0] = erfc(ralpha) * rr1;
+      exp2a = exp(-ralpha*ralpha);
+      aefac = aesq2n;
+      for (m = 1; m <= 3; m++) {
+        bfac = m+m-1;
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
+      }
+      
+      // find terms needed later to compute mutual polarization
+
+      if (poltyp != DIRECT) {
+        scale3 = 1.0;
+        scale5 = 1.0;
+        damp = pdi * pdamp[jtype];
+        if (damp != 0.0) {
+          pgamma = MIN(pti,thole[jtype]);
+          damp = pgamma * pow(r/damp,3.0);
+          if (damp < 50.0) {
+            expdamp = exp(-damp);
+            scale3 = 1.0 - expdamp;
+            scale5 = 1.0 - expdamp*(1.0+damp);
+          }
+        }
+        scalek = factor_uscale;
+        bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
+        bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
+        
+        neighptr[n++] = j;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
+        tdipdip[ndip++] = bcn[1]*xr*yr;
+        tdipdip[ndip++] = bcn[1]*xr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr;
+        tdipdip[ndip++] = bcn[1]*yr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr;
+      } else {
+        if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
+      }
+      
+    } // jj
+
+    firstneigh_dipole[i] = neighptr;
+    firstneigh_dipdip[i] = tdipdip;
+    numneigh_dipole[i] = n;
+    ipage_dipole->vgot(n);
+    dpage_dipdip->vgot(ndip);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::umutual2b(double **field, double **fieldp)
+{
+  if (!gpu_umutual2b_ready) {
+    PairAmoeba::umutual2b(field, fieldp);
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  firstneigh = hippo_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success,aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (umutual1)
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHippoGPU::polar_real()
+{
+  if (!gpu_polar_real_ready) {
+    PairAmoeba::polar_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff and aewald for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  // set the energy unit conversion factor for polar real-space calculation
+
+  double felec = 0.5 * electric / am_dielectric;
+
+  firstneigh = hippo_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
+                                             atom->type, amtype, amgroup,
+                                             rpole, uind, uinp, sublo, subhi,
+                                             atom->tag, atom->nspecial, atom->special,
+                                             atom->nspecial15, atom->special15,
+                                             eflag, vflag, eflag_atom, vflag_atom,
+                                             host_start, &ilist, &numneigh, cpu_time,
+                                             success, aewald, felec, off2, atom->q,
+                                             domain->boxlo, domain->prd, &tq_pinned);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tq_single) {
+    float *tep_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tep_ptr, fpolar, virpolar);
+  } else {
+    double *tep_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tep_ptr, fpolar, virpolar);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute atom forces from torques
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
+                                              double** force_comp,
+                                              double* virial_comp)
+{
+  int i,ix,iy,iz;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double fix[3],fiy[3],fiz[3],_tq[4];
+
+  double** x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    _tq[0] = tq_ptr[4*i];
+    _tq[1] = tq_ptr[4*i+1];
+    _tq[2] = tq_ptr[4*i+2];
+    torque2force(i,_tq,fix,fiy,fiz,force_comp);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+
+    virial_comp[0] += vxx;
+    virial_comp[1] += vyy;
+    virial_comp[2] += vzz;
+    virial_comp[3] += vxy;
+    virial_comp[4] += vxz;
+    virial_comp[5] += vyz;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairHippoGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + hippo_gpu_bytes();
+}
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
new file mode 100644
index 0000000000..9e961045eb
--- /dev/null
+++ b/src/GPU/pair_hippo_gpu.h
@@ -0,0 +1,80 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(hippo/gpu,PairHippoGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_HIPPO_GPU_H
+#define LMP_PAIR_HIPPO_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairHippoGPU : public PairAmoeba {
+ public:
+  PairHippoGPU(LAMMPS *lmp);
+  ~PairHippoGPU();
+  void init_style();
+  double memory_usage();
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  virtual void induce();
+
+  virtual void dispersion_real();
+  virtual void multipole_real();
+  virtual void udirect2b(double **, double **);
+  virtual void umutual2b(double **, double **);
+  virtual void polar_real();
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tq_pinned;
+  void *fieldp_pinned;
+  bool tq_single;
+
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
+  bool gpu_multipole_real_ready;
+  bool gpu_udirect2b_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
+
+  void udirect2b_cpu();
+
+  template<class numtyp>
+  void compute_force_from_torque(const numtyp*, double**, double*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Insufficient memory on accelerator
+
+There is insufficient memory on one of the devices specified for the gpu
+package
+
+E: Pair style hippo/gpu requires atom attribute q
+
+The atom style defined does not have this attribute.
+
+*/

From bebef1849596eae20ec825cd56375e572124b3d3 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 21 Sep 2021 23:46:21 -0500
Subject: [PATCH 046/181] Cleaned up and minor changes

---
 lib/gpu/lal_amoeba.cu | 383 ++++++++++++------------------------------
 1 file changed, 111 insertions(+), 272 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 60205b16ff..e4d129214a 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -404,189 +404,6 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
-/* ----------------------------------------------------------------------
-   dispersion = real-space portion of Ewald dispersion
-   adapted from Tinker edreal1d() routine
-------------------------------------------------------------------------- */
-
-__kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff_amtype,
-                                 const __global numtyp4 *restrict coeff_amclass,
-                                 const __global numtyp4 *restrict sp_nonpolar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv,
-                                 const int eflag, const int vflag, const int inum,
-                                 const int nall, const int nbor_pitch,
-                                 const int t_per_atom, const numtyp aewald,
-                                 const numtyp off2)
-{
-  int tid, ii, offset, i;
-  atom_info(t_per_atom,ii,tid,offset);
-
-  int n_stride;
-  local_allocate_store_charge();
-
-  acctyp4 f;
-  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp energy, e_coul, virial[6];
-  if (EVFLAG) {
-    energy=(acctyp)0;
-    e_coul=(acctyp)0;
-    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
-  }
-
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-
-  if (ii<inum) {
-    int itype,iclass;
-    numtyp ci,ai;
-
-    int numj, nbor, nbor_end;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
-
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
-    itype  = polar3[i].z;            // amtype[i];
-    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
-    ci = coeff_amclass[iclass].x;    // csix[iclass];
-    ai = coeff_amclass[iclass].y;    // adisp[iclass];
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int jextra=nbor_mem[nbor];
-      int j = jextra & NEIGHMASK15;
-
-      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
- 
-      // Compute r12
-      numtyp xr = ix.x - jx.x;
-      numtyp yr = ix.y - jx.y;
-      numtyp zr = ix.z - jx.z;
-      numtyp r2 = xr*xr + yr*yr + zr*zr;
-
-      //if (r2>off2) continue;
-  
-      int jtype =   polar3[j].z; // amtype[j];
-      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
-      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
-      numtyp ak = coeff_amclass[jclass].y;    // adisp[jclass];
-
-      numtyp r6 = r2*r2*r2;
-      numtyp ralpha2 = r2 * aewald*aewald;
-      numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2;
-      numtyp expterm = ucl_exp(-ralpha2);
-      numtyp expa = expterm * term;
-
-      // find the damping factor for the dispersion interaction
-
-      numtyp r = ucl_sqrt(r2);
-      numtyp r7 = r6 * r;
-      numtyp di = ai * r;
-      numtyp di2 = di * di;
-      numtyp di3 = di * di2;
-      numtyp dk = ak * r;
-      numtyp expi = ucl_exp(-di);
-      numtyp expk = ucl_exp(-dk);
-     
-      numtyp ai2,ak2;
-      numtyp di4,di5;
-      numtyp dk2,dk3;
-      numtyp ti,ti2;
-      numtyp tk,tk2;
-      numtyp damp3,damp5;
-      numtyp ddamp;
-      numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)];
-
-      if (ai != ak) {
-        ai2 = ai * ai;
-        ak2 = ak * ak;
-        dk2 = dk * dk;
-        dk3 = dk * dk2;
-        ti = ak2 / (ak2-ai2);
-        ti2 = ti * ti;
-        tk = ai2 / (ai2-ak2);
-        tk2 = tk * tk;
-        damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi
-          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk
-          - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi
-          - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk;
-        damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi
-          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
-          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
-          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
-        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + 
-          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);
-
-      } else {
-        di4 = di2 * di2;
-        di5 = di2 * di3;
-        damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi;
-        damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi;
-        ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0;
-      }
-
-      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
-      
-      // apply damping and scaling factors for this interaction
-
-      numtyp scale = factor_disp * damp*damp;
-      scale = scale - (numtyp )1.0;
-      numtyp e = -ci * ck * (expa+scale) / r6;
-      numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
-      numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;
-
-      energy+= e;
-
-      // increment the damped dispersion derivative components
-
-      numtyp dedx = de * xr;
-      numtyp dedy = de * yr;
-      numtyp dedz = de * zr;
-      f.x += dedx;
-      f.y += dedy;
-      f.z += dedz;
-      
-      // increment the internal virial tensor components
-
-      numtyp vxx = xr * dedx;
-      numtyp vyx = yr * dedx;
-      numtyp vzx = zr * dedx;
-      numtyp vyy = yr * dedy;
-      numtyp vzy = zr * dedy;
-      numtyp vzz = zr * dedz;
-
-      virial[0] += vxx;
-      virial[1] += vyy;
-      virial[2] += vzz;
-      virial[3] += vyx;
-      virial[4] += vzx;
-      virial[5] += vzy;
-    } // nbor
-    
-  } // ii<inum
-
-  // accumate force, energy and virial
-  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-     offset,eflag,vflag,ans,engv);
-}
-
 /* ----------------------------------------------------------------------
    multipole_real = real-space portion of multipole
    adapted from Tinker emreal1d() routine
@@ -636,7 +453,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     numtyp term1,term2,term3;
     numtyp term4,term5,term6;
     numtyp bn[6];
-    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
 
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -655,16 +471,19 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    ci  = polar1[i].x;    // rpole[i][0];
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
+    const numtyp4 pol1i = polar1[i];
+    numtyp ci  = pol1i.x;    // rpole[i][0];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -683,18 +502,21 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
 
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
@@ -910,7 +732,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
 
-  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
   numtyp4* polar1 = (numtyp4*)(&extra[0]);
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
@@ -933,21 +754,23 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    int itype,igroup;
     numtyp bn[4],bcn[3];
     numtyp fid[3],fip[3];
-    
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
+
+    const numtyp4 pol1i = polar1[i];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
+    int itype  = pol3i.z;    // amtype[i];
+    int igroup = pol3i.w;    // amgroup[i];
     
     // debug:
     // xi__ = ix; xi__.w = itype;
@@ -984,18 +807,21 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
       numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
 
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
 
       numtyp factor_dscale, factor_pscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
@@ -1185,14 +1011,17 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp rr3 = rr1 * r2inv;
       numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
 
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
-      numtyp ukx = polar4[j].x;  // uind[j][0];
-      numtyp uky = polar4[j].y;  // uind[j][1];
-      numtyp ukz = polar4[j].z;  // uind[j][2];
-      numtyp ukxp = polar5[j].x; // uinp[j][0];
-      numtyp ukyp = polar5[j].y; // uinp[j][1];
-      numtyp ukzp = polar5[j].z; // uinp[j][2];
+      const numtyp4 pol3j = polar3[j];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+      const numtyp4 pol4j = polar4[j];
+      numtyp ukx = pol4j.x;  // uind[j][0];
+      numtyp uky = pol4j.y;  // uind[j][1];
+      numtyp ukz = pol4j.z;  // uind[j][2];
+      const numtyp4 pol5j = polar5[j];
+      numtyp ukxp = pol5j.x; // uinp[j][0];
+      numtyp ukyp = pol5j.y; // uinp[j][1];
+      numtyp ukzp = pol5j.z; // uinp[j][2];
 
       numtyp factor_uscale;
       if (igroup == jgroup) factor_uscale = polar_uscale;
@@ -1355,24 +1184,29 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    ci  = polar1[i].x;    // rpole[i][0];
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
-    uix = polar4[i].x;    // uind[i][0];
-    uiy = polar4[i].y;    // uind[i][1];
-    uiz = polar4[i].z;    // uind[i][2];
-    uixp = polar5[i].x;   // uinp[i][0];
-    uiyp = polar5[i].y;   // uinp[i][1];
-    uizp = polar5[i].z;   // uinp[i][2];
+    const numtyp4 pol1i = polar1[i];
+    ci  = pol1i.x;    // rpole[i][0];
+    dix = pol1i.y;    // rpole[i][1];
+    diy = pol1i.z;    // rpole[i][2];
+    diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    qixx = pol2i.x;   // rpole[i][4];
+    qixy = pol2i.y;   // rpole[i][5];
+    qixz = pol2i.z;   // rpole[i][6];
+    qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    qiyz = pol3i.x;   // rpole[i][9];
+    qizz = pol3i.y;   // rpole[i][12];
+    itype  = pol3i.z;    // amtype[i];
+    igroup = pol3i.w;    // amgroup[i];
+    const numtyp4 pol4i = polar4[i];
+    uix = pol4i.x;    // uind[i][0];
+    uiy = pol4i.y;    // uind[i][1];
+    uiz = pol4i.z;    // uind[i][2];
+    const numtyp4 pol5i = polar5[i];
+    uixp = pol5i.x;   // uinp[i][0];
+    uiyp = pol5i.y;   // uinp[i][1];
+    uizp = pol5i.z;   // uinp[i][2];
 
     // debug:
     // xi__ = ix; xi__.w = itype;
@@ -1398,24 +1232,29 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   
       numtyp r = ucl_sqrt(r2);
       
+      const numtyp4 pol1j = polar1[j];
       numtyp ck = polar1[j].x;   // rpole[j][0];
       numtyp dkx = polar1[j].y;  // rpole[j][1];
       numtyp dky = polar1[j].z;  // rpole[j][2];
       numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
-      numtyp ukx = polar4[j].x;  // uind[j][0];
-      numtyp uky = polar4[j].y;  // uind[j][1];
-      numtyp ukz = polar4[j].z;  // uind[j][2];
-      numtyp ukxp = polar5[j].x; // uinp[j][0];
-      numtyp ukyp = polar5[j].y; // uinp[j][1];
-      numtyp ukzp = polar5[j].z; // uinp[j][2];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype =   pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+      const numtyp4 pol4j = polar4[j];
+      numtyp ukx = pol4j.x;  // uind[j][0];
+      numtyp uky = pol4j.y;  // uind[j][1];
+      numtyp ukz = pol4j.z;  // uind[j][2];
+      const numtyp4 pol5j = polar5[j];
+      numtyp ukxp = pol5j.x; // uinp[j][0];
+      numtyp ukyp = pol5j.y; // uinp[j][1];
+      numtyp ukzp = pol5j.z; // uinp[j][2];
 
       numtyp factor_dscale, factor_pscale, factor_uscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];

From 2428f1f4d527cd053d6d587632219eb829fc844d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 22 Sep 2021 11:44:41 -0500
Subject: [PATCH 047/181] Updated hippo kernels

---
 lib/gpu/lal_hippo.cu | 208 ++++++++++++++++++++++++-------------------
 1 file changed, 115 insertions(+), 93 deletions(-)

diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index a21afe6cd8..07df4c6ad0 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -55,7 +55,7 @@ _texture( q_tex,int2);
 #define local_allocate_store_ufld()                                         \
     __local acctyp red_acc[6][BLOCK_PAIR];
 
-#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
                                 tep)                                        \
   if (t_per_atom>1) {                                                       \
     red_acc[0][tid]=tq.x;                                                   \
@@ -225,7 +225,7 @@ _texture( q_tex,int2);
 
 #define local_allocate_store_ufld()
 
-#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+#define store_answers_hippo_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
                           tep)                                              \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
@@ -636,7 +636,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
     numtyp term1,term2,term3;
     numtyp term4,term5,term6;
     numtyp bn[6];
-    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
 
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -655,16 +654,19 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    ci  = polar1[i].x;    // rpole[i][0];
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
+    const numtyp4 pol1i = polar1[i];
+    numtyp ci  = pol1i.x;    // rpole[i][0];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -683,18 +685,21 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
 
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
@@ -873,7 +878,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
   } // ii<inum
 
   // accumulate tq
-  store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+  store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
   
   // accumate force, energy and virial: use _acc if not the first kernel
   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
@@ -910,7 +915,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
 
-  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
   numtyp4* polar1 = (numtyp4*)(&extra[0]);
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
@@ -933,21 +937,23 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    int itype,igroup;
     numtyp bn[4],bcn[3];
     numtyp fid[3],fip[3];
-    
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
+
+    const numtyp4 pol1i = polar1[i];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
+    int itype  = pol3i.z;    // amtype[i];
+    int igroup = pol3i.w;    // amgroup[i];
     
     // debug:
     // xi__ = ix; xi__.w = itype;
@@ -984,18 +990,21 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
       numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
 
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
 
       numtyp factor_dscale, factor_pscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
@@ -1185,14 +1194,17 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       numtyp rr3 = rr1 * r2inv;
       numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
 
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
-      numtyp ukx = polar4[j].x;  // uind[j][0];
-      numtyp uky = polar4[j].y;  // uind[j][1];
-      numtyp ukz = polar4[j].z;  // uind[j][2];
-      numtyp ukxp = polar5[j].x; // uinp[j][0];
-      numtyp ukyp = polar5[j].y; // uinp[j][1];
-      numtyp ukzp = polar5[j].z; // uinp[j][2];
+      const numtyp4 pol3j = polar3[j];
+      int jtype = pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+      const numtyp4 pol4j = polar4[j];
+      numtyp ukx = pol4j.x;  // uind[j][0];
+      numtyp uky = pol4j.y;  // uind[j][1];
+      numtyp ukz = pol4j.z;  // uind[j][2];
+      const numtyp4 pol5j = polar5[j];
+      numtyp ukxp = pol5j.x; // uinp[j][0];
+      numtyp ukyp = pol5j.y; // uinp[j][1];
+      numtyp ukzp = pol5j.z; // uinp[j][2];
 
       numtyp factor_uscale;
       if (igroup == jgroup) factor_uscale = polar_uscale;
@@ -1355,24 +1367,29 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    ci  = polar1[i].x;    // rpole[i][0];
-    dix = polar1[i].y;    // rpole[i][1];
-    diy = polar1[i].z;    // rpole[i][2];
-    diz = polar1[i].w;    // rpole[i][3];
-    qixx = polar2[i].x;   // rpole[i][4];
-    qixy = polar2[i].y;   // rpole[i][5];
-    qixz = polar2[i].z;   // rpole[i][6];
-    qiyy = polar2[i].w;   // rpole[i][8];
-    qiyz   = polar3[i].x; // rpole[i][9];
-    qizz   = polar3[i].y; // rpole[i][12];
-    itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
-    uix = polar4[i].x;    // uind[i][0];
-    uiy = polar4[i].y;    // uind[i][1];
-    uiz = polar4[i].z;    // uind[i][2];
-    uixp = polar5[i].x;   // uinp[i][0];
-    uiyp = polar5[i].y;   // uinp[i][1];
-    uizp = polar5[i].z;   // uinp[i][2];
+    const numtyp4 pol1i = polar1[i];
+    ci  = pol1i.x;    // rpole[i][0];
+    dix = pol1i.y;    // rpole[i][1];
+    diy = pol1i.z;    // rpole[i][2];
+    diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    qixx = pol2i.x;   // rpole[i][4];
+    qixy = pol2i.y;   // rpole[i][5];
+    qixz = pol2i.z;   // rpole[i][6];
+    qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    qiyz = pol3i.x;   // rpole[i][9];
+    qizz = pol3i.y;   // rpole[i][12];
+    itype  = pol3i.z;    // amtype[i];
+    igroup = pol3i.w;    // amgroup[i];
+    const numtyp4 pol4i = polar4[i];
+    uix = pol4i.x;    // uind[i][0];
+    uiy = pol4i.y;    // uind[i][1];
+    uiz = pol4i.z;    // uind[i][2];
+    const numtyp4 pol5i = polar5[i];
+    uixp = pol5i.x;   // uinp[i][0];
+    uiyp = pol5i.y;   // uinp[i][1];
+    uizp = pol5i.z;   // uinp[i][2];
 
     // debug:
     // xi__ = ix; xi__.w = itype;
@@ -1398,24 +1415,29 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
   
       numtyp r = ucl_sqrt(r2);
       
+      const numtyp4 pol1j = polar1[j];
       numtyp ck = polar1[j].x;   // rpole[j][0];
       numtyp dkx = polar1[j].y;  // rpole[j][1];
       numtyp dky = polar1[j].z;  // rpole[j][2];
       numtyp dkz = polar1[j].w;  // rpole[j][3];
-      numtyp qkxx = polar2[j].x; // rpole[j][4];
-      numtyp qkxy = polar2[j].y; // rpole[j][5];
-      numtyp qkxz = polar2[j].z; // rpole[j][6];
-      numtyp qkyy = polar2[j].w; // rpole[j][8];
-      numtyp qkyz = polar3[j].x; // rpole[j][9];
-      numtyp qkzz = polar3[j].y; // rpole[j][12];
-      int jtype =   polar3[j].z; // amtype[j];
-      int jgroup =  polar3[j].w; // amgroup[j];
-      numtyp ukx = polar4[j].x;  // uind[j][0];
-      numtyp uky = polar4[j].y;  // uind[j][1];
-      numtyp ukz = polar4[j].z;  // uind[j][2];
-      numtyp ukxp = polar5[j].x; // uinp[j][0];
-      numtyp ukyp = polar5[j].y; // uinp[j][1];
-      numtyp ukzp = polar5[j].z; // uinp[j][2];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype =   pol3j.z; // amtype[j];
+      int jgroup =  pol3j.w; // amgroup[j];
+      const numtyp4 pol4j = polar4[j];
+      numtyp ukx = pol4j.x;  // uind[j][0];
+      numtyp uky = pol4j.y;  // uind[j][1];
+      numtyp ukz = pol4j.z;  // uind[j][2];
+      const numtyp4 pol5j = polar5[j];
+      numtyp ukxp = pol5j.x; // uinp[j][0];
+      numtyp ukyp = pol5j.y; // uinp[j][1];
+      numtyp ukzp = pol5j.z; // uinp[j][2];
 
       numtyp factor_dscale, factor_pscale, factor_uscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
@@ -1710,7 +1732,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       frcz = frcz + depz;
 
       // get the dtau/dr terms used for mutual polarization force
-      // poltyp == MUTUAL  && amoeba
+      // poltyp == MUTUAL  && hippo
           
       term1 = bn[2] - usc3*rr5;
       term2 = bn[3] - usc5*rr7;

From 830b5fa2dd25576e7eb12a0abe58d4012d0dc593 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 23 Sep 2021 09:21:55 -0500
Subject: [PATCH 048/181] Started working on hippo/gpu

---
 src/GPU/pair_hippo_gpu.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index ce0051962b..be5d4afc2b 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -121,6 +121,9 @@ double hippo_gpu_bytes();
 
 PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
 {
+  amoeba = 0;
+  hippo = 1;
+
   respa_enable = 0;
   reinitflag = 0;
   cpu_time = 0.0;
@@ -131,9 +134,9 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_hal_ready = false;               // always false for HIPPO
   gpu_repulsion_ready = false;         // true for HIPPO when ready
   gpu_dispersion_real_ready = false;   // true for HIPPO when ready
-  gpu_multipole_real_ready = true;
-  gpu_udirect2b_ready = true;
-  gpu_umutual2b_ready = true;
+  gpu_multipole_real_ready = false;
+  gpu_udirect2b_ready = false;
+  gpu_umutual2b_ready = false;
   gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);

From ad8164dfc0ed38f20a304a140b080a94c6a110f9 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 24 Sep 2021 00:21:25 -0500
Subject: [PATCH 049/181] Fixed bugs in the dispersion real-space term for
 hippo. NOTE: CPU version filter out neighbors with zero special_disp

---
 lib/gpu/lal_hippo.cpp      | 11 ++++++-----
 lib/gpu/lal_hippo.cu       |  7 ++++---
 src/GPU/pair_hippo_gpu.cpp |  4 ++--
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 7fa358e35a..07f8732bcb 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -210,10 +210,11 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
   this->_aewald = aewald;
   const int red_blocks=dispersion_real(eflag,vflag);
 
-  // leave the answers (forces, energies and virial) on the device,
-  //   only copy them back in the last kernel (polar_real)
-  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //device->add_ans_object(ans);
+  // only copy them back if this is the last kernel
+  //   otherwise, commenting out these two lines to leave the answers
+  //   (forces, energies and virial) on the device until the last kernel
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
 
   this->hd_balancer.stop_timer();
 
@@ -238,7 +239,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
                                (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
-  // Build the short neighbor list for the cutoff off2_mpole,
+  // Build the short neighbor list for the cutoff off2_disp,
   //   at this point mpole is the first kernel in a time step
   
   this->k_short_nbor.set_size(GX,BX);
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 07df4c6ad0..f9020cf9a6 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -512,7 +512,8 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       numtyp tk,tk2;
       numtyp damp3,damp5;
       numtyp ddamp;
-      numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)];
+      const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
+      numtyp factor_disp = sp_nonpol.z; // factor_disp = special_disp[sbmask15(j)];
 
       if (ai != ak) {
         ai2 = ai * ai;
@@ -547,7 +548,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       // apply damping and scaling factors for this interaction
 
       numtyp scale = factor_disp * damp*damp;
-      scale = scale - (numtyp )1.0;
+      scale = scale - (numtyp)1.0;
       numtyp e = -ci * ck * (expa+scale) / r6;
       numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
       numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;
@@ -562,7 +563,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       f.x += dedx;
       f.y += dedy;
       f.z += dedz;
-      
+
       // increment the internal virial tensor components
 
       numtyp vxx = xr * dedx;
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index be5d4afc2b..a6e7b9edc6 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -133,11 +133,11 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
 
   gpu_hal_ready = false;               // always false for HIPPO
   gpu_repulsion_ready = false;         // true for HIPPO when ready
-  gpu_dispersion_real_ready = false;   // true for HIPPO when ready
+  gpu_dispersion_real_ready = true;   // true for HIPPO when ready
   gpu_multipole_real_ready = false;
   gpu_udirect2b_ready = false;
   gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = false;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }

From e77df80ce22d0eff3fb8e1f84921f3c1b959609e Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 24 Sep 2021 16:44:43 -0500
Subject: [PATCH 050/181] Working hippo multipole real-space term, added helper
 functions in a separate file

---
 lib/gpu/lal_base_amoeba.h  |   8 +-
 lib/gpu/lal_hippo.cpp      | 109 ++++++++++-
 lib/gpu/lal_hippo.cu       | 357 +++++++++++++++++++++++++++++++++++--
 lib/gpu/lal_hippo.h        |  15 ++
 lib/gpu/lal_hippo_ext.cpp  |   7 +-
 lib/gpu/lal_hippo_extra.h  | 326 +++++++++++++++++++++++++++++++++
 src/GPU/pair_hippo_gpu.cpp |   9 +-
 7 files changed, 797 insertions(+), 34 deletions(-)
 create mode 100644 lib/gpu/lal_hippo_extra.h

diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 40da00f176..997e7b21ed 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -143,7 +143,7 @@ class BaseAmoeba {
                 double *charge, double *boxlo, double *prd);
 
   /// Compute multipole real-space with device neighboring
-  int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
                 tagint *tag, int **nspecial, tagint **special,
@@ -155,7 +155,7 @@ class BaseAmoeba {
                 double *boxlo, double *prd, void **tep_ptr);
 
   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
-  int** compute_udirect2b(const int ago, const int inum_full, const int nall,
+  virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole,
                 double **host_uind, double **host_uinp,
@@ -169,7 +169,7 @@ class BaseAmoeba {
                 double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute the real space part of the induced field (umutual2b) with device neighboring
-  int** compute_umutual2b(const int ago, const int inum_full, const int nall,
+  virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole,
                 double **host_uind, double **host_uinp,
@@ -183,7 +183,7 @@ class BaseAmoeba {
                 double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute polar real-space with device neighboring
-  int** compute_polar_real(const int ago, const int inum_full, const int nall,
+  virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole, double **host_uind,
                 double **host_uinp, double *sublo, double *subhi,
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 07f8732bcb..fad749a185 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -56,6 +56,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
                   const double *host_special_polar_piscale,
                   const double *host_special_polar_pscale,
                   const double *host_csix, const double *host_adisp,
+                  const double *host_pcore, const double *host_palpha,
                   const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15,
                   const double cell_size, const double gpu_split, FILE *_screen,
@@ -69,7 +70,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
   if (success!=0)
     return success;
 
+  // specific to HIPPO
   k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
+  _pval.alloc(this->_max_tep_size,*(this->ucl_device),UCL_READ_ONLY,UCL_READ_ONLY);
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -98,8 +101,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
   for (int i = 0; i < max_amclass; i++) {
     host_write2[i].x = host_csix[i];
     host_write2[i].y = host_adisp[i];
-    host_write2[i].z = (numtyp)0;
-    host_write2[i].w = (numtyp)0;
+    host_write2[i].z = host_pcore[i];
+    host_write2[i].w = host_palpha[i];
   }
 
   coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
@@ -262,6 +265,93 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute multipole real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** HippoT::compute_multipole_real(const int ago, const int inum_full,
+                                          const int nall, double **host_x,
+                                          int *host_type, int *host_amtype,
+                                          int *host_amgroup, double **host_rpole,
+                                          double *sublo, double *subhi, tagint *tag,
+                                          int **nspecial, tagint **special,
+                                          int *nspecial15, tagint **special15,
+                                          const bool eflag_in, const bool vflag_in,
+                                          const bool eatom, const bool vatom,
+                                          int &host_start, int **ilist, int **jnum,
+                                          const double cpu_time, bool &success,
+                                          const double aewald, const double felec,
+                                          const double off2_mpole, double *host_q,
+                                          double *boxlo, double *prd, void **tep_ptr) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          nullptr, nullptr, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_mpole = off2_mpole;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  const int red_blocks=multipole_real(eflag,vflag);
+
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
+  this->hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  this->_tep.update_host(this->_max_tep_size*4,false);
+/*
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
+    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Calculate the multipole real-space term, returning tep
 // ---------------------------------------------------------------------------
@@ -290,13 +380,14 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
                          &nbor_pitch, &this->_threads_per_atom);
 
   this->k_multipole.set_size(GX,BX);
-  this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
-                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                    &this->dev_short_nbor,
-                    &this->ans->force, &this->ans->engv, &this->_tep,
-                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
-                    &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra, &_pval,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
   this->time_pair.stop();
 
   return GX;
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index f9020cf9a6..56da15f8aa 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -15,7 +15,8 @@
 
 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include <stdio.h>
-#include "lal_aux_fun1.h"
+#include "lal_hippo_extra.h"
+//#include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
 #endif
@@ -404,6 +405,318 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
+/* ----------------------------------------------------------------------
+   repulsion = Pauli repulsion interactions
+   adapted from Tinker erepel1b() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_nonpolar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 __global acctyp4 *restrict tep,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp off2, const numtyp cut2,
+                                 const numtyp c0, const numtyp c1, const numtyp c2,
+                                 const numtyp c3, const numtyp c4, const numtyp c5)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol1i = polar1[i];
+    numtyp ci  = pol1i.x;    // rpole[i][0];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
+    int itype = pol3i.z; // amtype[i];
+    numtyp sizi = coeff[itype].x; // sizpr[itype];
+    numtyp dmpi = coeff[itype].y; // dmppr[itype];
+    numtyp vali = coeff[itype].z; // elepr[itype];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = ix.x - jx.x;
+      numtyp yr = ix.y - jx.y;
+      numtyp zr = ix.z - jx.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      
+      numtyp sizk = coeff[jtype].x; // sizpr[jtype];
+      numtyp dmpk = coeff[jtype].y; // dmppr[jtype];
+      numtyp valk = coeff[jtype].z; // elepr[jtype];
+
+      const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
+      numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)];
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      
+      numtyp dik = dix*dkx + diy*dky + diz*dkz;
+      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
+      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
+      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
+
+      // additional intermediates involving moments and distance
+
+      numtyp dirx = diy*zr - diz*yr;
+      numtyp diry = diz*xr - dix*zr;
+      numtyp dirz = dix*yr - diy*xr;
+      numtyp dkrx = dky*zr - dkz*yr;
+      numtyp dkry = dkz*xr - dkx*zr;
+      numtyp dkrz = dkx*yr - dky*xr;
+      numtyp dikx = diy*dkz - diz*dky;
+      numtyp diky = diz*dkx - dix*dkz;
+      numtyp dikz = dix*dky - diy*dkx;
+      numtyp qirx = qiz*yr - qiy*zr;
+      numtyp qiry = qix*zr - qiz*xr;
+      numtyp qirz = qiy*xr - qix*yr;
+      numtyp qkrx = qkz*yr - qky*zr;
+      numtyp qkry = qkx*zr - qkz*xr;
+      numtyp qkrz = qky*xr - qkx*yr;
+      numtyp qikx = qky*qiz - qkz*qiy;
+      numtyp qiky = qkz*qix - qkx*qiz;
+      numtyp qikz = qkx*qiy - qky*qix;
+      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
+      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
+      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
+      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
+      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
+      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
+      numtyp qikrx = qizk*yr - qiyk*zr;
+      numtyp qikry = qixk*zr - qizk*xr;
+      numtyp qikrz = qiyk*xr - qixk*yr;
+      numtyp qkirx = qkzi*yr - qkyi*zr;
+      numtyp qkiry = qkxi*zr - qkzi*xr;
+      numtyp qkirz = qkyi*xr - qkxi*yr;
+      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
+      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
+      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
+      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
+      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
+      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
+      numtyp diqkrx = diqkz*yr - diqky*zr;
+      numtyp diqkry = diqkx*zr - diqkz*xr;
+      numtyp diqkrz = diqky*xr - diqkx*yr;
+      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
+      numtyp dkqiry = dkqix*zr - dkqiz*xr;
+      numtyp dkqirz = dkqiy*xr - dkqix*yr;
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
+
+      // get damping coefficients for the Pauli repulsion energy
+      numtyp dmpik[11];
+      damprep(r,r2,rr1,rr3,rr5,rr7,rr9,rr11,11,dmpi,dmpk,dmpik);
+
+      // calculate intermediate terms needed for the energy
+
+      numtyp term1 = vali*valk;
+      numtyp term2 = valk*dir - vali*dkr + dik;
+      numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      numtyp term4 = dir*qkr - dkr*qir - 4.0*qik;
+      numtyp term5 = qir*qkr;
+      numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + 
+        term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8];
+
+      // compute the Pauli repulsion energy for this interaction
+
+      numtyp sizik = sizi * sizk * factor_repel;
+      numtyp e = sizik * eterm * rr1;
+
+      // calculate intermediate terms for force and torque
+
+      numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + 
+        term4*dmpik[8] + term5*dmpik[10];
+      term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6];
+      term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6];
+      term3 = (numtyp)2.0 * dmpik[4];
+      term4 = (numtyp)2.0 * (-valk*dmpik[4] + dkr*dmpik[6] - qkr*dmpik[8]);
+      term5 = (numtyp)2.0 * (-vali*dmpik[4] - dir*dmpik[6] - qir*dmpik[8]);
+      numtyp term6 = (numtyp)4.0 * dmpik[6];
+
+      // compute the force components for this interaction
+
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+        term4*qix + term5*qkx + term6*(qixk+qkxi);
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+        term4*qiy + term5*qky + term6*(qiyk+qkyi);
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+        term4*qiz + term5*qkz + term6*(qizk+qkzi);
+      frcx = frcx*rr1 + eterm*rr3*xr;
+      frcy = frcy*rr1 + eterm*rr3*yr;
+      frcz = frcz*rr1 + eterm*rr3*zr;
+
+      // compute the torque components for this interaction
+      
+      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+        term4*qirx - term6*(qikrx+qikx);
+      numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - 
+        term4*qiry - term6*(qikry+qiky);
+      numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+        term4*qirz - term6*(qikrz+qikz);
+      ttmix = sizik * ttmix * rr1;
+      ttmiy = sizik * ttmiy * rr1;
+      ttmiz = sizik * ttmiz * rr1;
+
+      // use energy switching if near the cutoff distance
+
+      if (r2 > cut2) {
+        numtyp r3 = r2 * r;
+        numtyp r4 = r2 * r2;
+        numtyp r5 = r2 * r3;
+        numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0;
+        numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp).0*c4*r3 +
+          (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1;
+        dtaper *= e * rr1;
+        e *= taper;
+        frcx = frcx*taper - dtaper*xr;
+        frcy = frcy*taper - dtaper*yr;
+        frcz = frcz*taper - dtaper*zr;
+        ttmix *= taper;
+        ttmiy *= taper;
+        ttmiz *= taper;
+      }
+
+      energy += e;
+
+      // increment force-based gradient and torque on atom I
+
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;
+
+      // increment the internal virial tensor components
+      if (EVFLAG && vflag) {
+        numtyp vxx = -xr * frcx;
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = -yr * frcy;
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = -zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate tq
+  store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+  // accumate force, energy and virial
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
 /* ----------------------------------------------------------------------
    dispersion = real-space portion of Ewald dispersion
    adapted from Tinker edreal1d() routine
@@ -594,20 +907,22 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv,
-                                 __global acctyp4 *restrict tep,
-                                 const int eflag, const int vflag, const int inum,
-                                 const int nall, const int nbor_pitch,
-                                 const int t_per_atom, const numtyp aewald,
-                                 const numtyp felec, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
+                                const __global numtyp *restrict extra,
+                                const __global numtyp *restrict pval,
+                                const __global numtyp4 *restrict coeff_amtype,
+                                const __global numtyp4 *restrict coeff_amclass,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict ans,
+                                __global acctyp *restrict engv,
+                                __global acctyp4 *restrict tep,
+                                const int eflag, const int vflag, const int inum,
+                                const int nall, const int nbor_pitch,
+                                const int t_per_atom, const numtyp aewald,
+                                const numtyp felec, const numtyp off2,
+                                const numtyp polar_dscale, const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -633,6 +948,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int m;
+    int itype,iclass;
     numtyp bfac;
     numtyp term1,term2,term3;
     numtyp term4,term5,term6;
@@ -668,6 +984,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
     const numtyp4 pol3i = polar3[i];
     numtyp qiyz = pol3i.x;   // rpole[i][9];
     numtyp qizz = pol3i.y;   // rpole[i][12];
+    itype  = pol3i.z;        // amtype[i];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+
+    numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
+    numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
+    numtyp vali = pval[i];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -701,10 +1023,15 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp qkzz = pol3j.y; // rpole[j][12];
       int jtype = pol3j.z; // amtype[j];
       int jgroup =  pol3j.w; // amgroup[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
 
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
 
+      numtyp corek = coeff_amclass[jtype].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
+      numtyp valk = pval[j];
+
       // intermediates involving moments and separation distance
 
       numtyp dir = dix*xr + diy*yr + diz*zr;
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 9fcb11c164..fa9aef45fc 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -48,6 +48,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
            const double *host_special_polar_piscale,
            const double *host_special_polar_pscale,
            const double *host_csix, const double *host_adisp,
+           const double *host_pcore, const double *host_palpha,
            const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const int maxspecial15, const double cell_size,
            const double gpu_split, FILE *_screen,
@@ -65,6 +66,18 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                 const double aewald, const double off2_disp, double *charge,
                 double *boxlo, double *prd);
 
+  /// Compute multipole real-space with device neighboring
+  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double felec, const double off2_mpole, double *charge,
+                double *boxlo, double *prd, void **tep_ptr);
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
@@ -105,6 +118,8 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
 
   UCL_Kernel k_dispersion;
 
+  UCL_Vector<acctyp,acctyp> _pval;
+
  protected:
   bool _allocated;
   int dispersion_real(const int eflag, const int vflag);
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index b9e31e7b20..fa09e7bce4 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -38,6 +38,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
                     const double *host_csix, const double *host_adisp,
+                    const double *host_pcore, const double *host_palpha,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
@@ -73,7 +74,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                           host_special_repel, host_special_disp,
                           host_special_mpole, host_special_polar_wscale,
                           host_special_polar_piscale, host_special_polar_pscale,
-                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          host_csix, host_adisp, host_pcore, host_palpha,
+                          nlocal, nall, max_nbors,
                           maxspecial, maxspecial15, cell_size, gpu_split,
                           screen, polar_dscale, polar_uscale);
 
@@ -97,7 +99,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                             host_special_repel, host_special_disp,
                             host_special_mpole, host_special_polar_wscale,
                             host_special_polar_piscale, host_special_polar_pscale,
-                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            host_csix, host_adisp, host_pcore, host_palpha,
+                            nlocal, nall, max_nbors,
                             maxspecial, maxspecial15, cell_size, gpu_split,
                             screen, polar_dscale, polar_uscale);
 
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
new file mode 100644
index 0000000000..890ce51121
--- /dev/null
+++ b/lib/gpu/lal_hippo_extra.h
@@ -0,0 +1,326 @@
+/// **************************************************************************
+//                              hippo_extra.h
+//                             -------------------
+//                              Trung Dac Nguyen
+//
+//  Device code for hippo math routines
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/*
+
+#ifndef LAL_HIPPO_EXTRA_H
+#define LAL_HIPPO_EXTRA_H
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_aux_fun1.h"
+#else
+#endif
+
+#define MY_PI2 (numtyp)1.57079632679489661923
+#define MY_PI4 (numtyp)0.78539816339744830962
+
+/* ----------------------------------------------------------------------
+   damprep generates coefficients for the Pauli repulsion
+   damping function for powers of the interatomic distance
+
+   literature reference:
+
+   J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
+   Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
+   150, 084104 (2019)
+------------------------------------------------------------------------- */
+
+ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
+                        const numtyp rr3, const numtyp rr5, const numtyp rr7,
+                        const numtyp rr9, const numtyp rr11, const int rorder,
+                        const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
+{
+  numtyp r3,r4;
+  numtyp r5,r6,r7,r8;
+  numtyp s,ds,d2s;
+  numtyp d3s,d4s,d5s;
+  numtyp dmpi2,dmpk2;
+  numtyp dmpi22,dmpi23;
+  numtyp dmpi24,dmpi25;
+  numtyp dmpi26,dmpi27;
+  numtyp dmpk22,dmpk23;
+  numtyp dmpk24,dmpk25;
+  numtyp dmpk26;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp pre,term,tmp;
+
+  // compute tolerance value for damping exponents
+
+  eps = (numtyp)0.001;
+  diff = dmpi-dmpk; 
+  if (diff < (numtyp)0) diff = -diff;
+
+  // treat the case where alpha damping exponents are equal
+
+  if (diff < eps) {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    r6 = r5 * r;
+    r7 = r6 * r;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dampi = dmpi2 * r;
+    expi = ucl_exp(-dampi);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpi26 = dmpi25 * dmpi2;
+    pre = (numtyp)128.0;
+    s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
+
+    ds = (dmpi22*r3 + dmpi23*r4) * expi / 3.0;
+    d2s = dmpi24 * expi * r5 / 9.0;
+    d3s = dmpi25 * expi * r6 / 45.0;
+    d4s = (dmpi25*r6 + dmpi26*r7) * expi / 315.0;
+    if (rorder >= 11) {
+      r8 = r7 * r;
+      dmpi27 = dmpi2 * dmpi26;
+      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0;
+    }
+
+  // treat the case where alpha damping exponents are unequal
+
+  } else {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    dmpi2 = 0.5 * dmpi;
+    dmpk2 = 0.5 * dmpk;
+    dampi = dmpi2 * r;
+    dampk = dmpk2 * r;
+    expi = exp(-dampi);
+    expk = exp(-dampk);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpk22 = dmpk2 * dmpk2;
+    dmpk23 = dmpk22 * dmpk2;
+    dmpk24 = dmpk23 * dmpk2;
+    dmpk25 = dmpk24 * dmpk2;
+    term = dmpi22 - dmpk22;
+    pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0);
+    tmp = 4.0 * dmpi2 * dmpk2 / term;
+    s = (dampi-tmp)*expk + (dampk+tmp)*expi;
+
+    ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - 
+          4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - 
+           (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+           4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + 
+       (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
+       4.0*dmpi2*dmpk2/term) * expi;
+    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - 
+           (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - 
+           4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
+      (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + 
+       (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + 
+       4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi;
+    d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + 
+           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - 
+           (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - 
+           (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+           4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + 
+       dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + 
+       (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + 
+       (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
+       4.0*dmpi2*dmpk2/term) * expi;
+    
+    if (rorder >= 11) {
+      r6 = r5 * r;
+      dmpi26 = dmpi25 * dmpi2;
+      dmpk26 = dmpk25 * dmpk2;
+      d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + 
+             dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - 
+             (4.0/945.0)*dmpi2*dmpk26*r5/term - 
+             (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - 
+             (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+             4.0*dmpi2*dmpk2/term) * expk + 
+        (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + 
+         dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + 
+         (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + 
+         (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + 
+         4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
+    }
+  }
+
+  // convert partial derivatives into full derivatives
+
+  s = s * rr1;
+  ds = ds * rr3;
+  d2s = d2s * rr5;
+  d3s = d3s * rr7;
+  d4s = d4s * rr9;
+  d5s = d5s * rr11;
+  dmpik[0] = 0.5 * pre * s * s;
+  dmpik[2] = pre * s * ds;
+  dmpik[4] = pre * (s*d2s + ds*ds);
+  dmpik[6] = pre * (s*d3s + 3.0*ds*d2s);
+  dmpik[8] = pre * (s*d4s + 4.0*ds*d3s + 3.0*d2s*d2s);
+  if (rorder >= 11) dmpik[10] = pre * (s*d5s + 5.0*ds*d4s + 10.0*d2s*d3s);
+}
+
+/* ----------------------------------------------------------------------
+   damppole generates coefficients for the charge penetration
+   damping function for powers of the interatomic distance
+
+   literature references:
+
+   L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
+   Effective Fragment Potential Method: Theory and Application to
+   the Benzene Dimer", Journal of Computational Chemistry, 28,
+   276-291 (2007)  [Gordon f1 and f2 models]
+
+   J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
+   J. W. Ponder, "An Optimized Charge Penetration Model for Use with
+   the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
+   276-291 (2017)
+------------------------------------------------------------------------- */
+
+ucl_inline void damppole(const numtyp r, const int rorder,
+                         const numtyp alphai, const numtyp alphak,
+                         numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampi6,dampi7;
+  numtyp dampi8;
+  numtyp dampk2,dampk3;
+  numtyp dampk4,dampk5;
+  numtyp dampk6;
+
+  // compute tolerance and exponential damping factors
+
+  eps = 0.001;
+  diff = fabs(alphai-alphak);
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = exp(-dampi);
+  expk = exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dampi5 = dampi2 * dampi3;
+  dmpi[0] = 1.0 - (1.0 + 0.5*dampi)*expi;
+  dmpi[2] = 1.0 - (1.0 + dampi + 0.5*dampi2)*expi;
+  dmpi[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi;
+  dmpi[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + dampi4/30.0)*expi;
+  dmpi[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                   4.0*dampi4/105.0 + dampi5/210.0)*expi;
+  if (diff < eps) {
+    dmpk[0] = dmpi[0];
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+    dmpk[8] = dmpi[8];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dampk5 = dampk2 * dampk3;
+    dmpk[0] = 1.0 - (1.0 + 0.5*dampk)*expk;
+    dmpk[2] = 1.0 - (1.0 + dampk + 0.5*dampk2)*expk;
+    dmpk[4] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk;
+    dmpk[6] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk;
+    dmpk[8] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 
+                     4.0*dampk4/105.0 + dampk5/210.0)*expk;
+  }
+
+  // valence-valence charge penetration damping for Gordon f1
+
+  if (diff < eps) {
+    dampi6 = dampi3 * dampi3;
+    dampi7 = dampi3 * dampi4;
+    dmpik[0] = 1.0 - (1.0 + 11.0*dampi/16.0 + 3.0*dampi2/16.0 + 
+                      dampi3/48.0)*expi;
+    dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + 
+                      7.0*dampi3/48.0 + dampi4/48.0)*expi;
+    dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                      dampi4/24.0 + dampi5/144.0)*expi;
+    dmpik[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                      dampi4/24.0 + dampi5/120.0 + dampi6/720.0)*expi;
+    dmpik[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                      dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + 
+                      dampi7/5040.0)*expi;
+    if (rorder >= 11) {
+      dampi8 = dampi4 * dampi4;
+      dmpik[10] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                         dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + 
+                         dampi7/5040.0 + dampi8/45360.0)*expi;
+    }
+
+  } else {
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[0] = 1.0 - termi2*(1.0 + 2.0*termk + 0.5*dampi)*expi - 
+      termk2*(1.0 + 2.0*termi + 0.5*dampk)*expk;
+    dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi -
+      termk2*(1.0+dampk+0.5*dampk2)*expk -
+      2.0*termi2*termk*(1.0+dampi)*expi -
+      2.0*termk2*termi*(1.0+dampk)*expk;
+    dmpik[4] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi - 
+      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk - 
+      2.0*termi2*termk*(1.0 + dampi + dampi2/3.0)*expi - 
+      2.0*termk2*termi*(1.0 + dampk + dampk2/3.0)*expk;
+    dmpik[6] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + 
+                             dampi3/6.0 + dampi4/30.0)*expi - 
+      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk - 
+      2.0*termi2*termk*(1.0 + dampi + 2.0*dampi2/5.0 + dampi3/15.0)*expi - 
+      2.0*termk2*termi*(1.0 + dampk + 2.0*dampk2/5.0 + dampk3/15.0)*expk;
+    dmpik[8] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                             4.0*dampi4/105.0 + dampi5/210.0)*expi - 
+      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 
+              4.0*dampk4/105.0 + dampk5/210.0)*expk - 
+      2.0*termi2*termk*(1.0 + dampi + 3.0*dampi2/7.0 + 
+                        2.0*dampi3/21.0 + dampi4/105.0)*expi - 
+      2.0*termk2*termi*(1.0 + dampk + 3.0*dampk2/7.0 + 
+                        2.0*dampk3/21.0 + dampk4/105.0)*expk;
+    
+    if (rorder >= 11) {
+      dampi6 = dampi3 * dampi3;
+      dampk6 = dampk3 * dampk3;
+      dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                                5.0*dampi4/126.0 + 2.0*dampi5/315.0 + 
+                                dampi6/1890.0)*expi - 
+        termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + 
+                2.0*dampk5/315.0 + dampk6/1890.0)*expk - 
+        2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + 
+                          dampi4/63.0 + dampi5/945.0)*expi - 
+        2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + 
+                          dampk4/63.0 + dampk5/945.0)*expk;
+    }
+  }
+}
+
+
+
+#endif
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index a6e7b9edc6..91465abb82 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -59,6 +59,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
                     const double *host_csix, const double *host_adisp,
+                    const double *host_pcore, const double *host_palpha,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
@@ -191,10 +192,10 @@ void PairHippoGPU::init_style()
                                 pdamp, thole, dirdamp, amtype2class, special_hal,
                                 special_repel, special_disp, special_mpole,
                                 special_polar_wscale, special_polar_piscale,
-                                special_polar_pscale, csix, adisp, atom->nlocal,
-                                atom->nlocal+atom->nghost, mnf, maxspecial,
-                                maxspecial15, cell_size, gpu_mode, screen,
-                                polar_dscale, polar_uscale, tq_size);
+                                special_polar_pscale, csix, adisp, pcore, palpha,
+                                atom->nlocal, atom->nlocal+atom->nghost, mnf,
+                                maxspecial, maxspecial15, cell_size, gpu_mode,
+                                screen, polar_dscale, polar_uscale, tq_size);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE)

From 78ef0d631fefab0af68d22371271cb8935c5e3b6 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 25 Sep 2021 12:25:34 -0500
Subject: [PATCH 051/181] Working on the multipole real-space term of hippo

---
 lib/gpu/lal_base_amoeba.cpp |  10 ++-
 lib/gpu/lal_base_amoeba.h   |   4 +-
 lib/gpu/lal_hippo.cpp       | 133 +++++++++++++++++++++++++++++++-----
 lib/gpu/lal_hippo.cu        |  83 ++++++++++++++--------
 lib/gpu/lal_hippo.h         |  16 ++++-
 lib/gpu/lal_hippo_ext.cpp   |   4 +-
 src/GPU/pair_hippo_gpu.cpp  |  20 +++---
 7 files changed, 207 insertions(+), 63 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index b8e927d6ce..1a299e902f 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -757,7 +757,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
-                                  double** uind, double** uinp) {
+                                  double** uind, double** uinp, double* pval) {
   // signal that we need to transfer extra data from the host
 
   atom->extra_data_unavail();
@@ -812,6 +812,14 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
       pextra[idx+2] = uinp[i][2];    
     }
   }
+
+  if (pval) {
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx]   = pval[i]; 
+    }
+  }
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 997e7b21ed..fc665ec731 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -131,7 +131,7 @@ class BaseAmoeba {
                        bool &success);
 
   /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
-  int** precompute(const int ago, const int inum_full, const int nall,
+  virtual int** precompute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole, double **host_uind,
                 double **host_uinp, double *sublo, double *subhi,
@@ -232,7 +232,7 @@ class BaseAmoeba {
 
   /// cast host arrays into a single array for atom->extra
   void cast_extra_data(int* amtype, int* amgroup, double** rpole,
-    double** uind, double** uinp);
+    double** uind, double** uinp, double* pval=nullptr);
 
   /// Per-atom arrays
   UCL_Vector<acctyp,acctyp> _tep, _fieldp;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index fad749a185..10d75f2393 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -155,6 +155,102 @@ double HippoT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
 }
 
+// ---------------------------------------------------------------------------
+// Prepare for multiple kernel calls in a time step:
+//   - reallocate per-atom arrays, if needed
+//   - transfer extra data from host to device
+//   - build the full neighbor lists for use by different kernels
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int** HippoT::precompute(const int ago, const int inum_full, const int nall,
+                              double **host_x, int *host_type, int *host_amtype,
+                              int *host_amgroup, double **host_rpole,
+                              double **host_uind, double **host_uinp, double *host_pval,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              int *nspecial15, tagint **special15,
+                              const bool eflag_in, const bool vflag_in,
+                              const bool eatom, const bool vatom, int &host_start,
+                              int **&ilist, int **&jnum, const double cpu_time,
+                              bool &success, double *host_q, double *boxlo,
+                              double *prd) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // ------------------- Resize 1-5 neighbor arrays ------------------------
+
+  if (nall>this->_nmax) {
+    this->_nmax = nall;
+    this->dev_nspecial15.clear();
+    this->dev_special15.clear();
+    this->dev_special15_t.clear();
+    this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+  }
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    this->resize_atom(0,nall,success);
+    this->zero_timers();
+    return nullptr;
+  }
+
+  this->hd_balancer.balance(cpu_time);
+  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
+  this->ans->inum(inum);
+  host_start=inum;
+
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                    success);
+    if (!success)
+      return nullptr;
+    this->atom->cast_q_data(host_q);
+    this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+    this->hd_balancer.start_timer();
+  } else {
+    this->atom->cast_x_data(host_x,host_type);
+    this->atom->cast_q_data(host_q);
+    this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+    this->hd_balancer.start_timer();
+    this->atom->add_x_data(host_x,host_type);
+  }
+  this->atom->add_q_data();
+  this->atom->add_extra_data();
+
+  *ilist=this->nbor->host_ilist.begin();
+  *jnum=this->nbor->host_acc.begin();
+
+  this->device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  // re-allocate dev_short_nbor if necessary
+  if (inum_full*(2+this->_max_nbors) > this->dev_short_nbor.cols()) {
+    int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->dev_short_nbor.resize((2+this->_max_nbors)*this->_nmax);
+  }
+
+  return this->nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute dispersion real-space
 // ---------------------------------------------------------------------------
@@ -201,9 +297,9 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
   int** firstneigh = nullptr;
-  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
-                          nullptr, nullptr, sublo, subhi, tag,
+                          nullptr, nullptr, nullptr, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
@@ -270,19 +366,20 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** HippoT::compute_multipole_real(const int ago, const int inum_full,
-                                          const int nall, double **host_x,
-                                          int *host_type, int *host_amtype,
-                                          int *host_amgroup, double **host_rpole,
-                                          double *sublo, double *subhi, tagint *tag,
-                                          int **nspecial, tagint **special,
-                                          int *nspecial15, tagint **special15,
-                                          const bool eflag_in, const bool vflag_in,
-                                          const bool eatom, const bool vatom,
-                                          int &host_start, int **ilist, int **jnum,
-                                          const double cpu_time, bool &success,
-                                          const double aewald, const double felec,
-                                          const double off2_mpole, double *host_q,
-                                          double *boxlo, double *prd, void **tep_ptr) {
+                                     const int nall, double **host_x,
+                                     int *host_type, int *host_amtype,
+                                     int *host_amgroup, double **host_rpole,
+                                     double* host_pval, double *sublo,
+                                     double *subhi, tagint *tag,
+                                     int **nspecial, tagint **special,
+                                     int *nspecial15, tagint **special15,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     int &host_start, int **ilist, int **jnum,
+                                     const double cpu_time, bool &success,
+                                     const double aewald, const double felec,
+                                     const double off2_mpole, double *host_q,
+                                     double *boxlo, double *prd, void **tep_ptr) {
   this->acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -311,9 +408,9 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
   int** firstneigh = nullptr;
-  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
-                          nullptr, nullptr, sublo, subhi, tag,
+                          nullptr, nullptr, host_pval, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
@@ -380,7 +477,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
                          &nbor_pitch, &this->_threads_per_atom);
 
   this->k_multipole.set_size(GX,BX);
-  this->k_multipole.run(&this->atom->x, &this->atom->extra, &_pval,
+  this->k_multipole.run(&this->atom->x, &this->atom->extra,
                         &coeff_amtype, &coeff_amclass, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor,
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 56da15f8aa..bc5d9270d4 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -908,7 +908,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
 
 __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict extra,
-                                const __global numtyp *restrict pval,
                                 const __global numtyp4 *restrict coeff_amtype,
                                 const __global numtyp4 *restrict coeff_amclass,
                                 const __global numtyp4 *restrict sp_polar,
@@ -945,6 +944,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
   numtyp4* polar1 = (numtyp4*)(&extra[0]);
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
 
   if (ii<inum) {
     int m;
@@ -989,7 +989,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
     numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
     numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
-    numtyp vali = pval[i];
+    numtyp vali = polar6[i].x;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -1030,7 +1030,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
       numtyp corek = coeff_amclass[jtype].z;  // pcore[jclass];
       numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
-      numtyp valk = pval[j];
+      numtyp valk = polar6[j].x;
 
       // intermediates involving moments and separation distance
 
@@ -1133,29 +1133,56 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       }
       for (m = 0; m < 6; m++) bn[m] *= felec;
 
-      term1 = ci*ck;
-      term2 = ck*dir - ci*dkr + dik;
-      term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
-      term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
-      term5 = qir*qkr;
-      numtyp scalek = (numtyp)1.0 - factor_mpole;
-      rr1 = bn[0] - scalek*rr1;
-      rr3 = bn[1] - scalek*rr3;
-      rr5 = bn[2] - scalek*rr5;
-      rr7 = bn[3] - scalek*rr7;
-      rr9 = bn[4] - scalek*rr9;
-      rr11 = bn[5] - scalek*rr11;
-      numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
+      term1 = corei*corek;
+      numtyp term1i = corek*vali;
+      numtyp term2i = corek*dir;
+      numtyp term3i = corek*qir;
+      numtyp term1k = corei*valk;
+      numtyp term2k = -corei*dkr;
+      numtyp term3k = corei*qkr;
+      numtyp term1ik = vali*valk;
+      numtyp term2ik = valk*dir - vali*dkr + dik;
+      numtyp term3ik = vali*qkr + valk*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
+      numtyp term4ik = dir*qkr - dkr*qir - 4.0*qik;
+      numtyp term5ik = qir*qkr;
+      numtyp dmpi[9],dmpj[9];
+      numtyp dmpij[11];
+      damppole(r,11,alphai,alphak,dmpi,dmpj,dmpij);
+      numtyp scalek = factor_mpole;
+      numtyp rr1i = bn[0] - (1.0-scalek*dmpi[0])*rr1;
+      numtyp rr3i = bn[1] - (1.0-scalek*dmpi[2])*rr3;
+      numtyp rr5i = bn[2] - (1.0-scalek*dmpi[4])*rr5;
+      numtyp rr7i = bn[3] - (1.0-scalek*dmpi[6])*rr7;
+      numtyp rr1k = bn[0] - (1.0-scalek*dmpj[0])*rr1;
+      numtyp rr3k = bn[1] - (1.0-scalek*dmpj[2])*rr3;
+      numtyp rr5k = bn[2] - (1.0-scalek*dmpj[4])*rr5;
+      numtyp rr7k = bn[3] - (1.0-scalek*dmpj[6])*rr7;
+      numtyp rr1ik = bn[0] - (1.0-scalek*dmpij[0])*rr1;
+      numtyp rr3ik = bn[1] - (1.0-scalek*dmpij[2])*rr3;
+      numtyp rr5ik = bn[2] - (1.0-scalek*dmpij[4])*rr5;
+      numtyp rr7ik = bn[3] - (1.0-scalek*dmpij[6])*rr7;
+      numtyp rr9ik = bn[4] - (1.0-scalek*dmpij[8])*rr9;
+      numtyp rr11ik = bn[5] - (1.0-scalek*dmpij[10])*rr11;
+      rr1 = bn[0] - (1.0-scalek)*rr1;
+      rr3 = bn[1] - (1.0-scalek)*rr3;
+      numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + 
+        term1i*rr1i + term1k*rr1k + term1ik*rr1ik + 
+        term2i*rr3i + term2k*rr3k + term2ik*rr3ik + 
+        term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
 
-      // find standard multipole intermediates for force and torque
+      // find damped multipole intermediates for force and torque
 
-      numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11;
-      term1 = -ck*rr3 + dkr*rr5 - qkr*rr7;
-      term2 = ci*rr3 + dir*rr5 + qir*rr7;
-      term3 = (numtyp)2.0 * rr5;
-      term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
-      term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9);
-      term6 = (numtyp)4.0 * rr7;
+      numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + 
+        term1i*rr3i + term1k*rr3k + term1ik*rr3ik + 
+        term2i*rr5i + term2k*rr5k + term2ik*rr5ik + 
+        term3i*rr7i + term3k*rr7k + term3ik*rr7ik;
+      term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik;
+      term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik;
+      term3 = 2.0 * rr5ik;
+      term4 = -2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik);
+      term5 = -2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik);
+      term6 = 4.0 * rr7ik;
+      rr3 = rr3ik;
 
       energy += e;
 
@@ -1209,10 +1236,10 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
   store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
   
   // accumate force, energy and virial: use _acc if not the first kernel
-  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-     offset,eflag,vflag,ans,engv);
-  //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //   offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     //offset,eflag,vflag,ans,engv);
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index fa9aef45fc..ae604e8401 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -54,6 +54,18 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
            const double gpu_split, FILE *_screen,
            const double polar_dscale, const double polar_uscale);
 
+  /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
+  int** precompute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double **host_uind,
+                double **host_uinp, double* host_pval, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **&ilist, int **&numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);
+
   /// Compute dispersion real-space with device neighboring
   int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -69,8 +81,8 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
+                int *host_amgroup, double **host_rpole, double *host_pval,
+                double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
                 int *nspecial15, tagint **special15,
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index fa09e7bce4..390f713d98 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -140,7 +140,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
 int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, int *nspecial15, tagint** special15,
                            const bool eflag, const bool vflag, const bool eatom,
                            const bool vatom, int &host_start,
@@ -148,7 +148,7 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
                            bool &success, const double aewald, const double felec, const double off2,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 91465abb82..6ac22e0721 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -79,7 +79,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
 
 int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double *sublo, double *subhi, tagint *tag,
+              double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag,
               int **nspecial, tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
@@ -135,7 +135,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_hal_ready = false;               // always false for HIPPO
   gpu_repulsion_ready = false;         // true for HIPPO when ready
   gpu_dispersion_real_ready = true;   // true for HIPPO when ready
-  gpu_multipole_real_ready = false;
+  gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = false;
   gpu_umutual2b_ready = false;
   gpu_polar_real_ready = false;
@@ -294,14 +294,14 @@ void PairHippoGPU::multipole_real()
   double felec = electric / am_dielectric;
 
   firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
-                                                 atom->type, amtype, amgroup, rpole,
-                                                 sublo, subhi, atom->tag,
-                                                 atom->nspecial, atom->special,
-                                                 atom->nspecial15, atom->special15,
-                                                 eflag, vflag, eflag_atom, vflag_atom,
-                                                 host_start, &ilist, &numneigh, cpu_time,
-                                                 success, aewald, felec, off2, atom->q,
-                                                 domain->boxlo, domain->prd, &tq_pinned);
+                                                atom->type, amtype, amgroup, rpole, pval,
+                                                sublo, subhi, atom->tag,
+                                                atom->nspecial, atom->special,
+                                                atom->nspecial15, atom->special15,
+                                                eflag, vflag, eflag_atom, vflag_atom,
+                                                host_start, &ilist, &numneigh, cpu_time,
+                                                success, aewald, felec, off2, atom->q,
+                                                domain->boxlo, domain->prd, &tq_pinned);
   
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");

From f8bc091cb8336a486823b4df25c9339a18808cf5 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 25 Sep 2021 13:17:06 -0500
Subject: [PATCH 052/181] Kept working on the multipole real-space term of
 hippo

---
 lib/gpu/lal_base_amoeba.cpp     |  9 ++++---
 lib/gpu/lal_hippo.cu            | 43 ++++++++++++++++++---------------
 src/AMOEBA/amoeba_multipole.cpp |  3 ++-
 src/GPU/pair_hippo_gpu.cpp      |  2 +-
 4 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 1a299e902f..c4fdb8c9e5 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -793,8 +793,8 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
     pextra[idx+3] = (numtyp)amgroup[i];
   }
 
+  n += nstride*_nall;
   if (uind) {
-    n += nstride*_nall;
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
       pextra[idx]   = uind[i][0];
@@ -802,9 +802,9 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
       pextra[idx+2] = uind[i][2];
     }
   }
-  
+
+  n += nstride*_nall;
   if (uinp) {
-    n += nstride*_nall;
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
       pextra[idx]   = uinp[i][0];
@@ -813,8 +813,9 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
     }
   }
 
+  n += nstride*_nall;
   if (pval) {
-    n += nstride*_nall;
+
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
       pextra[idx]   = pval[i]; 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index bc5d9270d4..040ecf9308 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1032,6 +1032,9 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
       numtyp valk = polar6[j].x;
 
+      if (i==0 && j < 10) printf("j = %d: corei = %f; corek = %f; alphai = %f; alphak = %f; vali = %f; valk = %f\n",
+        j, corei, corek, alphai, alphak, vali, valk);
+
       // intermediates involving moments and separation distance
 
       numtyp dir = dix*xr + diy*yr + diz*zr;
@@ -1149,22 +1152,22 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp dmpij[11];
       damppole(r,11,alphai,alphak,dmpi,dmpj,dmpij);
       numtyp scalek = factor_mpole;
-      numtyp rr1i = bn[0] - (1.0-scalek*dmpi[0])*rr1;
-      numtyp rr3i = bn[1] - (1.0-scalek*dmpi[2])*rr3;
-      numtyp rr5i = bn[2] - (1.0-scalek*dmpi[4])*rr5;
-      numtyp rr7i = bn[3] - (1.0-scalek*dmpi[6])*rr7;
-      numtyp rr1k = bn[0] - (1.0-scalek*dmpj[0])*rr1;
-      numtyp rr3k = bn[1] - (1.0-scalek*dmpj[2])*rr3;
-      numtyp rr5k = bn[2] - (1.0-scalek*dmpj[4])*rr5;
-      numtyp rr7k = bn[3] - (1.0-scalek*dmpj[6])*rr7;
-      numtyp rr1ik = bn[0] - (1.0-scalek*dmpij[0])*rr1;
-      numtyp rr3ik = bn[1] - (1.0-scalek*dmpij[2])*rr3;
-      numtyp rr5ik = bn[2] - (1.0-scalek*dmpij[4])*rr5;
-      numtyp rr7ik = bn[3] - (1.0-scalek*dmpij[6])*rr7;
-      numtyp rr9ik = bn[4] - (1.0-scalek*dmpij[8])*rr9;
-      numtyp rr11ik = bn[5] - (1.0-scalek*dmpij[10])*rr11;
-      rr1 = bn[0] - (1.0-scalek)*rr1;
-      rr3 = bn[1] - (1.0-scalek)*rr3;
+      numtyp rr1i = bn[0] - ((numtyp)1.0-scalek*dmpi[0])*rr1;
+      numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
+      numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
+      numtyp rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7;
+      numtyp rr1k = bn[0] - ((numtyp)1.0-scalek*dmpj[0])*rr1;
+      numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpj[2])*rr3;
+      numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpj[4])*rr5;
+      numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpj[6])*rr7;
+      numtyp rr1ik = bn[0] - ((numtyp)1.0-scalek*dmpij[0])*rr1;
+      numtyp rr3ik = bn[1] - ((numtyp)1.0-scalek*dmpij[2])*rr3;
+      numtyp rr5ik = bn[2] - ((numtyp)1.0-scalek*dmpij[4])*rr5;
+      numtyp rr7ik = bn[3] - ((numtyp)1.0-scalek*dmpij[6])*rr7;
+      numtyp rr9ik = bn[4] - ((numtyp)1.0-scalek*dmpij[8])*rr9;
+      numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11;
+      rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1;
+      rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
       numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + 
         term1i*rr1i + term1k*rr1k + term1ik*rr1ik + 
         term2i*rr3i + term2k*rr3k + term2ik*rr3ik + 
@@ -1178,10 +1181,10 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
         term3i*rr7i + term3k*rr7k + term3ik*rr7ik;
       term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik;
       term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik;
-      term3 = 2.0 * rr5ik;
-      term4 = -2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik);
-      term5 = -2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik);
-      term6 = 4.0 * rr7ik;
+      term3 = (numtyp)2.0 * rr5ik;
+      term4 = (numtyp)-2.0 * (corek*rr5i+valk*rr5ik - dkr*rr7ik+qkr*rr9ik);
+      term5 = (numtyp)-2.0 * (corei*rr5k+vali*rr5ik + dir*rr7ik+qir*rr9ik);
+      term6 = (numtyp)4.0 * rr7ik;
       rr3 = rr3ik;
 
       energy += e;
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index 3f5c9082e7..8d9e0c101d 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -379,7 +379,8 @@ void PairAmoeba::multipole_real()
         corek = pcore[jclass];
         alphak = palpha[jclass];
         valk = pval[j];
-
+        if (i==0 && j < 10) printf("j = %d: corei = %f; corek = %f; alphai = %f; alphak = %f; vali = %f; valk = %f\n",
+          j, corei, corek, alphai, alphak, vali, valk);
 	/*
 	printf("HIPPO MPOLE ij %d %d: pcore/alpha/val I %g %g %g: J %g %g %g\n",
 	       atom->tag[i],atom->tag[j],corei,alphai,vali,corek,alphak,valk);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 6ac22e0721..3bad2d4f52 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -292,7 +292,7 @@ void PairHippoGPU::multipole_real()
   // set the energy unit conversion factor for multipolar real-space calculation
 
   double felec = electric / am_dielectric;
-
+  printf("hippo gpu multipole\n");
   firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
                                                 atom->type, amtype, amgroup, rpole, pval,
                                                 sublo, subhi, atom->tag,

From edbed9c9c9c268701d7061dba651179931997c11 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 26 Sep 2021 00:13:40 -0500
Subject: [PATCH 053/181] Fixed bugs in HippoT::compute_dispersion_real and
 compute_multipole_real to ensure that answers only get copied back from
 device in the last kernel activated.

---
 lib/gpu/lal_hippo.cpp           | 11 +++++------
 lib/gpu/lal_hippo.cu            |  3 ---
 lib/gpu/lal_hippo.h             |  2 --
 src/AMOEBA/amoeba_multipole.cpp |  9 +++++----
 src/GPU/pair_hippo_gpu.cpp      |  2 +-
 5 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 10d75f2393..b4b84cc47d 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -72,7 +72,6 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
 
   // specific to HIPPO
   k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
-  _pval.alloc(this->_max_tep_size,*(this->ucl_device),UCL_READ_ONLY,UCL_READ_ONLY);
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
@@ -312,8 +311,8 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
   // only copy them back if this is the last kernel
   //   otherwise, commenting out these two lines to leave the answers
   //   (forces, energies and virial) on the device until the last kernel
-  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  this->device->add_ans_object(this->ans);
+  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //this->device->add_ans_object(this->ans);
 
   this->hd_balancer.stop_timer();
 
@@ -430,9 +429,9 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
   const int red_blocks=multipole_real(eflag,vflag);
 
   // leave the answers (forces, energies and virial) on the device,
-  //   only copy them back in the last kernel (polar_real)
-  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //device->add_ans_object(ans);
+  //   only copy them back in the last kernel (this one, or polar_real once done)
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
 
   this->hd_balancer.stop_timer();
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 040ecf9308..3bfd4f7019 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1032,9 +1032,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
       numtyp valk = polar6[j].x;
 
-      if (i==0 && j < 10) printf("j = %d: corei = %f; corek = %f; alphai = %f; alphak = %f; vali = %f; valk = %f\n",
-        j, corei, corek, alphai, alphak, vali, valk);
-
       // intermediates involving moments and separation distance
 
       numtyp dir = dix*xr + diy*yr + diz*zr;
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index ae604e8401..251f909b78 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -130,8 +130,6 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
 
   UCL_Kernel k_dispersion;
 
-  UCL_Vector<acctyp,acctyp> _pval;
-
  protected:
   bool _allocated;
   int dispersion_real(const int eflag, const int vflag);
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index 8d9e0c101d..945ee976eb 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -379,8 +379,7 @@ void PairAmoeba::multipole_real()
         corek = pcore[jclass];
         alphak = palpha[jclass];
         valk = pval[j];
-        if (i==0 && j < 10) printf("j = %d: corei = %f; corek = %f; alphai = %f; alphak = %f; vali = %f; valk = %f\n",
-          j, corei, corek, alphai, alphak, vali, valk);
+
 	/*
 	printf("HIPPO MPOLE ij %d %d: pcore/alpha/val I %g %g %g: J %g %g %g\n",
 	       atom->tag[i],atom->tag[j],corei,alphai,vali,corek,alphak,valk);
@@ -421,6 +420,8 @@ void PairAmoeba::multipole_real()
           term2i*rr3i + term2k*rr3k + term2ik*rr3ik + 
           term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
 
+        
+
         // find damped multipole intermediates for force and torque
 
         de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + 
@@ -527,14 +528,14 @@ void PairAmoeba::multipole_real()
 
       // increment force-based gradient and torque on second site
       // commenting out j parts for DEBUGGING
-      
+
       fmpole[j][0] -= frcx;
       fmpole[j][1] -= frcy;
       fmpole[j][2] -= frcz;
       tq[j][0] += ttmk[0];
       tq[j][1] += ttmk[1];
       tq[j][2] += ttmk[2];
-      
+
       // increment the virial due to pairwise Cartesian forces
 
       vxx = -xr * frcx;
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 3bad2d4f52..6ac22e0721 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -292,7 +292,7 @@ void PairHippoGPU::multipole_real()
   // set the energy unit conversion factor for multipolar real-space calculation
 
   double felec = electric / am_dielectric;
-  printf("hippo gpu multipole\n");
+
   firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
                                                 atom->type, amtype, amgroup, rpole, pval,
                                                 sublo, subhi, atom->tag,

From 5193dcf8c558eaadd49d83a52d53869c3fe1a9cf Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 26 Sep 2021 00:56:29 -0500
Subject: [PATCH 054/181] Working on the polar real-space term of hippo

---
 lib/gpu/lal_hippo.cpp       |   3 +-
 lib/gpu/lal_hippo.cu        | 395 ++++++++++++++++++------------------
 src/AMOEBA/amoeba_polar.cpp |   2 +-
 3 files changed, 205 insertions(+), 195 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index b4b84cc47d..12bf9cfd3c 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -597,7 +597,8 @@ int HippoT::polar_real(const int eflag, const int vflag) {
   }
 
   this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+  this->k_polar.run(&this->atom->x, &this->atom->extra,
+                    &coeff_amtype, &coeff_amclass, &sp_polar,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 3bfd4f7019..afc3cf10af 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1642,7 +1642,8 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
 __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict extra,
-                             const __global numtyp4 *restrict coeff,
+                             const __global numtyp4 *restrict coeff_amtype,
+                             const __global numtyp4 *restrict coeff_amclass,
                              const __global numtyp4 *restrict sp_polar,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
@@ -1683,6 +1684,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
   numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
   numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
 
   //numtyp4 xi__;
 
@@ -1749,8 +1751,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
     // debug:
     // xi__ = ix; xi__.w = itype;
 
-    numtyp pdi = coeff[itype].x;
-    numtyp pti = coeff[itype].y;
+    numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
+    numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
+     numtyp vali = polar6[i].x;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -1794,15 +1797,15 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp ukyp = pol5j.y; // uinp[j][1];
       numtyp ukzp = pol5j.z; // uinp[j][2];
 
-      numtyp factor_dscale, factor_pscale, factor_uscale;
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
       if (igroup == jgroup) {
-        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
-        factor_dscale = polar_dscale;
+        factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
         factor_uscale = polar_uscale;
       } else {
-        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
-        factor_dscale = factor_uscale = (numtyp)1.0;
+        factor_dscale = factor_pscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
+        factor_uscale = (numtyp)1.0;
       }
 
       // intermediates involving moments and separation distance
@@ -1862,77 +1865,45 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
         rc7[k] = (numtyp)0.0;
       }
 
-      // apply Thole polarization damping to scale factors
+      // apply charge penetration damping to scale factors
 
-      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
-      if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
-        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
-        if (damp < (numtyp)50.0) {
-          numtyp expdamp = ucl_exp(-damp);
-          sc3 = (numtyp)1.0 - expdamp;
-          sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
-          sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
-          numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
-          numtyp temp5 = damp;
-          numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
-          rc3[0] = xr * temp3;
-          rc3[1] = yr * temp3;
-          rc3[2] = zr * temp3;
-          rc5[0] = rc3[0] * temp5;
-          rc5[1] = rc3[1] * temp5;
-          rc5[2] = rc3[2] * temp5;
-          rc7[0] = rc5[0] * temp7;
-          rc7[1] = rc5[1] * temp7;
-          rc7[2] = rc5[2] * temp7;
-        }
-
-        psc3 = (numtyp)1.0 - sc3*factor_pscale;
-        psc5 = (numtyp)1.0 - sc5*factor_pscale;
-        psc7 = (numtyp)1.0 - sc7*factor_pscale;
-        dsc3 = (numtyp)1.0 - sc3*factor_dscale;
-        dsc5 = (numtyp)1.0 - sc5*factor_dscale;
-        dsc7 = (numtyp)1.0 - sc7*factor_dscale;
-        usc3 = (numtyp)1.0 - sc3*factor_uscale;
-        usc5 = (numtyp)1.0 - sc5*factor_uscale;
-        psr3 = bn[1] - psc3*rr3;
-        psr5 = bn[2] - psc5*rr5;
-        psr7 = bn[3] - psc7*rr7;
-        dsr3 = bn[1] - dsc3*rr3;
-        dsr5 = bn[2] - dsc5*rr5;
-        dsr7 = bn[3] - dsc7*rr7;
-        usr5 = bn[2] - usc5*rr5;
-        for (k = 0; k < 3; k++) {
-          prc3[k] = rc3[k] * factor_pscale;
-          prc5[k] = rc5[k] * factor_pscale;
-          prc7[k] = rc7[k] * factor_pscale;
-          drc3[k] = rc3[k] * factor_dscale;
-          drc5[k] = rc5[k] * factor_dscale;
-          drc7[k] = rc7[k] * factor_dscale;
-          urc3[k] = rc3[k] * factor_uscale;
-          urc5[k] = rc5[k] * factor_uscale;
-        }
-      } else { // damp == 0: ???
-      }
+      numtyp corek = coeff_amclass[jtype].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
+      numtyp valk = polar6[j].x;
+      numtyp dmpi[9],dmpk[9];
+      numtyp dmpik[9];
+      damppole(r,9,alphai,alphak,dmpi,dmpk,dmpik);
+      numtyp rr3core = bn[1] - (1.0-factor_dscale)*rr3;
+      numtyp rr5core = bn[2] - (1.0-factor_dscale)*rr5;
+      numtyp rr3i = bn[1] - (1.0-factor_dscale*dmpi[2])*rr3;
+      numtyp rr5i = bn[2] - (1.0-factor_dscale*dmpi[4])*rr5;
+      numtyp rr7i = bn[3] - (1.0-factor_dscale*dmpi[6])*rr7;
+      numtyp rr9i = bn[4] - (1.0-factor_dscale*dmpi[8])*rr9;
+      numtyp rr3k = bn[1] - (1.0-factor_dscale*dmpk[2])*rr3;
+      numtyp rr5k = bn[2] - (1.0-factor_dscale*dmpk[4])*rr5;
+      numtyp rr7k = bn[3] - (1.0-factor_dscale*dmpk[6])*rr7;
+      numtyp rr9k = bn[4] - (1.0-factor_dscale*dmpk[8])*rr9;
+      numtyp rr5ik = bn[2] - (1.0-factor_wscale*dmpik[4])*rr5;
+      numtyp rr7ik = bn[3] - (1.0-factor_wscale*dmpik[6])*rr7;
 
       // get the induced dipole field used for dipole torques
 
-      numtyp tix3 = psr3*ukx + dsr3*ukxp;
-      numtyp tiy3 = psr3*uky + dsr3*ukyp;
-      numtyp tiz3 = psr3*ukz + dsr3*ukzp;
-      numtyp tuir = -psr5*ukr - dsr5*ukrp;
-      
+      numtyp tix3 = 2.0*rr3i*ukx;
+      numtyp tiy3 = 2.0*rr3i*uky;
+      numtyp tiz3 = 2.0*rr3i*ukz;
+      numtyp tuir = -2.0*rr5i*ukr;
+
       ufld[0] += tix3 + xr*tuir;
       ufld[1] += tiy3 + yr*tuir;
       ufld[2] += tiz3 + zr*tuir;
 
       // get induced dipole field gradient used for quadrupole torques
 
-      numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
-      numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
-      numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
-      tuir = -psr7*ukr - dsr7*ukrp;
-      
+      numtyp tix5 = 4.0 * (rr5i*ukx);
+      numtyp tiy5 = 4.0 * (rr5i*uky);
+      numtyp tiz5 = 4.0 * (rr5i*ukz);
+      tuir = -2.0*rr7i*ukr;
+
       dufld[0] += xr*tix5 + xr*xr*tuir;
       dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
       dufld[2] += yr*tiy5 + yr*yr*tuir;
@@ -1941,150 +1912,188 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       dufld[5] += zr*tiz5 + zr*zr*tuir;
       
       // get the dEd/dR terms used for direct polarization force
+      
+      numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
+      numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k;
+      numtyp term1core;
+      numtyp tixx,tiyy,tizz,tixy,tixz,tiyz;
+      numtyp tkxx,tkyy,tkzz,tkxy,tkxz,tkyz;
 
-      term1 = bn[2] - dsc3*rr5;
-      term2 = bn[3] - dsc5*rr7;
-      term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
-      term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
-      term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
-      term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
-      term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
-      numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
-      numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
-        (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+      term1i = rr3i - rr5i*xr*xr;
+      term1core = rr3core - rr5core*xr*xr;
+      term2i = 2.0*rr5i*xr ;
+      term3i = rr7i*xr*xr - rr5i;
+      term4i = 2.0*rr5i;
+      term5i = 5.0*rr7i*xr;
+      term6i = rr9i*xr*xr;
+      term1k = rr3k - rr5k*xr*xr;
+      term2k = 2.0*rr5k*xr;
+      term3k = rr7k*xr*xr - rr5k;
+      term4k = 2.0*rr5k;
+      term5k = 5.0*rr7k*xr;
+      term6k = rr9k*xr*xr;
+      tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i -
+        qixx*term4i + qix*term5i - qir*term6i + (qiy*yr+qiz*zr)*rr7i;
+      tkxx = valk*term1k + corek*term1core - dkx*term2k + dkr*term3k -
+        qkxx*term4k + qkx*term5k - qkr*term6k + (qky*yr+qkz*zr)*rr7k;
 
-      term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
-      term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
-      term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
-      term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
-      term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
-      numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
-      numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
-        (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+      term1i = rr3i - rr5i*yr*yr;
+      term1core = rr3core - rr5core*yr*yr;
+      term2i = 2.0*rr5i*yr;
+      term3i = rr7i*yr*yr - rr5i;
+      term4i = 2.0*rr5i;
+      term5i = 5.0*rr7i*yr;
+      term6i = rr9i*yr*yr;
+      term1k = rr3k - rr5k*yr*yr;
+      term2k = 2.0*rr5k*yr;
+      term3k = rr7k*yr*yr - rr5k;
+      term4k = 2.0*rr5k;
+      term5k = 5.0*rr7k*yr;
+      term6k = rr9k*yr*yr;
+      tiyy = vali*term1i + corei*term1core + diy*term2i - dir*term3i -
+        qiyy*term4i + qiy*term5i - qir*term6i + (qix*xr+qiz*zr)*rr7i;
+      tkyy = valk*term1k + corek*term1core - dky*term2k + dkr*term3k -
+        qkyy*term4k + qky*term5k - qkr*term6k + (qkx*xr+qkz*zr)*rr7k;
 
-      term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
-      term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
-      term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
-      term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
-      term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
-      numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
-      numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
-        (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+      term1i = rr3i - rr5i*zr*zr;
+      term1core = rr3core - rr5core*zr*zr;
+      term2i = 2.0*rr5i*zr;
+      term3i = rr7i*zr*zr - rr5i;
+      term4i = 2.0*rr5i;
+      term5i = 5.0*rr7i*zr;
+      term6i = rr9i*zr*zr;
+      term1k = rr3k - rr5k*zr*zr;
+      term2k = 2.0*rr5k*zr;
+      term3k = rr7k*zr*zr - rr5k;
+      term4k = 2.0*rr5k;
+      term5k = 5.0*rr7k*zr;
+      term6k = rr9k*zr*zr;
+      tizz = vali*term1i + corei*term1core + diz*term2i - dir*term3i -
+        qizz*term4i + qiz*term5i - qir*term6i + (qix*xr+qiy*yr)*rr7i;
+      tkzz = valk*term1k + corek*term1core - dkz*term2k + dkr*term3k -
+        qkzz*term4k + qkz*term5k - qkr*term6k + (qkx*xr+qky*yr)*rr7k;
 
-      term3 = term1*xr*yr - rr3*yr*drc3[0];
-      term4 = rr3*drc3[0] - term1*xr;
-      term5 = term2*xr*yr - rr5*yr*drc5[0];
-      term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
-      term7 = rr5*drc5[0] - term2*xr;
-      numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
-      numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
-        (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;
+      term2i = rr5i*xr ;
+      term1i = yr * term2i;
+      term1core = rr5core*xr*yr;
+      term3i = rr5i*yr;
+      term4i = yr * (rr7i*xr);
+      term5i = 2.0*rr5i;
+      term6i = 2.0*rr7i*xr;
+      term7i = 2.0*rr7i*yr;
+      term8i = yr*rr9i*xr;
+      term2k = rr5k*xr;
+      term1k = yr * term2k;
+      term3k = rr5k*yr;
+      term4k = yr * (rr7k*xr);
+      term5k = 2.0*rr5k;
+      term6k = 2.0*rr7k*xr;
+      term7k = 2.0*rr7k*yr;
+      term8k = yr*rr9k*xr;
+      tixy = -vali*term1i - corei*term1core + diy*term2i + dix*term3i -
+        dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i;
+      tkxy = -valk*term1k - corek*term1core - dky*term2k - dkx*term3k +
+        dkr*term4k - qkxy*term5k + qky*term6k + qkx*term7k - qkr*term8k;
+      
+      term2i = rr5i*xr;
+      term1i = zr * term2i;
+      term1core = rr5core*xr*zr;
+      term3i = rr5i*zr;
+      term4i = zr * (rr7i*xr);
+      term5i = 2.0*rr5i;
+      term6i = 2.0*rr7i*xr;
+      term7i = 2.0*rr7i*zr;
+      term8i = zr*rr9i*xr;
+      term2k = rr5k*xr;
+      term1k = zr * term2k;
+      term3k = rr5k*zr;
+      term4k = zr * (rr7k*xr);
+      term5k = 2.0*rr5k;
+      term6k = 2.0*rr7k*xr;
+      term7k = 2.0*rr7k*zr;
+      term8k = zr*rr9k*xr;
+      tixz = -vali*term1i - corei*term1core + diz*term2i + dix*term3i -
+        dir*term4i - qixz*term5i + qiz*term6i + qix*term7i - qir*term8i;
+      tkxz = -valk*term1k - corek*term1core - dkz*term2k - dkx*term3k +
+        dkr*term4k - qkxz*term5k + qkz*term6k + qkx*term7k - qkr*term8k;
 
-      term3 = term1*xr*zr - rr3*zr*drc3[0];
-      term5 = term2*xr*zr - rr5*zr*drc5[0];
-      term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
-      numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
-      numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
-        (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+      term2i = rr5i*yr;
+      term1i = zr * term2i;
+      term1core = rr5core*yr*zr;
+      term3i = rr5i*zr;
+      term4i = zr * (rr7i*yr);
+      term5i = 2.0*rr5i;
+      term6i = 2.0*rr7i*yr;
+      term7i = 2.0*rr7i*zr;
+      term8i = zr*rr9i*yr;
+      term2k = rr5k*yr;
+      term1k = zr * term2k;
+      term3k = rr5k*zr;
+      term4k = zr * (rr7k*yr);
+      term5k = 2.0*rr5k;
+      term6k = 2.0*rr7k*yr;
+      term7k = 2.0*rr7k*zr;
+      term8k = zr*rr9k*yr;
+      tiyz = -vali*term1i - corei*term1core + diz*term2i + diy*term3i -
+        dir*term4i - qiyz*term5i + qiz*term6i + qiy*term7i - qir*term8i;
+      tkyz = -valk*term1k - corek*term1core - dkz*term2k - dky*term3k +
+        dkr*term4k - qkyz*term5k + qkz*term6k + qky*term7k - qkr*term8k;
 
-      term3 = term1*yr*zr - rr3*zr*drc3[1];
-      term4 = rr3*drc3[1] - term1*yr;
-      term5 = term2*yr*zr - rr5*zr*drc5[1];
-      term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
-      term7 = rr5*drc5[1] - term2*yr;
-      numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
-        (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
-      numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
-        (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+      numtyp depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
+      numtyp depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
+      numtyp depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
 
-      numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
-      numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
-      numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;
-
-      numtyp frcx = depx;
-      numtyp frcy = depy;
-      numtyp frcz = depz;
+      numtyp frcx = -2.0 * depx;
+      numtyp frcy = -2.0 * depy;
+      numtyp frcz = -2.0 * depz;
 
       // get the dEp/dR terms used for direct polarization force
-      
+      // poltyp == MUTUAL && hippo
       // tixx and tkxx
-      term1 = bn[2] - psc3*rr5;
-      term2 = bn[3] - psc5*rr7;
-      term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
-      term4 = rr3*prc3[0] - term1*xr - psr5*xr;
-      term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
-      term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
-      term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
-      tixx = ci*term3 + dix*term4 + dir*term5 +
-        (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
-      tkxx = ck*term3 - dkx*term4 - dkr*term5 +
-        (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+      term1 = 2.0 * rr5ik;
+      term2 = term1*xr;
+      term3 = rr5ik - rr7ik*xr*xr;
+      tixx = uix*term2 + uir*term3;
+      tkxx = ukx*term2 + ukr*term3;
 
       // tiyy and tkyy
-      term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
-      term4 = rr3*prc3[1] - term1*yr - psr5*yr;
-      term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
-      term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
-      term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
-      tiyy = ci*term3 + diy*term4 + dir*term5 +
-        (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
-      tkyy = ck*term3 - dky*term4 - dkr*term5 +
-        (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+      term2 = term1*yr;
+      term3 = rr5ik - rr7ik*yr*yr;
+      tiyy = uiy*term2 + uir*term3;
+      tkyy = uky*term2 + ukr*term3;
 
-      // tizz and tkzz
-      term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
-      term4 = rr3*prc3[2] - term1*zr - psr5*zr;
-      term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
-      term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
-      term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
-      tizz = ci*term3 + diz*term4 + dir*term5 +
-        (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
-      tkzz = ck*term3 - dkz*term4 - dkr*term5 +
-        (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+      // tiz and tkzz
+      term2 = term1*zr;
+      term3 = rr5ik - rr7ik*zr*zr;
+      tizz = uiz*term2 + uir*term3;
+      tkzz = ukz*term2 + ukr*term3;
 
       // tixy and tkxy
-      term3 = term1*xr*yr - rr3*yr*prc3[0];
-      term4 = rr3*prc3[0] - term1*xr;
-      term5 = term2*xr*yr - rr5*yr*prc5[0];
-      term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
-      term7 = rr5*prc5[0] - term2*xr;
-      tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
-        (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
-      tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
-        (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;
+      term1 = rr5ik*yr;
+      term2 = rr5ik*xr;
+      term3 = yr * (rr7ik*xr);
+      tixy = uix*term1 + uiy*term2 - uir*term3;
+      tkxy = ukx*term1 + uky*term2 - ukr*term3;
 
-      // tixz and tkxz
-      term3 = term1*xr*zr - rr3*zr*prc3[0];
-      term5 = term2*xr*zr - rr5*zr*prc5[0];
-      term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
-      tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
-        (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
-      tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
-        (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+      // tixx and tkxx
+      term1 = rr5ik * zr;
+      term3 = zr * (rr7ik*xr);
+      tixz = uix*term1 + uiz*term2 - uir*term3;
+      tkxz = ukx*term1 + ukz*term2 - ukr*term3;
 
       // tiyz and tkyz
-      term3 = term1*yr*zr - rr3*zr*prc3[1];
-      term4 = rr3*prc3[1] - term1*yr;
-      term5 = term2*yr*zr - rr5*zr*prc5[1];
-      term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
-      term7 = rr5*prc5[1] - term2*yr;
-      tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
-        (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
-      tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
-        (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+      term2 = rr5ik*yr;
+      term3 = zr * (rr7ik*yr);
+      tiyz = uiy*term1 + uiz*term2 - uir*term3;
+      tkyz = uky*term1 + ukz*term2 - ukr*term3;
 
-      depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
-      depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
-      depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
+      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
+      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
+      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp + tkxz*uixp + tkyz*uiyp + tkzz*uizp;
 
-      frcx = frcx + depx;
-      frcy = frcy + depy;
-      frcz = frcz + depz;
+      frcx = frcx - depx;
+      frcy = frcy - depy;
+      frcz = frcz - depz;
 
       // get the dtau/dr terms used for mutual polarization force
       // poltyp == MUTUAL  && hippo
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index f4acf3e7a8..e6b3e6ef70 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -579,7 +579,7 @@ void PairAmoeba::polar_real()
         tkz5 = 2.0 * (psr5*uiz+dsr5*uizp);
         tuir = -psr7*ukr - dsr7*ukrp;
         tukr = -psr7*uir - dsr7*uirp;
-
+      // reached here...
       } else if (hippo) {
         tix5 = 4.0 * (rr5i*ukx);
         tiy5 = 4.0 * (rr5i*uky);

From 7437c9862826d650e98c5b44b1aebe0eae6813c4 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 26 Sep 2021 09:11:09 -0500
Subject: [PATCH 055/181] Fixed bugs in the polar real kernel in hippo, getting
 closer..

---
 lib/gpu/lal_hippo.cpp      |  92 +++++++++++++++++++-
 lib/gpu/lal_hippo.cu       | 171 +++++++++++++------------------------
 lib/gpu/lal_hippo.h        |  13 +++
 lib/gpu/lal_hippo_ext.cpp  |   4 +-
 src/GPU/pair_hippo_gpu.cpp |   6 +-
 5 files changed, 169 insertions(+), 117 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 12bf9cfd3c..0f87104832 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
 
   // leave the answers (forces, energies and virial) on the device,
   //   only copy them back in the last kernel (this one, or polar_real once done)
-  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  this->device->add_ans_object(this->ans);
+  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //this->device->add_ans_object(this->ans);
 
   this->hd_balancer.stop_timer();
 
@@ -568,6 +568,94 @@ int HippoT::umutual2b(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute polar real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** HippoT::compute_polar_real(const int ago, const int inum_full,
+                                 const int nall, double **host_x,
+                                 int *host_type, int *host_amtype,
+                                 int *host_amgroup, double **host_rpole,
+                                 double **host_uind, double **host_uinp,
+                                 double *host_pval, double *sublo, double *subhi,
+                                 tagint *tag, int **nspecial, tagint **special,
+                                 int *nspecial15, tagint **special15,
+                                 const bool eflag_in, const bool vflag_in,
+                                 const bool eatom, const bool vatom,
+                                 int &host_start, int **ilist, int **jnum,
+                                 const double cpu_time, bool &success,
+                                 const double aewald, const double felec,
+                                 const double off2_polar, double *host_q,
+                                 double *boxlo, double *prd, void **tep_ptr) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, host_pval, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  const int red_blocks=polar_real(eflag,vflag);
+
+  // only copy answers (forces, energies and virial) back from the device
+  //   in the last kernel (which is polar_real here)
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
+
+  this->hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  this->_tep.update_host(this->_max_tep_size*4,false);
+/*
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
+    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Calculate the polar real-space term, returning tep
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index afc3cf10af..1f9c14d4da 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1753,7 +1753,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
     numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
     numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
-     numtyp vali = polar6[i].x;
+    numtyp vali = polar6[i].x;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -1873,25 +1873,25 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp dmpi[9],dmpk[9];
       numtyp dmpik[9];
       damppole(r,9,alphai,alphak,dmpi,dmpk,dmpik);
-      numtyp rr3core = bn[1] - (1.0-factor_dscale)*rr3;
-      numtyp rr5core = bn[2] - (1.0-factor_dscale)*rr5;
-      numtyp rr3i = bn[1] - (1.0-factor_dscale*dmpi[2])*rr3;
-      numtyp rr5i = bn[2] - (1.0-factor_dscale*dmpi[4])*rr5;
-      numtyp rr7i = bn[3] - (1.0-factor_dscale*dmpi[6])*rr7;
-      numtyp rr9i = bn[4] - (1.0-factor_dscale*dmpi[8])*rr9;
-      numtyp rr3k = bn[1] - (1.0-factor_dscale*dmpk[2])*rr3;
-      numtyp rr5k = bn[2] - (1.0-factor_dscale*dmpk[4])*rr5;
-      numtyp rr7k = bn[3] - (1.0-factor_dscale*dmpk[6])*rr7;
-      numtyp rr9k = bn[4] - (1.0-factor_dscale*dmpk[8])*rr9;
-      numtyp rr5ik = bn[2] - (1.0-factor_wscale*dmpik[4])*rr5;
-      numtyp rr7ik = bn[3] - (1.0-factor_wscale*dmpik[6])*rr7;
+      numtyp rr3core = bn[1] - ((numtyp)1.0-factor_dscale)*rr3;
+      numtyp rr5core = bn[2] - ((numtyp)1.0-factor_dscale)*rr5;
+      numtyp rr3i = bn[1] - ((numtyp)1.0-factor_dscale*dmpi[2])*rr3;
+      numtyp rr5i = bn[2] - ((numtyp)1.0-factor_dscale*dmpi[4])*rr5;
+      numtyp rr7i = bn[3] - ((numtyp)1.0-factor_dscale*dmpi[6])*rr7;
+      numtyp rr9i = bn[4] - ((numtyp)1.0-factor_dscale*dmpi[8])*rr9;
+      numtyp rr3k = bn[1] - ((numtyp)1.0-factor_dscale*dmpk[2])*rr3;
+      numtyp rr5k = bn[2] - ((numtyp)1.0-factor_dscale*dmpk[4])*rr5;
+      numtyp rr7k = bn[3] - ((numtyp)1.0-factor_dscale*dmpk[6])*rr7;
+      numtyp rr9k = bn[4] - ((numtyp)1.0-factor_dscale*dmpk[8])*rr9;
+      numtyp rr5ik = bn[2] - ((numtyp)1.0-factor_wscale*dmpik[4])*rr5;
+      numtyp rr7ik = bn[3] - ((numtyp)1.0-factor_wscale*dmpik[6])*rr7;
 
       // get the induced dipole field used for dipole torques
 
-      numtyp tix3 = 2.0*rr3i*ukx;
-      numtyp tiy3 = 2.0*rr3i*uky;
-      numtyp tiz3 = 2.0*rr3i*ukz;
-      numtyp tuir = -2.0*rr5i*ukr;
+      numtyp tix3 = (numtyp)2.0*rr3i*ukx;
+      numtyp tiy3 = (numtyp)2.0*rr3i*uky;
+      numtyp tiz3 = (numtyp)2.0*rr3i*ukz;
+      numtyp tuir = (numtyp)-2.0*rr5i*ukr;
 
       ufld[0] += tix3 + xr*tuir;
       ufld[1] += tiy3 + yr*tuir;
@@ -1899,10 +1899,10 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
       // get induced dipole field gradient used for quadrupole torques
 
-      numtyp tix5 = 4.0 * (rr5i*ukx);
-      numtyp tiy5 = 4.0 * (rr5i*uky);
-      numtyp tiz5 = 4.0 * (rr5i*ukz);
-      tuir = -2.0*rr7i*ukr;
+      numtyp tix5 = (numtyp)4.0 * (rr5i*ukx);
+      numtyp tiy5 = (numtyp)4.0 * (rr5i*uky);
+      numtyp tiz5 = (numtyp)4.0 * (rr5i*ukz);
+      tuir = (numtyp)-2.0*rr7i*ukr;
 
       dufld[0] += xr*tix5 + xr*xr*tuir;
       dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
@@ -1911,7 +1911,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
       dufld[5] += zr*tiz5 + zr*zr*tuir;
       
-      // get the dEd/dR terms used for direct polarization force
+      // get the field gradient for direct polarization force
       
       numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
       numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k;
@@ -1921,16 +1921,16 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
       term1i = rr3i - rr5i*xr*xr;
       term1core = rr3core - rr5core*xr*xr;
-      term2i = 2.0*rr5i*xr ;
+      term2i = (numtyp)2.0*rr5i*xr ;
       term3i = rr7i*xr*xr - rr5i;
-      term4i = 2.0*rr5i;
-      term5i = 5.0*rr7i*xr;
+      term4i = (numtyp)2.0*rr5i;
+      term5i = (numtyp)5.0*rr7i*xr;
       term6i = rr9i*xr*xr;
       term1k = rr3k - rr5k*xr*xr;
-      term2k = 2.0*rr5k*xr;
+      term2k = (numtyp)2.0*rr5k*xr;
       term3k = rr7k*xr*xr - rr5k;
       term4k = 2.0*rr5k;
-      term5k = 5.0*rr7k*xr;
+      term5k = (numtyp)5.0*rr7k*xr;
       term6k = rr9k*xr*xr;
       tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i -
         qixx*term4i + qix*term5i - qir*term6i + (qiy*yr+qiz*zr)*rr7i;
@@ -1939,16 +1939,16 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
       term1i = rr3i - rr5i*yr*yr;
       term1core = rr3core - rr5core*yr*yr;
-      term2i = 2.0*rr5i*yr;
+      term2i = (numtyp)2.0*rr5i*yr;
       term3i = rr7i*yr*yr - rr5i;
-      term4i = 2.0*rr5i;
-      term5i = 5.0*rr7i*yr;
+      term4i = (numtyp)2.0*rr5i;
+      term5i = (numtyp)5.0*rr7i*yr;
       term6i = rr9i*yr*yr;
       term1k = rr3k - rr5k*yr*yr;
-      term2k = 2.0*rr5k*yr;
+      term2k = (numtyp)2.0*rr5k*yr;
       term3k = rr7k*yr*yr - rr5k;
-      term4k = 2.0*rr5k;
-      term5k = 5.0*rr7k*yr;
+      term4k = (numtyp)2.0*rr5k;
+      term5k = (numtyp)5.0*rr7k*yr;
       term6k = rr9k*yr*yr;
       tiyy = vali*term1i + corei*term1core + diy*term2i - dir*term3i -
         qiyy*term4i + qiy*term5i - qir*term6i + (qix*xr+qiz*zr)*rr7i;
@@ -1957,16 +1957,16 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
       term1i = rr3i - rr5i*zr*zr;
       term1core = rr3core - rr5core*zr*zr;
-      term2i = 2.0*rr5i*zr;
+      term2i = (numtyp)2.0*rr5i*zr;
       term3i = rr7i*zr*zr - rr5i;
-      term4i = 2.0*rr5i;
-      term5i = 5.0*rr7i*zr;
+      term4i = (numtyp)2.0*rr5i;
+      term5i = (numtyp)5.0*rr7i*zr;
       term6i = rr9i*zr*zr;
       term1k = rr3k - rr5k*zr*zr;
-      term2k = 2.0*rr5k*zr;
+      term2k = (numtyp)2.0*rr5k*zr;
       term3k = rr7k*zr*zr - rr5k;
-      term4k = 2.0*rr5k;
-      term5k = 5.0*rr7k*zr;
+      term4k = (numtyp)2.0*rr5k;
+      term5k = (numtyp)5.0*rr7k*zr;
       term6k = rr9k*zr*zr;
       tizz = vali*term1i + corei*term1core + diz*term2i - dir*term3i -
         qizz*term4i + qiz*term5i - qir*term6i + (qix*xr+qiy*yr)*rr7i;
@@ -1978,17 +1978,17 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       term1core = rr5core*xr*yr;
       term3i = rr5i*yr;
       term4i = yr * (rr7i*xr);
-      term5i = 2.0*rr5i;
-      term6i = 2.0*rr7i*xr;
-      term7i = 2.0*rr7i*yr;
+      term5i = (numtyp)2.0*rr5i;
+      term6i = (numtyp)2.0*rr7i*xr;
+      term7i = (numtyp)2.0*rr7i*yr;
       term8i = yr*rr9i*xr;
       term2k = rr5k*xr;
       term1k = yr * term2k;
       term3k = rr5k*yr;
       term4k = yr * (rr7k*xr);
-      term5k = 2.0*rr5k;
-      term6k = 2.0*rr7k*xr;
-      term7k = 2.0*rr7k*yr;
+      term5k = (numtyp)2.0*rr5k;
+      term6k = (numtyp)2.0*rr7k*xr;
+      term7k = (numtyp)2.0*rr7k*yr;
       term8k = yr*rr9k*xr;
       tixy = -vali*term1i - corei*term1core + diy*term2i + dix*term3i -
         dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i;
@@ -2000,17 +2000,17 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       term1core = rr5core*xr*zr;
       term3i = rr5i*zr;
       term4i = zr * (rr7i*xr);
-      term5i = 2.0*rr5i;
-      term6i = 2.0*rr7i*xr;
-      term7i = 2.0*rr7i*zr;
+      term5i = (numtyp)2.0*rr5i;
+      term6i = (numtyp)2.0*rr7i*xr;
+      term7i = (numtyp)2.0*rr7i*zr;
       term8i = zr*rr9i*xr;
       term2k = rr5k*xr;
       term1k = zr * term2k;
       term3k = rr5k*zr;
       term4k = zr * (rr7k*xr);
-      term5k = 2.0*rr5k;
-      term6k = 2.0*rr7k*xr;
-      term7k = 2.0*rr7k*zr;
+      term5k = (numtyp)2.0*rr5k;
+      term6k = (numtyp)2.0*rr7k*xr;
+      term7k = (numtyp)2.0*rr7k*zr;
       term8k = zr*rr9k*xr;
       tixz = -vali*term1i - corei*term1core + diz*term2i + dix*term3i -
         dir*term4i - qixz*term5i + qiz*term6i + qix*term7i - qir*term8i;
@@ -2022,17 +2022,17 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       term1core = rr5core*yr*zr;
       term3i = rr5i*zr;
       term4i = zr * (rr7i*yr);
-      term5i = 2.0*rr5i;
-      term6i = 2.0*rr7i*yr;
-      term7i = 2.0*rr7i*zr;
+      term5i = (numtyp)2.0*rr5i;
+      term6i = (numtyp)2.0*rr7i*yr;
+      term7i = (numtyp)2.0*rr7i*zr;
       term8i = zr*rr9i*yr;
       term2k = rr5k*yr;
       term1k = zr * term2k;
       term3k = rr5k*zr;
       term4k = zr * (rr7k*yr);
-      term5k = 2.0*rr5k;
-      term6k = 2.0*rr7k*yr;
-      term7k = 2.0*rr7k*zr;
+      term5k = (numtyp)2.0*rr5k;
+      term6k = (numtyp)2.0*rr7k*yr;
+      term7k = (numtyp)2.0*rr7k*zr;
       term8k = zr*rr9k*yr;
       tiyz = -vali*term1i - corei*term1core + diz*term2i + diy*term3i -
         dir*term4i - qiyz*term5i + qiz*term6i + qiy*term7i - qir*term8i;
@@ -2043,14 +2043,14 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
       numtyp depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
 
-      numtyp frcx = -2.0 * depx;
-      numtyp frcy = -2.0 * depy;
-      numtyp frcz = -2.0 * depz;
+      numtyp frcx = (numtyp)-2.0 * depx;
+      numtyp frcy = (numtyp)-2.0 * depy;
+      numtyp frcz = (numtyp)-2.0 * depz;
 
       // get the dEp/dR terms used for direct polarization force
       // poltyp == MUTUAL && hippo
       // tixx and tkxx
-      term1 = 2.0 * rr5ik;
+      term1 = (numtyp)2.0 * rr5ik;
       term2 = term1*xr;
       term3 = rr5ik - rr7ik*xr*xr;
       tixx = uix*term2 + uir*term3;
@@ -2095,55 +2095,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       frcy = frcy - depy;
       frcz = frcz - depz;
 
-      // get the dtau/dr terms used for mutual polarization force
-      // poltyp == MUTUAL  && hippo
-          
-      term1 = bn[2] - usc3*rr5;
-      term2 = bn[3] - usc5*rr7;
-      term3 = usr5 + term1;
-      term4 = rr3 * factor_uscale;
-      term5 = -xr*term3 + rc3[0]*term4;
-      term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
-      tixx = uix*term5 + uir*term6;
-      tkxx = ukx*term5 + ukr*term6;
-
-      term5 = -yr*term3 + rc3[1]*term4;
-      term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
-      tiyy = uiy*term5 + uir*term6;
-      tkyy = uky*term5 + ukr*term6;
-
-      term5 = -zr*term3 + rc3[2]*term4;
-      term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
-      tizz = uiz*term5 + uir*term6;
-      tkzz = ukz*term5 + ukr*term6;
-
-      term4 = -usr5 * yr;
-      term5 = -xr*term1 + rr3*urc3[0];
-      term6 = xr*yr*term2 - rr5*yr*urc5[0];
-      tixy = uix*term4 + uiy*term5 + uir*term6;
-      tkxy = ukx*term4 + uky*term5 + ukr*term6;
-
-      term4 = -usr5 * zr;
-      term6 = xr*zr*term2 - rr5*zr*urc5[0];
-      tixz = uix*term4 + uiz*term5 + uir*term6;
-      tkxz = ukx*term4 + ukz*term5 + ukr*term6;
-
-      term5 = -yr*term1 + rr3*urc3[1];
-      term6 = yr*zr*term2 - rr5*zr*urc5[1];
-      tiyz = uiy*term4 + uiz*term5 + uir*term6;
-      tkyz = uky*term4 + ukz*term5 + ukr*term6;
-
-      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
-        + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
-      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
-        + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
-      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
-        + tkxz*uixp + tkyz*uiyp + tkzz*uizp;
-
-      frcx = frcx + depx;
-      frcy = frcy + depy;
-      frcz = frcz + depz;
-
       f.x -= frcx;
       f.y -= frcy;
       f.z -= frcz;
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 251f909b78..36e7a81237 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -90,6 +90,19 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                 const double aewald, const double felec, const double off2_mpole, double *charge,
                 double *boxlo, double *prd, void **tep_ptr);
 
+  /// Compute polar real-space with device neighboring
+  virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double **host_uind,
+                double **host_uinp, double *host_pval, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double felec, const double off2_polar,
+                double *charge, double *boxlo, double *prd, void **tep_ptr);
+
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
   void clear();
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 390f713d98..1851c3aba3 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -194,7 +194,7 @@ int** hippo_gpu_compute_polar_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup,
                            double **host_rpole, double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, int *nspecial15, tagint** special15,
                            const bool eflag, const bool vflag, const bool eatom,
                            const bool vatom, int &host_start,
@@ -202,7 +202,7 @@ int** hippo_gpu_compute_polar_real(const int ago, const int inum_full,
                            bool &success, const double aewald, const double felec, const double off2,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   return HIPPOMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 6ac22e0721..23395e5fe3 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -108,7 +108,7 @@ int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall
 
 int ** hippo_gpu_compute_polar_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double **host_uind, double **host_uinp,
+              double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
               double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = false;
   gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = false;
+  gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -1089,7 +1089,7 @@ void PairHippoGPU::polar_real()
 
   firstneigh = hippo_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
                                              atom->type, amtype, amgroup,
-                                             rpole, uind, uinp, sublo, subhi,
+                                             rpole, uind, uinp, pval, sublo, subhi,
                                              atom->tag, atom->nspecial, atom->special,
                                              atom->nspecial15, atom->special15,
                                              eflag, vflag, eflag_atom, vflag_atom,

From 2efd841a7e29248170820ef1b6a079fa156baf07 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 27 Sep 2021 11:35:35 -0500
Subject: [PATCH 056/181] Trying to find the difference in the neighbor list
 build in hippo vs amoeba

---
 lib/gpu/lal_hippo.cpp       |  6 +--
 lib/gpu/lal_hippo.cu        | 17 +++++--
 lib/gpu/lal_hippo_extra.h   | 92 ++++++++++++++++++-------------------
 src/AMOEBA/amoeba_polar.cpp | 12 +++--
 src/AMOEBA/pair_amoeba.cpp  | 45 +++++++++++++++++-
 src/GPU/pair_hippo_gpu.cpp  |  4 +-
 6 files changed, 114 insertions(+), 62 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 0f87104832..77bbebbb9a 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
 
   // leave the answers (forces, energies and virial) on the device,
   //   only copy them back in the last kernel (this one, or polar_real once done)
-  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //this->device->add_ans_object(this->ans);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  this->device->add_ans_object(this->ans);
 
   this->hd_balancer.stop_timer();
 
@@ -444,7 +444,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
     numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
   }
-*/  
+*/
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 1f9c14d4da..95f18db7d2 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1032,6 +1032,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
       numtyp valk = polar6[j].x;
 
+      if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole);
       // intermediates involving moments and separation distance
 
       numtyp dir = dix*xr + diy*yr + diz*zr;
@@ -1772,7 +1773,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
-      
+
       const numtyp4 pol1j = polar1[j];
       numtyp ck = polar1[j].x;   // rpole[j][0];
       numtyp dkx = polar1[j].y;  // rpole[j][1];
@@ -1800,6 +1801,11 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
+      // NOTE: for in.water_box/water_hexamer.hippo: there exist wscale = 0.2
+      //if (factor_wscale < (numtyp)1.0) continue; //factor_wscale = (numtyp)0;
+
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, r, factor_wscale);
+
       if (igroup == jgroup) {
         factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
         factor_uscale = polar_uscale;
@@ -1910,7 +1916,8 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
       dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
       dufld[5] += zr*tiz5 + zr*zr*tuir;
-      
+
+
       // get the field gradient for direct polarization force
       
       numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
@@ -1929,7 +1936,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       term1k = rr3k - rr5k*xr*xr;
       term2k = (numtyp)2.0*rr5k*xr;
       term3k = rr7k*xr*xr - rr5k;
-      term4k = 2.0*rr5k;
+      term4k = (numtyp)2.0*rr5k;
       term5k = (numtyp)5.0*rr7k*xr;
       term6k = rr9k*xr*xr;
       tixx = vali*term1i + corei*term1core + dix*term2i - dir*term3i -
@@ -2046,7 +2053,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp frcx = (numtyp)-2.0 * depx;
       numtyp frcy = (numtyp)-2.0 * depy;
       numtyp frcz = (numtyp)-2.0 * depz;
-
+       
       // get the dEp/dR terms used for direct polarization force
       // poltyp == MUTUAL && hippo
       // tixx and tkxx
@@ -2159,7 +2166,7 @@ __kernel void k_special15(__global int * dev_nbor,
       int which = sj >> SBBITS & 3;
       int j = sj & NEIGHMASK;
       tagint jtag = tag[j];
-
+      if (i == 0 && j < 20) printf("GPU: j = %d; jtag = %d\n", j, jtag);
       if (!which) {
         int offset=ii;
         for (int k=0; k<n15; k++) {
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index 890ce51121..c5a97d919a 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -81,14 +81,14 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
     pre = (numtyp)128.0;
     s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
 
-    ds = (dmpi22*r3 + dmpi23*r4) * expi / 3.0;
-    d2s = dmpi24 * expi * r5 / 9.0;
-    d3s = dmpi25 * expi * r6 / 45.0;
-    d4s = (dmpi25*r6 + dmpi26*r7) * expi / 315.0;
+    ds = (dmpi22*r3 + dmpi23*r4) * expi / (numtyp)3.0;
+    d2s = dmpi24 * expi * r5 / (numtyp)9.0;
+    d3s = dmpi25 * expi * r6 / (numtyp)45.0;
+    d4s = (dmpi25*r6 + dmpi26*r7) * expi / (numtyp)315.0;
     if (rorder >= 11) {
       r8 = r7 * r;
       dmpi27 = dmpi2 * dmpi26;
-      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0;
+      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/(numtyp)3.0) * expi / (numtyp)945.0;
     }
 
   // treat the case where alpha damping exponents are unequal
@@ -97,12 +97,12 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
     r3 = r2 * r;
     r4 = r3 * r;
     r5 = r4 * r;
-    dmpi2 = 0.5 * dmpi;
-    dmpk2 = 0.5 * dmpk;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dmpk2 = (numtyp)0.5 * dmpk;
     dampi = dmpi2 * r;
     dampk = dmpk2 * r;
-    expi = exp(-dampi);
-    expk = exp(-dampk);
+    expi = ucl_exp(-dampi);
+    expk = ucl_exp(-dampk);
     dmpi22 = dmpi2 * dmpi2;
     dmpi23 = dmpi22 * dmpi2;
     dmpi24 = dmpi23 * dmpi2;
@@ -112,34 +112,34 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
     dmpk24 = dmpk23 * dmpk2;
     dmpk25 = dmpk24 * dmpk2;
     term = dmpi22 - dmpk22;
-    pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0);
-    tmp = 4.0 * dmpi2 * dmpk2 / term;
+    pre = (numtyp)8192.0 * dmpi23 * dmpk23 / ucl_powr(term,(numtyp)4.0);
+    tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
     s = (dampi-tmp)*expk + (dampk+tmp)*expi;
 
-    ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - 
-          4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
-    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - 
-           (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
-           4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + 
-       (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
-       4.0*dmpi2*dmpk2/term) * expi;
-    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - 
-           (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - 
-           4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
-      (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + 
-       (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + 
-       4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi;
-    d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + 
-           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - 
-           (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - 
-           (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
-           4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + 
-       dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + 
-       (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + 
-       (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
+    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - 
+          (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - 
+           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - 
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + 
+       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
+    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - 
+           (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - 
+           (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
+      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + 
+       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + 
+       (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
+    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + 
+           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - 
+           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - 
+           ((numtyp)12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + 
+       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + 
+       (4.0/105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/21.0)*dmpi24*dmpk2*r3/term + 
+       (12.0/7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
        4.0*dmpi2*dmpk2/term) * expi;
     
     if (rorder >= 11) {
@@ -217,8 +217,8 @@ ucl_inline void damppole(const numtyp r, const int rorder,
   diff = fabs(alphai-alphak);
   dampi = alphai * r;
   dampk = alphak * r;
-  expi = exp(-dampi);
-  expk = exp(-dampk);
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
 
   // core-valence charge penetration damping for Gordon f1
 
@@ -308,15 +308,15 @@ ucl_inline void damppole(const numtyp r, const int rorder,
     if (rorder >= 11) {
       dampi6 = dampi3 * dampi3;
       dampk6 = dampk3 * dampk3;
-      dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                                5.0*dampi4/126.0 + 2.0*dampi5/315.0 + 
-                                dampi6/1890.0)*expi - 
-        termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + 
-                2.0*dampk5/315.0 + dampk6/1890.0)*expk - 
-        2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + 
-                          dampi4/63.0 + dampi5/945.0)*expi - 
-        2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + 
-                          dampk4/63.0 + dampk5/945.0)*expk;
+      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + 
+                                dampi6/(numtyp)1890.0)*expi - 
+        termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + 5.0*dampk4/(numtyp)126.0 + 
+                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - 
+        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + 
+                          dampi4/63.0 + dampi5/(numtyp)945.0)*expi - 
+        (numtyp)2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + 
+                          dampk4/63.0 + dampk5/(numtyp)945.0)*expk;
     }
   }
 }
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index e6b3e6ef70..4fa8a5d892 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -366,6 +366,7 @@ void PairAmoeba::polar_real()
       yr = x[j][1] - yi;
       zr = x[j][2] - zi;
       r2 = xr*xr + yr*yr + zr*zr;
+
       if (r2 > off2) continue; 
 
       jtype = amtype[j];
@@ -393,7 +394,7 @@ void PairAmoeba::polar_real()
 	  factor_uscale = 1.0;
 	}
       }
-
+      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, sqrt(r2), factor_wscale);
       r = sqrt(r2);
       ck = rpole[j][0];
       dkx = rpole[j][1];
@@ -567,7 +568,7 @@ void PairAmoeba::polar_real()
       ufld[j][0] += tkx3 + xr*tukr;
       ufld[j][1] += tky3 + yr*tukr;
       ufld[j][2] += tkz3 + zr*tukr;
-      
+
       // get induced dipole field gradient used for quadrupole torques
       
       if (amoeba) {
@@ -579,7 +580,6 @@ void PairAmoeba::polar_real()
         tkz5 = 2.0 * (psr5*uiz+dsr5*uizp);
         tuir = -psr7*ukr - dsr7*ukrp;
         tukr = -psr7*uir - dsr7*uirp;
-      // reached here...
       } else if (hippo) {
         tix5 = 4.0 * (rr5i*ukx);
         tiy5 = 4.0 * (rr5i*uky);
@@ -597,7 +597,6 @@ void PairAmoeba::polar_real()
       dufld[i][3] += xr*tiz5 + zr*tix5 + 2.0*xr*zr*tuir;
       dufld[i][4] += yr*tiz5 + zr*tiy5 + 2.0*yr*zr*tuir;
       dufld[i][5] += zr*tiz5 + zr*zr*tuir;
-
       dufld[j][0] -= xr*tkx5 + xr*xr*tukr;
       dufld[j][1] -= xr*tky5 + yr*tkx5 + 2.0*xr*yr*tukr;
       dufld[j][2] -= yr*tky5 + yr*yr*tukr;
@@ -668,7 +667,7 @@ void PairAmoeba::polar_real()
         frcx = depx;
         frcy = depy;
         frcz = depz;
-        
+
         // get the dEp/dR terms used for direct polarization force
         
         term1 = bn[2] - psc3*rr5;
@@ -855,6 +854,7 @@ void PairAmoeba::polar_real()
         frcx = -2.0 * depx;
         frcy = -2.0 * depy;
         frcz = -2.0 * depz;
+
       }
 
       // get the dtau/dr terms used for mutual polarization force
@@ -1199,6 +1199,8 @@ void PairAmoeba::polar_real()
 
     torque2force(i,tep,fix,fiy,fiz,fpolar);
 
+    //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, tep[0], tep[1], tep[2]);
+
     iz = zaxis2local[i];
     ix = xaxis2local[i];
     iy = yaxis2local[i];
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index 5157739f0e..1ff35e7ce1 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -242,6 +242,47 @@ void PairAmoeba::compute(int eflag, int vflag)
     time_induce = time_polar = time_qxfer = 0.0;
   }
 
+  { // DEBUGGING
+  double **x = atom->x;
+  int inum,jnum;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  if (use_ewald) choose(MPOLE_LONG);
+  else choose(MPOLE);
+
+  int i,ii,j,jj;
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double xi = x[i][0];
+    double yi = x[i][1];
+    double zi = x[i][2];
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      double factor_mpole = special_mpole[sbmask15(j)];
+      j &= NEIGHMASK15;
+
+      double xr = x[j][0] - xi;
+      double yr = x[j][1] - yi;
+      double zr = x[j][2] - zi;
+      double r2 = xr*xr + yr*yr + zr*zr;
+      if (r2 > off2) continue;
+      double r = sqrt(r2);
+      if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole);
+    }
+  }
+  
+
+  } // DEBUGGING
+
   double evdwl;
 
   evdwl = 0.0;
@@ -973,8 +1014,8 @@ void PairAmoeba::init_style()
 
   int irequest = neighbor->request(this,instance_me);
   // for DEBUGGING with GPU
-  //neighbor->requests[irequest]->half = 0;
-  //neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
 
   // open debug output files
   // names are hard-coded
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 23395e5fe3..4da0056029 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = false;
   gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = false;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -1137,6 +1137,8 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
     _tq[2] = tq_ptr[4*i+2];
     torque2force(i,_tq,fix,fiy,fiz,force_comp);
 
+    //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, _tq[0], _tq[1], _tq[2]);
+
     iz = zaxis2local[i];
     ix = xaxis2local[i];
     iy = yaxis2local[i];

From c6148938e5682075c10aff97f4cc9992ca5abc65 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 27 Sep 2021 12:36:11 -0500
Subject: [PATCH 057/181] Debugging the neighbor list in hippo vs amoeba

---
 src/AMOEBA/pair_amoeba.cpp            |  4 ++--
 src/DIPOLE/pair_lj_cut_dipole_cut.cpp | 18 +++++++++++++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index 1ff35e7ce1..59d85814ec 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -274,9 +274,9 @@ void PairAmoeba::compute(int eflag, int vflag)
       double yr = x[j][1] - yi;
       double zr = x[j][2] - zi;
       double r2 = xr*xr + yr*yr + zr*zr;
-      if (r2 > off2) continue;
+      //if (r2 > off2) continue;
       double r = sqrt(r2);
-      if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole);
+      if (i == 0) printf("j = %d: tag = %d; r = %f; factor_mpole = %f\n", j, r, atom->tag[j], factor_mpole);
     }
   }
   
diff --git a/src/DIPOLE/pair_lj_cut_dipole_cut.cpp b/src/DIPOLE/pair_lj_cut_dipole_cut.cpp
index 0f8a7317c6..e536d9d76e 100644
--- a/src/DIPOLE/pair_lj_cut_dipole_cut.cpp
+++ b/src/DIPOLE/pair_lj_cut_dipole_cut.cpp
@@ -19,6 +19,7 @@
 #include "atom.h"
 #include "neighbor.h"
 #include "neigh_list.h"
+#include "neigh_request.h"
 #include "comm.h"
 #include "force.h"
 #include "memory.h"
@@ -90,6 +91,8 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag)
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
+  int maxsize = 10;
+
   // loop over neighbors of my atoms
 
   for (ii = 0; ii < inum; ii++) {
@@ -102,6 +105,13 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag)
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
+    double scale_dipole = 1.0;
+    if (jnum > maxsize) {
+      scale_dipole = maxsize; //1.0/(double)maxsize;
+    } else {
+      scale_dipole = jnum; //1.0/(double)jnum;
+    }
+
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
@@ -207,7 +217,7 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag)
 
         // total force
 
-        fq = factor_coul*qqrd2e;
+        fq = scale_dipole*factor_coul*qqrd2e;
         fx = fq*forcecoulx + delx*forcelj;
         fy = fq*forcecouly + dely*forcelj;
         fz = fq*forcecoulz + delz*forcelj;
@@ -221,7 +231,7 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag)
         torque[i][1] += fq*tiycoul;
         torque[i][2] += fq*tizcoul;
 
-        if (newton_pair || j < nlocal) {
+        if (newton_pair) {
           f[j][0] -= fx;
           f[j][1] -= fy;
           f[j][2] -= fz;
@@ -362,7 +372,9 @@ void PairLJCutDipoleCut::init_style()
   if (!atom->q_flag || !atom->mu_flag || !atom->torque_flag)
     error->all(FLERR,"Pair dipole/cut requires atom attributes q, mu, torque");
 
-  neighbor->request(this,instance_me);
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
 }
 
 /* ----------------------------------------------------------------------

From d27836952aa4a753c931935d4f818c734282b9f7 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 27 Sep 2021 16:12:49 -0500
Subject: [PATCH 058/181] Fixed a bug in neighbor.cpp to make special_flag
 consistent between amoeba and hippo (to be 2 instead of 0), that caused
 missing neighbors with hippo

---
 lib/gpu/lal_hippo.cpp      |  4 ++--
 src/AMOEBA/pair_amoeba.cpp | 45 ++------------------------------------
 src/GPU/pair_hippo_gpu.cpp |  2 +-
 src/neighbor.cpp           |  2 ++
 4 files changed, 7 insertions(+), 46 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 77bbebbb9a..d31370be73 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -430,8 +430,8 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
 
   // leave the answers (forces, energies and virial) on the device,
   //   only copy them back in the last kernel (this one, or polar_real once done)
-  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  this->device->add_ans_object(this->ans);
+  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //this->device->add_ans_object(this->ans);
 
   this->hd_balancer.stop_timer();
 
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index 59d85814ec..5157739f0e 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -242,47 +242,6 @@ void PairAmoeba::compute(int eflag, int vflag)
     time_induce = time_polar = time_qxfer = 0.0;
   }
 
-  { // DEBUGGING
-  double **x = atom->x;
-  int inum,jnum;
-  int *ilist,*jlist,*numneigh,**firstneigh;
-
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-
-  if (use_ewald) choose(MPOLE_LONG);
-  else choose(MPOLE);
-
-  int i,ii,j,jj;
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    double xi = x[i][0];
-    double yi = x[i][1];
-    double zi = x[i][2];
-
-    jlist = firstneigh[i];
-    jnum = numneigh[i];
-
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      double factor_mpole = special_mpole[sbmask15(j)];
-      j &= NEIGHMASK15;
-
-      double xr = x[j][0] - xi;
-      double yr = x[j][1] - yi;
-      double zr = x[j][2] - zi;
-      double r2 = xr*xr + yr*yr + zr*zr;
-      //if (r2 > off2) continue;
-      double r = sqrt(r2);
-      if (i == 0) printf("j = %d: tag = %d; r = %f; factor_mpole = %f\n", j, r, atom->tag[j], factor_mpole);
-    }
-  }
-  
-
-  } // DEBUGGING
-
   double evdwl;
 
   evdwl = 0.0;
@@ -1014,8 +973,8 @@ void PairAmoeba::init_style()
 
   int irequest = neighbor->request(this,instance_me);
   // for DEBUGGING with GPU
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->full = 1;
+  //neighbor->requests[irequest]->half = 0;
+  //neighbor->requests[irequest]->full = 1;
 
   // open debug output files
   // names are hard-coded
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 4da0056029..1067969c7b 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -138,7 +138,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = false;
   gpu_umutual2b_ready = false;
-  gpu_polar_real_ready = false;
+  gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index 0ee4051d4b..dcb6a21b7c 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -527,6 +527,7 @@ void Neighbor::init()
       int flag=0;
       for (int isub=0; isub < ph->nstyles; ++isub) {
         if (force->pair_match("amoeba",0,isub)
+            || force->pair_match("hippo",0,isub)
             || force->pair_match("coul/wolf",0,isub)
             || force->pair_match("coul/dsf",0,isub)
             || force->pair_match("coul/exclude",0)
@@ -537,6 +538,7 @@ void Neighbor::init()
         special_flag[1] = special_flag[2] = special_flag[3] = 2;
     } else {
       if (force->pair_match("amoeba",0)
+          || force->pair_match("hippo",0)
           || force->pair_match("coul/wolf",0)
           || force->pair_match("coul/dsf",0)
           || force->pair_match("coul/exclude",0)

From 8d54547bc0693abb68bd6aa033a3375e6506846e Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 28 Sep 2021 00:50:33 -0500
Subject: [PATCH 059/181] Commented out debugging commands in the hippo
 kernels, added (numtyp) to numerics in hippo_extra, replaced fabs with
 explicit func

---
 lib/gpu/lal_hippo.cu      |   2 -
 lib/gpu/lal_hippo_extra.h | 129 +++++++++++++++++++-------------------
 2 files changed, 65 insertions(+), 66 deletions(-)

diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 95f18db7d2..45361ed1fb 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1032,7 +1032,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
       numtyp valk = polar6[j].x;
 
-      if (i == 0 && j < 10) printf("j = %d: r = %f; factor_mpole = %f\n", j, r, factor_mpole);
       // intermediates involving moments and separation distance
 
       numtyp dir = dix*xr + diy*yr + diz*zr;
@@ -2166,7 +2165,6 @@ __kernel void k_special15(__global int * dev_nbor,
       int which = sj >> SBBITS & 3;
       int j = sj & NEIGHMASK;
       tagint jtag = tag[j];
-      if (i == 0 && j < 20) printf("GPU: j = %d; jtag = %d\n", j, jtag);
       if (!which) {
         int offset=ii;
         for (int k=0; k<n15; k++) {
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index c5a97d919a..a06ac4425c 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -132,15 +132,15 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
        ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + 
        (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
     d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + 
-           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - 
+           dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - 
            ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - 
-           ((numtyp)12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+           ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - 
            (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
       (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + 
        dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + 
-       (4.0/105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/21.0)*dmpi24*dmpk2*r3/term + 
-       (12.0/7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
-       4.0*dmpi2*dmpk2/term) * expi;
+       ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + 
+       ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
+       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
     
     if (rorder >= 11) {
       r6 = r5 * r;
@@ -168,12 +168,12 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
   d3s = d3s * rr7;
   d4s = d4s * rr9;
   d5s = d5s * rr11;
-  dmpik[0] = 0.5 * pre * s * s;
+  dmpik[0] = (numtyp)0.5 * pre * s * s;
   dmpik[2] = pre * s * ds;
   dmpik[4] = pre * (s*d2s + ds*ds);
-  dmpik[6] = pre * (s*d3s + 3.0*ds*d2s);
-  dmpik[8] = pre * (s*d4s + 4.0*ds*d3s + 3.0*d2s*d2s);
-  if (rorder >= 11) dmpik[10] = pre * (s*d5s + 5.0*ds*d4s + 10.0*d2s*d3s);
+  dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
+  dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
+  if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
 }
 
 /* ----------------------------------------------------------------------
@@ -213,8 +213,9 @@ ucl_inline void damppole(const numtyp r, const int rorder,
 
   // compute tolerance and exponential damping factors
 
-  eps = 0.001;
-  diff = fabs(alphai-alphak);
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; 
+  if (diff < (numtyp)0) diff = -diff;
   dampi = alphai * r;
   dampk = alphak * r;
   expi = ucl_exp(-dampi);
@@ -226,12 +227,12 @@ ucl_inline void damppole(const numtyp r, const int rorder,
   dampi3 = dampi * dampi2;
   dampi4 = dampi2 * dampi2;
   dampi5 = dampi2 * dampi3;
-  dmpi[0] = 1.0 - (1.0 + 0.5*dampi)*expi;
-  dmpi[2] = 1.0 - (1.0 + dampi + 0.5*dampi2)*expi;
-  dmpi[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi;
-  dmpi[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + dampi4/30.0)*expi;
-  dmpi[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                   4.0*dampi4/105.0 + dampi5/210.0)*expi;
+  dmpi[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampi)*expi;
+  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
+  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
+  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
+  dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                   (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
   if (diff < eps) {
     dmpk[0] = dmpi[0];
     dmpk[2] = dmpi[2];
@@ -243,12 +244,12 @@ ucl_inline void damppole(const numtyp r, const int rorder,
     dampk3 = dampk * dampk2;
     dampk4 = dampk2 * dampk2;
     dampk5 = dampk2 * dampk3;
-    dmpk[0] = 1.0 - (1.0 + 0.5*dampk)*expk;
-    dmpk[2] = 1.0 - (1.0 + dampk + 0.5*dampk2)*expk;
-    dmpk[4] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk;
-    dmpk[6] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk;
-    dmpk[8] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 
-                     4.0*dampk4/105.0 + dampk5/210.0)*expk;
+    dmpk[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)0.5*dampk)*expk;
+    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
+    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
+    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
+    dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + 
+                     (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
   }
 
   // valence-valence charge penetration damping for Gordon f1
@@ -256,22 +257,22 @@ ucl_inline void damppole(const numtyp r, const int rorder,
   if (diff < eps) {
     dampi6 = dampi3 * dampi3;
     dampi7 = dampi3 * dampi4;
-    dmpik[0] = 1.0 - (1.0 + 11.0*dampi/16.0 + 3.0*dampi2/16.0 + 
-                      dampi3/48.0)*expi;
-    dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + 
-                      7.0*dampi3/48.0 + dampi4/48.0)*expi;
-    dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                      dampi4/24.0 + dampi5/144.0)*expi;
-    dmpik[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                      dampi4/24.0 + dampi5/120.0 + dampi6/720.0)*expi;
-    dmpik[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                      dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + 
-                      dampi7/5040.0)*expi;
+    dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + 
+                      dampi3/(numtyp)48.0)*expi;
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
+                      (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
+    dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
+    dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + 
+                      dampi7/(numtyp)5040.0)*expi;
     if (rorder >= 11) {
       dampi8 = dampi4 * dampi4;
-      dmpik[10] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                         dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + 
-                         dampi7/5040.0 + dampi8/45360.0)*expi;
+      dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                         dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + 
+                         dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
     }
 
   } else {
@@ -281,29 +282,29 @@ ucl_inline void damppole(const numtyp r, const int rorder,
     termk = alphai2 / (alphai2-alphak2);
     termi2 = termi * termi;
     termk2 = termk * termk;
-    dmpik[0] = 1.0 - termi2*(1.0 + 2.0*termk + 0.5*dampi)*expi - 
-      termk2*(1.0 + 2.0*termi + 0.5*dampk)*expk;
-    dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi -
-      termk2*(1.0+dampk+0.5*dampk2)*expk -
-      2.0*termi2*termk*(1.0+dampi)*expi -
-      2.0*termk2*termi*(1.0+dampk)*expk;
-    dmpik[4] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi - 
-      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk - 
-      2.0*termi2*termk*(1.0 + dampi + dampi2/3.0)*expi - 
-      2.0*termk2*termi*(1.0 + dampk + dampk2/3.0)*expk;
-    dmpik[6] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + 
-                             dampi3/6.0 + dampi4/30.0)*expi - 
-      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk - 
-      2.0*termi2*termk*(1.0 + dampi + 2.0*dampi2/5.0 + dampi3/15.0)*expi - 
-      2.0*termk2*termi*(1.0 + dampk + 2.0*dampk2/5.0 + dampk3/15.0)*expk;
-    dmpik[8] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                             4.0*dampi4/105.0 + dampi5/210.0)*expi - 
-      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 
-              4.0*dampk4/105.0 + dampk5/210.0)*expk - 
-      2.0*termi2*termk*(1.0 + dampi + 3.0*dampi2/7.0 + 
-                        2.0*dampi3/21.0 + dampi4/105.0)*expi - 
-      2.0*termk2*termi*(1.0 + dampk + 3.0*dampk2/7.0 + 
-                        2.0*dampk3/21.0 + dampk4/105.0)*expk;
+    dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - 
+      termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - 
+      termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - 
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - 
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
+    dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
+                             dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - 
+      termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - 
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - 
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
+    dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                             (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - 
+      termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + 
+              (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - 
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + 
+                        (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - 
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + 
+                        (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
     
     if (rorder >= 11) {
       dampi6 = dampi3 * dampi3;
@@ -311,12 +312,12 @@ ucl_inline void damppole(const numtyp r, const int rorder,
       dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
                                 (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + 
                                 dampi6/(numtyp)1890.0)*expi - 
-        termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + 5.0*dampk4/(numtyp)126.0 + 
+        termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + 
                 (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - 
         (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + 
-                          dampi4/63.0 + dampi5/(numtyp)945.0)*expi - 
-        (numtyp)2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + 
-                          dampk4/63.0 + dampk5/(numtyp)945.0)*expk;
+                          dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - 
+        (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + 
+                          dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
     }
   }
 }

From e80eea56ba0c4548f4ffadf47529100d7d16179f Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 28 Sep 2021 14:59:39 -0500
Subject: [PATCH 060/181] Added udirect2b and umutual2b for hippo

---
 cmake/Modules/Packages/GPU.cmake |   3 +
 lib/gpu/Opencl.makefile          |   3 +
 lib/gpu/lal_amoeba.cu            |  10 +-
 lib/gpu/lal_hippo.cpp            | 155 ++++++++++++++++++++++-
 lib/gpu/lal_hippo.cu             | 211 ++++++++++++++++---------------
 lib/gpu/lal_hippo.h              |  34 +++++
 lib/gpu/lal_hippo_ext.cpp        |   8 +-
 lib/gpu/lal_hippo_extra.h        | 105 ++++++++++++++-
 src/AMOEBA/amoeba_induce.cpp     |   2 +-
 src/GPU/pair_hippo_gpu.cpp       |  26 ++--
 10 files changed, 426 insertions(+), 131 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 2b6977005d..cf5bcd2ea2 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -172,6 +172,7 @@ elseif(GPU_API STREQUAL "OPENCL")
     ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
     ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
     ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
+    ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu
   )
 
   foreach(GPU_KERNEL ${GPU_LIB_CU})
@@ -188,6 +189,7 @@ elseif(GPU_API STREQUAL "OPENCL")
   GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
   GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
   GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
+  GenerateOpenCLHeader(hippo ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_hippo.cu)
 
   list(APPEND GPU_LIB_SOURCES
     ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
@@ -197,6 +199,7 @@ elseif(GPU_API STREQUAL "OPENCL")
     ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
     ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
     ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
+    ${CMAKE_CURRENT_BINARY_DIR}/gpu/hippo_cl.h
   )
 
   add_library(gpu STATIC ${GPU_LIB_SOURCES})
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 2ff98827d4..64a2161f85 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -74,6 +74,9 @@ $(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.
 $(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
 	$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
 
+$(OBJ_DIR)/hippo_cl.h: lal_hippo.cu $(PRE1_H) lal_hippo_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh hippo $(PRE1_H) lal_hippo_extra.h lal_hippo.cu $(OBJ_DIR)/hippo_cl.h;
+
 $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index e4d129214a..1deb3e3bb5 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1064,7 +1064,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
       bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
 
-      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
+      numtyp tdipdip[6];
       tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
       tdipdip[1] = bcn[1]*xr*yr;
       tdipdip[2] = bcn[1]*xr*zr;
@@ -1233,10 +1233,10 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp r = ucl_sqrt(r2);
       
       const numtyp4 pol1j = polar1[j];
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp ck = pol1j.x;   // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
       const numtyp4 pol2j = polar2[j];
       numtyp qkxx = pol2j.x; // rpole[j][4];
       numtyp qkxy = pol2j.y; // rpole[j][5];
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index d31370be73..caf910863f 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -489,6 +489,81 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the permanent field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** HippoT::compute_udirect2b(const int ago, const int inum_full,
+                                const int nall, double **host_x,
+                                int *host_type, int *host_amtype,
+                                int *host_amgroup, double **host_rpole,
+                                double **host_uind, double **host_uinp,
+                                double* host_pval,
+                                double *sublo, double *subhi, tagint *tag,
+                                int **nspecial, tagint **special,
+                                int *nspecial15, tagint **special15,
+                                const bool eflag_in, const bool vflag_in,
+                                const bool eatom, const bool vatom,
+                                int &host_start, int **ilist, int **jnum,
+                                const double cpu_time, bool &success,
+                                const double aewald, const double off2_polar,
+                                double *host_q, double *boxlo, double *prd,
+                                void** fieldp_ptr) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, host_pval, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+                         
+  // ------------------- Resize _fieldp array ------------------------
+
+  if (inum_full>this->_max_fieldp_size) {
+    this->_max_fieldp_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_fieldp.resize(this->_max_fieldp_size*8);
+  }
+  *fieldp_ptr=this->_fieldp.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_aewald = aewald;
+  const int red_blocks=udirect2b(eflag,vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+/*
+  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
+    this->_fieldp.cols(), _max_fieldp_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
+    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/
+  return firstneigh; //nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Calculate the real-space permanent field, returning field and fieldp
 // ---------------------------------------------------------------------------
@@ -518,7 +593,8 @@ int HippoT::udirect2b(const int eflag, const int vflag) {
   }
   
   this->k_udirect2b.set_size(GX,BX);
-  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
@@ -529,6 +605,80 @@ int HippoT::udirect2b(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute the direct real space part
+//    of the induced field
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** HippoT::compute_umutual2b(const int ago, const int inum_full,
+                                     const int nall, double **host_x,
+                                     int *host_type, int *host_amtype,
+                                     int *host_amgroup, double **host_rpole,
+                                     double **host_uind, double **host_uinp, double *host_pval,
+                                     double *sublo, double *subhi, tagint *tag,
+                                     int **nspecial, tagint **special,
+                                     int *nspecial15, tagint **special15,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     int &host_start, int **ilist, int **jnum,
+                                     const double cpu_time, bool &success,
+                                     const double aewald, const double off2_polar,
+                                     double *host_q, double *boxlo, double *prd,
+                                     void** fieldp_ptr) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer extra data from the host
+  //   and build the neighbor lists if needed
+
+  int** firstneigh = nullptr;
+  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          host_uind, host_uinp, host_pval, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _fieldp array ------------------------
+
+  if (inum_full>this->_max_fieldp_size) {
+    this->_max_fieldp_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_fieldp.resize(this->_max_fieldp_size*8);
+  }
+  *fieldp_ptr=this->_fieldp.host.begin();
+
+  this->_off2_polar = off2_polar;
+  this->_aewald = aewald;
+  const int red_blocks=umutual2b(eflag,vflag);
+
+  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
+
+  this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+/*
+  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
+    this->_fieldp.cols(), _max_fieldp_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
+    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/
+  return firstneigh; //nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Calculate the real-space induced field, returning field and fieldp
 // ---------------------------------------------------------------------------
@@ -558,7 +708,8 @@ int HippoT::umutual2b(const int eflag, const int vflag) {
   }
 
   this->k_umutual2b.set_size(GX,BX);
-  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
                         &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 45361ed1fb..487e852baf 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -16,7 +16,6 @@
 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include <stdio.h>
 #include "lal_hippo_extra.h"
-//#include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
 #endif
@@ -985,10 +984,10 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
     numtyp qiyz = pol3i.x;   // rpole[i][9];
     numtyp qizz = pol3i.y;   // rpole[i][12];
     itype  = pol3i.z;        // amtype[i];
-    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
 
-    numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
-    numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    numtyp corei = coeff_amclass[iclass].z;  // pcore[iclass];
+    numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
     numtyp vali = polar6[i].x;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
@@ -1028,8 +1027,8 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
 
-      numtyp corek = coeff_amclass[jtype].z;  // pcore[jclass];
-      numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
+      numtyp corek = coeff_amclass[jclass].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jclass].w; // palpha[jclass];
       numtyp valk = polar6[j].x;
 
       // intermediates involving moments and separation distance
@@ -1249,10 +1248,11 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
-                                 const __global int *dev_nbor,
+                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict coeff_amtype,
+                                const __global numtyp4 *restrict coeff_amclass,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
                                  __global acctyp4 *restrict fieldp,
@@ -1273,6 +1273,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
   numtyp4* polar1 = (numtyp4*)(&extra[0]);
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
@@ -1309,13 +1310,11 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
     numtyp qizz = pol3i.y;   // rpole[i][12];
     int itype  = pol3i.z;    // amtype[i];
     int igroup = pol3i.w;    // amgroup[i];
-    
-    // debug:
-    // xi__ = ix; xi__.w = itype;
+    int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
 
-    numtyp pdi = coeff[itype].x;
-    numtyp pti = coeff[itype].y;
-    numtyp ddi = coeff[itype].z;
+    numtyp corei = coeff_amclass[iclass].z;  // pcore[iclass];
+    numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
+    numtyp vali = polar6[i].x;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -1360,15 +1359,21 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp qkzz = pol3j.y; // rpole[j][12];
       int jtype = pol3j.z; // amtype[j];
       int jgroup =  pol3j.w; // amgroup[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
 
-      numtyp factor_dscale, factor_pscale;
+      numtyp corek = coeff_amclass[jclass].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jclass].w; // palpha[jclass];
+      numtyp valk = polar6[j].x;
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
       if (igroup == jgroup) {
-        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
-        factor_dscale = polar_dscale;
+        factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
+        factor_uscale = polar_uscale;
       } else {
-        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
-        factor_dscale = (numtyp)1.0;
+        factor_dscale = factor_pscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
+        factor_uscale = (numtyp)1.0;
       }
 
       // intermediates involving moments and separation distance
@@ -1400,50 +1405,42 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
         bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
       }
 
-      // find the field components for Thole polarization damping
-
-      numtyp scale3 = (numtyp)1.0;
-      numtyp scale5 = (numtyp)1.0;
-      numtyp scale7 = (numtyp)1.0;
-      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
-      if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
-        if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp) ;
-            scale3 = (numtyp)1.0 - expdamp ;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
-          }
-        } else {
-          pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
-          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
-          if (damp < (numtyp)50.0) {
-            numtyp expdamp = ucl_exp(-damp);
-            scale3 = (numtyp)1.0 - expdamp;
-            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
-          }
-        }
-      } else { // damp == 0: ???
-      }
-
+      // find the field components for charge penetration damping
+      numtyp dmpi[7],dmpk[7];
+      dampdir(r,alphai,alphak,dmpi,dmpk);
+          
       numtyp scalek = factor_dscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
-        
+      numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
+      numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
+      numtyp rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7;
+      numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpk[2])*rr3;
+      numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5;
+      numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7;
+      rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
+      fid[0] = -xr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dkx + (numtyp)2.0*rr5k*qkx;
+      fid[1] = -yr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dky + (numtyp)2.0*rr5k*qky;
+      fid[2] = -zr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dkz + (numtyp)2.0*rr5k*qkz;
+
       scalek = factor_pscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
-      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
-      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
-      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+      rr3 = r2inv * rr1;
+      rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
+      rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
+      rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7;
+      rr3k = bn[1] - ((numtyp)1.0-scalek*dmpk[2])*rr3;
+      rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5;
+      rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7;
+      rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
+      fip[0] = -xr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dkx + (numtyp)2.0*rr5k*qkx;
+      fip[1] = -yr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dky + (numtyp)2.0*rr5k*qky;
+      fip[2] = -zr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
+        rr3k*dkz + (numtyp)2.0*rr5k*qkz;
+          
+      // find terms needed later to compute mutual polarization
 
       _fieldp[0] += fid[0];
       _fieldp[1] += fid[1];
@@ -1467,17 +1464,18 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict fieldp,
-                                 const int inum,  const int nall,
-                                 const int nbor_pitch, const int t_per_atom,
-                                 const numtyp aewald, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
+                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict coeff_amtype,
+                                const __global numtyp4 *restrict coeff_amclass,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict fieldp,
+                                const int inum,  const int nall,
+                                const int nbor_pitch, const int t_per_atom,
+                                const numtyp aewald, const numtyp off2,
+                                const numtyp polar_dscale, const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -1491,6 +1489,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
   numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
   numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
 
   //numtyp4 xi__;
 
@@ -1518,9 +1517,11 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
     
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
-    
-    numtyp pdi = coeff[itype].x;
-    numtyp pti = coeff[itype].y;
+
+    int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    numtyp corei = coeff_amclass[iclass].z;  // pcore[iclass];
+    numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
+    numtyp vali = polar6[i].x;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -1561,9 +1562,21 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       numtyp ukyp = pol5j.y; // uinp[j][1];
       numtyp ukzp = pol5j.z; // uinp[j][2];
 
-      numtyp factor_uscale;
-      if (igroup == jgroup) factor_uscale = polar_uscale;
-      else factor_uscale = (numtyp)1.0;
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+      numtyp corek = coeff_amclass[jclass].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jclass].w; // palpha[jclass];
+      numtyp valk = polar6[j].x;
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
+        factor_uscale = polar_uscale;
+      } else {
+        factor_dscale = factor_pscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
+        factor_uscale = (numtyp)1.0;
+      }
 
       // calculate the real space Ewald error function terms
 
@@ -1583,32 +1596,20 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
       // find terms needed later to compute mutual polarization
       // if (poltyp != DIRECT) 
-      numtyp scale3 = (numtyp)1.0;
-      numtyp scale5 = (numtyp)1.0;
-      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
-      if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
-        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
-        if (damp < (numtyp)50.0) {
-          numtyp expdamp = ucl_exp(-damp);
-          scale3 = (numtyp)1.0 - expdamp;
-          scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
-        }
-        
-      } else { // damp == 0: ???
-      }
+      numtyp dmpik[5];
+      dampmut(r,alphai,alphak,dmpik);
+      numtyp scalek = factor_wscale;
+      rr3 = r2inv * rr1;
+      numtyp rr3ik = bn[1] - ((numtyp)1.0-scalek*dmpik[2])*rr3;
+      numtyp rr5ik = bn[2] - ((numtyp)1.0-scalek*dmpik[4])*rr5;
 
-      numtyp scalek = factor_uscale;
-      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
-      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
-
-      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
-      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
-      tdipdip[1] = bcn[1]*xr*yr;
-      tdipdip[2] = bcn[1]*xr*zr;
-      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
-      tdipdip[4] = bcn[1]*yr*zr;
-      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
+      numtyp tdipdip[6];
+      tdipdip[0] = -rr3ik + rr5ik*xr*xr;
+      tdipdip[1] = rr5ik*xr*yr;
+      tdipdip[2] = rr5ik*xr*zr;
+      tdipdip[3] = -rr3ik + rr5ik*yr*yr;
+      tdipdip[4] = rr5ik*yr*zr;
+      tdipdip[5] = -rr3ik + rr5ik*zr*zr;
       //if (i==0 && j == 10) 
       //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
       //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 36e7a81237..a6742eee28 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -90,6 +90,40 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                 const double aewald, const double felec, const double off2_mpole, double *charge,
                 double *boxlo, double *prd, void **tep_ptr);
 
+   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
+   virtual int** compute_udirect2b(const int ago, const int inum_full,
+                  const int nall, double **host_x,
+                  int *host_type, int *host_amtype,
+                  int *host_amgroup, double **host_rpole,
+                  double **host_uind, double **host_uinp, double* host_pval,
+                  double *sublo, double *subhi, tagint *tag,
+                  int **nspecial, tagint **special,
+                  int *nspecial15, tagint **special15,
+                  const bool eflag_in, const bool vflag_in,
+                  const bool eatom, const bool vatom,
+                  int &host_start, int **ilist, int **jnum,
+                  const double cpu_time, bool &success,
+                  const double aewald, const double off2_polar,
+                  double *host_q, double *boxlo, double *prd,
+                  void** fieldp_ptr);
+
+   /// Compute the real space part of the induced field (umutual2b) with device neighboring
+   virtual int** compute_umutual2b(const int ago, const int inum_full,
+                                     const int nall, double **host_x,
+                                     int *host_type, int *host_amtype,
+                                     int *host_amgroup, double **host_rpole,
+                                     double **host_uind, double **host_uinp, double *host_pval,
+                                     double *sublo, double *subhi, tagint *tag,
+                                     int **nspecial, tagint **special,
+                                     int *nspecial15, tagint **special15,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     int &host_start, int **ilist, int **jnum,
+                                     const double cpu_time, bool &success,
+                                     const double aewald, const double off2_polar,
+                                     double *host_q, double *boxlo, double *prd,
+                                     void** fieldp_ptr);
+
   /// Compute polar real-space with device neighboring
   virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 1851c3aba3..16b697d88f 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -157,7 +157,7 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
 int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
+                           double **host_uind, double **host_uinp, double *host_pval,
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, int *nspecial15, tagint** special15,
                            const bool eflag, const bool vflag, const bool eatom,
@@ -166,7 +166,7 @@ int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full,
                            bool &success,  const double aewald, const double off2, double *host_q,
                            double *boxlo, double *prd, void **fieldp_ptr) {
   return HIPPOMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
@@ -175,7 +175,7 @@ int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full,
 int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
+                           double **host_uind, double **host_uinp, double *host_pval, 
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, int *nspecial15, tagint** special15,
                            const bool eflag, const bool vflag,
@@ -184,7 +184,7 @@ int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full,
                            bool &success, const double aewald, const double off2, double *host_q,
                            double *boxlo, double *prd, void **fieldp_ptr) {
   return HIPPOMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index a06ac4425c..cacee4ae72 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -59,7 +59,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
   // compute tolerance value for damping exponents
 
   eps = (numtyp)0.001;
-  diff = dmpi-dmpk; 
+  diff = dmpi-dmpk; // fabs(dmpi-dmpk)
   if (diff < (numtyp)0) diff = -diff;
 
   // treat the case where alpha damping exponents are equal
@@ -322,6 +322,109 @@ ucl_inline void damppole(const numtyp r, const int rorder,
   }
 }
 
+/* ----------------------------------------------------------------------
+   dampdir = direct field damping coefficents
+   dampdir generates coefficients for the direct field damping
+   function for powers of the interatomic distance
+------------------------------------------------------------------------- */
 
+ucl_inline void dampdir(numtyp r, numtyp alphai, numtyp alphak, numtyp *dmpi, numtyp *dmpk)
+{
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampk2;
+  numtyp dampi3,dampk3;
+  numtyp dampi4,dampk4;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; // fabs(alphai-alphak);
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1 (HIPPO)
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
+  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
+  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
+  if (diff < eps) {
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
+    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
+    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/30.0)*expk;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   dampmut = mutual field damping coefficents
+   dampmut generates coefficients for the mutual field damping
+   function for powers of the interatomic distance
+------------------------------------------------------------------------- */
+
+ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampk2,dampk3;
+
+  // compute tolerance and exponential damping factors
+
+  eps = (numtyp)0.001;
+  diff = alphai-alphak; // fabs(alphai-alphak);
+  if (diff < (numtyp)0) diff = -diff;
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = ucl_exp(-dampi);
+  expk = ucl_exp(-dampk);
+
+  // valence-valence charge penetration damping for Gordon f1 (HIPPO)
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  if (diff < eps) {
+    dampi4 = dampi2 * dampi2;
+    dampi5 = dampi2 * dampi3;
+    dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + 
+                      7.0*dampi3/48.0 + dampi4/48.0)*expi;
+    dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                      dampi4/24.0 + dampi5/144.0)*expi;
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi - 
+      termk2*(1.0+dampk+0.5*dampk2)*expk - 
+      2.0*termi2*termk*(1.0+dampi)*expi - 2.0*termk2*termi*(1.0+dampk)*expk;
+    dmpik[4] = 1.0 - termi2*(1.0+dampi+0.5*dampi2 + dampi3/6.0)*expi - 
+      termk2*(1.0+dampk+0.5*dampk2 + dampk3/6.00)*expk - 
+      2.0*termi2*termk *(1.0+dampi+dampi2/3.0)*expi - 
+      2.0*termk2*termi *(1.0+dampk+dampk2/3.0)*expk;
+  }
+}
 
 #endif
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 5b855abdd0..617eb89fcd 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -1900,7 +1900,7 @@ void PairAmoeba::dampmut(double r, double alphai, double alphak, double *dmpik)
 ------------------------------------------------------------------------- */
 
 void PairAmoeba::dampdir(double r, double alphai, double alphak, 
-                         double *dmpi, double *dmpk)
+                         double dmpi[7], double dmpk[7])
 {
   double eps,diff;
   double expi,expk;
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 1067969c7b..f4cbf28561 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -89,7 +89,7 @@ int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int
 int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, 
-              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
@@ -98,7 +98,7 @@ int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall
 
 int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double **host_uind, double **host_uinp, 
+              double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
               double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -136,8 +136,8 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_repulsion_ready = false;         // true for HIPPO when ready
   gpu_dispersion_real_ready = true;   // true for HIPPO when ready
   gpu_multipole_real_ready = true;
-  gpu_udirect2b_ready = false;
-  gpu_umutual2b_ready = false;
+  gpu_udirect2b_ready = true;
+  gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
@@ -791,7 +791,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
 
   firstneigh = hippo_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
                                             atom->type, amtype, amgroup, rpole,
-                                            uind, uinp, sublo, subhi, atom->tag,
+                                            uind, uinp, pval, sublo, subhi, atom->tag,
                                             atom->nspecial, atom->special,
                                             atom->nspecial15, atom->special15,
                                             eflag, vflag, eflag_atom, vflag_atom,
@@ -1015,14 +1015,14 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
   else choose(POLAR);
 
   firstneigh = hippo_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
-                                            atom->type, amtype, amgroup, rpole,
-                                            uind, uinp, sublo, subhi, atom->tag,
-                                            atom->nspecial, atom->special,
-                                            atom->nspecial15, atom->special15,
-                                            eflag, vflag, eflag_atom, vflag_atom,
-                                            host_start, &ilist, &numneigh, cpu_time,
-                                            success,aewald, off2, atom->q,
-                                            domain->boxlo, domain->prd, &fieldp_pinned);
+                                           atom->type, amtype, amgroup, rpole,
+                                           uind, uinp, pval, sublo, subhi, atom->tag,
+                                           atom->nspecial, atom->special,
+                                           atom->nspecial15, atom->special15,
+                                           eflag, vflag, eflag_atom, vflag_atom,
+                                           host_start, &ilist, &numneigh, cpu_time,
+                                           success,aewald, off2, atom->q,
+                                           domain->boxlo, domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 

From bf88ab77fa6f07a1a8ffa94c2154b268965ffe7f Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 28 Sep 2021 15:06:30 -0500
Subject: [PATCH 061/181] Cleaned up unused variables in kernel (to be
 continued)

---
 lib/gpu/lal_hippo.cu      | 14 --------------
 lib/gpu/lal_hippo_extra.h | 22 +++++++++++-----------
 2 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 487e852baf..f643f2b994 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1692,19 +1692,10 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int k,m,itype,igroup;
     numtyp bfac;
-    numtyp psc3,psc5,psc7;
-    numtyp dsc3,dsc5,dsc7;
-    numtyp usc3,usc5;
-    numtyp psr3,psr5,psr7;
-    numtyp dsr3,dsr5,dsr7;
-    numtyp usr5;
     numtyp term1,term2,term3;
     numtyp term4,term5;
     numtyp term6,term7;
     numtyp rc3[3],rc5[3],rc7[3];
-    numtyp prc3[3],prc5[3],prc7[3];
-    numtyp drc3[3],drc5[3],drc7[3];
-    numtyp urc3[3],urc5[3];
     numtyp bn[5];
     numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
 
@@ -1801,11 +1792,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
-      // NOTE: for in.water_box/water_hexamer.hippo: there exist wscale = 0.2
-      //if (factor_wscale < (numtyp)1.0) continue; //factor_wscale = (numtyp)0;
-
-      //if (i == 12 && j < 20) printf("j = %d: r = %f; factor_wscale = %f\n", j, r, factor_wscale);
-
       if (igroup == jgroup) {
         factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
         factor_uscale = polar_uscale;
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index cacee4ae72..61bfebc17f 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -404,10 +404,10 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
   if (diff < eps) {
     dampi4 = dampi2 * dampi2;
     dampi5 = dampi2 * dampi3;
-    dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + 
-                      7.0*dampi3/48.0 + dampi4/48.0)*expi;
-    dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
-                      dampi4/24.0 + dampi5/144.0)*expi;
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
+                      7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
   } else {
     dampk2 = dampk * dampk;
     dampk3 = dampk * dampk2;
@@ -417,13 +417,13 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
     termk = alphai2 / (alphai2-alphak2);
     termi2 = termi * termi;
     termk2 = termk * termk;
-    dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi - 
-      termk2*(1.0+dampk+0.5*dampk2)*expk - 
-      2.0*termi2*termk*(1.0+dampi)*expi - 2.0*termk2*termi*(1.0+dampk)*expk;
-    dmpik[4] = 1.0 - termi2*(1.0+dampi+0.5*dampi2 + dampi3/6.0)*expi - 
-      termk2*(1.0+dampk+0.5*dampk2 + dampk3/6.00)*expk - 
-      2.0*termi2*termk *(1.0+dampi+dampi2/3.0)*expi - 
-      2.0*termk2*termi *(1.0+dampk+dampk2/3.0)*expk;
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - 
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - 
+      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - 
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - 
+      (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - 
+      (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
   }
 }
 

From b874feb127df9d0edb0d5235ba6be7d1879e6480 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 28 Sep 2021 17:28:33 -0500
Subject: [PATCH 062/181] Removed trailing spaces

---
 lib/gpu/lal_amoeba.cpp      |   8 +-
 lib/gpu/lal_amoeba.cu       |  80 ++++++++++----------
 lib/gpu/lal_amoeba.h        |   6 +-
 lib/gpu/lal_base_amoeba.cpp |  32 ++++----
 lib/gpu/lal_base_amoeba.h   |   2 +-
 lib/gpu/lal_hippo.cpp       |  22 +++---
 lib/gpu/lal_hippo.cu        | 130 ++++++++++++++++----------------
 lib/gpu/lal_hippo.h         |   6 +-
 lib/gpu/lal_hippo_ext.cpp   |   4 +-
 lib/gpu/lal_hippo_extra.h   | 146 ++++++++++++++++++------------------
 src/GPU/Install.sh          |   2 +
 src/GPU/pair_amoeba_gpu.cpp |  62 +++++++--------
 src/GPU/pair_hippo_gpu.cpp  |  64 ++++++++--------
 13 files changed, 283 insertions(+), 281 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 8d9af4706e..917166c423 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -140,7 +140,7 @@ void AmoebaT::clear() {
   coeff_amclass.clear();
   sp_polar.clear();
   sp_nonpolar.clear();
-  
+
   this->clear_atomic();
 }
 
@@ -169,7 +169,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
 
   // Build the short neighbor list for the cutoff off2_mpole,
   //   at this point mpole is the first kernel in a time step
-  
+
   this->k_short_nbor.set_size(GX,BX);
   this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
@@ -194,7 +194,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::udirect2b(const int eflag, const int vflag) {
-  int ainum=this->ans->inum(); 
+  int ainum=this->ans->inum();
   if (ainum == 0)
     return 0;
 
@@ -216,7 +216,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
                            &nbor_pitch, &this->_threads_per_atom);
     this->short_nbor_polar_avail = true;
   }
-  
+
   this->k_udirect2b.set_size(GX,BX);
   this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 1deb3e3bb5..fdb959f3e2 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -492,7 +492,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
@@ -500,7 +500,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-  
+
       numtyp r = ucl_sqrt(r2);
       const numtyp4 pol1j = polar1[j];
       numtyp ck  = pol1j.x;  // rpole[j][0];
@@ -533,12 +533,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
       numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
       numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-      
+
       numtyp dik = dix*dkx + diy*dky + diz*dkz;
       numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
       numtyp diqk = dix*qkx + diy*qky + diz*qkz;
       numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
-      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
         qixx*qkxx + qiyy*qkyy + qizz*qkzz;
 
       // additional intermediates involving moments and distance
@@ -585,11 +585,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp dkqirx = dkqiz*yr - dkqiy*zr;
       numtyp dkqiry = dkqix*zr - dkqiz*xr;
       numtyp dkqirz = dkqiy*xr - dkqix*yr;
-      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
         (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
-      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
         (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
-      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
         (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
 
       // get reciprocal distance terms for this interaction
@@ -650,20 +650,20 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
       // compute the force components for this interaction
 
-      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
         term4*qix + term5*qkx + term6*(qixk+qkxi);
-      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
         term4*qiy + term5*qky + term6*(qiyk+qkyi);
-      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
         term4*qiz + term5*qkz + term6*(qizk+qkzi);
 
       // compute the torque components for this interaction
 
-      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
         term4*qirx - term6*(qikrx+qikx);
-      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
         term4*qiry - term6*(qikry+qiky);
-      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
         term4*qirz - term6*(qikrz+qikz);
 
       // increment force-based gradient and torque on first site
@@ -691,12 +691,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
         virial[5] += vyz;
       }
     } // nbor
-    
+
   } // ii<inum
 
   // accumulate tq
   store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
-  
+
   // accumate force, energy and virial: use _acc if not the first kernel
   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv);
@@ -771,7 +771,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
     numtyp qizz = pol3i.y;   // rpole[i][12];
     int itype  = pol3i.z;    // amtype[i];
     int igroup = pol3i.w;    // amgroup[i];
-    
+
     // debug:
     // xi__ = ix; xi__.w = itype;
 
@@ -790,7 +790,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
@@ -798,7 +798,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-      
+
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
@@ -898,7 +898,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
       fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
       fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
-        
+
       scalek = factor_pscale;
       bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
       bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
@@ -918,7 +918,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   } // ii<inum
 
   // accumulate field and fieldp
-  
+
   store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
@@ -977,10 +977,10 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     int itype,igroup;
     numtyp bn[4],bcn[3];
     numtyp fid[3],fip[3];
-    
+
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
-    
+
     numtyp pdi = coeff[itype].x;
     numtyp pti = coeff[itype].y;
 
@@ -995,7 +995,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
@@ -1003,7 +1003,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-  
+
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
@@ -1044,7 +1044,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       }
 
       // find terms needed later to compute mutual polarization
-      // if (poltyp != DIRECT) 
+      // if (poltyp != DIRECT)
       numtyp scale3 = (numtyp)1.0;
       numtyp scale5 = (numtyp)1.0;
       numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
@@ -1056,7 +1056,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
           scale3 = (numtyp)1.0 - expdamp;
           scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
         }
-        
+
       } else { // damp == 0: ???
       }
 
@@ -1071,17 +1071,17 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
       tdipdip[4] = bcn[1]*yr*zr;
       tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
-      //if (i==0 && j == 10) 
+      //if (i==0 && j == 10)
       //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
       //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
       fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
       fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
       fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
-      
+
       fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
       fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
       fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
-      
+
       _fieldp[0] += fid[0];
       _fieldp[1] += fid[1];
       _fieldp[2] += fid[2];
@@ -1093,7 +1093,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
   } // ii<inum
 
   // accumulate field and fieldp
-  
+
   store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
@@ -1221,7 +1221,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
@@ -1229,9 +1229,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-  
+
       numtyp r = ucl_sqrt(r2);
-      
+
       const numtyp4 pol1j = polar1[j];
       numtyp ck = pol1j.x;   // rpole[j][0];
       numtyp dkx = pol1j.y;  // rpole[j][1];
@@ -1383,7 +1383,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp tiy3 = psr3*uky + dsr3*ukyp;
       numtyp tiz3 = psr3*ukz + dsr3*ukzp;
       numtyp tuir = -psr5*ukr - dsr5*ukrp;
-      
+
       ufld[0] += tix3 + xr*tuir;
       ufld[1] += tiy3 + yr*tuir;
       ufld[2] += tiz3 + zr*tuir;
@@ -1394,14 +1394,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
       numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
       tuir = -psr7*ukr - dsr7*ukrp;
-      
+
       dufld[0] += xr*tix5 + xr*xr*tuir;
       dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
       dufld[2] += yr*tiy5 + yr*yr*tuir;
       dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
       dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
       dufld[5] += zr*tiz5 + zr*zr*tuir;
-      
+
       // get the dEd/dR terms used for direct polarization force
 
       term1 = bn[2] - dsc3*rr5;
@@ -1473,7 +1473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp frcz = depz;
 
       // get the dEp/dR terms used for direct polarization force
-      
+
       // tixx and tkxx
       term1 = bn[2] - psc3*rr5;
       term2 = bn[3] - psc5*rr7;
@@ -1550,7 +1550,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
       // get the dtau/dr terms used for mutual polarization force
       // poltyp == MUTUAL  && amoeba
-          
+
       term1 = bn[2] - usc3*rr5;
       term2 = bn[3] - usc5*rr7;
       term3 = usr5 + term1;
@@ -1617,7 +1617,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
         virial[5] += vyz;
       }
     } // nbor
-    
+
   } // ii<inum
 
   // accumulate ufld and dufld to compute tep
@@ -1648,7 +1648,7 @@ __kernel void k_special15(__global int * dev_nbor,
   atom_info(t_per_atom,ii,tid,offset);
 
   if (ii<inum) {
-  
+
     int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 04eb6e4aa9..df72435b81 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -70,13 +70,13 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> coeff_amtype;
   /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
   UCL_D_Vec<numtyp4> coeff_amclass;
-  /// Special polar values [0-4]: 
+  /// Special polar values [0-4]:
   ///   sp_polar.x = special_polar_wscale
   ///   sp_polar.y special_polar_pscale,
   ///   sp_polar.z = special_polar_piscale
   ///   sp_polar.w = special_mpole
   UCL_D_Vec<numtyp4> sp_polar;
-  /// Special nonpolar values [0-4]: 
+  /// Special nonpolar values [0-4]:
   ///   sp_nonpolar.x = special_hal
   ///   sp_nonpolar.y special_repel
   ///   sp_nonpolar.z = special_disp
@@ -97,7 +97,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
   int polar_real(const int eflag, const int vflag);
-  
+
 };
 
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index c4fdb8c9e5..3728fbe85e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -106,7 +106,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                               _threads_per_atom);
   if (success!=0)
     return success;
-                              
+
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -121,7 +121,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _maxspecial=maxspecial;
   _maxspecial15=maxspecial15;
 
-  // allocate per-atom array tep 
+  // allocate per-atom array tep
 
   int ef_nall=nlocal; //nall;
   if (ef_nall==0)
@@ -250,7 +250,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
                           const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom,
                           int &host_start, const double cpu_time,
-                          bool &success, const double aewald, const double felec, 
+                          bool &success, const double aewald, const double felec,
                           const double off2_polar, double *host_q, const int nlocal,
                           double *boxlo, double *prd, void **tep_ptr) {
   acc_timers();
@@ -280,7 +280,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
     dev_special15_t.clear();
     dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
     dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
   }
 
   *tep_ptr=_tep.host.begin();
@@ -320,7 +320,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
   _off2_polar = off2_polar;
   _felec = felec;
   const int red_blocks=polar_real(eflag,vflag);
-  
+
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
@@ -375,7 +375,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
     dev_special15_t.clear();
     dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
     dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
   }
 
   if (inum_full==0) {
@@ -462,7 +462,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
 
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
   //   For now we invoke precompute() again here,
   //     to be able to turn on/off the udirect2b kernel (which comes before this)
   //   Once all the kernels are ready, precompute() is needed only once
@@ -509,7 +509,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
     numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
   }
-*/  
+*/
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
@@ -560,7 +560,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
                           success, host_q, boxlo, prd);
-                         
+
   // ------------------- Resize _fieldp array ------------------------
 
   if (inum_full>_max_fieldp_size) {
@@ -698,7 +698,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
 
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
   //   For now we invoke precompute() again here,
   //     to be able to turn on/off the udirect2b kernel (which comes before this)
   //   Once all the kernels are ready, precompute() is needed only once
@@ -745,7 +745,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
     numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
   }
-*/  
+*/
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
@@ -809,7 +809,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
       int idx = n+i*nstride;
       pextra[idx]   = uinp[i][0];
       pextra[idx+1] = uinp[i][1];
-      pextra[idx+2] = uinp[i][2];    
+      pextra[idx+2] = uinp[i][2];
     }
   }
 
@@ -818,7 +818,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
-      pextra[idx]   = pval[i]; 
+      pextra[idx]   = pval[i];
     }
   }
 }
@@ -846,7 +846,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_special15.set_function(*pair_program,"k_special15");
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
-  
+
   _compiled=true;
 
   #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
@@ -874,13 +874,13 @@ int BaseAmoebaT::add_onefive_neighbors() {
   int _nall=atom->nall();
   int ainum=ans->inum();
   int nbor_pitch=nbor->nbor_pitch();
-  
+
   k_special15.set_size(GX,BX);
   k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
                     &atom->dev_tag, &dev_nspecial15, &dev_special15,
                     &ainum, &_nall, &nbor_pitch,
                     &_threads_per_atom);
-  
+
   return GX;
 }
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index fc665ec731..bd30fc3fbb 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -287,7 +287,7 @@ class BaseAmoeba {
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
   virtual int polar_real(const int eflag, const int vflag) = 0;
-  
+
 };
 
 }
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index caf910863f..ac221f8376 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -145,7 +145,7 @@ void HippoT::clear() {
   coeff_amclass.clear();
   sp_polar.clear();
   sp_nonpolar.clear();
-  
+
   this->clear_atomic();
 }
 
@@ -199,7 +199,7 @@ int** HippoT::precompute(const int ago, const int inum_full, const int nall,
     this->dev_special15_t.clear();
     this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
     this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+    this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
   }
 
   if (inum_full==0) {
@@ -286,7 +286,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
 
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
   //   For now we invoke precompute() again here,
   //     to be able to turn on/off the udirect2b kernel (which comes before this)
   //   Once all the kernels are ready, precompute() is needed only once
@@ -339,7 +339,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
 
   // Build the short neighbor list for the cutoff off2_disp,
   //   at this point mpole is the first kernel in a time step
-  
+
   this->k_short_nbor.set_size(GX,BX);
   this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
@@ -397,7 +397,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
 
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
   //   For now we invoke precompute() again here,
   //     to be able to turn on/off the udirect2b kernel (which comes before this)
   //   Once all the kernels are ready, precompute() is needed only once
@@ -468,7 +468,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
 
   // Build the short neighbor list for the cutoff off2_mpole,
   //   at this point mpole is the first kernel in a time step
-  
+
   this->k_short_nbor.set_size(GX,BX);
   this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
@@ -537,7 +537,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
                           success, host_q, boxlo, prd);
-                         
+
   // ------------------- Resize _fieldp array ------------------------
 
   if (inum_full>this->_max_fieldp_size) {
@@ -569,7 +569,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int HippoT::udirect2b(const int eflag, const int vflag) {
-  int ainum=this->ans->inum(); 
+  int ainum=this->ans->inum();
   if (ainum == 0)
     return 0;
 
@@ -591,7 +591,7 @@ int HippoT::udirect2b(const int eflag, const int vflag) {
                            &nbor_pitch, &this->_threads_per_atom);
     this->short_nbor_polar_avail = true;
   }
-  
+
   this->k_udirect2b.set_size(GX,BX);
   this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
                         &coeff_amtype, &coeff_amclass, &sp_polar,
@@ -756,7 +756,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
 
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
   //   For now we invoke precompute() again here,
   //     to be able to turn on/off the udirect2b kernel (which comes before this)
   //   Once all the kernels are ready, precompute() is needed only once
@@ -803,7 +803,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
     numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
     printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
   }
-*/  
+*/
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index f643f2b994..b282586efb 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -491,7 +491,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = ix.x - jx.x;
       numtyp yr = ix.y - jx.y;
@@ -499,7 +499,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       if (r2>off2) continue;
-  
+
       const numtyp4 pol1j = polar1[j];
       numtyp ck  = pol1j.x;  // rpole[j][0];
       numtyp dkx = pol1j.y;  // rpole[j][1];
@@ -514,7 +514,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp qkyz = pol3j.x; // rpole[j][9];
       numtyp qkzz = pol3j.y; // rpole[j][12];
       int jtype = pol3j.z; // amtype[j];
-      
+
       numtyp sizk = coeff[jtype].x; // sizpr[jtype];
       numtyp dmpk = coeff[jtype].y; // dmppr[jtype];
       numtyp valk = coeff[jtype].z; // elepr[jtype];
@@ -534,12 +534,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
       numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
       numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-      
+
       numtyp dik = dix*dkx + diy*dky + diz*dkz;
       numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
       numtyp diqk = dix*qkx + diy*qky + diz*qkz;
       numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
-      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
         qixx*qkxx + qiyy*qkyy + qizz*qkzz;
 
       // additional intermediates involving moments and distance
@@ -586,11 +586,11 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp dkqirx = dkqiz*yr - dkqiy*zr;
       numtyp dkqiry = dkqix*zr - dkqiz*xr;
       numtyp dkqirz = dkqiy*xr - dkqix*yr;
-      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
         (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
-      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
         (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
-      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
         (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
 
       // get reciprocal distance terms for this interaction
@@ -616,7 +616,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
       numtyp term4 = dir*qkr - dkr*qir - 4.0*qik;
       numtyp term5 = qir*qkr;
-      numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + 
+      numtyp eterm = term1*dmpik[0] + term2*dmpik[2] +
         term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8];
 
       // compute the Pauli repulsion energy for this interaction
@@ -626,7 +626,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
 
       // calculate intermediate terms for force and torque
 
-      numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + 
+      numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] +
         term4*dmpik[8] + term5*dmpik[10];
       term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6];
       term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6];
@@ -637,23 +637,23 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
 
       // compute the force components for this interaction
 
-      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
         term4*qix + term5*qkx + term6*(qixk+qkxi);
-      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
         term4*qiy + term5*qky + term6*(qiyk+qkyi);
-      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
         term4*qiz + term5*qkz + term6*(qizk+qkzi);
       frcx = frcx*rr1 + eterm*rr3*xr;
       frcy = frcy*rr1 + eterm*rr3*yr;
       frcz = frcz*rr1 + eterm*rr3*zr;
 
       // compute the torque components for this interaction
-      
-      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+
+      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
         term4*qirx - term6*(qikrx+qikx);
-      numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - 
+      numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) -
         term4*qiry - term6*(qikry+qiky);
-      numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+      numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) -
         term4*qirz - term6*(qikrz+qikz);
       ttmix = sizik * ttmix * rr1;
       ttmiy = sizik * ttmiy * rr1;
@@ -706,7 +706,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
         virial[5] += vyz;
       }
     } // nbor
-    
+
   } // ii<inum
 
   // accumulate tq
@@ -786,7 +786,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = ix.x - jx.x;
       numtyp yr = ix.y - jx.y;
@@ -794,7 +794,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-  
+
       int jtype =   polar3[j].z; // amtype[j];
       int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
       numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
@@ -816,7 +816,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       numtyp dk = ak * r;
       numtyp expi = ucl_exp(-di);
       numtyp expk = ucl_exp(-dk);
-     
+
       numtyp ai2,ak2;
       numtyp di4,di5;
       numtyp dk2,dk3;
@@ -844,7 +844,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
           - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
           - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
           - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
-        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + 
+        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) +
           (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);
 
       } else {
@@ -856,7 +856,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       }
 
       numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
-      
+
       // apply damping and scaling factors for this interaction
 
       numtyp scale = factor_disp * damp*damp;
@@ -892,7 +892,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       virial[4] += vzx;
       virial[5] += vzy;
     } // nbor
-    
+
   } // ii<inum
 
   // accumate force, energy and virial
@@ -997,7 +997,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
@@ -1005,7 +1005,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-  
+
       numtyp r = ucl_sqrt(r2);
       const numtyp4 pol1j = polar1[j];
       numtyp ck  = pol1j.x;  // rpole[j][0];
@@ -1043,12 +1043,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
       numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
       numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-      
+
       numtyp dik = dix*dkx + diy*dky + diz*dkz;
       numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
       numtyp diqk = dix*qkx + diy*qky + diz*qkz;
       numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
-      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
         qixx*qkxx + qiyy*qkyy + qizz*qkzz;
 
       // additional intermediates involving moments and distance
@@ -1095,11 +1095,11 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp dkqirx = dkqiz*yr - dkqiy*zr;
       numtyp dkqiry = dkqix*zr - dkqiz*xr;
       numtyp dkqirz = dkqiy*xr - dkqix*yr;
-      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
         (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
-      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
         (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
-      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
         (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
 
       // get reciprocal distance terms for this interaction
@@ -1164,16 +1164,16 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11;
       rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1;
       rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
-      numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + 
-        term1i*rr1i + term1k*rr1k + term1ik*rr1ik + 
-        term2i*rr3i + term2k*rr3k + term2ik*rr3ik + 
+      numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik +
+        term1i*rr1i + term1k*rr1k + term1ik*rr1ik +
+        term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
         term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
 
       // find damped multipole intermediates for force and torque
 
-      numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + 
-        term1i*rr3i + term1k*rr3k + term1ik*rr3ik + 
-        term2i*rr5i + term2k*rr5k + term2ik*rr5ik + 
+      numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
+        term1i*rr3i + term1k*rr3k + term1ik*rr3ik +
+        term2i*rr5i + term2k*rr5k + term2ik*rr5ik +
         term3i*rr7i + term3k*rr7k + term3ik*rr7ik;
       term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik;
       term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik;
@@ -1187,20 +1187,20 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
       // compute the force components for this interaction
 
-      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
         term4*qix + term5*qkx + term6*(qixk+qkxi);
-      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
         term4*qiy + term5*qky + term6*(qiyk+qkyi);
-      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
         term4*qiz + term5*qkz + term6*(qizk+qkzi);
 
       // compute the torque components for this interaction
 
-      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
         term4*qirx - term6*(qikrx+qikx);
-      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
         term4*qiry - term6*(qikry+qiky);
-      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
         term4*qirz - term6*(qikrz+qikz);
 
       // increment force-based gradient and torque on first site
@@ -1228,12 +1228,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
         virial[5] += vyz;
       }
     } // nbor
-    
+
   } // ii<inum
 
   // accumulate tq
   store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
-  
+
   // accumate force, energy and virial: use _acc if not the first kernel
   //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      //offset,eflag,vflag,ans,engv);
@@ -1327,7 +1327,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
@@ -1335,7 +1335,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-      
+
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
@@ -1408,7 +1408,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       // find the field components for charge penetration damping
       numtyp dmpi[7],dmpk[7];
       dampdir(r,alphai,alphak,dmpi,dmpk);
-          
+
       numtyp scalek = factor_dscale;
       numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
       numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
@@ -1439,7 +1439,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
         rr3k*dky + (numtyp)2.0*rr5k*qky;
       fip[2] = -zr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
         rr3k*dkz + (numtyp)2.0*rr5k*qkz;
-          
+
       // find terms needed later to compute mutual polarization
 
       _fieldp[0] += fid[0];
@@ -1453,7 +1453,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
   } // ii<inum
 
   // accumulate field and fieldp
-  
+
   store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
@@ -1514,7 +1514,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
     int itype,igroup;
     numtyp bn[4],bcn[3];
     numtyp fid[3],fip[3];
-    
+
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
 
@@ -1534,7 +1534,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
@@ -1542,7 +1542,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-  
+
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
@@ -1595,7 +1595,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       }
 
       // find terms needed later to compute mutual polarization
-      // if (poltyp != DIRECT) 
+      // if (poltyp != DIRECT)
       numtyp dmpik[5];
       dampmut(r,alphai,alphak,dmpik);
       numtyp scalek = factor_wscale;
@@ -1610,17 +1610,17 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       tdipdip[3] = -rr3ik + rr5ik*yr*yr;
       tdipdip[4] = rr5ik*yr*zr;
       tdipdip[5] = -rr3ik + rr5ik*zr*zr;
-      //if (i==0 && j == 10) 
+      //if (i==0 && j == 10)
       //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
       //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
       fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
       fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
       fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
-      
+
       fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
       fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
       fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
-      
+
       _fieldp[0] += fid[0];
       _fieldp[1] += fid[1];
       _fieldp[2] += fid[2];
@@ -1632,7 +1632,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
   } // ii<inum
 
   // accumulate field and fieldp
-  
+
   store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }
 
@@ -1754,7 +1754,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       //int jtype=jx.w;
- 
+
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
@@ -1762,7 +1762,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       //if (r2>off2) continue;
-  
+
       numtyp r = ucl_sqrt(r2);
 
       const numtyp4 pol1j = polar1[j];
@@ -1905,7 +1905,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
 
       // get the field gradient for direct polarization force
-      
+
       numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
       numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k;
       numtyp term1core;
@@ -1987,7 +1987,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
         dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i;
       tkxy = -valk*term1k - corek*term1core - dky*term2k - dkx*term3k +
         dkr*term4k - qkxy*term5k + qky*term6k + qkx*term7k - qkr*term8k;
-      
+
       term2i = rr5i*xr;
       term1i = zr * term2i;
       term1core = rr5core*xr*zr;
@@ -2039,7 +2039,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp frcx = (numtyp)-2.0 * depx;
       numtyp frcy = (numtyp)-2.0 * depy;
       numtyp frcz = (numtyp)-2.0 * depz;
-       
+
       // get the dEp/dR terms used for direct polarization force
       // poltyp == MUTUAL && hippo
       // tixx and tkxx
@@ -2108,7 +2108,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
         virial[5] += vyz;
       }
     } // nbor
-    
+
   } // ii<inum
 
   // accumulate ufld and dufld to compute tep
@@ -2139,7 +2139,7 @@ __kernel void k_special15(__global int * dev_nbor,
   atom_info(t_per_atom,ii,tid,offset);
 
   if (ii<inum) {
-  
+
     int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index a6742eee28..289f769813 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -154,13 +154,13 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> coeff_amtype;
   /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
   UCL_D_Vec<numtyp4> coeff_amclass;
-  /// Special polar values [0-4]: 
+  /// Special polar values [0-4]:
   ///   sp_polar.x = special_polar_wscale
   ///   sp_polar.y special_polar_pscale,
   ///   sp_polar.z = special_polar_piscale
   ///   sp_polar.w = special_mpole
   UCL_D_Vec<numtyp4> sp_polar;
-  /// Special nonpolar values [0-4]: 
+  /// Special nonpolar values [0-4]:
   ///   sp_nonpolar.x = special_hal
   ///   sp_nonpolar.y special_repel
   ///   sp_nonpolar.z = special_disp
@@ -184,7 +184,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
   int polar_real(const int eflag, const int vflag);
-  
+
 };
 
 }
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 16b697d88f..982cf894a6 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -129,7 +129,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
                            const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd) {                             
+                           double *host_q, double *boxlo, double *prd) {
   return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,
@@ -175,7 +175,7 @@ int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full,
 int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp, double *host_pval, 
+                           double **host_uind, double **host_uinp, double *host_pval,
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
                            tagint **special, int *nspecial15, tagint** special15,
                            const bool eflag, const bool vflag,
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index 61bfebc17f..0b8f96f69b 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -116,46 +116,46 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
     tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
     s = (dampi-tmp)*expk + (dampk+tmp)*expi;
 
-    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - 
-          (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term -
+          (numtyp)4.0*dmpi2*dmpk2/term) * expk +
       (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
-    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - 
-           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - 
-           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + 
-       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 -
+           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
+       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
        (numtyp)4.0*dmpi2*dmpk2/term) * expi;
-    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - 
-           (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - 
-           (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
-      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + 
-       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + 
+    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
+           (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
+           (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk +
+      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
+       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
        (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
-    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + 
-           dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - 
-           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - 
-           ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - 
-           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + 
-       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + 
-       ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + 
-       ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
+    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 +
+           dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 -
+           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
+           ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
+       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
+       ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
+       ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
        (numtyp)4.0*dmpi2*dmpk2/term) * expi;
-    
+
     if (rorder >= 11) {
       r6 = r5 * r;
       dmpi26 = dmpi25 * dmpi2;
       dmpk26 = dmpk25 * dmpk2;
-      d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + 
-             dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - 
-             (4.0/945.0)*dmpi2*dmpk26*r5/term - 
-             (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - 
-             (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
-             4.0*dmpi2*dmpk2/term) * expk + 
-        (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + 
-         dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + 
-         (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + 
-         (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + 
+      d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 +
+             dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 -
+             (4.0/945.0)*dmpi2*dmpk26*r5/term -
+             (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term -
+             (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term -
+             4.0*dmpi2*dmpk2/term) * expk +
+        (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 +
+         dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 +
+         (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term +
+         (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term +
          4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
     }
   }
@@ -214,7 +214,7 @@ ucl_inline void damppole(const numtyp r, const int rorder,
   // compute tolerance and exponential damping factors
 
   eps = (numtyp)0.001;
-  diff = alphai-alphak; 
+  diff = alphai-alphak;
   if (diff < (numtyp)0) diff = -diff;
   dampi = alphai * r;
   dampk = alphak * r;
@@ -231,7 +231,7 @@ ucl_inline void damppole(const numtyp r, const int rorder,
   dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
   dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
   dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
-  dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+  dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
                    (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
   if (diff < eps) {
     dmpk[0] = dmpi[0];
@@ -248,7 +248,7 @@ ucl_inline void damppole(const numtyp r, const int rorder,
     dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
     dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
     dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
-    dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + 
+    dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
                      (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
   }
 
@@ -257,21 +257,21 @@ ucl_inline void damppole(const numtyp r, const int rorder,
   if (diff < eps) {
     dampi6 = dampi3 * dampi3;
     dampi7 = dampi3 * dampi4;
-    dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + 
+    dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 +
                       dampi3/(numtyp)48.0)*expi;
-    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
                       (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
-    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
                       dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
-    dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+    dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
                       dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
-    dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
-                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + 
+    dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
                       dampi7/(numtyp)5040.0)*expi;
     if (rorder >= 11) {
       dampi8 = dampi4 * dampi4;
       dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
-                         dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + 
+                         dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
                          dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
     }
 
@@ -282,41 +282,41 @@ ucl_inline void damppole(const numtyp r, const int rorder,
     termk = alphai2 / (alphai2-alphak2);
     termi2 = termi * termi;
     termk2 = termk * termk;
-    dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - 
+    dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi -
       termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
     dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
       termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
       (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
       (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
-    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - 
-      termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - 
-      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - 
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi -
       (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
-    dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
-                             dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - 
-      termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - 
-      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - 
+    dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                             dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi -
+      termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi -
       (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
-    dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
-                             (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - 
-      termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + 
-              (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - 
-      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + 
-                        (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - 
-      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + 
+    dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                             (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi -
+      termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
+              (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 +
+                        (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 +
                         (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
-    
+
     if (rorder >= 11) {
       dampi6 = dampi3 * dampi3;
       dampk6 = dampk3 * dampk3;
-      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
-                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + 
-                                dampi6/(numtyp)1890.0)*expi - 
-        termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + 
-                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - 
-        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + 
-                          dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - 
-        (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + 
+      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 +
+                                dampi6/(numtyp)1890.0)*expi -
+        termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 +
+                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk -
+        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 +
+                          dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi -
+        (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 +
                           dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
     }
   }
@@ -404,9 +404,9 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
   if (diff < eps) {
     dampi4 = dampi2 * dampi2;
     dampi5 = dampi2 * dampi3;
-    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
                       7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
-    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
                       dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
   } else {
     dampk2 = dampk * dampk;
@@ -417,12 +417,12 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
     termk = alphai2 / (alphai2-alphak2);
     termi2 = termi * termi;
     termk2 = termk * termk;
-    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - 
-      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - 
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
       (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
-    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - 
-      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - 
-      (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - 
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk -
+      (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi -
       (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
   }
 }
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 9e231663c0..9da06cf636 100755
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -91,6 +91,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp
 action pair_gauss_gpu.h pair_gauss.h
 action pair_gayberne_gpu.cpp pair_gayberne.cpp
 action pair_gayberne_gpu.h pair_gayberne.cpp
+action pair_hippo_gpu.cpp pair_hippo.cpp
+action pair_hippo_gpu.h pair_hippo.cpp
 action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp
 action pair_lj96_cut_gpu.h pair_lj96_cut.h
 action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 91bc679447..e1fe1f1097 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -76,7 +76,7 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
 
 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double **host_uind, double **host_uinp, 
+              double **host_rpole, double **host_uind, double **host_uinp,
               double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -86,7 +86,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal
 
 int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double **host_uind, double **host_uinp, 
+              double **host_rpole, double **host_uind, double **host_uinp,
               double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -170,7 +170,7 @@ void PairAmoebaGPU::init_style()
     maxspecial=atom->maxspecial;
     maxspecial15=atom->maxspecial15;
   }
-    
+
   int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
@@ -207,7 +207,7 @@ void PairAmoebaGPU::multipole_real()
 
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
-  
+
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -239,7 +239,7 @@ void PairAmoebaGPU::multipole_real()
                                                  host_start, &ilist, &numneigh, cpu_time,
                                                  success, aewald, felec, off2, atom->q,
                                                  domain->boxlo, domain->prd, &tq_pinned);
-  
+
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
@@ -281,7 +281,7 @@ void PairAmoebaGPU::induce()
 
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
-  
+
   if (use_ewald) {
     choose(POLAR_LONG);
     int nmine = p_kspace->nfft_owned;
@@ -317,7 +317,7 @@ void PairAmoebaGPU::induce()
   memory->create(usump,nlocal,3,"ameoba/induce:usump");
 
   // get the electrostatic field due to permanent multipoles
-  
+
   dfield0c(field,fieldp);
 
   // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
@@ -345,7 +345,7 @@ void PairAmoebaGPU::induce()
   for (i = 0; i < 10; i++) {
     printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
       i, udir[i][0], udir[i][1], udir[i][2],
-      udirp[i][0], udirp[i][1], udirp[i][2]); 
+      udirp[i][0], udirp[i][1], udirp[i][2]);
   }
 */
   // get induced dipoles via the OPT extrapolation method
@@ -353,7 +353,7 @@ void PairAmoebaGPU::induce()
   //       uopt,uoptp with a optorder+1 dimension, just optorder ??
   //       since no need to store optorder+1 values after these loops
 
-  if (poltyp == OPT) { 
+  if (poltyp == OPT) {
     for (i = 0; i < nlocal; i++) {
       for (j = 0; j < 3; j++) {
         uopt[i][0][j] = udir[i][j];
@@ -460,7 +460,7 @@ void PairAmoebaGPU::induce()
       crstyle = FIELD;
       comm->reverse_comm_pair(this);
     }
-    
+
     //error->all(FLERR,"STOP GPU");
 
     // set initial conjugate gradient residual and conjugate vector
@@ -486,7 +486,7 @@ void PairAmoebaGPU::induce()
       cfstyle = RSD;
       comm->forward_comm_pair(this);
       uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
-      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
       crstyle = ZRSD;
       comm->reverse_comm_pair(this);
    }
@@ -574,7 +574,7 @@ void PairAmoebaGPU::induce()
       if (pcgprec) {
         cfstyle = RSD;
         comm->forward_comm_pair(this);
-        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
         crstyle = ZRSD;
         comm->reverse_comm_pair(this);
       }
@@ -629,7 +629,7 @@ void PairAmoebaGPU::induce()
       if (iter >= politer) done = true;
 
       //  apply a "peek" iteration to the mutual induced dipoles
-     
+
       if (done) {
         for (i = 0; i < nlocal; i++) {
           term = pcgpeek * poli[i];
@@ -644,7 +644,7 @@ void PairAmoebaGPU::induce()
 
     // terminate the calculation if dipoles failed to converge
     // NOTE: could make this an error
-    
+
     if (iter >= maxiter || eps > epsold)
       if (me == 0)
 	      error->warning(FLERR,"AMOEBA induced dipoles did not converge");
@@ -652,7 +652,7 @@ void PairAmoebaGPU::induce()
 
   // DEBUG output to dump file
 
-  if (uind_flag) 
+  if (uind_flag)
     dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp);
 
   // deallocation of arrays
@@ -700,7 +700,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
     PairAmoeba::udirect2b(field, fieldp);
     return;
   }
-   
+
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
@@ -753,7 +753,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
     int idx = 4*i;
     field[i][0] += field_ptr[idx];
     field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2]; 
+    field[i][2] += field_ptr[idx+2];
   }
 
   double* fieldp_ptr = (double *)fieldp_pinned;
@@ -764,7 +764,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
-  
+
 }
 
 /* ----------------------------------------------------------------------
@@ -802,7 +802,7 @@ void PairAmoebaGPU::udirect2b_cpu()
   firstneigh = list->firstneigh;
 
   // NOTE: doesn't this have a problem if aewald is tiny ??
-  
+
   aesq2 = 2.0 * aewald * aewald;
   aesq2n = 0.0;
   if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
@@ -829,13 +829,13 @@ void PairAmoebaGPU::udirect2b_cpu()
     pdi = pdamp[itype];
     pti = thole[itype];
     ddi = dirdamp[itype];
-    
+
     // evaluate all sites within the cutoff distance
 
     for (jj = 0; jj < jnum; jj++) {
       jextra = jlist[jj];
       j = jextra & NEIGHMASK15;
-      
+
       xr = x[j][0] - x[i][0];
       yr = x[j][1] - x[i][1];
       zr = x[j][2] - x[i][2];
@@ -844,7 +844,7 @@ void PairAmoebaGPU::udirect2b_cpu()
 
       jtype = amtype[j];
       jgroup = amgroup[j];
-      
+
       factor_wscale = special_polar_wscale[sbmask15(jextra)];
       if (igroup == jgroup) {
         factor_pscale = special_polar_piscale[sbmask15(jextra)];
@@ -872,7 +872,7 @@ void PairAmoebaGPU::udirect2b_cpu()
         aefac = aesq2 * aefac;
         bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
       }
-      
+
       // find terms needed later to compute mutual polarization
 
       if (poltyp != DIRECT) {
@@ -891,7 +891,7 @@ void PairAmoebaGPU::udirect2b_cpu()
         scalek = factor_uscale;
         bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
         bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
-        
+
         neighptr[n++] = j;
         tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
         tdipdip[ndip++] = bcn[1]*xr*yr;
@@ -902,7 +902,7 @@ void PairAmoebaGPU::udirect2b_cpu()
       } else {
         if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
       }
-      
+
     } // jj
 
     firstneigh_dipole[i] = neighptr;
@@ -973,7 +973,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
     int idx = 4*i;
     field[i][0] += field_ptr[idx];
     field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2]; 
+    field[i][2] += field_ptr[idx+2];
   }
 
   double* fieldp_ptr = (double *)fieldp_pinned;
@@ -1001,7 +1001,7 @@ void PairAmoebaGPU::polar_real()
 
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
-  
+
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -1033,7 +1033,7 @@ void PairAmoebaGPU::polar_real()
                                              host_start, &ilist, &numneigh, cpu_time,
                                              success, aewald, felec, off2, atom->q,
                                              domain->boxlo, domain->prd, &tq_pinned);
-  
+
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
@@ -1091,11 +1091,11 @@ void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr,
     vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
     vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
     vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
-    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
                  xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
-    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
                  xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
-    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
 
     virial_comp[0] += vxx;
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index f4cbf28561..fbc1b6b238 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -88,7 +88,7 @@ int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int
 
 int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double **host_uind, double **host_uinp, 
+              double **host_rpole, double **host_uind, double **host_uinp,
               double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
               tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -185,7 +185,7 @@ void PairHippoGPU::init_style()
     maxspecial=atom->maxspecial;
     maxspecial15=atom->maxspecial15;
   }
-    
+
   int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
@@ -222,7 +222,7 @@ void PairHippoGPU::dispersion_real()
 
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
-  
+
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -250,7 +250,7 @@ void PairHippoGPU::dispersion_real()
                                                  host_start, &ilist, &numneigh, cpu_time,
                                                  success, aewald, off2, atom->q,
                                                  domain->boxlo, domain->prd);
-  
+
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 }
@@ -270,7 +270,7 @@ void PairHippoGPU::multipole_real()
 
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
-  
+
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -302,7 +302,7 @@ void PairHippoGPU::multipole_real()
                                                 host_start, &ilist, &numneigh, cpu_time,
                                                 success, aewald, felec, off2, atom->q,
                                                 domain->boxlo, domain->prd, &tq_pinned);
-  
+
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
@@ -344,7 +344,7 @@ void PairHippoGPU::induce()
 
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
-  
+
   if (use_ewald) {
     choose(POLAR_LONG);
     int nmine = p_kspace->nfft_owned;
@@ -380,7 +380,7 @@ void PairHippoGPU::induce()
   memory->create(usump,nlocal,3,"ameoba/induce:usump");
 
   // get the electrostatic field due to permanent multipoles
-  
+
   dfield0c(field,fieldp);
 
   // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
@@ -408,7 +408,7 @@ void PairHippoGPU::induce()
   for (i = 0; i < 10; i++) {
     printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
       i, udir[i][0], udir[i][1], udir[i][2],
-      udirp[i][0], udirp[i][1], udirp[i][2]); 
+      udirp[i][0], udirp[i][1], udirp[i][2]);
   }
 */
   // get induced dipoles via the OPT extrapolation method
@@ -416,7 +416,7 @@ void PairHippoGPU::induce()
   //       uopt,uoptp with a optorder+1 dimension, just optorder ??
   //       since no need to store optorder+1 values after these loops
 
-  if (poltyp == OPT) { 
+  if (poltyp == OPT) {
     for (i = 0; i < nlocal; i++) {
       for (j = 0; j < 3; j++) {
         uopt[i][0][j] = udir[i][j];
@@ -523,7 +523,7 @@ void PairHippoGPU::induce()
       crstyle = FIELD;
       comm->reverse_comm_pair(this);
     }
-    
+
     //error->all(FLERR,"STOP GPU");
 
     // set initial conjugate gradient residual and conjugate vector
@@ -549,7 +549,7 @@ void PairHippoGPU::induce()
       cfstyle = RSD;
       comm->forward_comm_pair(this);
       uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
-      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
       crstyle = ZRSD;
       comm->reverse_comm_pair(this);
    }
@@ -637,7 +637,7 @@ void PairHippoGPU::induce()
       if (pcgprec) {
         cfstyle = RSD;
         comm->forward_comm_pair(this);
-        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
         crstyle = ZRSD;
         comm->reverse_comm_pair(this);
       }
@@ -692,7 +692,7 @@ void PairHippoGPU::induce()
       if (iter >= politer) done = true;
 
       //  apply a "peek" iteration to the mutual induced dipoles
-     
+
       if (done) {
         for (i = 0; i < nlocal; i++) {
           term = pcgpeek * poli[i];
@@ -707,7 +707,7 @@ void PairHippoGPU::induce()
 
     // terminate the calculation if dipoles failed to converge
     // NOTE: could make this an error
-    
+
     if (iter >= maxiter || eps > epsold)
       if (me == 0)
 	      error->warning(FLERR,"hippo induced dipoles did not converge");
@@ -715,7 +715,7 @@ void PairHippoGPU::induce()
 
   // DEBUG output to dump file
 
-  if (uind_flag) 
+  if (uind_flag)
     dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp);
 
   // deallocation of arrays
@@ -763,7 +763,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
     PairAmoeba::udirect2b(field, fieldp);
     return;
   }
-   
+
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
@@ -816,7 +816,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
     int idx = 4*i;
     field[i][0] += field_ptr[idx];
     field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2]; 
+    field[i][2] += field_ptr[idx+2];
   }
 
   double* fieldp_ptr = (double *)fieldp_pinned;
@@ -827,7 +827,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
-  
+
 }
 
 /* ----------------------------------------------------------------------
@@ -865,7 +865,7 @@ void PairHippoGPU::udirect2b_cpu()
   firstneigh = list->firstneigh;
 
   // NOTE: doesn't this have a problem if aewald is tiny ??
-  
+
   aesq2 = 2.0 * aewald * aewald;
   aesq2n = 0.0;
   if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
@@ -892,13 +892,13 @@ void PairHippoGPU::udirect2b_cpu()
     pdi = pdamp[itype];
     pti = thole[itype];
     ddi = dirdamp[itype];
-    
+
     // evaluate all sites within the cutoff distance
 
     for (jj = 0; jj < jnum; jj++) {
       jextra = jlist[jj];
       j = jextra & NEIGHMASK15;
-      
+
       xr = x[j][0] - x[i][0];
       yr = x[j][1] - x[i][1];
       zr = x[j][2] - x[i][2];
@@ -907,7 +907,7 @@ void PairHippoGPU::udirect2b_cpu()
 
       jtype = amtype[j];
       jgroup = amgroup[j];
-      
+
       factor_wscale = special_polar_wscale[sbmask15(jextra)];
       if (igroup == jgroup) {
         factor_pscale = special_polar_piscale[sbmask15(jextra)];
@@ -935,7 +935,7 @@ void PairHippoGPU::udirect2b_cpu()
         aefac = aesq2 * aefac;
         bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
       }
-      
+
       // find terms needed later to compute mutual polarization
 
       if (poltyp != DIRECT) {
@@ -954,7 +954,7 @@ void PairHippoGPU::udirect2b_cpu()
         scalek = factor_uscale;
         bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
         bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
-        
+
         neighptr[n++] = j;
         tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
         tdipdip[ndip++] = bcn[1]*xr*yr;
@@ -965,7 +965,7 @@ void PairHippoGPU::udirect2b_cpu()
       } else {
         if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
       }
-      
+
     } // jj
 
     firstneigh_dipole[i] = neighptr;
@@ -1036,7 +1036,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
     int idx = 4*i;
     field[i][0] += field_ptr[idx];
     field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2]; 
+    field[i][2] += field_ptr[idx+2];
   }
 
   double* fieldp_ptr = (double *)fieldp_pinned;
@@ -1064,7 +1064,7 @@ void PairHippoGPU::polar_real()
 
   bool success = true;
   int *ilist, *numneigh, **firstneigh;
-  
+
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -1096,7 +1096,7 @@ void PairHippoGPU::polar_real()
                                              host_start, &ilist, &numneigh, cpu_time,
                                              success, aewald, felec, off2, atom->q,
                                              domain->boxlo, domain->prd, &tq_pinned);
-  
+
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
@@ -1156,11 +1156,11 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
     vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
     vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
     vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
-    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
                  xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
-    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
                  xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
-    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
 
     virial_comp[0] += vxx;

From 98a2b6729299574aea7fb29f5f4f6fc3a253dce2 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 28 Sep 2021 17:39:55 -0500
Subject: [PATCH 063/181] Changed to the API of BaseAmoeba to reduce duplicates
 in hippo

---
 lib/gpu/lal_amoeba_ext.cpp  | 27 +++++----------------------
 lib/gpu/lal_base_amoeba.cpp | 18 +++++++++---------
 lib/gpu/lal_base_amoeba.h   | 10 +++++-----
 3 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 565f16b627..b73f6c4ca6 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -116,24 +116,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
 void amoeba_gpu_clear() {
   AMOEBAMF.clear();
 }
-/*
-int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd) {
-  return AMOEBAMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
-                          tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, off2, host_q, boxlo, prd);
-}
-*/
+
 int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -145,7 +128,7 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                            bool &success, const double aewald, const double felec, const double off2,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
@@ -163,7 +146,7 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
                            bool &success,  const double aewald, const double off2, double *host_q,
                            double *boxlo, double *prd, void **fieldp_ptr) {
   return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
@@ -181,7 +164,7 @@ int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
                            bool &success, const double aewald, const double off2, double *host_q,
                            double *boxlo, double *prd, void **fieldp_ptr) {
   return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
@@ -199,7 +182,7 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
                            bool &success, const double aewald, const double felec, const double off2,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
                           sublo, subhi, tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 3728fbe85e..7322dde5df 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -341,7 +341,7 @@ template <class numtyp, class acctyp>
 int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall,
                               double **host_x, int *host_type, int *host_amtype,
                               int *host_amgroup, double **host_rpole,
-                              double **host_uind, double **host_uinp,
+                              double **host_uind, double **host_uinp, double *host_pval,
                               double *sublo, double *subhi, tagint *tag,
                               int **nspecial, tagint **special,
                               int *nspecial15, tagint **special15,
@@ -433,7 +433,7 @@ template <class numtyp, class acctyp>
 int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
                                           const int nall, double **host_x,
                                           int *host_type, int *host_amtype,
-                                          int *host_amgroup, double **host_rpole,
+                                          int *host_amgroup, double **host_rpole, double *host_pval,
                                           double *sublo, double *subhi, tagint *tag,
                                           int **nspecial, tagint **special,
                                           int *nspecial15, tagint **special15,
@@ -474,7 +474,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   int** firstneigh = nullptr;
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
-                          nullptr, nullptr, sublo, subhi, tag,
+                          nullptr, nullptr, nullptr, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
@@ -522,7 +522,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
                                      const int nall, double **host_x,
                                      int *host_type, int *host_amtype,
                                      int *host_amgroup, double **host_rpole,
-                                     double **host_uind, double **host_uinp,
+                                     double **host_uind, double **host_uinp, double *host_pval,
                                      double *sublo, double *subhi, tagint *tag,
                                      int **nspecial, tagint **special,
                                      int *nspecial15, tagint **special15,
@@ -555,7 +555,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
   int** firstneigh = nullptr;
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, sublo, subhi, tag,
+                          host_uind, host_uinp, nullptr, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
@@ -596,7 +596,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
                                      const int nall, double **host_x,
                                      int *host_type, int *host_amtype,
                                      int *host_amgroup, double **host_rpole,
-                                     double **host_uind, double **host_uinp,
+                                     double **host_uind, double **host_uinp, double *host_pval,
                                      double *sublo, double *subhi, tagint *tag,
                                      int **nspecial, tagint **special,
                                      int *nspecial15, tagint **special15,
@@ -629,7 +629,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
   int** firstneigh = nullptr;
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, sublo, subhi, tag,
+                          host_uind, host_uinp, nullptr, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
@@ -669,7 +669,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
                                       const int nall, double **host_x,
                                       int *host_type, int *host_amtype,
                                       int *host_amgroup, double **host_rpole,
-                                      double **host_uind, double **host_uinp,
+                                      double **host_uind, double **host_uinp, double *host_pval,
                                       double *sublo, double *subhi, tagint *tag,
                                       int **nspecial, tagint **special,
                                       int *nspecial15, tagint **special15,
@@ -710,7 +710,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
   int** firstneigh = nullptr;
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, sublo, subhi, tag,
+                          host_uind, host_uinp, nullptr, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index bd30fc3fbb..accb9a5900 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -134,7 +134,7 @@ class BaseAmoeba {
   virtual int** precompute(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole, double **host_uind,
-                double **host_uinp, double *sublo, double *subhi,
+                double **host_uinp, double *host_pval, double *sublo, double *subhi,
                 tagint *tag, int **nspecial, tagint **special,
                 int *nspecial15, tagint **special15,
                 const bool eflag, const bool vflag,
@@ -145,7 +145,7 @@ class BaseAmoeba {
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi,
                 tagint *tag, int **nspecial, tagint **special,
                 int *nspecial15, tagint **special15,
                 const bool eflag, const bool vflag,
@@ -158,7 +158,7 @@ class BaseAmoeba {
   virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole,
-                double **host_uind, double **host_uinp,
+                double **host_uind, double **host_uinp, double *host_pval,
                 double *sublo, double *subhi,
                 tagint *tag, int **nspecial, tagint **special,
                 int *nspecial15, tagint **special15,
@@ -172,7 +172,7 @@ class BaseAmoeba {
   virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole,
-                double **host_uind, double **host_uinp,
+                double **host_uind, double **host_uinp, double *host_pval,
                 double *sublo, double *subhi,
                 tagint *tag, int **nspecial, tagint **special,
                 int *nspecial15, tagint **special15,
@@ -186,7 +186,7 @@ class BaseAmoeba {
   virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole, double **host_uind,
-                double **host_uinp, double *sublo, double *subhi,
+                double **host_uinp, double *host_pval, double *sublo, double *subhi,
                 tagint *tag, int **nspecial, tagint **special,
                 int *nspecial15, tagint **special15,
                 const bool eflag, const bool vflag,

From 6286a119b354f31238bb0026fc44440fda7d6335 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 28 Sep 2021 23:12:07 -0500
Subject: [PATCH 064/181] Removed precompute() in hippo

---
 lib/gpu/lal_base_amoeba.cpp |   4 +-
 lib/gpu/lal_hippo.cpp       | 130 +++++-------------------------------
 lib/gpu/lal_hippo.cu        |  36 +++++-----
 lib/gpu/lal_hippo.h         |  12 ----
 4 files changed, 38 insertions(+), 144 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 7322dde5df..8b002c27e6 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -399,12 +399,12 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
     if (!success)
       return nullptr;
     atom->cast_q_data(host_q);
-    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
+    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
     hd_balancer.start_timer();
   } else {
     atom->cast_x_data(host_x,host_type);
     atom->cast_q_data(host_q);
-    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
+    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index ac221f8376..5a6ac20633 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -154,120 +154,24 @@ double HippoT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
 }
 
-// ---------------------------------------------------------------------------
-// Prepare for multiple kernel calls in a time step:
-//   - reallocate per-atom arrays, if needed
-//   - transfer extra data from host to device
-//   - build the full neighbor lists for use by different kernels
-// ---------------------------------------------------------------------------
-
-template <class numtyp, class acctyp>
-int** HippoT::precompute(const int ago, const int inum_full, const int nall,
-                              double **host_x, int *host_type, int *host_amtype,
-                              int *host_amgroup, double **host_rpole,
-                              double **host_uind, double **host_uinp, double *host_pval,
-                              double *sublo, double *subhi, tagint *tag,
-                              int **nspecial, tagint **special,
-                              int *nspecial15, tagint **special15,
-                              const bool eflag_in, const bool vflag_in,
-                              const bool eatom, const bool vatom, int &host_start,
-                              int **&ilist, int **&jnum, const double cpu_time,
-                              bool &success, double *host_q, double *boxlo,
-                              double *prd) {
-  this->acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  this->set_kernel(eflag,vflag);
-
-  // ------------------- Resize 1-5 neighbor arrays ------------------------
-
-  if (nall>this->_nmax) {
-    this->_nmax = nall;
-    this->dev_nspecial15.clear();
-    this->dev_special15.clear();
-    this->dev_special15_t.clear();
-    this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
-    this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
-  }
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    this->resize_atom(0,nall,success);
-    this->zero_timers();
-    return nullptr;
-  }
-
-  this->hd_balancer.balance(cpu_time);
-  int inum=this->hd_balancer.get_gpu_count(ago,inum_full);
-  this->ans->inum(inum);
-  host_start=inum;
-
-  // Build neighbor list on GPU if necessary
-  if (ago==0) {
-    this->_max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                    success);
-    if (!success)
-      return nullptr;
-    this->atom->cast_q_data(host_q);
-    this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
-    this->hd_balancer.start_timer();
-  } else {
-    this->atom->cast_x_data(host_x,host_type);
-    this->atom->cast_q_data(host_q);
-    this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
-    this->hd_balancer.start_timer();
-    this->atom->add_x_data(host_x,host_type);
-  }
-  this->atom->add_q_data();
-  this->atom->add_extra_data();
-
-  *ilist=this->nbor->host_ilist.begin();
-  *jnum=this->nbor->host_acc.begin();
-
-  this->device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
-                     boxlo, prd);
-
-  // re-allocate dev_short_nbor if necessary
-  if (inum_full*(2+this->_max_nbors) > this->dev_short_nbor.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    this->dev_short_nbor.resize((2+this->_max_nbors)*this->_nmax);
-  }
-
-  return this->nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute dispersion real-space
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
 int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
-                                           const int nall, double **host_x,
-                                           int *host_type, int *host_amtype,
-                                           int *host_amgroup, double **host_rpole,
-                                           double *sublo, double *subhi, tagint *tag,
-                                           int **nspecial, tagint **special,
-                                           int *nspecial15, tagint **special15,
-                                           const bool eflag_in, const bool vflag_in,
-                                           const bool eatom, const bool vatom,
-                                           int &host_start, int **ilist, int **jnum,
-                                           const double cpu_time, bool &success,
-                                           const double aewald, const double off2_disp,
-                                           double *host_q, double *boxlo, double *prd) {
+                                      const int nall, double **host_x,
+                                      int *host_type, int *host_amtype,
+                                      int *host_amgroup, double **host_rpole,
+                                      double *sublo, double *subhi, tagint *tag,
+                                      int **nspecial, tagint **special,
+                                      int *nspecial15, tagint **special15,
+                                      const bool eflag_in, const bool vflag_in,
+                                      const bool eatom, const bool vatom,
+                                      int &host_start, int **ilist, int **jnum,
+                                      const double cpu_time, bool &success,
+                                      const double aewald, const double off2_disp,
+                                      double *host_q, double *boxlo, double *prd) {
   this->acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -296,7 +200,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
   int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
                           nullptr, nullptr, nullptr, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
@@ -407,7 +311,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
   int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
                           nullptr, nullptr, host_pval, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
@@ -530,7 +434,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
   //   and build the neighbor lists if needed
 
   int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
                           host_uind, host_uinp, host_pval, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
@@ -645,7 +549,7 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full,
   //   and build the neighbor lists if needed
 
   int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
                           host_uind, host_uinp, host_pval, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
@@ -766,7 +670,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
   int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
                           host_uind, host_uinp, host_pval, sublo, subhi, tag,
                           nspecial, special, nspecial15, special15,
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index b282586efb..2e62d0703e 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -410,21 +410,21 @@ _texture( q_tex,int2);
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_nonpolar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv,
-                                 __global acctyp4 *restrict tep,
-                                 const int eflag, const int vflag, const int inum,
-                                 const int nall, const int nbor_pitch,
-                                 const int t_per_atom, const numtyp aewald,
-                                 const numtyp off2, const numtyp cut2,
-                                 const numtyp c0, const numtyp c1, const numtyp c2,
-                                 const numtyp c3, const numtyp c4, const numtyp c5)
+                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict coeff,
+                                const __global numtyp4 *restrict sp_nonpolar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict ans,
+                                __global acctyp *restrict engv,
+                                __global acctyp4 *restrict tep,
+                                const int eflag, const int vflag, const int inum,
+                                const int nall, const int nbor_pitch,
+                                const int t_per_atom, const numtyp aewald,
+                                const numtyp off2, const numtyp cut2,
+                                const numtyp c0, const numtyp c1, const numtyp c2,
+                                const numtyp c3, const numtyp c4, const numtyp c5)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -895,9 +895,11 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
 
   } // ii<inum
 
-  // accumate force, energy and virial
+  // accumate force, energy and virial: use _acc if not the first kernel
   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv);
+  //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //   offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);     
 }
 
 /* ----------------------------------------------------------------------
@@ -1236,7 +1238,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
   // accumate force, energy and virial: use _acc if not the first kernel
   //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-     //offset,eflag,vflag,ans,engv);
+  //   offset,eflag,vflag,ans,engv);
   store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 289f769813..78f85db7df 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -54,18 +54,6 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
            const double gpu_split, FILE *_screen,
            const double polar_dscale, const double polar_uscale);
 
-  /// Reallocate per-atom arrays if needed, and build neighbor lists once, if needed
-  int** precompute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double **host_uind,
-                double **host_uinp, double* host_pval, double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **&ilist, int **&numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd);
-
   /// Compute dispersion real-space with device neighboring
   int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,

From b95508125b36cc004c62385ad0391bf25fd7c01d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 28 Sep 2021 23:24:34 -0500
Subject: [PATCH 065/181] Adding the repulsion kernel for hippo

---
 lib/gpu/lal_hippo.cpp | 113 +++++++++++++++++++++++++++++++++++++++++-
 lib/gpu/lal_hippo.h   |  18 ++++++-
 2 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 5a6ac20633..9a45ea6fc8 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -36,7 +36,9 @@ HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
 template <class numtyp, class acctyp>
 HippoT::~Hippo() {
   clear();
+  k_repulsion.clear();
   k_dispersion.clear();
+
 }
 
 template <class numtyp, class acctyp>
@@ -71,6 +73,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
     return success;
 
   // specific to HIPPO
+  k_repulsion.set_function(*(this->pair_program),"k_hippo_repulsion");
   k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
 
   // If atom type constants fit in shared memory use fast kernel
@@ -154,10 +157,118 @@ double HippoT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
 }
 
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute repulsion
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** HippoT::compute_repulsion(const int ago, const int inum_full,
+                                const int nall, double **host_x,
+                                int *host_type, int *host_amtype,
+                                int *host_amgroup, double **host_rpole,
+                                double *sublo, double *subhi, tagint *tag,
+                                int **nspecial, tagint **special,
+                                int *nspecial15, tagint **special15,
+                                const bool eflag_in, const bool vflag_in,
+                                const bool eatom, const bool vatom,
+                                int &host_start, int **ilist, int **jnum,
+                                const double cpu_time, bool &success,
+                                const double aewald, const double off2_repulse,
+                                double *host_q, double *boxlo, double *prd) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE:
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
+                                host_amtype, host_amgroup, host_rpole,
+                                nullptr, nullptr, nullptr, sublo, subhi, tag,
+                                nspecial, special, nspecial15, special15,
+                                eflag_in, vflag_in, eatom, vatom,
+                                host_start, ilist, jnum, cpu_time,
+                                success, host_q, boxlo, prd);
+
+  this->_off2_repulse = off2_repulse;
+  this->_aewald = aewald;
+  const int red_blocks=repulsion(eflag,vflag);
+
+  // only copy them back if this is the last kernel
+  //   otherwise, commenting out these two lines to leave the answers
+  //   (forces, energies and virial) on the device until the last kernel
+  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //this->device->add_ans_object(this->ans);
+
+  this->hd_balancer.stop_timer();
+
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the repulsion term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::repulsion(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_disp,
+  //   at this point mpole is the first kernel in a time step
+
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_repulse, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_repulsion.set_size(GX,BX);
+  k_repulsion.run(&this->atom->x, &this->atom->extra,
+                  &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                  &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                  &this->dev_short_nbor,
+                  &this->ans->force, &this->ans->engv,
+                  &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                  &this->_threads_per_atom,  &this->_aewald,
+                  &this->_off2_repulse);
+  this->time_pair.stop();
+
+  return GX;
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute dispersion real-space
 // ---------------------------------------------------------------------------
-
 template <class numtyp, class acctyp>
 int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
                                       const int nall, double **host_x,
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 78f85db7df..17e3a1b03f 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -54,6 +54,21 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
            const double gpu_split, FILE *_screen,
            const double polar_dscale, const double polar_uscale);
 
+  /// Compute repulsion with device neighboring
+  int** compute_repulsion(const int ago, const int inum_full,
+                                      const int nall, double **host_x,
+                                      int *host_type, int *host_amtype,
+                                      int *host_amgroup, double **host_rpole,
+                                      double *sublo, double *subhi, tagint *tag,
+                                      int **nspecial, tagint **special,
+                                      int *nspecial15, tagint **special15,
+                                      const bool eflag_in, const bool vflag_in,
+                                      const bool eatom, const bool vatom,
+                                      int &host_start, int **ilist, int **jnum,
+                                      const double cpu_time, bool &success,
+                                      const double aewald, const double off2_repulse,
+                                      double *host_q, double *boxlo, double *prd);
+
   /// Compute dispersion real-space with device neighboring
   int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -163,10 +178,11 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   numtyp _polar_dscale, _polar_uscale;
   numtyp _qqrd2e;
 
-  UCL_Kernel k_dispersion;
+  UCL_Kernel k_repulsion, k_dispersion;
 
  protected:
   bool _allocated;
+  int repulsion(const int eflag, const int vflag);
   int dispersion_real(const int eflag, const int vflag);
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);

From 17edd797a7a57ee227ab44cba1861f4bbbdec798 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 28 Sep 2021 23:42:04 -0500
Subject: [PATCH 066/181] Adding API for the repulsion term to hippo/gpu

---
 lib/gpu/lal_hippo.cpp      | 16 +++++++--
 lib/gpu/lal_hippo.h        |  2 +-
 lib/gpu/lal_hippo_ext.cpp  | 17 +++++++++
 src/AMOEBA/pair_amoeba.h   |  2 +-
 src/GPU/pair_hippo_gpu.cpp | 70 ++++++++++++++++++++++++++++++++++++++
 src/GPU/pair_hippo_gpu.h   |  1 +
 6 files changed, 104 insertions(+), 4 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 9a45ea6fc8..80762b55aa 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -173,7 +173,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
                                 int &host_start, int **ilist, int **jnum,
                                 const double cpu_time, bool &success,
                                 const double aewald, const double off2_repulse,
-                                double *host_q, double *boxlo, double *prd) {
+                                double *host_q, double *boxlo, double *prd, void **tep_ptr) {
   this->acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -210,6 +210,14 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
                                 host_start, ilist, jnum, cpu_time,
                                 success, host_q, boxlo, prd);
 
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
   this->_off2_repulse = off2_repulse;
   this->_aewald = aewald;
   const int red_blocks=repulsion(eflag,vflag);
@@ -222,6 +230,10 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
 
   this->hd_balancer.stop_timer();
 
+  // copy tep from device to host
+
+  this->_tep.update_host(this->_max_tep_size*4,false);
+
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
@@ -257,7 +269,7 @@ int HippoT::repulsion(const int eflag, const int vflag) {
                   &coeff_amtype, &coeff_amclass, &sp_nonpolar,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
-                  &this->ans->force, &this->ans->engv,
+                  &this->ans->force, &this->ans->engv, &this->_tep,
                   &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
                   &this->_threads_per_atom,  &this->_aewald,
                   &this->_off2_repulse);
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 17e3a1b03f..374ca5d836 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -67,7 +67,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                                       int &host_start, int **ilist, int **jnum,
                                       const double cpu_time, bool &success,
                                       const double aewald, const double off2_repulse,
-                                      double *host_q, double *boxlo, double *prd);
+                                      double *host_q, double *boxlo, double *prd, void** tep_ptr);
 
   /// Compute dispersion real-space with device neighboring
   int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 982cf894a6..2f1a800589 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -120,6 +120,23 @@ void hippo_gpu_clear() {
   HIPPOMF.clear();
 }
 
+int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  return HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, tep_ptr);
+}
+
 int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 8a2f09d443..5ba7aae981 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -343,7 +343,7 @@ class PairAmoeba : public Pair {
 
   void hal();
 
-  void repulsion();
+  virtual void repulsion();
   void damprep(double, double, double, double, double, double, double, double,
                int, double, double, double *);
 
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index fbc1b6b238..4852f75e08 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -66,6 +66,17 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                     const double polar_dscale, const double polar_uscale, int& tq_size);
 void hippo_gpu_clear();
 
+int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr);
+
 int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -209,6 +220,65 @@ void PairHippoGPU::init_style()
 
 /* ---------------------------------------------------------------------- */
 
+void PairHippoGPU::repulsion()
+{
+  if (!gpu_repulsion_ready) {
+    PairAmoeba::repulsion();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  choose(REPULSE);
+
+  // set the energy unit conversion factor for multipolar real-space calculation
+
+  firstneigh = hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x,
+                                            atom->type, amtype, amgroup, rpole,
+                                            sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success, aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &tq_pinned);
+
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tq_single) {
+    float *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, frepulse, virrepulse);
+  } else {
+    double *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, frepulse, virrepulse);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 void PairHippoGPU::dispersion_real()
 {
   if (!gpu_dispersion_real_ready) {
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index 9e961045eb..c7a4e75ebe 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -35,6 +35,7 @@ class PairHippoGPU : public PairAmoeba {
 
   virtual void induce();
 
+  virtual void repulsion();
   virtual void dispersion_real();
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);

From 4be44c386f2408cb68f7816d1a80544ba6a73b59 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 29 Sep 2021 09:40:33 -0500
Subject: [PATCH 067/181] Added necessary arguments to the hippo repulsion
 kernel

---
 lib/gpu/lal_hippo.cpp      | 30 +++++++++++++++++++++++++-----
 lib/gpu/lal_hippo.cu       | 14 +++++++-------
 lib/gpu/lal_hippo.h        | 30 ++++++++++++++++++------------
 lib/gpu/lal_hippo_ext.cpp  | 10 ++++++++--
 src/GPU/pair_hippo_gpu.cpp | 11 ++++++++---
 5 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 80762b55aa..6830847e98 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -57,6 +57,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
                   const double *host_special_polar_wscale,
                   const double *host_special_polar_piscale,
                   const double *host_special_polar_pscale,
+                  const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
                   const double *host_csix, const double *host_adisp,
                   const double *host_pcore, const double *host_palpha,
                   const int nlocal, const int nall, const int max_nbors,
@@ -99,6 +100,16 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
   coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
   ucl_copy(coeff_amtype,host_write,false);
 
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_sizpr[i];
+    host_write[i].y = host_dmppr[i];
+    host_write[i].z = host_elepr[i];
+    host_write[i].w = (numtyp)0;
+  }
+
+  coeff_rep.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_rep,host_write,false);
+
   UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
   for (int i = 0; i < max_amclass; i++) {
     host_write2[i].x = host_csix[i];
@@ -133,7 +144,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
   _polar_uscale = polar_uscale;
 
   _allocated=true;
-  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + coeff_amclass.row_bytes() +
     + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes();
   return 0;
 }
@@ -145,6 +156,7 @@ void HippoT::clear() {
   _allocated=false;
 
   coeff_amtype.clear();
+  coeff_rep.clear();
   coeff_amclass.clear();
   sp_polar.clear();
   sp_nonpolar.clear();
@@ -173,7 +185,9 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
                                 int &host_start, int **ilist, int **jnum,
                                 const double cpu_time, bool &success,
                                 const double aewald, const double off2_repulse,
-                                double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+                                double *host_q, double *boxlo, double *prd,
+                                double cut2, double c0, double c1, double c2,
+                                double c3, double c4, double c5, void **tep_ptr) {
   this->acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -219,7 +233,13 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
   *tep_ptr=this->_tep.host.begin();
 
   this->_off2_repulse = off2_repulse;
-  this->_aewald = aewald;
+  _cut2 = cut2;
+  _c0 = c0;
+  _c1 = c1;
+  _c2 = c2;
+  _c3 = c3;
+  _c4 = c4;
+  _c5 = c5;
   const int red_blocks=repulsion(eflag,vflag);
 
   // only copy them back if this is the last kernel
@@ -266,13 +286,13 @@ int HippoT::repulsion(const int eflag, const int vflag) {
 
   k_repulsion.set_size(GX,BX);
   k_repulsion.run(&this->atom->x, &this->atom->extra,
-                  &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                  &coeff_rep, &sp_nonpolar,
                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                   &this->dev_short_nbor,
                   &this->ans->force, &this->ans->engv, &this->_tep,
                   &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
                   &this->_threads_per_atom,  &this->_aewald,
-                  &this->_off2_repulse);
+                  &this->_off2_repulse, &_cut2, &_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
   this->time_pair.stop();
 
   return GX;
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 2e62d0703e..1b6344a163 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -411,7 +411,7 @@ _texture( q_tex,int2);
 
 __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
                                 const __global numtyp *restrict extra,
-                                const __global numtyp4 *restrict coeff,
+                                const __global numtyp4 *restrict coeff_rep,
                                 const __global numtyp4 *restrict sp_nonpolar,
                                 const __global int *dev_nbor,
                                 const __global int *dev_packed,
@@ -480,9 +480,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
     numtyp qiyz = pol3i.x;   // rpole[i][9];
     numtyp qizz = pol3i.y;   // rpole[i][12];
     int itype = pol3i.z; // amtype[i];
-    numtyp sizi = coeff[itype].x; // sizpr[itype];
-    numtyp dmpi = coeff[itype].y; // dmppr[itype];
-    numtyp vali = coeff[itype].z; // elepr[itype];
+    numtyp sizi = coeff_rep[itype].x; // sizpr[itype];
+    numtyp dmpi = coeff_rep[itype].y; // dmppr[itype];
+    numtyp vali = coeff_rep[itype].z; // elepr[itype];
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -515,9 +515,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp qkzz = pol3j.y; // rpole[j][12];
       int jtype = pol3j.z; // amtype[j];
 
-      numtyp sizk = coeff[jtype].x; // sizpr[jtype];
-      numtyp dmpk = coeff[jtype].y; // dmppr[jtype];
-      numtyp valk = coeff[jtype].z; // elepr[jtype];
+      numtyp sizk = coeff_rep[jtype].x; // sizpr[jtype];
+      numtyp dmpk = coeff_rep[jtype].y; // dmppr[jtype];
+      numtyp valk = coeff_rep[jtype].z; // elepr[jtype];
 
       const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
       numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)];
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 374ca5d836..ceab20d17b 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -47,6 +47,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
            const double *host_special_polar_wscale,
            const double *host_special_polar_piscale,
            const double *host_special_polar_pscale,
+           const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
            const double *host_csix, const double *host_adisp,
            const double *host_pcore, const double *host_palpha,
            const int nlocal, const int nall, const int max_nbors,
@@ -56,18 +57,20 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
 
   /// Compute repulsion with device neighboring
   int** compute_repulsion(const int ago, const int inum_full,
-                                      const int nall, double **host_x,
-                                      int *host_type, int *host_amtype,
-                                      int *host_amgroup, double **host_rpole,
-                                      double *sublo, double *subhi, tagint *tag,
-                                      int **nspecial, tagint **special,
-                                      int *nspecial15, tagint **special15,
-                                      const bool eflag_in, const bool vflag_in,
-                                      const bool eatom, const bool vatom,
-                                      int &host_start, int **ilist, int **jnum,
-                                      const double cpu_time, bool &success,
-                                      const double aewald, const double off2_repulse,
-                                      double *host_q, double *boxlo, double *prd, void** tep_ptr);
+                          const int nall, double **host_x,
+                          int *host_type, int *host_amtype,
+                          int *host_amgroup, double **host_rpole,
+                          double *sublo, double *subhi, tagint *tag,
+                          int **nspecial, tagint **special,
+                          int *nspecial15, tagint **special15,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, int **ilist, int **jnum,
+                          const double cpu_time, bool &success,
+                          const double aewald, const double off2_repulse,
+                          double *host_q, double *boxlo, double *prd,
+                          double cut2, double c0, double c1, double c2,
+                          double c3, double c4, double c5,void** tep_ptr);
 
   /// Compute dispersion real-space with device neighboring
   int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
@@ -157,6 +160,8 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> coeff_amtype;
   /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
   UCL_D_Vec<numtyp4> coeff_amclass;
+  /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z; 
+  UCL_D_Vec<numtyp4> coeff_rep;
   /// Special polar values [0-4]:
   ///   sp_polar.x = special_polar_wscale
   ///   sp_polar.y special_polar_pscale,
@@ -175,6 +180,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   /// Number of atom types
   int _lj_types;
 
+  numtyp _cut2,_c0,_c1,_c2,_c3,_c4,_c5;
   numtyp _polar_dscale, _polar_uscale;
   numtyp _qqrd2e;
 
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 2f1a800589..15cb53cdb1 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -37,6 +37,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
+                    const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
                     const double *host_csix, const double *host_adisp,
                     const double *host_pcore, const double *host_palpha,
                     const int nlocal, const int nall, const int max_nbors,
@@ -74,6 +75,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                           host_special_repel, host_special_disp,
                           host_special_mpole, host_special_polar_wscale,
                           host_special_polar_piscale, host_special_polar_pscale,
+                          host_sizpr, host_dmppr, host_elepr,
                           host_csix, host_adisp, host_pcore, host_palpha,
                           nlocal, nall, max_nbors,
                           maxspecial, maxspecial15, cell_size, gpu_split,
@@ -99,6 +101,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                             host_special_repel, host_special_disp,
                             host_special_mpole, host_special_polar_wscale,
                             host_special_polar_piscale, host_special_polar_pscale,
+                            host_sizpr, host_dmppr, host_elepr,
                             host_csix, host_adisp, host_pcore, host_palpha,
                             nlocal, nall, max_nbors,
                             maxspecial, maxspecial15, cell_size, gpu_split,
@@ -129,12 +132,15 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                            const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+                           double *host_q, double *boxlo, double *prd,
+                           double cut2, double c0, double c1, double c2,
+                           double c3, double c4, double c5, void **tep_ptr) {
   return HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, off2, host_q, boxlo, prd, tep_ptr);
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd,
+                          cut2, c0, c1, c2, c3, c4, c5, tep_ptr);
 }
 
 int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 4852f75e08..d6a16c72fb 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -58,6 +58,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
+                    const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
                     const double *host_csix, const double *host_adisp,
                     const double *host_pcore, const double *host_palpha,
                     const int nlocal, const int nall, const int max_nbors,
@@ -75,7 +76,9 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                            const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd, void **tep_ptr);
+                           double *host_q, double *boxlo, double *prd,
+                           double cut2, double c0, double c1, double c2,
+                           double c3, double c4, double c5, void **tep_ptr);
 
 int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
@@ -203,7 +206,8 @@ void PairHippoGPU::init_style()
                                 pdamp, thole, dirdamp, amtype2class, special_hal,
                                 special_repel, special_disp, special_mpole,
                                 special_polar_wscale, special_polar_piscale,
-                                special_polar_pscale, csix, adisp, pcore, palpha,
+                                special_polar_pscale, sizpr, dmppr, elepr,
+                                csix, adisp, pcore, palpha,
                                 atom->nlocal, atom->nlocal+atom->nghost, mnf,
                                 maxspecial, maxspecial15, cell_size, gpu_mode,
                                 screen, polar_dscale, polar_uscale, tq_size);
@@ -261,7 +265,8 @@ void PairHippoGPU::repulsion()
                                             eflag, vflag, eflag_atom, vflag_atom,
                                             host_start, &ilist, &numneigh, cpu_time,
                                             success, aewald, off2, atom->q,
-                                            domain->boxlo, domain->prd, &tq_pinned);
+                                            domain->boxlo, domain->prd, cut2,
+                                            c0, c1, c2, c3, c4, c5, &tq_pinned);
 
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");

From 01381b7f54ac6d01f48444ec00997cacff775dbe Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 29 Sep 2021 11:57:25 -0500
Subject: [PATCH 068/181] Fixed bugs in the repulsion kernel, now working
 correctly with the double precision mode

---
 lib/gpu/lal_base_amoeba.cpp |  2 +-
 lib/gpu/lal_hippo.cu        | 22 +++++++++++++---------
 lib/gpu/lal_hippo_extra.h   | 30 +++++++++++++++---------------
 src/GPU/pair_hippo_gpu.cpp  |  2 +-
 4 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 8b002c27e6..5d1b7016da 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -78,7 +78,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   bool rot = false;
   bool vel = false;
   _extra_fields = 24; // round up to accomodate quadruples of numtyp values
-                      // rpole 13; uind 3; uinp 3; amtype, amgroup
+                      // rpole 13; uind 3; uinp 3; amtype, amgroup; pval
   int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields);
   if (success!=0)
     return success;
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 1b6344a163..bf63652a47 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -493,9 +493,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       //int jtype=jx.w;
 
       // Compute r12
-      numtyp xr = ix.x - jx.x;
-      numtyp yr = ix.y - jx.y;
-      numtyp zr = ix.z - jx.z;
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
       if (r2>off2) continue;
@@ -521,6 +521,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
 
       const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
       numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)];
+      if (factor_repel == (numtyp)0) continue;
 
       // intermediates involving moments and separation distance
 
@@ -614,7 +615,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp term1 = vali*valk;
       numtyp term2 = valk*dir - vali*dkr + dik;
       numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
-      numtyp term4 = dir*qkr - dkr*qir - 4.0*qik;
+      numtyp term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
       numtyp term5 = qir*qkr;
       numtyp eterm = term1*dmpik[0] + term2*dmpik[2] +
         term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8];
@@ -646,6 +647,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       frcx = frcx*rr1 + eterm*rr3*xr;
       frcy = frcy*rr1 + eterm*rr3*yr;
       frcz = frcz*rr1 + eterm*rr3*zr;
+      frcx = sizik * frcx;
+      frcy = sizik * frcy;
+      frcz = sizik * frcz;
 
       // compute the torque components for this interaction
 
@@ -666,7 +670,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
         numtyp r4 = r2 * r2;
         numtyp r5 = r2 * r3;
         numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0;
-        numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp).0*c4*r3 +
+        numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp)4.0*c4*r3 +
           (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1;
         dtaper *= e * rr1;
         e *= taper;
@@ -896,10 +900,10 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
   } // ii<inum
 
   // accumate force, energy and virial: use _acc if not the first kernel
-  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-     offset,eflag,vflag,ans,engv);
-  //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //   offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);     
+  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //   offset,eflag,vflag,ans,engv);
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);     
 }
 
 /* ----------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index 0b8f96f69b..2afcc963ec 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -125,9 +125,9 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
       (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
        ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
        (numtyp)4.0*dmpi2*dmpk2/term) * expi;
-    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
-           (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
-           (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk +
+    d3s = (dmpi2*dmpk23*r4/(numtyp)15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
+           ((numtyp)4.0/(numtyp)15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
+           (numtyp)4.0*dmpi2*dmpk22*r/term - (numtyp)4.0/term*dmpi2*dmpk2) * expk +
       (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
        ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
        (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
@@ -136,7 +136,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
            ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
            ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
            (numtyp)4.0*dmpi2*dmpk2/term) * expk +
-      (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
        dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
        ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
        ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
@@ -146,17 +146,17 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
       r6 = r5 * r;
       dmpi26 = dmpi25 * dmpi2;
       dmpk26 = dmpk25 * dmpk2;
-      d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 +
-             dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 -
-             (4.0/945.0)*dmpi2*dmpk26*r5/term -
-             (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term -
-             (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term -
-             4.0*dmpi2*dmpk2/term) * expk +
-        (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 +
-         dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 +
-         (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term +
-         (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term +
-         4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
+      d5s = (dmpi2*dmpk25*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi2*dmpk24*r5 +
+             dmpi2*dmpk23*r4/(numtyp)21.0 + dmpi2*dmpk22*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 -
+             ((numtyp)4.0/(numtyp)945.0)*dmpi2*dmpk26*r5/term -
+             ((numtyp)4.0/(numtyp)63.0)*dmpi2*dmpk25*r4/term - ((numtyp)4.0/(numtyp)9.0)*dmpi2*dmpk24*r3/term -
+             ((numtyp)16.0/(numtyp)9.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+             (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+        (dmpi25*dmpk2*r6/(numtyp)945.0 + ((numtyp)2.0/(numtyp)189.0)*dmpi24*dmpk2*r5 +
+         dmpi23*dmpk2*r4/(numtyp)21.0 + dmpi22*dmpk2*r3/(numtyp)9.0 + dmpi2*dmpk2*r2/(numtyp)9.0 +
+         ((numtyp)4.0/(numtyp)945.0)*dmpi26*dmpk2*r5/term + ((numtyp)4.0/(numtyp)63.0)*dmpi25*dmpk2*r4/term +
+         ((numtyp)4.0/(numtyp)9.0)*dmpi24*dmpk2*r3/term + ((numtyp)16.0/(numtyp)9.0)*dmpi23*dmpk2*r2/term +
+         (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
     }
   }
 
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index d6a16c72fb..533abef4d9 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -147,7 +147,7 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   tq_pinned = nullptr;
 
   gpu_hal_ready = false;               // always false for HIPPO
-  gpu_repulsion_ready = false;         // true for HIPPO when ready
+  gpu_repulsion_ready = true;         // true for HIPPO when ready
   gpu_dispersion_real_ready = true;   // true for HIPPO when ready
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;

From ad9d45639e7297ae8ba5e462063d79c8adc7d291 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 29 Sep 2021 12:32:08 -0500
Subject: [PATCH 069/181] Fixed bugs with damprep where ucl_powr in mixed
 precision failed with a negative single-reprecision base

---
 lib/gpu/lal_hippo.cpp     | 3 ++-
 lib/gpu/lal_hippo.cu      | 5 +++--
 lib/gpu/lal_hippo_extra.h | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 6830847e98..d1b61d5415 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -292,7 +292,8 @@ int HippoT::repulsion(const int eflag, const int vflag) {
                   &this->ans->force, &this->ans->engv, &this->_tep,
                   &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
                   &this->_threads_per_atom,  &this->_aewald,
-                  &this->_off2_repulse, &_cut2, &_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
+                  &this->_off2_repulse, &_cut2,
+                  &_c0, &_c1, &_c2, &_c3, &_c4, &_c5);
   this->time_pair.stop();
 
   return GX;
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index bf63652a47..fae6cf1681 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -644,13 +644,14 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
         term4*qiy + term5*qky + term6*(qiyk+qkyi);
       numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
         term4*qiz + term5*qkz + term6*(qizk+qkzi);
+
       frcx = frcx*rr1 + eterm*rr3*xr;
       frcy = frcy*rr1 + eterm*rr3*yr;
       frcz = frcz*rr1 + eterm*rr3*zr;
       frcx = sizik * frcx;
       frcy = sizik * frcy;
       frcz = sizik * frcz;
-
+      
       // compute the torque components for this interaction
 
       numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
@@ -903,7 +904,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
   //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
   //   offset,eflag,vflag,ans,engv);
   store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);     
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index 2afcc963ec..ac02e2e9e8 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -112,7 +112,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
     dmpk24 = dmpk23 * dmpk2;
     dmpk25 = dmpk24 * dmpk2;
     term = dmpi22 - dmpk22;
-    pre = (numtyp)8192.0 * dmpi23 * dmpk23 / ucl_powr(term,(numtyp)4.0);
+    pre = (numtyp)8192.0 * dmpi23 * dmpk23 / (term*term*term*term); //ucl_powr(term,(numtyp)4.0);
     tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
     s = (dampi-tmp)*expk + (dampk+tmp)*expi;
 
@@ -173,6 +173,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
   dmpik[4] = pre * (s*d2s + ds*ds);
   dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
   dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
+  
   if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
 }
 

From e0f91b96fe064a93bd478c250a768aa9eee70ff5 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 29 Sep 2021 13:07:20 -0500
Subject: [PATCH 070/181] Cleaned up and added necessary comments

---
 src/GPU/pair_amoeba_gpu.cpp | 12 +++++++--
 src/GPU/pair_hippo_gpu.cpp  | 51 +++++++++++++++++++++++--------------
 2 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index e1fe1f1097..65a4af7d64 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -192,7 +192,10 @@ void PairAmoebaGPU::init_style()
     tq_single = true;
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of mulipole interactions
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
 
 void PairAmoebaGPU::multipole_real()
 {
@@ -257,6 +260,8 @@ void PairAmoebaGPU::multipole_real()
 /* ----------------------------------------------------------------------
    induce = induced dipole moments via pre-conditioned CG solver
    adapted from Tinker induce0a() routine
+   NOTE: Almost the same in the CPU version, except that there is no need
+      to call reverse_comm() for crstyle = FIELD;
 ------------------------------------------------------------------------- */
 
 void PairAmoebaGPU::induce()
@@ -986,7 +991,10 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
   }
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
 
 void PairAmoebaGPU::polar_real()
 {
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 533abef4d9..014b14471e 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -146,9 +146,9 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   fieldp_pinned = nullptr;
   tq_pinned = nullptr;
 
-  gpu_hal_ready = false;               // always false for HIPPO
-  gpu_repulsion_ready = true;         // true for HIPPO when ready
-  gpu_dispersion_real_ready = true;   // true for HIPPO when ready
+  gpu_hal_ready = false;              // always false for HIPPO
+  gpu_repulsion_ready = true;
+  gpu_dispersion_real_ready = true;
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
@@ -222,7 +222,10 @@ void PairHippoGPU::init_style()
     tq_single = true;
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   repulsion = Pauli repulsion interactions
+   adapted from Tinker erepel1b() routine
+------------------------------------------------------------------------- */
 
 void PairHippoGPU::repulsion()
 {
@@ -258,15 +261,15 @@ void PairHippoGPU::repulsion()
   // set the energy unit conversion factor for multipolar real-space calculation
 
   firstneigh = hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x,
-                                            atom->type, amtype, amgroup, rpole,
-                                            sublo, subhi, atom->tag,
-                                            atom->nspecial, atom->special,
-                                            atom->nspecial15, atom->special15,
-                                            eflag, vflag, eflag_atom, vflag_atom,
-                                            host_start, &ilist, &numneigh, cpu_time,
-                                            success, aewald, off2, atom->q,
-                                            domain->boxlo, domain->prd, cut2,
-                                            c0, c1, c2, c3, c4, c5, &tq_pinned);
+                                           atom->type, amtype, amgroup, rpole,
+                                           sublo, subhi, atom->tag,
+                                           atom->nspecial, atom->special,
+                                           atom->nspecial15, atom->special15,
+                                           eflag, vflag, eflag_atom, vflag_atom,
+                                           host_start, &ilist, &numneigh, cpu_time,
+                                           success, aewald, off2, atom->q,
+                                           domain->boxlo, domain->prd, cut2,
+                                           c0, c1, c2, c3, c4, c5, &tq_pinned);
 
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
@@ -282,7 +285,10 @@ void PairHippoGPU::repulsion()
   }
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   dispersion_real = real-space portion of Ewald dispersion
+   adapted from Tinker edreal1d() routine
+------------------------------------------------------------------------- */
 
 void PairHippoGPU::dispersion_real()
 {
@@ -330,7 +336,10 @@ void PairHippoGPU::dispersion_real()
     error->one(FLERR,"Insufficient memory on accelerator");
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of mulipole interactions
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
 
 void PairHippoGPU::multipole_real()
 {
@@ -395,6 +404,8 @@ void PairHippoGPU::multipole_real()
 /* ----------------------------------------------------------------------
    induce = induced dipole moments via pre-conditioned CG solver
    adapted from Tinker induce0a() routine
+   NOTE: Almost the same in the CPU version, except that there is no need
+      to call reverse_comm() for crstyle = FIELD;
 ------------------------------------------------------------------------- */
 
 void PairHippoGPU::induce()
@@ -879,6 +890,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
   // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
+  //   so no need to call ubdirect2b_cpu().
   // udirect2b_cpu();
 
   // accumulate the field and fieldp values from the GPU lib
@@ -1124,7 +1136,10 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
   }
 }
 
-/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
 
 void PairHippoGPU::polar_real()
 {
@@ -1187,7 +1202,7 @@ void PairHippoGPU::polar_real()
 }
 
 /* ----------------------------------------------------------------------
-   compute atom forces from torques
+   compute atom forces from torques used by various terms
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
@@ -1212,8 +1227,6 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
     _tq[2] = tq_ptr[4*i+2];
     torque2force(i,_tq,fix,fiy,fiz,force_comp);
 
-    //if (i < 10) printf("i = %d: tep = %f %f %f\n", i, _tq[0], _tq[1], _tq[2]);
-
     iz = zaxis2local[i];
     ix = xaxis2local[i];
     iy = yaxis2local[i];

From 3328ac0df2790aa2c4d16f8088d8450061d60c05 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 1 Oct 2021 09:58:21 -0500
Subject: [PATCH 071/181] Attempted to remove some redundancy in data transfers
 in the amoeba kernels; keeping HIPPO independent of AMOEBA for now

---
 lib/gpu/lal_amoeba.cpp      |  6 +++---
 lib/gpu/lal_amoeba.cu       |  5 +++--
 lib/gpu/lal_base_amoeba.cpp | 36 ++++++++++++++++++++++--------------
 lib/gpu/lal_base_amoeba.h   |  8 ++++----
 lib/gpu/lal_hippo.cpp       |  6 +++---
 lib/gpu/lal_hippo.cu        |  2 +-
 src/MAKE/Makefile.mpi       |  4 ++--
 7 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 917166c423..b92e1bfd55 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -62,9 +62,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
-                            "k_amoeba_multipole",
-                            "k_amoeba_udirect2b", "k_amoeba_umutual2b",
-                            "k_amoeba_polar", "k_amoeba_short_nbor");
+                            "k_amoeba_multipole", "k_amoeba_udirect2b",
+                            "k_amoeba_umutual2b", "k_amoeba_polar",
+                            "k_amoeba_short_nbor", "k_amoeba_special15");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index fdb959f3e2..befefa8dd0 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1637,12 +1637,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
    else do nothing to IJ entry
 ------------------------------------------------------------------------- */
 
-__kernel void k_special15(__global int * dev_nbor,
+__kernel void k_amoeba_special15(__global int * dev_nbor,
                           const __global int * dev_packed,
                           const __global tagint *restrict tag,
                           const __global int *restrict nspecial15,
                           const __global tagint *restrict special15,
-                          const int inum, const int nall, const int nbor_pitch,
+                          const int inum, const int nall,
+                          const int nbor_pitch,
                           const int t_per_atom) {
   int tid, ii, offset, n_stride, i;
   atom_info(t_per_atom,ii,tid,offset);
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 5d1b7016da..bb5eb2d53b 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -58,7 +58,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const char *k_name_udirect2b,
                              const char *k_name_umutual2b,
                              const char *k_name_polar,
-                             const char *k_name_short_nbor) {
+                             const char *k_name_short_nbor,
+                             const char* k_name_special15) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -91,7 +92,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program,k_name_multipole,
-                  k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor);
+                  k_name_udirect2b, k_name_umutual2b,k_name_polar,
+                  k_name_short_nbor, k_name_special15);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -399,24 +401,22 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
     if (!success)
       return nullptr;
     atom->cast_q_data(host_q);
-    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+    //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
     hd_balancer.start_timer();
   } else {
     atom->cast_x_data(host_x,host_type);
     atom->cast_q_data(host_q);
-    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+    //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
   atom->add_q_data();
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();
 
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
-  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
-                     boxlo, prd);
-
   // re-allocate dev_short_nbor if necessary
   if (inum_full*(2+_max_nbors) > dev_short_nbor.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(inum_full)*1.10);
@@ -463,13 +463,8 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
   // NOTE:
-  //   For now we invoke precompute() again here,
-  //     to be able to turn on/off the udirect2b kernel (which comes before this)
   //   Once all the kernels are ready, precompute() is needed only once
   //     in the first kernel in a time step.
-  //   We only need to cast uind and uinp from host to device here
-  //     if the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
   int** firstneigh = nullptr;
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
@@ -553,6 +548,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
   //   and build the neighbor lists if needed
 
   int** firstneigh = nullptr;
+/*  
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
                           host_uind, host_uinp, nullptr, sublo, subhi, tag,
@@ -560,6 +556,9 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
                           success, host_q, boxlo, prd);
+*/
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();                          
 
   // ------------------- Resize _fieldp array ------------------------
 
@@ -627,6 +626,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
   //   and build the neighbor lists if needed
 
   int** firstneigh = nullptr;
+/*  
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
                           host_uind, host_uinp, nullptr, sublo, subhi, tag,
@@ -634,6 +634,9 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
                           success, host_q, boxlo, prd);
+*/
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();                          
 
   // ------------------- Resize _fieldp array ------------------------
 
@@ -708,6 +711,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
   int** firstneigh = nullptr;
+/*  
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
                           host_uind, host_uinp, nullptr, sublo, subhi, tag,
@@ -715,6 +719,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
                           success, host_q, boxlo, prd);
+*/
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  atom->add_extra_data();                          
 
   // ------------------- Resize _tep array ------------------------
 
@@ -829,7 +836,8 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_udirect2b,
                                   const char *kname_umutual2b,
                                   const char *kname_polar,
-                                  const char *kname_short_nbor) {
+                                  const char *kname_short_nbor,
+                                  const char* kname_special15) {
   if (_compiled)
     return;
 
@@ -843,7 +851,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
   k_polar.set_function(*pair_program,kname_polar);
   k_short_nbor.set_function(*pair_program,kname_short_nbor);
-  k_special15.set_function(*pair_program,"k_special15");
+  k_special15.set_function(*pair_program,kname_special15);
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index accb9a5900..6b11e25786 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -56,7 +56,7 @@ class BaseAmoeba {
                   const double gpu_split, FILE *screen, const void *pair_program,
                   const char *kname_multipole,
                   const char *kname_udirect2b, const char *kname_umutual2b,
-                  const char *kname_polar, const char *kname_short_nbor);
+                  const char *kname_polar, const char *kname_short_nbor, const char* kname_special15);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -279,9 +279,9 @@ class BaseAmoeba {
   numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-     const char *kname_multipole,
-     const char *kname_udirect2b, const char *kname_umutual2b,
-     const char *kname_polar, const char *kname_short_nbor);
+     const char *kname_multipole, const char *kname_udirect2b,
+     const char *kname_umutual2b, const char *kname_polar,
+     const char *kname_short_nbor, const char* kname_special15);
 
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index d1b61d5415..9a86be8f42 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -67,9 +67,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,hippo,
-                            "k_hippo_multipole",
-                            "k_hippo_udirect2b", "k_hippo_umutual2b",
-                            "k_hippo_polar", "k_hippo_short_nbor");
+                            "k_hippo_multipole", "k_hippo_udirect2b",
+                            "k_hippo_umutual2b", "k_hippo_polar",
+                            "k_hippo_short_nbor", "k_hippo_special15");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index fae6cf1681..cb11bd4022 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -2135,7 +2135,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
    else do nothing to IJ entry
 ------------------------------------------------------------------------- */
 
-__kernel void k_special15(__global int * dev_nbor,
+__kernel void k_hippo_special15(__global int * dev_nbor,
                           const __global int * dev_packed,
                           const __global tagint *restrict tag,
                           const __global int *restrict nspecial15,
diff --git a/src/MAKE/Makefile.mpi b/src/MAKE/Makefile.mpi
index 9776b0153e..e95d80d137 100644
--- a/src/MAKE/Makefile.mpi
+++ b/src/MAKE/Makefile.mpi
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx
-CCFLAGS =	-g -O3
+CCFLAGS =	-g -O3 -fopenmp
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -O3
+LINKFLAGS =	-g -O3 -fopenmp
 LIB =
 SIZE =		size
 

From f126f785a4a7f013d14264c024e100338b3971f8 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 1 Oct 2021 10:19:17 -0500
Subject: [PATCH 072/181] Removed duplicates in the amoeba kernels

---
 lib/gpu/lal_base_amoeba.cpp | 40 ++++++++++++++++++++-----------------
 lib/gpu/lal_base_amoeba.h   |  2 ++
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index bb5eb2d53b..d0631442e0 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -353,20 +353,20 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
                               bool &success, double *host_q, double *boxlo,
                               double *prd) {
   acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
+  //int eflag, vflag;
+  if (eatom) _eflag=2;
+  else if (eflag_in) _eflag=1;
+  else _eflag=0;
+  if (vatom) _vflag=2;
+  else if (vflag_in) _vflag=1;
+  else _vflag=0;
 
   #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
+  if (_eflag) _eflag=2;
+  if (_vflag) _vflag=2;
   #endif
 
-  set_kernel(eflag,vflag);
+  set_kernel(_eflag,_vflag);
 
   // ------------------- Resize 1-5 neighbor arrays ------------------------
 
@@ -444,6 +444,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
                                           const double aewald, const double felec,
                                           const double off2_mpole, double *host_q,
                                           double *boxlo, double *prd, void **tep_ptr) {
+/*                                            
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -459,7 +460,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   #endif
 
   set_kernel(eflag,vflag);
-
+*/
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
   // NOTE:
@@ -486,7 +487,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   _off2_mpole = off2_mpole;
   _felec = felec;
   _aewald = aewald;
-  const int red_blocks=multipole_real(eflag,vflag);
+  const int red_blocks=multipole_real(_eflag,_vflag);
 
   // leave the answers (forces, energies and virial) on the device,
   //   only copy them back in the last kernel (polar_real)
@@ -528,6 +529,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
                                      const double aewald, const double off2_polar,
                                      double *host_q, double *boxlo, double *prd,
                                      void** fieldp_ptr) {
+/*                                       
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -543,7 +545,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
   #endif
 
   set_kernel(eflag,vflag);
-
+*/
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
 
@@ -570,7 +572,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
 
   _off2_polar = off2_polar;
   _aewald = aewald;
-  const int red_blocks=udirect2b(eflag,vflag);
+  const int red_blocks=udirect2b(_eflag,_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
@@ -606,6 +608,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
                                      const double aewald, const double off2_polar,
                                      double *host_q, double *boxlo, double *prd,
                                      void** fieldp_ptr) {
+/*                                       
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -621,7 +624,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
   #endif
 
   set_kernel(eflag,vflag);
-
+*/
   // reallocate per-atom arrays, transfer extra data from the host
   //   and build the neighbor lists if needed
 
@@ -648,7 +651,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
 
   _off2_polar = off2_polar;
   _aewald = aewald;
-  const int red_blocks=umutual2b(eflag,vflag);
+  const int red_blocks=umutual2b(_eflag,_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
@@ -683,6 +686,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
                                       const double aewald, const double felec,
                                       const double off2_polar, double *host_q,
                                       double *boxlo, double *prd, void **tep_ptr) {
+/*                                        
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -698,7 +702,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
   #endif
 
   set_kernel(eflag,vflag);
-
+*/
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
   // NOTE:
@@ -734,7 +738,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
   _off2_polar = off2_polar;
   _felec = felec;
   _aewald = aewald;
-  const int red_blocks=polar_real(eflag,vflag);
+  const int red_blocks=polar_real(_eflag,_vflag);
 
   // only copy answers (forces, energies and virial) back from the device
   //   in the last kernel (which is polar_real here)
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 6b11e25786..cb040c630d 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -278,6 +278,8 @@ class BaseAmoeba {
   numtyp _aewald,_felec;
   numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
 
+  int _eflag, _vflag;
+
   void compile_kernels(UCL_Device &dev, const void *pair_string,
      const char *kname_multipole, const char *kname_udirect2b,
      const char *kname_umutual2b, const char *kname_polar,

From f4d3d3a2b591ac0dee4d982506ae8e880394a922 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 2 Oct 2021 00:09:53 -0500
Subject: [PATCH 073/181] Gradually cleaned up and removed redundancy in amoeba
 and hippo

---
 lib/gpu/lal_base_amoeba.cpp | 156 +++-----------------------------
 lib/gpu/lal_base_amoeba.h   |  20 ++--
 lib/gpu/lal_hippo.cpp       | 176 +++++++-----------------------------
 lib/gpu/lal_hippo.cu        |   4 +-
 lib/gpu/lal_hippo.h         |   1 -
 lib/gpu/lal_hippo_ext.cpp   |   7 +-
 src/GPU/pair_hippo_gpu.cpp  |   6 +-
 7 files changed, 61 insertions(+), 309 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index d0631442e0..7cd410b6b8 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -245,8 +245,8 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full,
-                          const int nall, double **host_x, int *host_type, int *host_amtype,
-                          int *host_amgroup, double **host_rpole,
+                          const int nall, double **host_x, int *host_type,
+                          int *host_amtype, int *host_amgroup, double **host_rpole,
                           double **host_uind, double **host_uinp,
                           int *ilist, int *numj, int **firstneigh,
                           const bool eflag_in, const bool vflag_in,
@@ -353,7 +353,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
                               bool &success, double *host_q, double *boxlo,
                               double *prd) {
   acc_timers();
-  //int eflag, vflag;
   if (eatom) _eflag=2;
   else if (eflag_in) _eflag=1;
   else _eflag=0;
@@ -401,12 +400,10 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
     if (!success)
       return nullptr;
     atom->cast_q_data(host_q);
-    //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
     hd_balancer.start_timer();
   } else {
     atom->cast_x_data(host_x,host_type);
     atom->cast_q_data(host_q);
-    //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
@@ -444,23 +441,6 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
                                           const double aewald, const double felec,
                                           const double off2_mpole, double *host_q,
                                           double *boxlo, double *prd, void **tep_ptr) {
-/*                                            
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-*/
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
   // NOTE:
@@ -499,13 +479,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   // copy tep from device to host
 
   _tep.update_host(_max_tep_size*4,false);
-/*
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
-    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/
+
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
@@ -529,36 +503,11 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
                                      const double aewald, const double off2_polar,
                                      double *host_q, double *boxlo, double *prd,
                                      void** fieldp_ptr) {
-/*                                       
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-*/
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
 
   int** firstneigh = nullptr;
-/*  
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, nullptr, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
-*/
+
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();                          
 
@@ -577,14 +526,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   _fieldp.update_host(_max_fieldp_size*8,false);
-/*
-  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
-    this->_fieldp.cols(), _max_fieldp_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
-    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/
+
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
@@ -608,36 +550,11 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
                                      const double aewald, const double off2_polar,
                                      double *host_q, double *boxlo, double *prd,
                                      void** fieldp_ptr) {
-/*                                       
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-*/
   // reallocate per-atom arrays, transfer extra data from the host
   //   and build the neighbor lists if needed
 
   int** firstneigh = nullptr;
-/*  
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, nullptr, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
-*/
+
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();                          
 
@@ -656,14 +573,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   _fieldp.update_host(_max_fieldp_size*8,false);
-/*
-  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
-    this->_fieldp.cols(), _max_fieldp_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
-    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/
+
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
@@ -686,44 +596,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
                                       const double aewald, const double felec,
                                       const double off2_polar, double *host_q,
                                       double *boxlo, double *prd, void **tep_ptr) {
-/*                                        
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-*/
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-  // NOTE:
-  //   For now we invoke precompute() again here,
-  //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   Once all the kernels are ready, precompute() is needed only once
-  //     in the first kernel in a time step.
-  //   We only need to cast uind and uinp from host to device here
-  //     if the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
   int** firstneigh = nullptr;
-/*  
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, nullptr, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
-*/
+
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();                          
 
@@ -750,13 +625,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
   // copy tep from device to host
 
   _tep.update_host(_max_tep_size*4,false);
-/*
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
-    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/
+
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
@@ -826,7 +695,6 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
   n += nstride*_nall;
   if (pval) {
-
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
       pextra[idx]   = pval[i];
@@ -889,9 +757,9 @@ int BaseAmoebaT::add_onefive_neighbors() {
 
   k_special15.set_size(GX,BX);
   k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
-                    &atom->dev_tag, &dev_nspecial15, &dev_special15,
-                    &ainum, &_nall, &nbor_pitch,
-                    &_threads_per_atom);
+                  &atom->dev_tag, &dev_nspecial15, &dev_special15,
+                  &ainum, &_nall, &nbor_pitch,
+                  &_threads_per_atom);
 
   return GX;
 }
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index cb040c630d..dc3467f692 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -145,14 +145,14 @@ class BaseAmoeba {
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double *host_pval, double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double aewald, const double felec, const double off2_mpole, double *charge,
-                double *boxlo, double *prd, void **tep_ptr);
+                int *host_amgroup, double **host_rpole, double *host_pval,
+                double *sublo, double *subhi, tagint *tag,
+                int **nspecial, tagint **special, int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+                int &host_start, int **ilist, int **numj, const double cpu_time,
+                bool &success, const double aewald, const double felec,
+                const double off2_mpole, double *charge, double *boxlo,
+                double *prd, void **tep_ptr);
 
   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
   virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall,
@@ -165,8 +165,8 @@ class BaseAmoeba {
                 const bool eflag, const bool vflag,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **numj, const double cpu_time, bool &success,
-                const double aewald, const double off2_polar, double *charge,
-                double *boxlo, double *prd, void **fieldp_ptr);
+                const double aewald, const double off2_polar,
+                double *charge, double *boxlo, double *prd, void **fieldp_ptr);
 
   /// Compute the real space part of the induced field (umutual2b) with device neighboring
   virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall,
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 9a86be8f42..a5e3be5974 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -48,22 +48,20 @@ int HippoT::bytes_per_atom(const int max_nbors) const {
 
 template <class numtyp, class acctyp>
 int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
-                  const double *host_pdamp, const double *host_thole,
-                  const double *host_dirdamp, const int *host_amtype2class,
-                  const double *host_special_hal,
-                  const double *host_special_repel,
-                  const double *host_special_disp,
-                  const double *host_special_mpole,
-                  const double *host_special_polar_wscale,
-                  const double *host_special_polar_piscale,
-                  const double *host_special_polar_pscale,
-                  const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
-                  const double *host_csix, const double *host_adisp,
-                  const double *host_pcore, const double *host_palpha,
-                  const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const int maxspecial15,
-                  const double cell_size, const double gpu_split, FILE *_screen,
-                  const double polar_dscale, const double polar_uscale) {
+                 const double *host_pdamp, const double *host_thole,
+                 const double *host_dirdamp, const int *host_amtype2class,
+                 const double *host_special_repel, const double *host_special_disp,
+                 const double *host_special_mpole,
+                 const double *host_special_polar_wscale,
+                 const double *host_special_polar_piscale,
+                 const double *host_special_polar_pscale,
+                 const double *host_sizpr, const double *host_dmppr, const double *host_elepr,
+                 const double *host_csix, const double *host_adisp,
+                 const double *host_pcore, const double *host_palpha,
+                 const int nlocal, const int nall, const int max_nbors,
+                 const int maxspecial, const int maxspecial15,
+                 const double cell_size, const double gpu_split, FILE *_screen,
+                 const double polar_dscale, const double polar_uscale) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,hippo,
@@ -133,9 +131,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
 
   sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<5; i++) {
-    dview[i].x=host_special_hal[i];
-    dview[i].y=host_special_repel[i];
-    dview[i].z=host_special_disp[i];
+    dview[i].x=host_special_repel[i];
+    dview[i].y=host_special_disp[i];
+    dview[i].z=(numtyp)0;
     dview[i].w=(numtyp)0;
   }
   ucl_copy(sp_nonpolar,dview,5,false);
@@ -211,7 +209,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
   //     to be able to turn on/off the udirect2b kernel (which comes before this)
   //   Once all the kernels are ready, precompute() is needed only once
   //     in the first kernel in a time step.
-  //   We only need to cast uind and uinp from host to device here
+  //   We only need to cast the necessary from host to device here
   //     if the neighbor lists are rebuilt and other per-atom arrays
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
@@ -240,7 +238,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
   _c3 = c3;
   _c4 = c4;
   _c5 = c5;
-  const int red_blocks=repulsion(eflag,vflag);
+  const int red_blocks=repulsion(this->_eflag,this->_vflag);
 
   // only copy them back if this is the last kernel
   //   otherwise, commenting out these two lines to leave the answers
@@ -316,32 +314,14 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
                                       const double cpu_time, bool &success,
                                       const double aewald, const double off2_disp,
                                       double *host_q, double *boxlo, double *prd) {
-  this->acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  this->set_kernel(eflag,vflag);
-
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
   // NOTE:
   //   For now we invoke precompute() again here,
   //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   Once all the kernels are ready, precompute() is needed only once
-  //     in the first kernel in a time step.
-  //   We only need to cast uind and uinp from host to device here
-  //     if the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+  //   We only need to cast necesary data arrays from host to device here
+  //     because the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type) are ready on the device.
 
   int** firstneigh = nullptr;
   firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
@@ -350,11 +330,11 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
                           nspecial, special, nspecial15, special15,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
+                          success, host_q, boxlo, prd);                        
 
   this->_off2_disp = off2_disp;
   this->_aewald = aewald;
-  const int red_blocks=dispersion_real(eflag,vflag);
+  const int red_blocks=dispersion_real(this->_eflag,this->_vflag);
 
   // only copy them back if this is the last kernel
   //   otherwise, commenting out these two lines to leave the answers
@@ -427,22 +407,6 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
                                      const double aewald, const double felec,
                                      const double off2_mpole, double *host_q,
                                      double *boxlo, double *prd, void **tep_ptr) {
-  this->acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  this->set_kernel(eflag,vflag);
-
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
   // NOTE:
@@ -474,7 +438,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
   this->_off2_mpole = off2_mpole;
   this->_felec = felec;
   this->_aewald = aewald;
-  const int red_blocks=multipole_real(eflag,vflag);
+  const int red_blocks=multipole_real(this->_eflag,this->_vflag);
 
   // leave the answers (forces, energies and virial) on the device,
   //   only copy them back in the last kernel (this one, or polar_real once done)
@@ -486,13 +450,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
   // copy tep from device to host
 
   this->_tep.update_host(this->_max_tep_size*4,false);
-/*
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
-    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/
+
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
@@ -558,22 +516,6 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
                                 const double aewald, const double off2_polar,
                                 double *host_q, double *boxlo, double *prd,
                                 void** fieldp_ptr) {
-  this->acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  this->set_kernel(eflag,vflag);
-
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
 
@@ -596,19 +538,12 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
 
   this->_off2_polar = off2_polar;
   this->_aewald = aewald;
-  const int red_blocks=udirect2b(eflag,vflag);
+  const int red_blocks=udirect2b(this->_eflag,this->_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   this->_fieldp.update_host(this->_max_fieldp_size*8,false);
-/*
-  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
-    this->_fieldp.cols(), _max_fieldp_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
-    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/
+
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
@@ -673,22 +608,6 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full,
                                      const double aewald, const double off2_polar,
                                      double *host_q, double *boxlo, double *prd,
                                      void** fieldp_ptr) {
-  this->acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  this->set_kernel(eflag,vflag);
-
   // reallocate per-atom arrays, transfer extra data from the host
   //   and build the neighbor lists if needed
 
@@ -711,19 +630,12 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full,
 
   this->_off2_polar = off2_polar;
   this->_aewald = aewald;
-  const int red_blocks=umutual2b(eflag,vflag);
+  const int red_blocks=umutual2b(this->_eflag,this->_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   this->_fieldp.update_host(this->_max_fieldp_size*8,false);
-/*
-  printf("GPU lib: _fieldp size = %d: max fieldp size = %d\n",
-    this->_fieldp.cols(), _max_fieldp_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_fieldp[4*i]);
-    printf("i = %d; field = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/
+
   return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
@@ -786,29 +698,11 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
                                  const double aewald, const double felec,
                                  const double off2_polar, double *host_q,
                                  double *boxlo, double *prd, void **tep_ptr) {
-  this->acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  this->set_kernel(eflag,vflag);
-
   // reallocate per-atom arrays, transfer data from the host
   //   and build the neighbor lists if needed
   // NOTE:
   //   For now we invoke precompute() again here,
   //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   Once all the kernels are ready, precompute() is needed only once
-  //     in the first kernel in a time step.
   //   We only need to cast uind and uinp from host to device here
   //     if the neighbor lists are rebuilt and other per-atom arrays
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
@@ -833,7 +727,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
   this->_off2_polar = off2_polar;
   this->_felec = felec;
   this->_aewald = aewald;
-  const int red_blocks=polar_real(eflag,vflag);
+  const int red_blocks=polar_real(this->_eflag,this->_vflag);
 
   // only copy answers (forces, energies and virial) back from the device
   //   in the last kernel (which is polar_real here)
@@ -845,13 +739,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
   // copy tep from device to host
 
   this->_tep.update_host(this->_max_tep_size*4,false);
-/*
-  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
-  for (int i = 0; i < 10; i++) {
-    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
-    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
-  }
-*/
+
   return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index cb11bd4022..f38a9f4ac0 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -520,7 +520,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp valk = coeff_rep[jtype].z; // elepr[jtype];
 
       const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
-      numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)];
+      numtyp factor_repel = sp_nonpol.x; // factor_repel = special_repel[sbmask15(j)];
       if (factor_repel == (numtyp)0) continue;
 
       // intermediates involving moments and separation distance
@@ -830,7 +830,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       numtyp damp3,damp5;
       numtyp ddamp;
       const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
-      numtyp factor_disp = sp_nonpol.z; // factor_disp = special_disp[sbmask15(j)];
+      numtyp factor_disp = sp_nonpol.y; // factor_disp = special_disp[sbmask15(j)];
 
       if (ai != ak) {
         ai2 = ai * ai;
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index ceab20d17b..9941460bff 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -41,7 +41,6 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
            const double *host_pdamp, const double *host_thole,
            const double *host_dirdamp, const int *host_amtype2class,
            const double *host_special_mpole,
-           const double *host_special_hal,
            const double *host_special_repel,
            const double *host_special_disp,
            const double *host_special_polar_wscale,
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 15cb53cdb1..4152833320 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -30,7 +30,6 @@ static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
 int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                     const double *host_pdamp, const double *host_thole,
                     const double *host_dirdamp, const int *host_amtype2class,
-                    const double *host_special_hal,
                     const double *host_special_repel,
                     const double *host_special_disp,
                     const double *host_special_mpole,
@@ -71,8 +70,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
   if (world_me==0)
     init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
                           host_pdamp, host_thole, host_dirdamp,
-                          host_amtype2class, host_special_hal,
-                          host_special_repel, host_special_disp,
+                          host_amtype2class, host_special_repel, host_special_disp,
                           host_special_mpole, host_special_polar_wscale,
                           host_special_polar_piscale, host_special_polar_pscale,
                           host_sizpr, host_dmppr, host_elepr,
@@ -97,8 +95,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
     if (gpu_rank==i && world_me!=0)
       init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
                             host_pdamp, host_thole, host_dirdamp,
-                            host_amtype2class, host_special_hal,
-                            host_special_repel, host_special_disp,
+                            host_amtype2class, host_special_repel, host_special_disp,
                             host_special_mpole, host_special_polar_wscale,
                             host_special_polar_piscale, host_special_polar_pscale,
                             host_sizpr, host_dmppr, host_elepr,
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 014b14471e..dcdac836bd 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -53,8 +53,8 @@ enum{GORDON1,GORDON2};
 int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                     const double *host_pdamp, const double *host_thole,
                     const double *host_dirdamp, const int* host_amtype2class,
-                    const double *host_special_hal, const double *host_special_repel,
-                    const double *host_special_disp, const double *host_special_mpole,
+                    const double *host_special_repel, const double *host_special_disp,
+                    const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
@@ -203,7 +203,7 @@ void PairHippoGPU::init_style()
   int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
-                                pdamp, thole, dirdamp, amtype2class, special_hal,
+                                pdamp, thole, dirdamp, amtype2class,
                                 special_repel, special_disp, special_mpole,
                                 special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, sizpr, dmppr, elepr,

From 5a6426bf96b2aa8d69d8e4580460b82a48d7573c Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 2 Oct 2021 00:56:15 -0500
Subject: [PATCH 074/181] Only transfer data arrays that are needed in each
 kernel

---
 lib/gpu/lal_base_amoeba.cpp |  53 ++++++++++---------
 lib/gpu/lal_hippo.cpp       | 102 ++++++++++--------------------------
 2 files changed, 55 insertions(+), 100 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 7cd410b6b8..c56cb77aa3 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -350,8 +350,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
                               const bool eflag_in, const bool vflag_in,
                               const bool eatom, const bool vatom, int &host_start,
                               int **&ilist, int **&jnum, const double cpu_time,
-                              bool &success, double *host_q, double *boxlo,
-                              double *prd) {
+                              bool &success, double *host_q, double *boxlo, double *prd) {
   acc_timers();
   if (eatom) _eflag=2;
   else if (eflag_in) _eflag=1;
@@ -509,7 +508,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
   int** firstneigh = nullptr;
 
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
-  atom->add_extra_data();                          
+  atom->add_extra_data();
 
   // ------------------- Resize _fieldp array ------------------------
 
@@ -647,30 +646,34 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
   int n = 0;
   int nstride = 4;
-  for (int i = 0; i < _nall; i++) {
-    int idx = n+i*nstride;
-    pextra[idx]   = rpole[i][0];
-    pextra[idx+1] = rpole[i][1];
-    pextra[idx+2] = rpole[i][2];
-    pextra[idx+3] = rpole[i][3];
-  }
+  if (rpole) {
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx]   = rpole[i][0];
+      pextra[idx+1] = rpole[i][1];
+      pextra[idx+2] = rpole[i][2];
+      pextra[idx+3] = rpole[i][3];
+    }
 
-  n += nstride*_nall;
-  for (int i = 0; i < _nall; i++) {
-    int idx = n+i*nstride;
-    pextra[idx]   = rpole[i][4];
-    pextra[idx+1] = rpole[i][5];
-    pextra[idx+2] = rpole[i][6];
-    pextra[idx+3] = rpole[i][8];
-  }
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx]   = rpole[i][4];
+      pextra[idx+1] = rpole[i][5];
+      pextra[idx+2] = rpole[i][6];
+      pextra[idx+3] = rpole[i][8];
+    }
 
-  n += nstride*_nall;
-  for (int i = 0; i < _nall; i++) {
-    int idx = n+i*nstride;
-    pextra[idx]   = rpole[i][9];
-    pextra[idx+1] = rpole[i][12];
-    pextra[idx+2] = (numtyp)amtype[i];
-    pextra[idx+3] = (numtyp)amgroup[i];
+    n += nstride*_nall;
+    for (int i = 0; i < _nall; i++) {
+      int idx = n+i*nstride;
+      pextra[idx]   = rpole[i][9];
+      pextra[idx+1] = rpole[i][12];
+      pextra[idx+2] = (numtyp)amtype[i];
+      pextra[idx+3] = (numtyp)amgroup[i];
+    }
+  } else {
+    n += 2*nstride*_nall;
   }
 
   n += nstride*_nall;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index a5e3be5974..5a348c9272 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -314,23 +314,12 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
                                       const double cpu_time, bool &success,
                                       const double aewald, const double off2_disp,
                                       double *host_q, double *boxlo, double *prd) {
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-  // NOTE:
-  //   For now we invoke precompute() again here,
-  //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   We only need to cast necesary data arrays from host to device here
-  //     because the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type) are ready on the device.
 
-  int** firstneigh = nullptr;
-  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          nullptr, nullptr, nullptr, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);                        
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(host_amtype, host_amgroup, host_rpole,
+                        nullptr, nullptr, nullptr);
+  this->atom->add_extra_data();
 
   this->_off2_disp = off2_disp;
   this->_aewald = aewald;
@@ -344,7 +333,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
 
   this->hd_balancer.stop_timer();
 
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
+  return nullptr; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -407,25 +396,11 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
                                      const double aewald, const double felec,
                                      const double off2_mpole, double *host_q,
                                      double *boxlo, double *prd, void **tep_ptr) {
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-  // NOTE:
-  //   For now we invoke precompute() again here,
-  //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   Once all the kernels are ready, precompute() is needed only once
-  //     in the first kernel in a time step.
-  //   We only need to cast uind and uinp from host to device here
-  //     if the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
-  int** firstneigh = nullptr;
-  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          nullptr, nullptr, host_pval, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, nullptr, nullptr, host_pval);
+  this->atom->add_extra_data();
 
   // ------------------- Resize _tep array ------------------------
 
@@ -451,7 +426,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
 
   this->_tep.update_host(this->_max_tep_size*4,false);
 
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
+  return nullptr; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -516,17 +491,11 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
                                 const double aewald, const double off2_polar,
                                 double *host_q, double *boxlo, double *prd,
                                 void** fieldp_ptr) {
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
 
-  int** firstneigh = nullptr;
-  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, host_pval, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
+  // all the necessary data arrays are already copied from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval);
+  this->atom->add_extra_data();
 
   // ------------------- Resize _fieldp array ------------------------
 
@@ -544,7 +513,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
 
   this->_fieldp.update_host(this->_max_fieldp_size*8,false);
 
-  return firstneigh; //nbor->host_jlist.begin()-host_start;
+  return nullptr; //nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -608,17 +577,11 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full,
                                      const double aewald, const double off2_polar,
                                      double *host_q, double *boxlo, double *prd,
                                      void** fieldp_ptr) {
-  // reallocate per-atom arrays, transfer extra data from the host
-  //   and build the neighbor lists if needed
 
-  int** firstneigh = nullptr;
-  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, host_pval, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
+  // cast necessary data arrays from host to device
+
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
+  this->atom->add_extra_data();
 
   // ------------------- Resize _fieldp array ------------------------
 
@@ -636,7 +599,7 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full,
 
   this->_fieldp.update_host(this->_max_fieldp_size*8,false);
 
-  return firstneigh; //nbor->host_jlist.begin()-host_start;
+  return nullptr; //nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -698,23 +661,12 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
                                  const double aewald, const double felec,
                                  const double off2_polar, double *host_q,
                                  double *boxlo, double *prd, void **tep_ptr) {
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-  // NOTE:
-  //   For now we invoke precompute() again here,
-  //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   We only need to cast uind and uinp from host to device here
-  //     if the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type, amtype, amgroup, rpole) are ready on the device.
 
-  int** firstneigh = nullptr;
-  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          host_uind, host_uinp, host_pval, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
+  // cast necessary data arrays from host to device
+
+  //this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
+  this->atom->add_extra_data();
 
   // ------------------- Resize _tep array ------------------------
 
@@ -740,7 +692,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
 
   this->_tep.update_host(this->_max_tep_size*4,false);
 
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
+  return nullptr;
 }
 
 // ---------------------------------------------------------------------------

From 0f0f6a51de796caeeece16b9eb77f299a4672866 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 2 Oct 2021 16:02:44 -0500
Subject: [PATCH 075/181] Renamed sp_polar to sp_amoeba, and replaced
 special_wscale with special_hal for amoeba

---
 lib/gpu/lal_amoeba.cpp | 41 ++++++++++++++++-------------------------
 lib/gpu/lal_amoeba.cu  | 24 ++++++++++++------------
 lib/gpu/lal_amoeba.h   | 17 ++++++-----------
 3 files changed, 34 insertions(+), 48 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index b92e1bfd55..924a175cfe 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -103,30 +103,21 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
   ucl_copy(coeff_amclass,host_write2,false);
 
   UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
-  sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  sp_amoeba.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
   for (int i=0; i<5; i++) {
-    dview[i].x=host_special_polar_wscale[i];
+    dview[i].x=host_special_hal[i];
     dview[i].y=host_special_polar_piscale[i];
     dview[i].z=host_special_polar_pscale[i];
     dview[i].w=host_special_mpole[i];
   }
-  ucl_copy(sp_polar,dview,5,false);
-
-  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
-  for (int i=0; i<5; i++) {
-    dview[i].x=host_special_hal[i];
-    dview[i].y=host_special_repel[i];
-    dview[i].z=host_special_disp[i];
-    dview[i].w=(numtyp)0;
-  }
-  ucl_copy(sp_nonpolar,dview,5,false);
+  ucl_copy(sp_amoeba,dview,5,false);
 
   _polar_dscale = polar_dscale;
   _polar_uscale = polar_uscale;
 
   _allocated=true;
   this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
-    + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes();
+    + sp_amoeba.row_bytes() + this->_tep.row_bytes();
   return 0;
 }
 
@@ -138,8 +129,7 @@ void AmoebaT::clear() {
 
   coeff_amtype.clear();
   coeff_amclass.clear();
-  sp_polar.clear();
-  sp_nonpolar.clear();
+  sp_amoeba.clear();
 
   this->clear_atomic();
 }
@@ -177,13 +167,14 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
                          &nbor_pitch, &this->_threads_per_atom);
 
   this->k_multipole.set_size(GX,BX);
-  this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
-                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                    &this->dev_short_nbor,
-                    &this->ans->force, &this->ans->engv, &this->_tep,
-                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
-                    &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra,
+                        &coeff_amtype, &sp_amoeba,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
   this->time_pair.stop();
 
   return GX;
@@ -218,7 +209,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   }
 
   this->k_udirect2b.set_size(GX,BX);
-  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
@@ -258,7 +249,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   }
 
   this->k_umutual2b.set_size(GX,BX);
-  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
                         &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
@@ -297,7 +288,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   }
 
   this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_amoeba,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index befefa8dd0..f29522084d 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -14,7 +14,7 @@
 // ***************************************************************************
 
 #if defined(NV_KERNEL) || defined(USE_HIP)
-#include <stdio.h>
+//#include <stdio.h>
 #include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@@ -412,7 +412,7 @@ _texture( q_tex,int2);
 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
                                  const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global numtyp4 *restrict sp_amoeba,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
@@ -518,7 +518,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       int jtype = pol3j.z; // amtype[j];
       int jgroup =  pol3j.w; // amgroup[j];
 
-      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
       numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
 
       // intermediates involving moments and separation distance
@@ -713,7 +713,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
                                  const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global numtyp4 *restrict sp_amoeba,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
@@ -824,12 +824,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       int jgroup =  pol3j.w; // amgroup[j];
 
       numtyp factor_dscale, factor_pscale;
-      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
       if (igroup == jgroup) {
-        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_pscale = sp_pol.y; // sp_amoeba_piscale[sbmask15(jextra)];
         factor_dscale = polar_dscale;
       } else {
-        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_pscale = sp_pol.z; // sp_amoeba_pscale[sbmask15(jextra)];
         factor_dscale = (numtyp)1.0;
       }
 
@@ -931,7 +931,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
                                  const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global numtyp4 *restrict sp_amoeba,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
                                  const __global int *dev_short_nbor,
@@ -1105,7 +1105,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
 __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                              const __global numtyp *restrict extra,
                              const __global numtyp4 *restrict coeff,
-                             const __global numtyp4 *restrict sp_polar,
+                             const __global numtyp4 *restrict sp_amoeba,
                              const __global int *dev_nbor,
                              const __global int *dev_packed,
                              const __global int *dev_short_nbor,
@@ -1257,13 +1257,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp ukzp = pol5j.z; // uinp[j][2];
 
       numtyp factor_dscale, factor_pscale, factor_uscale;
-      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
       if (igroup == jgroup) {
-        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_pscale = sp_pol.y; // sp_amoeba_piscale[sbmask15(jextra)];
         factor_dscale = polar_dscale;
         factor_uscale = polar_uscale;
       } else {
-        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_pscale = sp_pol.z; // sp_amoeba_pscale[sbmask15(jextra)];
         factor_dscale = factor_uscale = (numtyp)1.0;
       }
 
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index df72435b81..d12b79719f 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -70,17 +70,12 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> coeff_amtype;
   /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
   UCL_D_Vec<numtyp4> coeff_amclass;
-  /// Special polar values [0-4]:
-  ///   sp_polar.x = special_polar_wscale
-  ///   sp_polar.y special_polar_pscale,
-  ///   sp_polar.z = special_polar_piscale
-  ///   sp_polar.w = special_mpole
-  UCL_D_Vec<numtyp4> sp_polar;
-  /// Special nonpolar values [0-4]:
-  ///   sp_nonpolar.x = special_hal
-  ///   sp_nonpolar.y special_repel
-  ///   sp_nonpolar.z = special_disp
-  UCL_D_Vec<numtyp4> sp_nonpolar;
+  /// Special amoeba values [0-4]:
+  ///   sp_amoeba.x = special_hal
+  ///   sp_amoeba.y = special_polar_pscale,
+  ///   sp_amoeba.z = special_polar_piscale
+  ///   sp_amoeba.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_amoeba;
 
   /// If atom type constants fit in shared memory, use fast kernels
   bool shared_types;

From 79fbbd4f33ad0eb42c5f182929c509d162a2d0d9 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 4 Oct 2021 14:40:58 -0500
Subject: [PATCH 076/181] Cleaned up the API of amoeba and hippo to remove
 unncessary arguments

---
 lib/gpu/lal_amoeba_ext.cpp  |  64 +++++-------------
 lib/gpu/lal_base_amoeba.cpp |  84 ++++-------------------
 lib/gpu/lal_base_amoeba.h   |  39 +++--------
 lib/gpu/lal_hippo.cpp       |  98 +++++----------------------
 lib/gpu/lal_hippo.h         |  63 ++++--------------
 lib/gpu/lal_hippo_ext.cpp   |  85 +++++++-----------------
 src/GPU/pair_amoeba_gpu.cpp |  90 +++++--------------------
 src/GPU/pair_hippo_gpu.cpp  | 129 ++++++++----------------------------
 8 files changed, 135 insertions(+), 517 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index b73f6c4ca6..18e1cf22f8 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -134,58 +134,28 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                           cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
 }
 
-int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
+void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                            double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success,  const double aewald, const double off2, double *host_q,
-                           double *boxlo, double *prd, void **fieldp_ptr) {
-  return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
-                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
+                           const double aewald, const double off2, void **fieldp_ptr) {
+  AMOEBAMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                             aewald, off2, fieldp_ptr);
 }
 
-int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2, double *host_q,
-                           double *boxlo, double *prd, void **fieldp_ptr) {
-  return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
-                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
+void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                   double **host_uind, double **host_uinp,
+                                   const double aewald, const double off2, void **fieldp_ptr) {
+  AMOEBAMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                             aewald, off2, fieldp_ptr);
 }
 
-int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup,
-                           double **host_rpole, double **host_uind, double **host_uinp,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double felec, const double off2,
-                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
-  return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
-                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                    double **host_uind, double **host_uinp,
+                                    const bool eflag_in, const bool vflag_in,
+                                    const bool eatom, const bool vatom,
+                                    const double aewald, const double felec, const double off2,
+                                    void **tep_ptr) {
+  AMOEBAMF.compute_polar_real(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, nullptr,
+                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
 double amoeba_gpu_bytes() {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index c56cb77aa3..5b396a641e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -487,35 +487,15 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
 //    of the permanent field
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
-                                     const int nall, double **host_x,
-                                     int *host_type, int *host_amtype,
-                                     int *host_amgroup, double **host_rpole,
+void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                                      double **host_uind, double **host_uinp, double *host_pval,
-                                     double *sublo, double *subhi, tagint *tag,
-                                     int **nspecial, tagint **special,
-                                     int *nspecial15, tagint **special15,
-                                     const bool eflag_in, const bool vflag_in,
-                                     const bool eatom, const bool vatom,
-                                     int &host_start, int **ilist, int **jnum,
-                                     const double cpu_time, bool &success,
                                      const double aewald, const double off2_polar,
-                                     double *host_q, double *boxlo, double *prd,
                                      void** fieldp_ptr) {
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-
-  int** firstneigh = nullptr;
+  // all the necessary data arrays are already copied from host to device
 
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();
-
-  // ------------------- Resize _fieldp array ------------------------
-
-  if (inum_full>_max_fieldp_size) {
-    _max_fieldp_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _fieldp.resize(_max_fieldp_size*8);
-  }
+ 
   *fieldp_ptr=_fieldp.host.begin();
 
   _off2_polar = off2_polar;
@@ -525,8 +505,6 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   _fieldp.update_host(_max_fieldp_size*8,false);
-
-  return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -534,35 +512,15 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
 //    of the induced field
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
-                                     const int nall, double **host_x,
-                                     int *host_type, int *host_amtype,
-                                     int *host_amgroup, double **host_rpole,
+void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                                      double **host_uind, double **host_uinp, double *host_pval,
-                                     double *sublo, double *subhi, tagint *tag,
-                                     int **nspecial, tagint **special,
-                                     int *nspecial15, tagint **special15,
-                                     const bool eflag_in, const bool vflag_in,
-                                     const bool eatom, const bool vatom,
-                                     int &host_start, int **ilist, int **jnum,
-                                     const double cpu_time, bool &success,
                                      const double aewald, const double off2_polar,
-                                     double *host_q, double *boxlo, double *prd,
                                      void** fieldp_ptr) {
-  // reallocate per-atom arrays, transfer extra data from the host
-  //   and build the neighbor lists if needed
-
-  int** firstneigh = nullptr;
+  // all the necessary data arrays are already copied from host to device
 
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();                          
 
-  // ------------------- Resize _fieldp array ------------------------
-
-  if (inum_full>_max_fieldp_size) {
-    _max_fieldp_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _fieldp.resize(_max_fieldp_size*8);
-  }
   *fieldp_ptr=_fieldp.host.begin();
 
   _off2_polar = off2_polar;
@@ -572,41 +530,25 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full,
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   _fieldp.update_host(_max_fieldp_size*8,false);
-
-  return firstneigh; //nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
-                                      const int nall, double **host_x,
-                                      int *host_type, int *host_amtype,
-                                      int *host_amgroup, double **host_rpole,
-                                      double **host_uind, double **host_uinp, double *host_pval,
-                                      double *sublo, double *subhi, tagint *tag,
-                                      int **nspecial, tagint **special,
-                                      int *nspecial15, tagint **special15,
-                                      const bool eflag_in, const bool vflag_in,
-                                      const bool eatom, const bool vatom,
-                                      int &host_start, int **ilist, int **jnum,
-                                      const double cpu_time, bool &success,
-                                      const double aewald, const double felec,
-                                      const double off2_polar, double *host_q,
-                                      double *boxlo, double *prd, void **tep_ptr) {
+void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
+                                     double **host_rpole, double **host_uind,
+                                     double **host_uinp, double *host_pval,
+                                     const bool eflag_in, const bool vflag_in,
+                                     const bool eatom, const bool vatom,
+                                     const double aewald, const double felec,
+                                     const double off2_polar, void **tep_ptr) {
 
   int** firstneigh = nullptr;
 
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();                          
 
-  // ------------------- Resize _tep array ------------------------
-
-  if (inum_full>_max_tep_size) {
-    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _tep.resize(_max_tep_size*4);
-  }
   *tep_ptr=_tep.host.begin();
 
   _off2_polar = off2_polar;
@@ -624,8 +566,6 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
   // copy tep from device to host
 
   _tep.update_host(_max_tep_size*4,false);
-
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index dc3467f692..7f9777061c 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -155,45 +155,22 @@ class BaseAmoeba {
                 double *prd, void **tep_ptr);
 
   /// Compute the real space part of the permanent field (udirect2b) with device neighboring
-  virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole,
+  virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                 double **host_uind, double **host_uinp, double *host_pval,
-                double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double aewald, const double off2_polar,
-                double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+                const double aewald, const double off2_polar, void **fieldp_ptr);
 
   /// Compute the real space part of the induced field (umutual2b) with device neighboring
-  virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole,
+  virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                 double **host_uind, double **host_uinp, double *host_pval,
-                double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double aewald, const double off2_polar, double *charge,
-                double *boxlo, double *prd, void **fieldp_ptr);
+                const double aewald, const double off2_polar, void **fieldp_ptr);
 
   /// Compute polar real-space with device neighboring
-  virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double **host_uind,
-                double **host_uinp, double *host_pval, double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
+  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
                 const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
+                const bool eatom, const bool vatom,
                 const double aewald, const double felec, const double off2_polar,
-                double *charge, double *boxlo, double *prd, void **tep_ptr);
+                void **tep_ptr);
 
   /// Compute polar real-space with host neighboring (not active for now)
   void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 5a348c9272..f62c46aaec 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -301,19 +301,9 @@ int HippoT::repulsion(const int eflag, const int vflag) {
 // Reneighbor on GPU if necessary, and then compute dispersion real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
-                                      const int nall, double **host_x,
-                                      int *host_type, int *host_amtype,
-                                      int *host_amgroup, double **host_rpole,
-                                      double *sublo, double *subhi, tagint *tag,
-                                      int **nspecial, tagint **special,
-                                      int *nspecial15, tagint **special15,
-                                      const bool eflag_in, const bool vflag_in,
-                                      const bool eatom, const bool vatom,
-                                      int &host_start, int **ilist, int **jnum,
-                                      const double cpu_time, bool &success,
-                                      const double aewald, const double off2_disp,
-                                      double *host_q, double *boxlo, double *prd) {
+int** HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
+                                      double **host_rpole, const double aewald,
+                                      const double off2_disp) {
 
   // cast necessary data arrays from host to device
 
@@ -475,21 +465,9 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
 //    of the permanent field
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_udirect2b(const int ago, const int inum_full,
-                                const int nall, double **host_x,
-                                int *host_type, int *host_amtype,
-                                int *host_amgroup, double **host_rpole,
-                                double **host_uind, double **host_uinp,
-                                double* host_pval,
-                                double *sublo, double *subhi, tagint *tag,
-                                int **nspecial, tagint **special,
-                                int *nspecial15, tagint **special15,
-                                const bool eflag_in, const bool vflag_in,
-                                const bool eatom, const bool vatom,
-                                int &host_start, int **ilist, int **jnum,
-                                const double cpu_time, bool &success,
+void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                double **host_uind, double **host_uinp, double* host_pval,
                                 const double aewald, const double off2_polar,
-                                double *host_q, double *boxlo, double *prd,
                                 void** fieldp_ptr) {
 
   // all the necessary data arrays are already copied from host to device
@@ -497,12 +475,6 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
   this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, host_pval);
   this->atom->add_extra_data();
 
-  // ------------------- Resize _fieldp array ------------------------
-
-  if (inum_full>this->_max_fieldp_size) {
-    this->_max_fieldp_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    this->_fieldp.resize(this->_max_fieldp_size*8);
-  }
   *fieldp_ptr=this->_fieldp.host.begin();
 
   this->_off2_polar = off2_polar;
@@ -512,8 +484,6 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   this->_fieldp.update_host(this->_max_fieldp_size*8,false);
-
-  return nullptr; //nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -562,33 +532,16 @@ int HippoT::udirect2b(const int eflag, const int vflag) {
 //    of the induced field
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_umutual2b(const int ago, const int inum_full,
-                                     const int nall, double **host_x,
-                                     int *host_type, int *host_amtype,
-                                     int *host_amgroup, double **host_rpole,
-                                     double **host_uind, double **host_uinp, double *host_pval,
-                                     double *sublo, double *subhi, tagint *tag,
-                                     int **nspecial, tagint **special,
-                                     int *nspecial15, tagint **special15,
-                                     const bool eflag_in, const bool vflag_in,
-                                     const bool eatom, const bool vatom,
-                                     int &host_start, int **ilist, int **jnum,
-                                     const double cpu_time, bool &success,
-                                     const double aewald, const double off2_polar,
-                                     double *host_q, double *boxlo, double *prd,
-                                     void** fieldp_ptr) {
+void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                double **host_uind, double **host_uinp, double *host_pval,
+                                const double aewald, const double off2_polar,
+                                void** fieldp_ptr) {
 
   // cast necessary data arrays from host to device
 
   this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
   this->atom->add_extra_data();
 
-  // ------------------- Resize _fieldp array ------------------------
-
-  if (inum_full>this->_max_fieldp_size) {
-    this->_max_fieldp_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    this->_fieldp.resize(this->_max_fieldp_size*8);
-  }
   *fieldp_ptr=this->_fieldp.host.begin();
 
   this->_off2_polar = off2_polar;
@@ -598,8 +551,6 @@ int** HippoT::compute_umutual2b(const int ago, const int inum_full,
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
   this->_fieldp.update_host(this->_max_fieldp_size*8,false);
-
-  return nullptr; //nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -646,34 +597,17 @@ int HippoT::umutual2b(const int eflag, const int vflag) {
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_polar_real(const int ago, const int inum_full,
-                                 const int nall, double **host_x,
-                                 int *host_type, int *host_amtype,
-                                 int *host_amgroup, double **host_rpole,
-                                 double **host_uind, double **host_uinp,
-                                 double *host_pval, double *sublo, double *subhi,
-                                 tagint *tag, int **nspecial, tagint **special,
-                                 int *nspecial15, tagint **special15,
-                                 const bool eflag_in, const bool vflag_in,
-                                 const bool eatom, const bool vatom,
-                                 int &host_start, int **ilist, int **jnum,
-                                 const double cpu_time, bool &success,
-                                 const double aewald, const double felec,
-                                 const double off2_polar, double *host_q,
-                                 double *boxlo, double *prd, void **tep_ptr) {
-
+void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                double **host_uind, double **host_uinp, double *host_pval,
+                                const bool eflag_in, const bool vflag_in,
+                                const bool eatom, const bool vatom,
+                                const double aewald, const double felec,
+                                const double off2_polar, void **tep_ptr) {
   // cast necessary data arrays from host to device
 
-  //this->cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
   this->atom->add_extra_data();
 
-  // ------------------- Resize _tep array ------------------------
-
-  if (inum_full>this->_max_tep_size) {
-    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-    this->_tep.resize(this->_max_tep_size*4);
-  }
   *tep_ptr=this->_tep.host.begin();
 
   this->_off2_polar = off2_polar;
@@ -691,8 +625,6 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
   // copy tep from device to host
 
   this->_tep.update_host(this->_max_tep_size*4,false);
-
-  return nullptr;
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 9941460bff..492712eb85 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -72,16 +72,9 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                           double c3, double c4, double c5,void** tep_ptr);
 
   /// Compute dispersion real-space with device neighboring
-  int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double aewald, const double off2_disp, double *charge,
-                double *boxlo, double *prd);
+  int** compute_dispersion_real(int *host_amtype,  int *host_amgroup,
+                                double **host_rpole, const double aewald,
+                                const double off2_disp);
 
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
@@ -96,51 +89,23 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                 double *boxlo, double *prd, void **tep_ptr);
 
    /// Compute the real space part of the permanent field (udirect2b) with device neighboring
-   virtual int** compute_udirect2b(const int ago, const int inum_full,
-                  const int nall, double **host_x,
-                  int *host_type, int *host_amtype,
-                  int *host_amgroup, double **host_rpole,
+   virtual void compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                   double **host_uind, double **host_uinp, double* host_pval,
-                  double *sublo, double *subhi, tagint *tag,
-                  int **nspecial, tagint **special,
-                  int *nspecial15, tagint **special15,
-                  const bool eflag_in, const bool vflag_in,
-                  const bool eatom, const bool vatom,
-                  int &host_start, int **ilist, int **jnum,
-                  const double cpu_time, bool &success,
-                  const double aewald, const double off2_polar,
-                  double *host_q, double *boxlo, double *prd,
-                  void** fieldp_ptr);
+                  const double aewald, const double off2_polar, void** fieldp_ptr);
 
    /// Compute the real space part of the induced field (umutual2b) with device neighboring
-   virtual int** compute_umutual2b(const int ago, const int inum_full,
-                                     const int nall, double **host_x,
-                                     int *host_type, int *host_amtype,
-                                     int *host_amgroup, double **host_rpole,
-                                     double **host_uind, double **host_uinp, double *host_pval,
-                                     double *sublo, double *subhi, tagint *tag,
-                                     int **nspecial, tagint **special,
-                                     int *nspecial15, tagint **special15,
-                                     const bool eflag_in, const bool vflag_in,
-                                     const bool eatom, const bool vatom,
-                                     int &host_start, int **ilist, int **jnum,
-                                     const double cpu_time, bool &success,
-                                     const double aewald, const double off2_polar,
-                                     double *host_q, double *boxlo, double *prd,
-                                     void** fieldp_ptr);
+   virtual void compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                   double **host_uind, double **host_uinp, double *host_pval,
+                                   const double aewald, const double off2_polar,
+                                   void** fieldp_ptr);
 
   /// Compute polar real-space with device neighboring
-  virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double **host_uind,
-                double **host_uinp, double *host_pval, double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
+  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                double **host_uind, double **host_uinp, double *host_pval,
+                const bool eflag_in, const bool vflag_in,
+                const bool eatom, const bool vatom,
                 const double aewald, const double felec, const double off2_polar,
-                double *charge, double *boxlo, double *prd, void **tep_ptr);
+                void **tep_ptr);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 4152833320..9d3d845ad0 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -140,21 +140,11 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                           cut2, c0, c1, c2, c3, c4, c5, tep_ptr);
 }
 
-int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd) {
-  return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
-                          tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, off2, host_q, boxlo, prd);
+void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
+                                        double **host_rpole, const double aewald,
+                                        const double off2) {
+  HIPPOMF.compute_dispersion_real(host_amtype, host_amgroup, host_rpole,
+                                         aewald, off2);
 }
 
 int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
@@ -174,58 +164,29 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
                           cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
 }
 
-int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
+void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
                            double **host_uind, double **host_uinp, double *host_pval,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success,  const double aewald, const double off2, double *host_q,
-                           double *boxlo, double *prd, void **fieldp_ptr) {
-  return HIPPOMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
-                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
+                           const double aewald, const double off2, void **fieldp_ptr) {
+  HIPPOMF.compute_udirect2b(host_amtype, host_amgroup, host_rpole,
+                            host_uind, host_uinp, host_pval,
+                            aewald, off2, fieldp_ptr);
 }
 
-int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp, double *host_pval,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2, double *host_q,
-                           double *boxlo, double *prd, void **fieldp_ptr) {
-  return HIPPOMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
-                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
+void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                  double **host_uind, double **host_uinp, double *host_pval,
+                                  const double aewald, const double off2, void **fieldp_ptr) {
+  HIPPOMF.compute_umutual2b(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
+                            aewald, off2, fieldp_ptr);
 }
 
-int** hippo_gpu_compute_polar_real(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup,
-                           double **host_rpole, double **host_uind, double **host_uinp,
-                           double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double felec, const double off2,
-                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
-  return HIPPOMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval,
-                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
-                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                  double **host_uind, double **host_uinp, double *host_pval,
+                                  const bool eflag_in, const bool vflag_in,
+                                  const bool eatom, const bool vatom,
+                                  const double aewald, const double felec, const double off2,
+                                  void **tep_ptr) {
+  HIPPOMF.compute_polar_real(host_amtype, host_amgroup, host_rpole,  host_uind, host_uinp, host_pval,
+                             eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
 double hippo_gpu_bytes() {
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 65a4af7d64..ea7c40793c 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -74,35 +74,19 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
               bool &success, const double aewald, const double felec, const double off2,
               double *host_q, double *boxlo, double *prd, void **tq_ptr);
 
-int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
-              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
-              double *sublo, double *subhi, tagint *tag, int **nspecial,
-              tagint **special, int* nspecial15, tagint** special15,
-              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
-              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double aewald, const double off2, double *host_q,
-              double *boxlo, double *prd, void **fieldp_ptr);
+              const double aewald, const double off2, void **fieldp_ptr);
 
-int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
-              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
-              double *sublo, double *subhi, tagint *tag, int **nspecial,
-              tagint **special, int* nspecial15, tagint** special15,
-              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
-              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double aewald, const double off2, double *host_q,
-              double *boxlo, double *prd, void **fieldp_ptr);
+              const double aewald, const double off2, void **fieldp_ptr);
 
-int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
-              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
-              double *sublo, double *subhi, tagint *tag, int **nspecial,
-              tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
-              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double aewald, const double felec, const double off2,
-              double *host_q, double *boxlo, double *prd, void **tq_ptr);
+              const double aewald, const double felec, const double off2,
+              void **tq_ptr);
 
 double amoeba_gpu_bytes();
 
@@ -345,14 +329,7 @@ void PairAmoebaGPU::induce()
       }
     }
   }
-/*
-  printf("GPU: cutghost = %f\n", comm->cutghost[0]);
-  for (i = 0; i < 10; i++) {
-    printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
-      i, udir[i][0], udir[i][1], udir[i][2],
-      udirp[i][0], udirp[i][1], udirp[i][2]);
-  }
-*/
+
   // get induced dipoles via the OPT extrapolation method
   // NOTE: any way to rewrite these loops to avoid allocating
   //       uopt,uoptp with a optorder+1 dimension, just optorder ??
@@ -731,17 +708,8 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   if (use_ewald) choose(POLAR_LONG);
   else choose(POLAR);
 
-  firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
-                                            atom->type, amtype, amgroup, rpole,
-                                            uind, uinp, sublo, subhi, atom->tag,
-                                            atom->nspecial, atom->special,
-                                            atom->nspecial15, atom->special15,
-                                            eflag, vflag, eflag_atom, vflag_atom,
-                                            host_start, &ilist, &numneigh, cpu_time,
-                                            success, aewald, off2, atom->q,
-                                            domain->boxlo, domain->prd, &fieldp_pinned);
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
+  amoeba_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp,
+                               aewald, off2, &fieldp_pinned);
 
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
@@ -933,10 +901,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
 
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
+  int inum;
 
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
@@ -956,17 +921,8 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
   if (use_ewald) choose(POLAR_LONG);
   else choose(POLAR);
 
-  firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
-                                            atom->type, amtype, amgroup, rpole,
-                                            uind, uinp, sublo, subhi, atom->tag,
-                                            atom->nspecial, atom->special,
-                                            atom->nspecial15, atom->special15,
-                                            eflag, vflag, eflag_atom, vflag_atom,
-                                            host_start, &ilist, &numneigh, cpu_time,
-                                            success,aewald, off2, atom->q,
-                                            domain->boxlo, domain->prd, &fieldp_pinned);
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
+  amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, aewald,
+                               off2, &fieldp_pinned);
 
   // accumulate the field and fieldp values from the GPU lib
   //   field and fieldp may already have some nonzero values from kspace (umutual1)
@@ -1005,10 +961,7 @@ void PairAmoebaGPU::polar_real()
 
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
+  int inum;
 
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
@@ -1032,18 +985,9 @@ void PairAmoebaGPU::polar_real()
 
   double felec = 0.5 * electric / am_dielectric;
 
-  firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
-                                             atom->type, amtype, amgroup,
-                                             rpole, uind, uinp, sublo, subhi,
-                                             atom->tag, atom->nspecial, atom->special,
-                                             atom->nspecial15, atom->special15,
-                                             eflag, vflag, eflag_atom, vflag_atom,
-                                             host_start, &ilist, &numneigh, cpu_time,
-                                             success, aewald, felec, off2, atom->q,
-                                             domain->boxlo, domain->prd, &tq_pinned);
-
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
+  amoeba_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp,
+                                eflag, vflag, eflag_atom, vflag_atom,
+                                aewald, felec, off2, &tq_pinned);
 
   // reference to the tep array from GPU lib
 
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index dcdac836bd..0d77c67e10 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -80,16 +80,8 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                            double cut2, double c0, double c1, double c2,
                            double c3, double c4, double c5, void **tep_ptr);
 
-int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd);
+void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole,
+                                        const double aewald, const double off2);
 
 int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
@@ -100,35 +92,19 @@ int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int
               bool &success, const double aewald, const double felec, const double off2,
               double *host_q, double *boxlo, double *prd, void **tq_ptr);
 
-int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
-              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+void hippo_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
-              double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
-              tagint **special, int* nspecial15, tagint** special15,
-              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
-              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double aewald, const double off2, double *host_q,
-              double *boxlo, double *prd, void **fieldp_ptr);
+              double *host_pval, const double aewald, const double off2, void **fieldp_ptr);
 
-int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
-              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
-              double *sublo, double *subhi, tagint *tag, int **nspecial,
-              tagint **special, int* nspecial15, tagint** special15,
-              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
-              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double aewald, const double off2, double *host_q,
-              double *boxlo, double *prd, void **fieldp_ptr);
+              const double aewald, const double off2, void **fieldp_ptr);
 
-int ** hippo_gpu_compute_polar_real(const int ago, const int inum, const int nall,
-              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
-              double *sublo, double *subhi, tagint *tag, int **nspecial,
-              tagint **special, int* nspecial15, tagint** special15,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
-              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double aewald, const double felec, const double off2,
-              double *host_q, double *boxlo, double *prd, void **tq_ptr);
+              const double aewald, const double felec, const double off2,
+              void **tq_ptr);
 
 double hippo_gpu_bytes();
 
@@ -301,7 +277,6 @@ void PairHippoGPU::dispersion_real()
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
 
-  bool success = true;
   int *ilist, *numneigh, **firstneigh;
 
   double sublo[3],subhi[3];
@@ -322,18 +297,7 @@ void PairHippoGPU::dispersion_real()
   if (use_dewald) choose(DISP_LONG);
   else choose(DISP);
 
-  firstneigh = hippo_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x,
-                                                 atom->type, amtype, amgroup, rpole,
-                                                 sublo, subhi, atom->tag,
-                                                 atom->nspecial, atom->special,
-                                                 atom->nspecial15, atom->special15,
-                                                 eflag, vflag, eflag_atom, vflag_atom,
-                                                 host_start, &ilist, &numneigh, cpu_time,
-                                                 success, aewald, off2, atom->q,
-                                                 domain->boxlo, domain->prd);
-
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
+  hippo_gpu_compute_dispersion_real(amtype, amgroup, rpole, aewald, off2);
 }
 
 /* ----------------------------------------------------------------------
@@ -377,15 +341,15 @@ void PairHippoGPU::multipole_real()
 
   double felec = electric / am_dielectric;
 
-  firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
-                                                atom->type, amtype, amgroup, rpole, pval,
-                                                sublo, subhi, atom->tag,
-                                                atom->nspecial, atom->special,
-                                                atom->nspecial15, atom->special15,
-                                                eflag, vflag, eflag_atom, vflag_atom,
-                                                host_start, &ilist, &numneigh, cpu_time,
-                                                success, aewald, felec, off2, atom->q,
-                                                domain->boxlo, domain->prd, &tq_pinned);
+  hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
+                                   atom->type, amtype, amgroup, rpole, pval,
+                                   sublo, subhi, atom->tag,
+                                   atom->nspecial, atom->special,
+                                   atom->nspecial15, atom->special15,
+                                   eflag, vflag, eflag_atom, vflag_atom,
+                                   host_start, &ilist, &numneigh, cpu_time,
+                                   success, aewald, felec, off2, atom->q,
+                                   domain->boxlo, domain->prd, &tq_pinned);
 
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
@@ -854,9 +818,6 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
 
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
-
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -875,17 +836,8 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
   if (use_ewald) choose(POLAR_LONG);
   else choose(POLAR);
 
-  firstneigh = hippo_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
-                                            atom->type, amtype, amgroup, rpole,
-                                            uind, uinp, pval, sublo, subhi, atom->tag,
-                                            atom->nspecial, atom->special,
-                                            atom->nspecial15, atom->special15,
-                                            eflag, vflag, eflag_atom, vflag_atom,
-                                            host_start, &ilist, &numneigh, cpu_time,
-                                            success, aewald, off2, atom->q,
-                                            domain->boxlo, domain->prd, &fieldp_pinned);
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
+  hippo_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, pval,
+                              aewald, off2, &fieldp_pinned);
 
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
@@ -1078,10 +1030,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
 
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
+  int inum;
 
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
@@ -1101,17 +1050,9 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
   if (use_ewald) choose(POLAR_LONG);
   else choose(POLAR);
 
-  firstneigh = hippo_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
-                                           atom->type, amtype, amgroup, rpole,
-                                           uind, uinp, pval, sublo, subhi, atom->tag,
-                                           atom->nspecial, atom->special,
-                                           atom->nspecial15, atom->special15,
-                                           eflag, vflag, eflag_atom, vflag_atom,
-                                           host_start, &ilist, &numneigh, cpu_time,
-                                           success,aewald, off2, atom->q,
-                                           domain->boxlo, domain->prd, &fieldp_pinned);
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
+  hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
+                              aewald, off2, &fieldp_pinned);
+  
 
   // accumulate the field and fieldp values from the GPU lib
   //   field and fieldp may already have some nonzero values from kspace (umutual1)
@@ -1150,10 +1091,7 @@ void PairHippoGPU::polar_real()
 
   int eflag=1, vflag=1;
   int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
+  int inum;
 
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
@@ -1177,18 +1115,9 @@ void PairHippoGPU::polar_real()
 
   double felec = 0.5 * electric / am_dielectric;
 
-  firstneigh = hippo_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
-                                             atom->type, amtype, amgroup,
-                                             rpole, uind, uinp, pval, sublo, subhi,
-                                             atom->tag, atom->nspecial, atom->special,
-                                             atom->nspecial15, atom->special15,
-                                             eflag, vflag, eflag_atom, vflag_atom,
-                                             host_start, &ilist, &numneigh, cpu_time,
-                                             success, aewald, felec, off2, atom->q,
-                                             domain->boxlo, domain->prd, &tq_pinned);
-
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
+  hippo_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, pval,
+                               eflag, vflag, eflag_atom, vflag_atom,
+                               aewald, felec, off2, &tq_pinned);
 
   // reference to the tep array from GPU lib
 

From f4900d131ac828fca7b811fd98f85e276e6a0f70 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 1 Jul 2022 16:26:25 -0500
Subject: [PATCH 077/181] Working on the multipole term on the gpu side,
 incorrect virials

---
 lib/gpu/lal_amoeba.cu       | 24 +++++++--------
 src/GPU/pair_amoeba_gpu.cpp | 58 +++++++------------------------------
 2 files changed, 23 insertions(+), 59 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index f29522084d..f91e973c9b 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -668,9 +668,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
       // increment force-based gradient and torque on first site
 
-      f.x += frcx;
-      f.y += frcy;
-      f.z += frcz;
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
       tq.x += ttmix;
       tq.y += ttmiy;
       tq.z += ttmiz;
@@ -683,12 +683,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
         numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
         numtyp vzz = -zr * frcz;
 
-        virial[0] += vxx;
-        virial[1] += vyy;
-        virial[2] += vzz;
-        virial[3] += vxy;
-        virial[4] += vxz;
-        virial[5] += vyz;
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
       }
     } // nbor
 
@@ -1597,9 +1597,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       frcy = frcy + depy;
       frcz = frcz + depz;
 
-      f.x -= frcx;
-      f.y -= frcy;
-      f.z -= frcz;
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
 
       if (EVFLAG && vflag) {
         numtyp vxx = xr * frcx;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index dcbbc01185..fb9e8ef7e3 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -104,9 +104,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_hal_ready = false;               // true for AMOEBA when ready
   gpu_repulsion_ready = false;         // always false for AMOEBA
   gpu_dispersion_real_ready = false;   // always false for AMOEBA
-  gpu_multipole_real_ready = true;
-  gpu_udirect2b_ready = true;
-  gpu_umutual2b_ready = true;
+  gpu_multipole_real_ready = false;
+  gpu_udirect2b_ready = false;
+  gpu_umutual2b_ready = false;
   gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
@@ -262,26 +262,16 @@ void PairAmoebaGPU::induce()
   double sum,sump,term;
   double reduce[4],allreduce[4];
 
-  double *poli;
-  double **conj,**conjp;
-  double **vec,**vecp;
-  double **udir,**usum,**usump;
-
   int debug = 1;
 
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
-  if (use_ewald) {
-    choose(POLAR_LONG);
-    int nmine = p_kspace->nfft_owned;
-    memory->create(qfac,nmine,"ameoba/induce:qfac");
-  } else choose(POLAR);
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
 
   // owned atoms
 
-  double **x = atom->x;
-  double **f = atom->f;
   int nlocal = atom->nlocal;
 
   // zero out the induced dipoles at each site
@@ -293,19 +283,6 @@ void PairAmoebaGPU::induce()
     }
   }
 
-  // allocation of arrays
-  // NOTE: not all are used by all methods
-  // NOTE: could be re-allocated dynamically
-
-  memory->create(poli,nlocal,"ameoba/induce:poli");
-  memory->create(conj,nlocal,3,"ameoba/induce:conj");
-  memory->create(conjp,nlocal,3,"ameoba/induce:conjp");
-  memory->create(vec,nlocal,3,"ameoba/induce:vec");
-  memory->create(vecp,nlocal,3,"ameoba/induce:vecp");
-  memory->create(udir,nlocal,3,"ameoba/induce:udir");
-  memory->create(usum,nlocal,3,"ameoba/induce:usum");
-  memory->create(usump,nlocal,3,"ameoba/induce:usump");
-
   // get the electrostatic field due to permanent multipoles
 
   dfield0c(field,fieldp);
@@ -572,8 +549,6 @@ void PairAmoebaGPU::induce()
         }
       }
 
-      // NOTE: comp of b,bp and allreduce only needed if pcgprec ?
-
       reduce[0] = b;
       reduce[1] = bp;
       MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
@@ -633,17 +608,6 @@ void PairAmoebaGPU::induce()
 	      error->warning(FLERR,"AMOEBA induced dipoles did not converge");
   }
 
-  // deallocation of arrays
-
-  memory->destroy(poli);
-  memory->destroy(conj);
-  memory->destroy(conjp);
-  memory->destroy(vec);
-  memory->destroy(vecp);
-  memory->destroy(udir);
-  memory->destroy(usum);
-  memory->destroy(usump);
-
   // update the lists of previous induced dipole values
   // shift previous m values up to m+1, add new values at m = 0
   // only when preconditioner is used
@@ -1047,12 +1011,12 @@ void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr,
     vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
 
-    virial_comp[0] += vxx;
-    virial_comp[1] += vyy;
-    virial_comp[2] += vzz;
-    virial_comp[3] += vxy;
-    virial_comp[4] += vxz;
-    virial_comp[5] += vyz;
+    virial_comp[0] -= vxx;
+    virial_comp[1] -= vyy;
+    virial_comp[2] -= vzz;
+    virial_comp[3] -= vxy;
+    virial_comp[4] -= vxz;
+    virial_comp[5] -= vyz;
   }
 }
 

From 5dab809522927bc98a727f92a479c9c935d892c7 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 4 Jul 2022 01:38:22 -0500
Subject: [PATCH 078/181] Flipped force sign in polar_real, made sure that
 multipole_real is true for precompute() to be invoked, ubdirect2b() is
 segfault and needs work

---
 lib/gpu/lal_amoeba.cu       | 12 ++++++------
 src/GPU/pair_amoeba_gpu.cpp |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index f91e973c9b..a40f8314a5 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1609,12 +1609,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
         numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
         numtyp vzz = zr * frcz;
 
-        virial[0] += vxx;
-        virial[1] += vyy;
-        virial[2] += vzz;
-        virial[3] += vxy;
-        virial[4] += vxz;
-        virial[5] += vyz;
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
       }
     } // nbor
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index fb9e8ef7e3..56c621c4dc 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -104,9 +104,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_hal_ready = false;               // true for AMOEBA when ready
   gpu_repulsion_ready = false;         // always false for AMOEBA
   gpu_dispersion_real_ready = false;   // always false for AMOEBA
-  gpu_multipole_real_ready = false;
-  gpu_udirect2b_ready = false;
-  gpu_umutual2b_ready = false;
+  gpu_multipole_real_ready = true;     // need to be true for precompute()
+  gpu_udirect2b_ready = false;         // NEED work
+  gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);

From ee5afdc1468a79614b1743f62ab09baea9887814 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 4 Jul 2022 23:24:31 -0500
Subject: [PATCH 079/181] Updated all the gpu ready terms

---
 src/GPU/pair_amoeba_gpu.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 56c621c4dc..3d601fef88 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -105,9 +105,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_repulsion_ready = false;         // always false for AMOEBA
   gpu_dispersion_real_ready = false;   // always false for AMOEBA
   gpu_multipole_real_ready = true;     // need to be true for precompute()
-  gpu_udirect2b_ready = false;         // NEED work
+  gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }

From 83288666117b11b78d5094d74e3c2866266b1f8e Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 5 Jul 2022 11:02:31 -0500
Subject: [PATCH 080/181] Added checks for the gpu variant of pair amoeba/hippo
 in improper/amoeba and fix amoeba/bitorsion

---
 src/AMOEBA/fix_amoeba_bitorsion.cpp | 2 ++
 src/AMOEBA/improper_amoeba.cpp      | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/AMOEBA/fix_amoeba_bitorsion.cpp b/src/AMOEBA/fix_amoeba_bitorsion.cpp
index 85a87b0452..28e055a0fe 100644
--- a/src/AMOEBA/fix_amoeba_bitorsion.cpp
+++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp
@@ -195,7 +195,9 @@ void FixAmoebaBiTorsion::init()
 
   pair = nullptr;
   pair = force->pair_match("amoeba",1,0);
+  if (!pair) pair = force->pair_match("amoeba/gpu",1,0);
   if (!pair) pair = force->pair_match("hippo",1,0);
+  if (!pair) pair = force->pair_match("hippo/gpu",1,0);
 
   if (!pair)
     error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo");
diff --git a/src/AMOEBA/improper_amoeba.cpp b/src/AMOEBA/improper_amoeba.cpp
index 2b39214642..3ff4978f0f 100644
--- a/src/AMOEBA/improper_amoeba.cpp
+++ b/src/AMOEBA/improper_amoeba.cpp
@@ -286,7 +286,9 @@ void ImproperAmoeba::init_style()
 
   Pair *pair = NULL;
   pair = force->pair_match("amoeba",1,0);
+  if (!pair) pair = force->pair_match("amoeba/gpu",1,0);
   if (!pair) pair = force->pair_match("hippo",1,0);
+  if (!pair) pair = force->pair_match("hippo/gpu",1,0);
   if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo");
 
   int tmp;

From 675c2d38a3017217b662e2c516d39ea5e64ac13a Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 5 Jul 2022 14:37:26 -0500
Subject: [PATCH 081/181] Flipped sign of forces and virial terms in the hippo
 kernels

---
 lib/gpu/lal_amoeba.cu |  3 +-
 lib/gpu/lal_hippo.cu  | 73 ++++++++++++++++++++++---------------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index a40f8314a5..3b50feb6ed 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1642,8 +1642,7 @@ __kernel void k_amoeba_special15(__global int * dev_nbor,
                           const __global tagint *restrict tag,
                           const __global int *restrict nspecial15,
                           const __global tagint *restrict special15,
-                          const int inum, const int nall,
-                          const int nbor_pitch,
+                          const int inum, const int nall, const int nbor_pitch,
                           const int t_per_atom) {
   int tid, ii, offset, n_stride, i;
   atom_info(t_per_atom,ii,tid,offset);
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index f38a9f4ac0..b47e2d50e3 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -687,9 +687,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
 
       // increment force-based gradient and torque on atom I
 
-      f.x += frcx;
-      f.y += frcy;
-      f.z += frcz;
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
       tq.x += ttmix;
       tq.y += ttmiy;
       tq.z += ttmiz;
@@ -703,12 +703,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
         numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
         numtyp vzz = -zr * frcz;
 
-        virial[0] += vxx;
-        virial[1] += vyy;
-        virial[2] += vzz;
-        virial[3] += vxy;
-        virial[4] += vxz;
-        virial[5] += vyz;
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
       }
     } // nbor
 
@@ -877,9 +877,9 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       numtyp dedx = de * xr;
       numtyp dedy = de * yr;
       numtyp dedz = de * zr;
-      f.x += dedx;
-      f.y += dedy;
-      f.z += dedz;
+      f.x -= dedx;
+      f.y -= dedy;
+      f.z -= dedz;
 
       // increment the internal virial tensor components
 
@@ -890,12 +890,12 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       numtyp vzy = zr * dedy;
       numtyp vzz = zr * dedz;
 
-      virial[0] += vxx;
-      virial[1] += vyy;
-      virial[2] += vzz;
-      virial[3] += vyx;
-      virial[4] += vzx;
-      virial[5] += vzy;
+      virial[0] -= vxx;
+      virial[1] -= vyy;
+      virial[2] -= vzz;
+      virial[3] -= vyx;
+      virial[4] -= vzx;
+      virial[5] -= vzy;
     } // nbor
 
   } // ii<inum
@@ -1212,9 +1212,9 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
       // increment force-based gradient and torque on first site
 
-      f.x += frcx;
-      f.y += frcy;
-      f.z += frcz;
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
       tq.x += ttmix;
       tq.y += ttmiy;
       tq.z += ttmiz;
@@ -1227,12 +1227,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
         numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
         numtyp vzz = -zr * frcz;
 
-        virial[0] += vxx;
-        virial[1] += vyy;
-        virial[2] += vzz;
-        virial[3] += vxy;
-        virial[4] += vxz;
-        virial[5] += vyz;
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
       }
     } // nbor
 
@@ -2095,9 +2095,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       frcy = frcy - depy;
       frcz = frcz - depz;
 
-      f.x -= frcx;
-      f.y -= frcy;
-      f.z -= frcz;
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
 
       if (EVFLAG && vflag) {
         numtyp vxx = xr * frcx;
@@ -2107,12 +2107,12 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
         numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
         numtyp vzz = zr * frcz;
 
-        virial[0] += vxx;
-        virial[1] += vyy;
-        virial[2] += vzz;
-        virial[3] += vxy;
-        virial[4] += vxz;
-        virial[5] += vyz;
+        virial[0] -= vxx;
+        virial[1] -= vyy;
+        virial[2] -= vzz;
+        virial[3] -= vxy;
+        virial[4] -= vxz;
+        virial[5] -= vyz;
       }
     } // nbor
 
@@ -2159,6 +2159,7 @@ __kernel void k_hippo_special15(__global int * dev_nbor,
       int which = sj >> SBBITS & 3;
       int j = sj & NEIGHMASK;
       tagint jtag = tag[j];
+
       if (!which) {
         int offset=ii;
         for (int k=0; k<n15; k++) {

From 78d6df5ba97be87a8cd9d70e29a88bb187ee5d79 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 6 Jul 2022 11:17:08 -0500
Subject: [PATCH 082/181] Removed temporary arrays in hippo/gpu induce, flipped
 sign of the viriral terms in torque2force in hippo/gpu

---
 src/GPU/pair_amoeba_gpu.cpp |  8 +++---
 src/GPU/pair_hippo_gpu.cpp  | 51 +++++++------------------------------
 2 files changed, 12 insertions(+), 47 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 3d601fef88..fd9d99e56c 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -647,9 +647,6 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
 
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
-
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -674,6 +671,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
   // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
+  //   so no need to call ubdirect2b_cpu().
   // udirect2b_cpu();
 
   // accumulate the field and fieldp values from the GPU lib
@@ -881,8 +879,8 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
   if (use_ewald) choose(POLAR_LONG);
   else choose(POLAR);
 
-  amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, aewald,
-                               off2, &fieldp_pinned);
+  amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp,
+                               aewald, off2, &fieldp_pinned);
 
   // accumulate the field and fieldp values from the GPU lib
   //   field and fieldp may already have some nonzero values from kspace (umutual1)
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 55712b3250..535be0c160 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -388,11 +388,6 @@ void PairHippoGPU::induce()
   double sum,sump,term;
   double reduce[4],allreduce[4];
 
-  double *poli;
-  double **conj,**conjp;
-  double **vec,**vecp;
-  double **udir,**usum,**usump;
-
   int debug = 1;
 
   // set cutoffs, taper coeffs, and PME params
@@ -419,24 +414,11 @@ void PairHippoGPU::induce()
     }
   }
 
-  // allocation of arrays
-  // NOTE: not all are used by all methods
-  // NOTE: could be re-allocated dynamically
-
-  memory->create(poli,nlocal,"ameoba/induce:poli");
-  memory->create(conj,nlocal,3,"ameoba/induce:conj");
-  memory->create(conjp,nlocal,3,"ameoba/induce:conjp");
-  memory->create(vec,nlocal,3,"ameoba/induce:vec");
-  memory->create(vecp,nlocal,3,"ameoba/induce:vecp");
-  memory->create(udir,nlocal,3,"ameoba/induce:udir");
-  memory->create(usum,nlocal,3,"ameoba/induce:usum");
-  memory->create(usump,nlocal,3,"ameoba/induce:usump");
-
   // get the electrostatic field due to permanent multipoles
 
   dfield0c(field,fieldp);
 
-  // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
+  // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only
 
   if (!gpu_udirect2b_ready) {
     crstyle = FIELD;
@@ -705,8 +687,6 @@ void PairHippoGPU::induce()
         }
       }
 
-      // NOTE: comp of b,bp and allreduce only needed if pcgprec ?
-
       reduce[0] = b;
       reduce[1] = bp;
       MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
@@ -763,20 +743,9 @@ void PairHippoGPU::induce()
 
     if (iter >= maxiter || eps > epsold)
       if (comm->me == 0)
-	      error->warning(FLERR,"hippo induced dipoles did not converge");
+	      error->warning(FLERR,"HIPPO induced dipoles did not converge");
   }
 
-  // deallocation of arrays
-
-  memory->destroy(poli);
-  memory->destroy(conj);
-  memory->destroy(conjp);
-  memory->destroy(vec);
-  memory->destroy(vecp);
-  memory->destroy(udir);
-  memory->destroy(usum);
-  memory->destroy(usump);
-
   // update the lists of previous induced dipole values
   // shift previous m values up to m+1, add new values at m = 0
   // only when preconditioner is used
@@ -835,7 +804,6 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
   else choose(POLAR);
 
   double *pval = atom->dvector[index_pval];
-
   hippo_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, pval,
                               aewald, off2, &fieldp_pinned);
 
@@ -1051,10 +1019,9 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
   else choose(POLAR);
 
   double *pval = atom->dvector[index_pval];
-
   hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
                               aewald, off2, &fieldp_pinned);
-  
+
   // accumulate the field and fieldp values from the GPU lib
   //   field and fieldp may already have some nonzero values from kspace (umutual1)
 
@@ -1183,12 +1150,12 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
     vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
 
-    virial_comp[0] += vxx;
-    virial_comp[1] += vyy;
-    virial_comp[2] += vzz;
-    virial_comp[3] += vxy;
-    virial_comp[4] += vxz;
-    virial_comp[5] += vyz;
+    virial_comp[0] -= vxx;
+    virial_comp[1] -= vyy;
+    virial_comp[2] -= vzz;
+    virial_comp[3] -= vxy;
+    virial_comp[4] -= vxz;
+    virial_comp[5] -= vyz;
   }
 }
 

From 0c44bd10862016f21870bc585ac148db3996cd38 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 8 Jul 2022 14:45:31 -0500
Subject: [PATCH 083/181] Rearranged the order of real-space and kspace part of
 ufield0c(), delayed device-host transfer from umutual2b() to overlap with
 kspace part

---
 lib/gpu/lal_amoeba_ext.cpp  |  4 +++
 lib/gpu/lal_base_amoeba.cpp |  4 +--
 lib/gpu/lal_base_amoeba.h   |  7 ++++
 lib/gpu/lal_hippo.cpp       |  4 +--
 lib/gpu/lal_hippo_ext.cpp   |  4 +++
 src/GPU/pair_amoeba_gpu.cpp | 71 ++++++++++++++++++++++++++++++++++++-
 src/GPU/pair_amoeba_gpu.h   |  1 +
 src/GPU/pair_hippo_gpu.cpp  | 71 ++++++++++++++++++++++++++++++++++++-
 src/GPU/pair_hippo_gpu.h    |  1 +
 9 files changed, 161 insertions(+), 6 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 18e1cf22f8..63ed683833 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -148,6 +148,10 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **
                              aewald, off2, fieldp_ptr);
 }
 
+void amoeba_gpu_update_fieldp(void **fieldp_ptr) {
+  AMOEBAMF.update_fieldp(fieldp_ptr);
+}
+
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
                                     double **host_uind, double **host_uinp,
                                     const bool eflag_in, const bool vflag_in,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 5b396a641e..781945b77b 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -528,8 +528,8 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
   const int red_blocks=umutual2b(_eflag,_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
-
-  _fieldp.update_host(_max_fieldp_size*8,false);
+  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  //_fieldp.update_host(_max_fieldp_size*8,false);
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 7f9777061c..f439e2945f 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -183,6 +183,13 @@ class BaseAmoeba {
                const double off2_polar, double *charge, const int nlocal, double *boxlo,
                double *prd, void **tep_ptr);
 
+  // copy field and fieldp from device to host after umutual2b
+  virtual void update_fieldp(void **fieldp_ptr) {
+    *fieldp_ptr=_fieldp.host.begin();
+     // _fieldp store both arrays, one after another
+    _fieldp.update_host(_max_fieldp_size*8,false);
+  }
+  
   // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index f62c46aaec..3065bfefd4 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -549,8 +549,8 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos
   const int red_blocks=umutual2b(this->_eflag,this->_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
-
-  this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+  // NOTE: move this step to update_fieldp() to delay device-host transfer
+  //this->_fieldp.update_host(this->_max_fieldp_size*8,false);
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 9d3d845ad0..e7deaddbf3 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -179,6 +179,10 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **h
                             aewald, off2, fieldp_ptr);
 }
 
+void hippo_gpu_update_fieldp(void **fieldp_ptr) {
+  HIPPOMF.update_fieldp(fieldp_ptr);
+}
+
 void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
                                   double **host_uind, double **host_uinp, double *host_pval,
                                   const bool eflag_in, const bool vflag_in,
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index fd9d99e56c..1376a6bd12 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -82,6 +82,8 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
               const double aewald, const double off2, void **fieldp_ptr);
 
+void amoeba_gpu_update_fieldp(void **fieldp_ptr);
+
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -844,6 +846,72 @@ void PairAmoebaGPU::udirect2b_cpu()
   }
 }
 
+/* ----------------------------------------------------------------------
+   ufield0c = mutual induction via Ewald sum
+   ufield0c computes the mutual electrostatic field due to
+   induced dipole moments via Ewald summation
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
+{
+  int i,j;
+  double term;
+
+  // zero field,fieldp for owned and ghost atoms
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+
+  for (i = 0; i < nall; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] = 0.0;
+      fieldp[i][j] = 0.0;
+    }
+  }
+
+  // get the real space portion of the mutual field first
+
+  if (polar_rspace_flag) umutual2b(field,fieldp);
+
+  // get the reciprocal space part of the mutual field
+
+  if (polar_kspace_flag) umutual1(field,fieldp);
+
+  // add the self-energy portion of the mutual field
+
+  term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] += term*uind[i][j];
+      fieldp[i][j] += term*uinp[i][j];
+    }
+  }
+
+  // accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU
+  //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
+
+  amoeba_gpu_update_fieldp(&fieldp_pinned);
+  
+  int inum = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2];
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
+  }
+}
+
 /* ----------------------------------------------------------------------
    umutual2b = Ewald real mutual field via list
    umutual2b computes the real space contribution of the induced
@@ -881,7 +949,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
 
   amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp,
                                aewald, off2, &fieldp_pinned);
-
+/*
   // accumulate the field and fieldp values from the GPU lib
   //   field and fieldp may already have some nonzero values from kspace (umutual1)
 
@@ -903,6 +971,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
+*/  
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index e0210faa68..e419ccd1a1 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -39,6 +39,7 @@ class PairAmoebaGPU : public PairAmoeba {
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
   virtual void umutual2b(double **, double **);
+  virtual void ufield0c(double **, double **);
   virtual void polar_real();
 
  private:
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 535be0c160..41c1355fbb 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -100,6 +100,8 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
               const double aewald, const double off2, void **fieldp_ptr);
 
+void hippo_gpu_update_fieldp(void **fieldp_ptr);
+
 void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -983,6 +985,72 @@ void PairHippoGPU::udirect2b_cpu()
   }
 }
 
+/* ----------------------------------------------------------------------
+   ufield0c = mutual induction via Ewald sum
+   ufield0c computes the mutual electrostatic field due to
+   induced dipole moments via Ewald summation
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::ufield0c(double **field, double **fieldp)
+{
+  int i,j;
+  double term;
+
+  // zero field,fieldp for owned and ghost atoms
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+
+  for (i = 0; i < nall; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] = 0.0;
+      fieldp[i][j] = 0.0;
+    }
+  }
+
+  // get the real space portion of the mutual field first
+
+  if (polar_rspace_flag) umutual2b(field,fieldp);
+
+  // get the reciprocal space part of the mutual field
+
+  if (polar_kspace_flag) umutual1(field,fieldp);
+
+  // add the self-energy portion of the mutual field
+
+  term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] += term*uind[i][j];
+      fieldp[i][j] += term*uinp[i][j];
+    }
+  }
+
+  // accumulate the field and fieldp values from real-space portion from umutual2b() on the GPU
+  //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
+
+  hippo_gpu_update_fieldp(&fieldp_pinned);
+  
+  int inum = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2];
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
+  }
+}
+
 /* ----------------------------------------------------------------------
    umutual2b = Ewald real mutual field via list
    umutual2b computes the real space contribution of the induced
@@ -1021,7 +1089,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
   double *pval = atom->dvector[index_pval];
   hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
                               aewald, off2, &fieldp_pinned);
-
+/*
   // accumulate the field and fieldp values from the GPU lib
   //   field and fieldp may already have some nonzero values from kspace (umutual1)
 
@@ -1043,6 +1111,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
+*/
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index c7a4e75ebe..1ed1c3299d 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -40,6 +40,7 @@ class PairHippoGPU : public PairAmoeba {
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
   virtual void umutual2b(double **, double **);
+  virtual void ufield0c(double **, double **);
   virtual void polar_real();
 
  private:

From 66ee2bf98973519cb8711d1732879905d8180a2d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 14 Jul 2022 11:01:30 -0500
Subject: [PATCH 084/181] Cleaned up

---
 lib/gpu/lal_base_amoeba.cpp | 5 ++---
 lib/gpu/lal_hippo.cpp       | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 781945b77b..6f65c8c934 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -521,15 +521,14 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();                          
 
-  *fieldp_ptr=_fieldp.host.begin();
-
   _off2_polar = off2_polar;
   _aewald = aewald;
   const int red_blocks=umutual2b(_eflag,_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
   // NOTE: move this step to update_fieldp() to delay device-host transfer
-  //_fieldp.update_host(_max_fieldp_size*8,false);
+  // *fieldp_ptr=_fieldp.host.begin();
+  // _fieldp.update_host(_max_fieldp_size*8,false);
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 3065bfefd4..79a8772c3e 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -542,15 +542,14 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos
   this->cast_extra_data(nullptr, nullptr, nullptr, host_uind, host_uinp, nullptr);
   this->atom->add_extra_data();
 
-  *fieldp_ptr=this->_fieldp.host.begin();
-
   this->_off2_polar = off2_polar;
   this->_aewald = aewald;
   const int red_blocks=umutual2b(this->_eflag,this->_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
   // NOTE: move this step to update_fieldp() to delay device-host transfer
-  //this->_fieldp.update_host(this->_max_fieldp_size*8,false);
+  // *fieldp_ptr=this->_fieldp.host.begin();
+  // this->_fieldp.update_host(this->_max_fieldp_size*8,false);
 }
 
 // ---------------------------------------------------------------------------

From 288fd5add4de3f2aa4c7c8d98990a0fa92af440c Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 19 Jul 2022 15:18:17 -0500
Subject: [PATCH 085/181] Updated the python scripts under tools/tinker to the
 latest version in develop

---
 tools/tinker/data.py       | 146 ++++++++++++++++++-------------------
 tools/tinker/tinker2lmp.py | 113 ++++++++++++++--------------
 2 files changed, 130 insertions(+), 129 deletions(-)

diff --git a/tools/tinker/data.py b/tools/tinker/data.py
index 40d6582814..b75536da93 100644
--- a/tools/tinker/data.py
+++ b/tools/tinker/data.py
@@ -3,16 +3,17 @@
 #
 # Copyright (2005) Sandia Corporation.  Under the terms of Contract
 # DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-# certain rights in this software.  This software is distributed under 
+# certain rights in this software.  This software is distributed under
 # the GNU General Public License.
 
 # data tool
 
+from __future__ import print_function
 oneline = "Read, write, manipulate LAMMPS data files"
 
 docstr = """
 d = data("data.poly")            read a LAMMPS data file, can be gzipped
-d = data()			 create an empty data file
+d = data()           create an empty data file
 
 d.map(1,"id",3,"x")              assign names to atom columns (1-N)
 
@@ -26,17 +27,17 @@ d.reorder("Atoms",1,3,2,4,5)     reorder columns (1-N) in a data file section
 
   1,3,2,4,5 = new order of previous columns, can delete columns this way
 
-d.title = "My LAMMPS data file"	 set title of the data file
+d.title = "My LAMMPS data file"  set title of the data file
 d.headers["atoms"] = 1500        set a header value
 d.sections["Bonds"] = lines      set a section to list of lines (with newlines)
-d.delete("bonds")		 delete a keyword or section of data file
+d.delete("bonds")        delete a keyword or section of data file
 d.delete("Bonds")
-d.replace("Atoms",5,vec)      	 replace Nth column of section with vector
-d.newxyz(dmp,1000)		 replace xyz in Atoms with xyz of snapshot N
+d.replace("Atoms",5,vec)         replace Nth column of section with vector
+d.newxyz(dmp,1000)       replace xyz in Atoms with xyz of snapshot N
 
   newxyz assumes id,x,y,z are defined in both data and dump files
     also replaces ix,iy,iz if they are defined
-  
+
 index,time,flag = d.iterator(0/1)          loop over single data file snapshot
 time,box,atoms,bonds,tris,lines = d.viz(index)   return list of viz objects
 
@@ -53,7 +54,7 @@ time,box,atoms,bonds,tris,lines = d.viz(index)   return list of viz objects
       NULL if bonds do not exist
     tris = NULL
     lines = NULL
-    
+
 d.write("data.new")             write a LAMMPS data file
 """
 
@@ -65,7 +66,7 @@ d.write("data.new")             write a LAMMPS data file
 
 # Variables
 #   title = 1st line of data file
-#   names = dictionary with atom attributes as keys, col #s as values 
+#   names = dictionary with atom attributes as keys, col #s as values
 #   headers = dictionary with header name as key, value or tuple as values
 #   sections = dictionary with section name as key, array of lines as values
 #   nselect = 1 = # of snapshots
@@ -79,13 +80,13 @@ except: PIZZA_GUNZIP = "gunzip"
 
 # Class definition
 
-class data:
+class data(object):
 
   # --------------------------------------------------------------------
 
   def __init__(self,*list):
     self.nselect = 1
-    
+
     if len(list) == 0:
       self.title = "LAMMPS data file"
       self.names = {}
@@ -99,7 +100,7 @@ class data:
 
     self.title = f.readline()
     self.names = {}
-    
+
     headers = {}
     while 1:
       line = f.readline()
@@ -109,16 +110,16 @@ class data:
       found = 0
       for keyword in hkeywords:
         if line.find(keyword) >= 0:
-	  found = 1
-	  words = line.split()
-	  if keyword == "xlo xhi" or keyword == "ylo yhi" or \
-	    keyword == "zlo zhi":
-	    headers[keyword] = (float(words[0]),float(words[1]))
-	  elif keyword == "xy xz yz":
-	    headers[keyword] = \
+          found = 1
+          words = line.split()
+          if keyword == "xlo xhi" or keyword == "ylo yhi" or \
+            keyword == "zlo zhi":
+            headers[keyword] = (float(words[0]),float(words[1]))
+          elif keyword == "xy xz yz":
+            headers[keyword] = \
               (float(words[0]),float(words[1]),float(words[2]))
           else:
-	    headers[keyword] = int(words[0])
+            headers[keyword] = int(words[0])
       if not found:
         break
 
@@ -128,22 +129,21 @@ class data:
       for pair in skeywords:
         keyword,length = pair[0],pair[1]
         if keyword == line:
-	  found = 1
-          if not headers.has_key(length):
-            raise StandardError, \
-                  "data section %s has no matching header value" % line
-	  f.readline()
+          found = 1
+          if length not in headers:
+            raise (Exception, "data section %s has no matching header value" % line)
+          f.readline()
           list = []
-          for i in xrange(headers[length]): list.append(f.readline())
+          for i in range(headers[length]): list.append(f.readline())
           sections[keyword] = list
       if not found:
-        raise StandardError,"invalid section %s in data file" % line
+        raise (Exception,"invalid section %s in data file" % line)
       f.readline()
       line = f.readline()
       if not line:
         break
       line = line.strip()
-      
+
     f.close()
     self.headers = headers
     self.sections = sections
@@ -153,7 +153,7 @@ class data:
 
   def map(self,*pairs):
     if len(pairs) % 2 != 0:
-      raise StandardError, "data map() requires pairs of mappings"
+      raise Exception("data map() requires pairs of mappings")
     for i in range(0,len(pairs),2):
       j = i + 1
       self.names[pairs[j]] = pairs[i]-1
@@ -168,7 +168,7 @@ class data:
       lines = self.sections[field]
       for line in lines:
         words = line.split()
-        values = map(float,words)
+        values = list(map(float,words))
         array.append(values)
       return array
     elif len(list) == 2:
@@ -181,7 +181,7 @@ class data:
         vec.append(float(words[n]))
       return vec
     else:
-      raise StandardError, "invalid arguments for data.get()"
+      raise Exception("invalid arguments for data.get()")
 
   # --------------------------------------------------------------------
   # reorder columns in a data file field
@@ -192,10 +192,10 @@ class data:
     oldlines = self.sections[name]
     newlines = natoms*[""]
     for index in order:
-      for i in xrange(len(newlines)):
+      for i in range(len(newlines)):
         words = oldlines[i].split()
         newlines[i] += words[index-1] + " "
-    for i in xrange(len(newlines)):
+    for i in range(len(newlines)):
       newlines[i] += "\n"
     self.sections[name] = newlines
 
@@ -206,7 +206,7 @@ class data:
     lines = self.sections[name]
     newlines = []
     j = icol - 1
-    for i in xrange(len(lines)):
+    for i in range(len(lines)):
       line = lines[i]
       words = line.split()
       words[j] = str(vector[i])
@@ -228,48 +228,48 @@ class data:
     self.replace("Atoms",self.names['x']+1,x)
     self.replace("Atoms",self.names['y']+1,y)
     self.replace("Atoms",self.names['z']+1,z)
-    
-    if dm.names.has_key("ix") and self.names.has_key("ix"):
+
+    if "ix" in dm.names and "ix" in self.names:
       ix,iy,iz = dm.vecs(ntime,"ix","iy","iz")
       self.replace("Atoms",self.names['ix']+1,ix)
       self.replace("Atoms",self.names['iy']+1,iy)
       self.replace("Atoms",self.names['iz']+1,iz)
-      
+
   # --------------------------------------------------------------------
   # delete header value or section from data file
 
   def delete(self,keyword):
 
-    if self.headers.has_key(keyword): del self.headers[keyword]
-    elif self.sections.has_key(keyword): del self.sections[keyword]
-    else: raise StandardError, "keyword not found in data object"
+    if keyword in self.headers: del self.headers[keyword]
+    elif keyword in self.sections: del self.sections[keyword]
+    else: raise Exception("keyword not found in data object")
 
   # --------------------------------------------------------------------
   # write out a LAMMPS data file
 
   def write(self,file):
     f = open(file,"w")
-    print >>f,self.title
- 
+    print(self.title, file=f)
+
     # write any keywords in standard list hkeywords
     #   in the order they are in hkeywords
     # then write any extra keywords at end of header section
-   
+
     for keyword in hkeywords:
-      if self.headers.has_key(keyword):
+      if keyword in self.headers:
         if keyword == "xlo xhi" or keyword == "ylo yhi" or \
                keyword == "zlo zhi":
-	  pair = self.headers[keyword]
-	  print >>f,pair[0],pair[1],keyword
+          pair = self.headers[keyword]
+          print(pair[0],pair[1],keyword, file=f)
         elif keyword == "xy xz yz":
-	  triple = self.headers[keyword]
-	  print >>f,triple[0],triple[1],triple[2],keyword
+          triple = self.headers[keyword]
+          print(triple[0],triple[1],triple[2],keyword, file=f)
         else:
-	  print >>f,self.headers[keyword],keyword
+          print(self.headers[keyword],keyword, file=f)
 
-    for keyword in self.headers.keys():
+    for keyword in list(self.headers.keys()):
       if keyword not in hkeywords:
-        print >>f,self.headers[keyword],keyword
+        print(self.headers[keyword],keyword, file=f)
 
     # write any sections in standard list skeywords
     #   in the order they are in skeywords
@@ -277,18 +277,18 @@ class data:
 
     for pair in skeywords:
       keyword = pair[0]
-      if self.sections.has_key(keyword):
-        print >>f,"\n%s\n" % keyword
+      if keyword in self.sections:
+        print("\n%s\n" % keyword, file=f)
         for line in self.sections[keyword]:
-	  print >>f,line,
+          print(line, end='', file=f)
 
     skeyfirst = [pair[0] for pair in skeywords]
-    
-    for keyword in self.sections.keys():
+
+    for keyword in list(self.sections.keys()):
       if keyword not in skeyfirst:
-        print >>f,"\n%s\n" % keyword
+        print("\n%s\n" % keyword, file=f)
         for line in self.sections[keyword]:
-	  print >>f,line,
+          print(line, end='', file=f)
 
     f.close()
 
@@ -304,20 +304,20 @@ class data:
 
   def findtime(self,n):
     if n == 0: return 0
-    raise StandardError, "no step %d exists" % (n)
-   
+    raise(Exception, "no step %d exists" % (n))
+
   # --------------------------------------------------------------------
   # return list of atoms and bonds to viz for data object
 
   def viz(self,isnap):
-    if isnap: raise StandardError, "cannot call data.viz() with isnap != 0"
-    
+    if isnap: raise Exception("cannot call data.viz() with isnap != 0")
+
     id = self.names["id"]
     type = self.names["type"]
     x = self.names["x"]
     y = self.names["y"]
     z = self.names["z"]
-    
+
     xlohi = self.headers["xlo xhi"]
     ylohi = self.headers["ylo yhi"]
     zlohi = self.headers["zlo zhi"]
@@ -336,7 +336,7 @@ class data:
     # assumes atoms are sorted so can lookup up the 2 atoms in each bond
 
     bonds = []
-    if self.sections.has_key("Bonds"):
+    if "Bonds" in self.sections:
       bondlines = self.sections["Bonds"]
       for line in bondlines:
         words = line.split()
@@ -349,8 +349,8 @@ class data:
                       float(atom1words[z]),
                       float(atom2words[x]),float(atom2words[y]),
                       float(atom2words[z]),
-		      float(atom1words[type]),float(atom2words[type])])
-      
+                      float(atom1words[type]),float(atom2words[type])])
+
     tris = []
     lines = []
     return 0,box,atoms,bonds,tris,lines
@@ -375,8 +375,8 @@ class data:
 
 hkeywords = ["atoms","ellipsoids","lines","triangles","bodies",
              "bonds","angles","dihedrals","impropers",
-	     "atom types","bond types","angle types","dihedral types",
-	     "improper types",
+             "atom types","bond types","angle types","dihedral types",
+             "improper types",
              "xlo xhi","ylo yhi","zlo zhi","xy xz yz"]
 
 skeywords = [["Masses","atom types"],
@@ -384,14 +384,14 @@ skeywords = [["Masses","atom types"],
              ["Lines","lines"],["Triangles","triangles"],["Bodies","bodies"],
              ["Velocities","atoms"],
              ["Bonds","bonds"],
-	     ["Angles","angles"],
+             ["Angles","angles"],
              ["Dihedrals","dihedrals"],
-	     ["Impropers","impropers"],
+             ["Impropers","impropers"],
              ["Pair Coeffs","atom types"],
-	     ["Bond Coeffs","bond types"],
+             ["Bond Coeffs","bond types"],
              ["Angle Coeffs","angle types"],
-	     ["Dihedral Coeffs","dihedral types"],
-	     ["Improper Coeffs","improper types"],
+             ["Dihedral Coeffs","dihedral types"],
+             ["Improper Coeffs","improper types"],
              ["BondBond Coeffs","angle types"],
              ["BondAngle Coeffs","angle types"],
              ["MiddleBondTorsion Coeffs","dihedral types"],
diff --git a/tools/tinker/tinker2lmp.py b/tools/tinker/tinker2lmp.py
index 565d3f23fe..fe80be9a14 100644
--- a/tools/tinker/tinker2lmp.py
+++ b/tools/tinker/tinker2lmp.py
@@ -15,6 +15,7 @@
 
 # Author: Steve Plimpton
 
+from __future__ import print_function
 import sys,os,math
 from data import data
 
@@ -29,20 +30,20 @@ DELTA = 0.001     # delta on LAMMPS shrink-wrap box size, in Angstroms
 
 def error(txt=""):
   if not txt:
-    print "Syntax: tinker2lmp.py -switch args ..."
-    print "  -xyz file"
-    print "  -amoeba file"
-    print "  -hippo file"
-    print "  -data file"
-    print "  -bitorsion file"
-    print "  -nopbc"
-    print "  -pbc xhi yhi zhi"
-  else: print "ERROR:",txt
+    print("Syntax: tinker2lmp.py -switch args ...")
+    print("  -xyz file")
+    print("  -amoeba file")
+    print("  -hippo file")
+    print("  -data file")
+    print("  -bitorsion file")
+    print("  -nopbc")
+    print("  -pbc xhi yhi zhi")
+  else: print("ERROR:",txt)
   #sys.exit()
 
 # read and store values from a Tinker xyz file
 
-class XYZfile:
+class XYZfile(object):
   def __init__(self,file):
     lines = open(file,'r').readlines()
     header = lines[0]
@@ -212,7 +213,7 @@ class XYZfile:
   def output(self,outfile):
     fp = open(outfile,'w')
     words = self.header.split()
-    print >>fp,self.natoms,"replicated",' '.join(words[1:])
+    print(self.natoms,"replicated",' '.join(words[1:]), file=fp)
 
     id = self.id
     label = self.label
@@ -225,9 +226,9 @@ class XYZfile:
     # NOTE: worry about formatting of line
     
     for i in range(self.natoms):
-      print >>fp,i+1,label[i],x[i],y[i],z[i],type[i],
-      for j in bonds[i]: print >>fp,j,
-      print >>fp
+      print(i+1,label[i],x[i],y[i],z[i],type[i], end=' ', file=fp)
+      for j in bonds[i]: print(j, end=' ', file=fp)
+      print(file=fp)
     
     fp.close()
     
@@ -255,7 +256,7 @@ class XYZfile:
 # scalar force field params in Force Field Definition section
 # bond, angle, dihedral coeffs indexed by Tinker classes
 
-class PRMfile:
+class PRMfile(object):
   def __init__(self,file):
     lines = open(file,'r').readlines()
     self.nlines = len(lines)
@@ -519,7 +520,7 @@ class PRMfile:
             error("torsion does not have triplets of params: %d %d %d %d" % \
                   (class1,class2,class3,class4))
 
-          mfourier = (len(words)-5) / 3
+          mfourier = int((len(words)-5)/3)
           oneparams = [class1,class2,class3,class4,mfourier]
 
           for iset in range(mfourier):
@@ -743,7 +744,7 @@ if pbcflag:
 else:
   xlo = ylo = zlo = BIG
   xhi = yhi = zhi = -BIG
-  for i in xrange(natoms):
+  for i in range(natoms):
     xlo = min(xlo,float(x[i]))
     ylo = min(ylo,float(y[i]))
     zlo = min(zlo,float(z[i]))
@@ -1097,11 +1098,11 @@ for i,one in enumerate(alist):
       nbonds,hcount = xyz.angle_hbond_count(atom1,atom2,atom3,lmptype,lmpmass)
       
       if nbonds != 3: 
-        print "Center angle atom has wrong bond count"
-        print "  angle atom IDs:",atom1,atom2,atom3
-        print "  angle atom classes:",c1,c2,c3
-        print "  Tinker FF file param options:",len(params[3])
-        print "  Nbonds and hydrogen count:",nbonds,hcount
+        print("Center angle atom has wrong bond count")
+        print("  angle atom IDs:",atom1,atom2,atom3)
+        print("  angle atom classes:",c1,c2,c3)
+        print("  Tinker FF file param options:",len(params[3]))
+        print("  Nbonds and hydrogen count:",nbonds,hcount)
         #sys.exit()      NOTE: allow this for now
 
       if hcount == 0: which = 1
@@ -1109,22 +1110,22 @@ for i,one in enumerate(alist):
         which = 2
         m += 1
 
-      print "3-bond angle"
-      print "  angle atom IDs:",atom1,atom2,atom3
-      print "  angle atom classes:",c1,c2,c3
-      print "  Tinker FF file param options:",len(params[3])
-      print "  Nbonds and hydrogen count:",nbonds,hcount
-      print "  which:",which,m
+      print("3-bond angle")
+      print("  angle atom IDs:",atom1,atom2,atom3)
+      print("  angle atom classes:",c1,c2,c3)
+      print("  Tinker FF file param options:",len(params[3]))
+      print("  Nbonds and hydrogen count:",nbonds,hcount)
+      print("  which:",which,m)
 
     elif len(params[3]) == 3:
       nbonds,hcount = xyz.angle_hbond_count(atom1,atom2,atom3,lmptype,lmpmass)
       
       if nbonds != 4: 
-        print "Center angle atom has wrong bond count"
-        print "  angle atom IDs:",atom1,atom2,atom3
-        print "  angle atom classes:",c1,c2,c3
-        print "  Tinker FF file param options:",len(params[3])
-        print "  Nbonds and hydrogen count:",nbonds,hcount
+        print("Center angle atom has wrong bond count")
+        print("  angle atom IDs:",atom1,atom2,atom3)
+        print("  angle atom classes:",c1,c2,c3)
+        print("  Tinker FF file param options:",len(params[3]))
+        print("  Nbonds and hydrogen count:",nbonds,hcount)
         #sys.exit()     NOTE: allow this for now
         
       if hcount == 0: which = 1
@@ -1170,7 +1171,7 @@ for itype in range(len(aparams)):
   elif (c3,c2,c1) in badict:
     n1,n2,r1,r2 = badict[(c3,c2,c1)]
   else:
-    print "Bond-stretch angle triplet not found: %d %d %d" % (c1,c2,c3)
+    print("Bond-stretch angle triplet not found: %d %d %d" % (c1,c2,c3))
     n1,n2,r1,r2 = 4*[0.0]
     
   baparams.append((n1,n2,r1,r2))
@@ -1600,17 +1601,17 @@ if nbitorsions:
           nbitorsions)
 
   fp = open(bitorsionfile,'w')
-  print >>fp,"Tinker BiTorsion parameter file for fix bitorsion\n"
-  print >>fp,"%d bitorsion types" % len(bitorsionparams)
+  print("Tinker BiTorsion parameter file for fix bitorsion\n", file=fp)
+  print("%d bitorsion types" % len(bitorsionparams), file=fp)
   itype = 0
   for nx,ny,array in bitorsionparams:
     itype += 1
-    print >>fp
-    print >>fp,itype,nx,ny
+    print(file=fp)
+    print(itype,nx,ny, file=fp)
     for ix in range(nx):
       for iy in range(ny):
         xgrid,ygrid,value = array[ix][iy]
-        print >>fp," ",xgrid,ygrid,value
+        print(" ",xgrid,ygrid,value, file=fp)
   fp.close()
   
   lines = [] 
@@ -1624,21 +1625,21 @@ d.write(datafile)
 
 # print stats to screen
 
-print "Natoms =",natoms
-print "Ntypes =",ntypes
-print "Tinker XYZ types =",len(tink2lmp)
-print "Tinker PRM types =",prm.ntypes
+print("Natoms =",natoms)
+print("Ntypes =",ntypes)
+print("Tinker XYZ types =",len(tink2lmp))
+print("Tinker PRM types =",prm.ntypes)
 #print "Tinker groups =",ngroups
-print "Nmol =",nmol
-print "Nbonds =",nbonds
-print "Nangles =",nangles
-print "Ndihedrals =",ndihedrals
-print "Nimpropers =",nimpropers
-print "Npitorsions =",npitorsions
-print "Nbitorsions =",nbitorsions
-print "Nbondtypes =",len(bparams)
-print "Nangletypes =",len(aparams)
-print "Ndihedraltypes =",len(dparams)
-print "Nimpropertypes =",len(oparams)
-print "Npitorsiontypes =",len(pitorsionparams)
-print "Nbitorsiontypes =",len(bitorsionparams)
+print("Nmol =",nmol)
+print("Nbonds =",nbonds)
+print("Nangles =",nangles)
+print("Ndihedrals =",ndihedrals)
+print("Nimpropers =",nimpropers)
+print("Npitorsions =",npitorsions)
+print("Nbitorsions =",nbitorsions)
+print("Nbondtypes =",len(bparams))
+print("Nangletypes =",len(aparams))
+print("Ndihedraltypes =",len(dparams))
+print("Nimpropertypes =",len(oparams))
+print("Npitorsiontypes =",len(pitorsionparams))
+print("Nbitorsiontypes =",len(bitorsionparams))

From 93784f35e329c4068e7b904b0da27edf0b6a2bdb Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 25 Jul 2022 15:34:44 -0500
Subject: [PATCH 086/181] Added ucl_erfc to the opencl, cuda and hip backends;
 reverted to using erfc instead of approximation to ensure double-precision
 matches

---
 lib/gpu/lal_amoeba.cu       | 22 ++++++++++++++++------
 lib/gpu/lal_hippo.cu        | 20 +++++++++++++++-----
 lib/gpu/lal_pre_cuda_hip.h  |  8 +++++---
 lib/gpu/lal_preprocessor.h  |  1 +
 src/GPU/pair_amoeba_gpu.cpp | 23 -----------------------
 5 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 3b50feb6ed..d445305bb2 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -607,10 +607,13 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+      
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@@ -800,7 +803,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
 
       numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
       numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
@@ -850,10 +853,12 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1005,7 +1010,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
 
       numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
       numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
@@ -1031,10 +1036,12 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1298,10 +1305,13 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index b47e2d50e3..4f31650f73 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1124,10 +1124,13 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@@ -1400,10 +1403,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1551,7 +1556,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       //if (r2>off2) continue;
 
       numtyp r = ucl_sqrt(r2);
-      numtyp rinv = ucl_recip(r);
+      numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
       numtyp rr1 = rinv;
       numtyp rr3 = rr1 * r2inv;
@@ -1589,10 +1594,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
       for (int m = 1; m <= 3; m++) {
@@ -1838,10 +1845,13 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      //bn[0] = erfc(ralpha) / r;
       bn[0] = _erfc * rinv;
+      */
+      bn[0] = ucl_erfc(ralpha) * rinv;
+
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h
index 47a005b998..03c4fce85e 100644
--- a/lib/gpu/lal_pre_cuda_hip.h
+++ b/lib/gpu/lal_pre_cuda_hip.h
@@ -179,12 +179,15 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrt
 #define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_erfc erfc
 
 #else
 
+#define ucl_exp expf
+#define ucl_powr powf
 #define ucl_atan atanf
 #define ucl_cbrt cbrtf
 #define ucl_ceil ceilf
@@ -192,8 +195,7 @@
 #define ucl_recip(x) ((numtyp)1.0/(x))
 #define ucl_rsqrt rsqrtf
 #define ucl_sqrt sqrtf
-#define ucl_exp expf
-#define ucl_powr powf
+#define ucl_erfc erfcf
 
 #endif
 
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 2ef8af0911..c734e67b98 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -166,6 +166,7 @@
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
+#define ucl_erfc erfc
 
 #if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 1376a6bd12..3b0268f6b4 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -949,29 +949,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
 
   amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp,
                                aewald, off2, &fieldp_pinned);
-/*
-  // accumulate the field and fieldp values from the GPU lib
-  //   field and fieldp may already have some nonzero values from kspace (umutual1)
-
-  int nlocal = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
-
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
-
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
-  }
-*/  
 }
 
 /* ----------------------------------------------------------------------

From a6066bab4d7dce10f985c1ec24b3bf45f0de0b82 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 29 Jul 2022 13:01:57 -0500
Subject: [PATCH 087/181] Called the induce real-space term before the kspace
 term

---
 src/AMOEBA/amoeba_induce.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index d78e2d3262..1fce9be736 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -576,14 +576,14 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
     }
   }
 
-  // get the reciprocal space part of the mutual field
-
-  if (polar_kspace_flag) umutual1(field,fieldp);
-
   // get the real space portion of the mutual field
 
   if (polar_rspace_flag) umutual2b(field,fieldp);
 
+  // get the reciprocal space part of the mutual field
+
+  if (polar_kspace_flag) umutual1(field,fieldp);
+
   // add the self-energy portion of the mutual field
 
   term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;

From e980838ae2a6c8218175edb77d003f5801abe2ef Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 2 Aug 2022 16:45:06 -0500
Subject: [PATCH 088/181] Added timings for real-space and k-space portions for
 the terms

---
 src/AMOEBA/amoeba_induce.cpp    | 24 +++++++++++++++++++
 src/AMOEBA/amoeba_multipole.cpp | 12 ++++++++++
 src/AMOEBA/amoeba_polar.cpp     | 12 ++++++++++
 src/AMOEBA/pair_amoeba.cpp      | 42 +++++++++++++++++++++++++++++++++
 src/AMOEBA/pair_amoeba.h        |  5 ++++
 src/GPU/pair_amoeba_gpu.cpp     | 12 ++++++++++
 6 files changed, 107 insertions(+)

diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 1fce9be736..01491a8708 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -564,6 +564,8 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
   int i,j;
   double term;
 
+  double time0,time1,time2;
+
   // zero field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
@@ -576,13 +578,18 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
     }
   }
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   // get the real space portion of the mutual field
 
   if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = MPI_Wtime();
 
   // get the reciprocal space part of the mutual field
 
   if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = MPI_Wtime();
 
   // add the self-energy portion of the mutual field
 
@@ -593,6 +600,11 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
       fieldp[i][j] += term*uinp[i][j];
     }
   }
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------
@@ -801,6 +813,8 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
   int i,j;
   double term;
 
+  double time0,time1,time2;
+
   // zero out field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
@@ -815,7 +829,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
 
   // get the reciprocal space part of the permanent field
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   if (polar_kspace_flag) udirect1(field);
+  time1 = MPI_Wtime();
 
   for (i = 0; i < nlocal; i++) {
     for (j = 0; j < 3; j++) {
@@ -826,6 +844,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
   // get the real space portion of the permanent field
 
   if (polar_rspace_flag) udirect2b(field,fieldp);
+  time2 = MPI_Wtime();
 
   // get the self-energy portion of the permanent field
 
@@ -836,6 +855,11 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
       fieldp[i][j] += term*rpole[i][j+1];
     }
   }
+
+  // accumulate timing information
+
+  time_direct_kspace += time1 - time0;
+  time_direct_rspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index 886a64f150..603de5884d 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -54,6 +54,8 @@ void PairAmoeba::multipole()
   double qixx,qixy,qixz,qiyy,qiyz,qizz;
   double cii,dii,qii;
 
+  double time0,time1,time2;
+
   // set cutoffs, taper coeffs, and PME params
 
   if (use_ewald) choose(MPOLE_LONG);
@@ -77,13 +79,18 @@ void PairAmoeba::multipole()
 
   felec = electric / am_dielectric;
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   // compute the real space part of the Ewald summation
 
   if (mpole_rspace_flag) multipole_real();
+  time1 = MPI_Wtime();
 
   // compute the reciprocal space part of the Ewald summation
 
   if (mpole_kspace_flag) multipole_kspace();
+  time2 = MPI_Wtime();
 
   // compute the Ewald self-energy term over all the atoms
 
@@ -108,6 +115,11 @@ void PairAmoeba::multipole()
     e = fterm * (cii + term*(dii/3.0+2.0*term*qii/5.0));
     empole += e;
   }
+
+  // accumulate timing information
+
+  time_mpole_rspace += time1 - time0;
+  time_mpole_kspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index 646d045504..6312de77e9 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -52,6 +52,8 @@ void PairAmoeba::polar()
   double fix[3],fiy[3],fiz[3];
   double tep[3];
 
+  double time0,time1,time2;
+
   // set cutoffs, taper coeffs, and PME params
 
   if (use_ewald) choose(POLAR_LONG);
@@ -73,11 +75,16 @@ void PairAmoeba::polar()
 
   // compute the real space part of the dipole interactions
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   if (polar_rspace_flag) polar_real();
+  time1 = MPI_Wtime();
 
   // compute the reciprocal space part of dipole interactions
 
   if (polar_kspace_flag) polar_kspace();
+  time2 = MPI_Wtime();
 
   // compute the Ewald self-energy torque and virial terms
 
@@ -130,6 +137,11 @@ void PairAmoeba::polar()
     virpolar[4] -= vxz;
     virpolar[5] -= vyz;
   }
+
+  // accumulate timing information
+
+  time_polar_rspace += time1 - time0;
+  time_polar_kspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index bb0734cf41..c62aac87e9 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -342,6 +342,11 @@ void PairAmoeba::compute(int eflag, int vflag)
   if (update->ntimestep <= update->beginstep+1) {
     time_init = time_hal = time_repulse = time_disp = time_mpole = 0.0;
     time_induce = time_polar = time_qxfer = 0.0;
+
+    time_mpole_rspace = time_mpole_kspace = 0.0;
+    time_direct_rspace = time_direct_kspace = 0.0;
+    time_mutual_rspace = time_mutual_kspace = 0.0;
+    time_polar_rspace = time_polar_kspace = 0.0;
   }
 
   double time0,time1,time2,time3,time4,time5,time6,time7,time8;
@@ -511,6 +516,32 @@ void PairAmoeba::finish()
   MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_qxfer = ave/comm->nprocs;
 
+  // real-space/kspace breakdown
+
+  MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mpole_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mpole_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mpole_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_direct_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_direct_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_direct_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_direct_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mutual_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_mutual_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_kspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_polar_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_polar_rspace = ave/comm->nprocs;
+
+  MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_polar_kspace = ave/comm->nprocs;
+
   double time_total = (time_init + time_hal + time_repulse + time_disp +
                        time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
 
@@ -529,6 +560,17 @@ void PairAmoeba::finish()
     if (!amoeba)
       utils::logmesg(lmp,"  Qxfer   time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
     utils::logmesg(lmp,"  Total   time: {:.6g}\n",time_total * 100.0);
+
+    utils::logmesg(lmp,"  Real-space timing breakdown:\n");
+    utils::logmesg(lmp,"    Mpole  time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
+    utils::logmesg(lmp,"    Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
+    utils::logmesg(lmp,"    Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
+    utils::logmesg(lmp,"    Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); 
+    utils::logmesg(lmp,"  K-space timing breakdown:\n");
+    utils::logmesg(lmp,"    Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
+    utils::logmesg(lmp,"    Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
+    utils::logmesg(lmp,"    Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
+    utils::logmesg(lmp,"    Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
   }
 }
 
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 2431b99859..8195b1d16f 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -88,6 +88,11 @@ class PairAmoeba : public Pair {
   double time_init,time_hal,time_repulse,time_disp;
   double time_mpole,time_induce,time_polar,time_qxfer;
 
+  double time_mpole_rspace,time_mpole_kspace;
+  double time_direct_rspace,time_direct_kspace;
+  double time_mutual_rspace,time_mutual_kspace;
+  double time_polar_rspace,time_polar_kspace;
+
   // energy/virial components
 
   double ehal,erepulse,edisp,epolar,empole,eqxfer;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 3b0268f6b4..582eb7b595 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -857,6 +857,8 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
   int i,j;
   double term;
 
+  double time0,time1,time2;
+
   // zero field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
@@ -871,11 +873,16 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
 
   // get the real space portion of the mutual field first
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = MPI_Wtime();
 
   // get the reciprocal space part of the mutual field
 
   if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = MPI_Wtime();
 
   // add the self-energy portion of the mutual field
 
@@ -910,6 +917,11 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
 }
 
 /* ----------------------------------------------------------------------

From a54f0b684dbda1adf3b1d918302ef5540fb5a24f Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 3 Aug 2022 10:56:52 -0500
Subject: [PATCH 089/181] Moved temp variables inside the loop over neighbors

---
 lib/gpu/lal_amoeba.cu | 34 ++++++++++++++++++++++++----------
 lib/gpu/lal_hippo.cu  | 15 +++++++++++----
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index d445305bb2..173770f666 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -448,11 +448,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
 
   if (ii<inum) {
-    int m;
-    numtyp bfac;
-    numtyp term1,term2,term3;
-    numtyp term4,term5,term6;
-    numtyp bn[6];
+    //int m;
+    //numtyp bfac;
+    //numtyp term1,term2,term3;
+    //numtyp term4,term5,term6;
+    //numtyp bn[6];
 
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -604,9 +604,10 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
 
       // calculate the real space Ewald error function terms
-
+      
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[6];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -618,6 +619,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
 
+      int m;
       for (m = 1; m < 6; m++) {
         bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
@@ -625,6 +627,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       }
       for (m = 0; m < 6; m++) bn[m] *= felec;
 
+      numtyp term1,term2,term3;
+      numtyp term4,term5,term6;
+
       term1 = ci*ck;
       term2 = ck*dir - ci*dkr + dik;
       term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
@@ -757,8 +762,8 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
+    //numtyp bn[4],bcn[3];
+    //numtyp fid[3],fip[3];
 
     const numtyp4 pol1i = polar1[i];
     numtyp dix = pol1i.y;    // rpole[i][1];
@@ -853,6 +858,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[4],bcn[3];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -900,6 +906,8 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
       bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
       bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+
+      numtyp fid[3];
       fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
       fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
       fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
@@ -908,6 +916,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
       bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
       bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      numtyp fip[3];
       fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
       fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
       fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
@@ -980,8 +989,8 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     }
 
     int itype,igroup;
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
+    //numtyp bn[4],bcn[3];
+    //numtyp fid[3],fip[3];
 
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
@@ -1036,6 +1045,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[4];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -1068,6 +1078,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       }
 
       numtyp scalek = factor_uscale;
+      numtyp bcn[3];
       bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
       bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
 
@@ -1081,10 +1092,13 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       //if (i==0 && j == 10)
       //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
       //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
+
+      numtyp fid[3];
       fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
       fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
       fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
 
+      numtyp fip[3];
       fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
       fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
       fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 4f31650f73..3897a9e5ad 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1303,8 +1303,8 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
+    //numtyp bn[4],bcn[3];
+    //numtyp fid[3],fip[3];
 
     const numtyp4 pol1i = polar1[i];
     numtyp dix = pol1i.y;    // rpole[i][1];
@@ -1403,6 +1403,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[4],bcn[3];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -1429,6 +1430,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5;
       numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7;
       rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
+      numtyp fid[3];
       fid[0] = -xr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
         rr3k*dkx + (numtyp)2.0*rr5k*qkx;
       fid[1] = -yr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
@@ -1445,6 +1447,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5;
       rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7;
       rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
+      numtyp fip[3];
       fip[0] = -xr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
         rr3k*dkx + (numtyp)2.0*rr5k*qkx;
       fip[1] = -yr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
@@ -1524,8 +1527,8 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
     }
 
     int itype,igroup;
-    numtyp bn[4],bcn[3];
-    numtyp fid[3],fip[3];
+    //numtyp bn[4],bcn[3];
+    //numtyp fid[3],fip[3];
 
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
@@ -1594,6 +1597,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[4];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -1627,10 +1631,13 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       //if (i==0 && j == 10)
       //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
       //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
+
+      numtyp fid[3];
       fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
       fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
       fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
 
+      numtyp fip[3];
       fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
       fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
       fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;

From aad4e417f9a9adfeceade97d66e9e36e26ea5aac Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 3 Aug 2022 12:33:48 -0500
Subject: [PATCH 090/181] Moved temp variables inside neighbor loops

---
 lib/gpu/lal_amoeba.cu      | 25 ++++++++++++++++++++++---
 lib/gpu/lal_hippo.cu       | 28 ++++++++++++++++++++--------
 src/AMOEBA/pair_amoeba.cpp | 20 ++++++++++----------
 3 files changed, 52 insertions(+), 21 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 173770f666..6f0c7c8433 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -621,7 +621,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
       int m;
       for (m = 1; m < 6; m++) {
-        bfac = (numtyp) (m+m-1);
+        numtyp bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
         bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
       }
@@ -1170,7 +1170,8 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   //numtyp4 xi__;
 
   if (ii<inum) {
-    int k,m,itype,igroup;
+    int itype,igroup;
+    /*
     numtyp bfac;
     numtyp psc3,psc5,psc7;
     numtyp dsc3,dsc5,dsc7;
@@ -1186,6 +1187,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
     numtyp drc3[3],drc5[3],drc7[3];
     numtyp urc3[3],urc5[3];
     numtyp bn[5];
+    */
     numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
 
     int numj, nbor, nbor_end;
@@ -1317,8 +1319,25 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
       // calculate the real space Ewald error function terms
 
+      int k,m;
+      numtyp psc3,psc5,psc7;
+      numtyp dsc3,dsc5,dsc7;
+      numtyp usc3,usc5;
+      numtyp psr3,psr5,psr7;
+      numtyp dsr3,dsr5,dsr7;
+      numtyp usr5;
+      numtyp term1,term2,term3;
+      numtyp term4,term5;
+      numtyp term6,term7;
+      numtyp rc3[3],rc5[3],rc7[3];
+      numtyp prc3[3],prc5[3],prc7[3];
+      numtyp drc3[3],drc5[3],drc7[3];
+      numtyp urc3[3],urc5[3];
+    
+
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[5];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -1331,7 +1350,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
 
       for (m = 1; m <= 4; m++) {
-        bfac = (numtyp) (m+m-1);
+        numtyp bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
         bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
       }
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 3897a9e5ad..5b88ac4955 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -955,10 +955,10 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int m;
     int itype,iclass;
-    numtyp bfac;
-    numtyp term1,term2,term3;
-    numtyp term4,term5,term6;
-    numtyp bn[6];
+    //numtyp bfac;
+    //numtyp term1,term2,term3;
+    //numtyp term4,term5,term6;
+    //numtyp bn[6];
 
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -1124,6 +1124,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[6];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -1136,12 +1137,15 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
 
       for (m = 1; m < 6; m++) {
-        bfac = (numtyp) (m+m-1);
+        numtyp bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
         bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
       }
       for (m = 0; m < 6; m++) bn[m] *= felec;
 
+      numtyp term1,term2,term3;
+      numtyp term4,term5,term6;
+
       term1 = corei*corek;
       numtyp term1i = corek*vali;
       numtyp term2i = corek*dir;
@@ -1711,13 +1715,15 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
   //numtyp4 xi__;
 
   if (ii<inum) {
-    int k,m,itype,igroup;
+    int itype,igroup;
+    /*
     numtyp bfac;
     numtyp term1,term2,term3;
     numtyp term4,term5;
     numtyp term6,term7;
     numtyp rc3[3],rc5[3],rc7[3];
     numtyp bn[5];
+    */
     numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
 
     int numj, nbor, nbor_end;
@@ -1849,9 +1855,11 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
 
       // calculate the real space Ewald error function terms
-
+      
+      int k,m;
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp bn[5];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -1864,7 +1872,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
 
       for (m = 1; m <= 4; m++) {
-        bfac = (numtyp) (m+m-1);
+        numtyp bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
         bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
       }
@@ -1875,6 +1883,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp sc3 = (numtyp)1.0;
       numtyp sc5 = (numtyp)1.0;
       numtyp sc7 = (numtyp)1.0;
+      numtyp rc3[3],rc5[3],rc7[3];
       for (k = 0; k < 3; k++) {
         rc3[k] = (numtyp)0.0;
         rc5[k] = (numtyp)0.0;
@@ -2064,6 +2073,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp frcy = (numtyp)-2.0 * depy;
       numtyp frcz = (numtyp)-2.0 * depz;
 
+      numtyp term1,term2,term3;
+      //numtyp term4,term5,term6,term7;
+
       // get the dEp/dR terms used for direct polarization force
       // poltyp == MUTUAL && hippo
       // tixx and tkxx
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index c62aac87e9..d5270af450 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -561,16 +561,16 @@ void PairAmoeba::finish()
       utils::logmesg(lmp,"  Qxfer   time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
     utils::logmesg(lmp,"  Total   time: {:.6g}\n",time_total * 100.0);
 
-    utils::logmesg(lmp,"  Real-space timing breakdown:\n");
-    utils::logmesg(lmp,"    Mpole  time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
-    utils::logmesg(lmp,"    Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
-    utils::logmesg(lmp,"    Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
-    utils::logmesg(lmp,"    Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); 
-    utils::logmesg(lmp,"  K-space timing breakdown:\n");
-    utils::logmesg(lmp,"    Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
-    utils::logmesg(lmp,"    Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
-    utils::logmesg(lmp,"    Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
-    utils::logmesg(lmp,"    Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
+    utils::logmesg(lmp,"    Real-space timing breakdown:\n");
+    utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
+    utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
+    utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); 
+    utils::logmesg(lmp,"    K-space timing breakdown:\n");
+    utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
+    utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
+    utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
   }
 }
 

From 538aa13693bb5a9d9749e6361eda70ade3ef208c Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 10 Aug 2022 16:21:30 -0500
Subject: [PATCH 091/181] Only transfer data that is needed for umutual2b;
 allowed convolution and kspace term umutual1 to be overridden by the gpu
 counterparts

---
 lib/gpu/lal_base_amoeba.cpp     | 3 ++-
 src/AMOEBA/amoeba_convolution.h | 6 +++---
 src/AMOEBA/pair_amoeba.h        | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 6f65c8c934..3b67ee31a1 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -518,7 +518,8 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
                                      void** fieldp_ptr) {
   // all the necessary data arrays are already copied from host to device
 
-  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
   atom->add_extra_data();                          
 
   _off2_polar = off2_polar;
diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h
index 270a501a71..00f2b8ed91 100644
--- a/src/AMOEBA/amoeba_convolution.h
+++ b/src/AMOEBA/amoeba_convolution.h
@@ -47,7 +47,7 @@ class AmoebaConvolution : protected Pointers {
   FFT_SCALAR *pre_convolution();
   void *post_convolution();
 
- private:
+ protected:
   int which;                   // caller name for convolution being performed
   int flag3d;                  // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
   int nbrick_owned;            // owned grid points in brick decomp
@@ -71,9 +71,9 @@ class AmoebaConvolution : protected Pointers {
   void *zero_3d();
   void *zero_4d();
   FFT_SCALAR *pre_convolution_3d();
-  FFT_SCALAR *pre_convolution_4d();
+  virtual FFT_SCALAR *pre_convolution_4d();
   void *post_convolution_3d();
-  void *post_convolution_4d();
+  virtual void *post_convolution_4d();
   void kspacebbox(double, double *);
   void procs2grid2d(int, int, int, int &, int &);
 
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 8195b1d16f..93978ab1f2 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -381,7 +381,7 @@ class PairAmoeba : public Pair {
   void ufield0c(double **, double **);
   void uscale0b(int, double **, double **, double **, double **);
   void dfield0c(double **, double **);
-  void umutual1(double **, double **);
+  virtual void umutual1(double **, double **);
   virtual void umutual2b(double **, double **);
   void udirect1(double **);
   virtual void udirect2b(double **, double **);

From c13f825648ea36523da2194691cc1be1cd8eca63 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 10 Aug 2022 16:24:20 -0500
Subject: [PATCH 092/181] Added AmoebaConvolutionGPU class: need to replace fft
 compute with the GPU-accelerated backend

---
 src/GPU/amoeba_convolution_gpu.cpp | 140 +++++++++++++++++++++++++++++
 src/GPU/amoeba_convolution_gpu.h   |  34 +++++++
 2 files changed, 174 insertions(+)
 create mode 100644 src/GPU/amoeba_convolution_gpu.cpp
 create mode 100644 src/GPU/amoeba_convolution_gpu.h

diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
new file mode 100644
index 0000000000..976a115fe1
--- /dev/null
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -0,0 +1,140 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "amoeba_convolution_gpu.h"
+#include "comm.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "gridcomm.h"
+
+using namespace LAMMPS_NS;
+
+#define SCALE 0
+
+// External functions from GPU library
+
+//int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int size, const int flag);
+
+/* ----------------------------------------------------------------------
+   partition an FFT grid across processors
+   both for a brick and FFT x pencil decomposition
+   nx,nz,nz = global FFT grid size
+   order = size of stencil in each dimension that maps atoms to grid
+   adapted from PPPM::set_grid_local()
+------------------------------------------------------------------------- */
+
+AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
+                                     int nx_caller, int ny_caller, int nz_caller,
+                                     int order_caller, int which_caller) :
+  AmoebaConvolution(lmp, pair, nx_caller, ny_caller,  nz_caller, order_caller,
+                    which_caller)
+{
+}
+
+/* ----------------------------------------------------------------------
+   perform pre-convolution grid operations for 4d cgrid_brick array
+------------------------------------------------------------------------- */
+
+FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
+{
+  int ix,iy,iz,n;
+
+  // reverse comm for 4d brick grid + ghosts
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE GridComm");
+#endif
+
+  gc->reverse_comm(GridComm::PAIR,amoeba,2,sizeof(FFT_SCALAR),which,
+                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm");
+  debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm");
+#endif
+  // copy owned 4d brick grid values to FFT grid
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        cfft[n++] = cgrid_brick[iz][iy][ix][0];
+        cfft[n++] = cgrid_brick[iz][iy][ix][1];
+      }
+
+  // remap FFT grid from brick to x pencil partitioning
+  // NOTE: could just setup FFT to start from brick decomp and skip remap
+
+  remap->perform(cfft,cfft,remap_buf);
+
+#if DEBUG_AMOEBA
+  debug_scalar(FFT,"PRE Convo / POST Remap");
+  debug_file(FFT,"pre.convo.post.remap");
+#endif
+  // perform forward FFT
+
+  fft1->compute(cfft,cfft,FFT3d::FORWARD);
+
+  if (SCALE) {
+    double scale = 1.0/nfft_global;
+    for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
+  }
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT1,"PRE Convo / POST FFT");
+  debug_file(CFFT1,"pre.convo.post.fft");
+#endif
+  return cfft;
+}
+
+/* ----------------------------------------------------------------------
+   perform post-convolution grid operations for 4d cgrid_brick array
+------------------------------------------------------------------------- */
+
+void *AmoebaConvolutionGPU::post_convolution_4d()
+{
+  int ix,iy,iz,n;
+
+  // perform backward FFT
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT1,"POST Convo / PRE FFT");
+  debug_file(CFFT1,"post.convo.pre.fft");
+#endif
+  fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+
+#if DEBUG_AMOEBA
+  debug_scalar(CFFT2,"POST Convo / POST FFT");
+  debug_file(CFFT2,"post.convo.post.fft");
+#endif
+  // copy 1d complex values into 4d complex grid
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        cgrid_brick[iz][iy][ix][0] = cfft[n++];
+        cgrid_brick[iz][iy][ix][1] = cfft[n++];
+      }
+
+  // forward comm to populate ghost grid values
+
+#if DEBUG_AMOEBA
+  debug_scalar(GRIDBRICK_IN,"POST Convo / PRE gridcomm");
+  debug_file(GRIDBRICK_IN,"post.convo.pre.gridcomm");
+#endif
+  gc->forward_comm(GridComm::PAIR,amoeba,2,sizeof(FFT_SCALAR),which,
+                   gc_buf1,gc_buf2,MPI_FFT_SCALAR);
+
+  return (void *) cgrid_brick;
+}
diff --git a/src/GPU/amoeba_convolution_gpu.h b/src/GPU/amoeba_convolution_gpu.h
new file mode 100644
index 0000000000..33c3a4aac1
--- /dev/null
+++ b/src/GPU/amoeba_convolution_gpu.h
@@ -0,0 +1,34 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_AMOEBA_CONVOLUTION_GPU_H
+#define LMP_AMOEBA_CONVOLUTION_GPU_H
+
+#include "amoeba_convolution.h"
+
+
+namespace LAMMPS_NS {
+
+class AmoebaConvolutionGPU : public AmoebaConvolution {
+ public:
+  AmoebaConvolutionGPU(class LAMMPS *, class Pair *,
+                    int, int, int, int, int);
+
+  virtual FFT_SCALAR *pre_convolution_4d();
+  virtual void *post_convolution_4d();
+
+};
+
+}
+
+#endif

From f1112ab6b6225692df5e96ea685c33ca4c039adf Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndtrung@uchicago.edu>
Date: Mon, 15 Aug 2022 14:28:46 -0500
Subject: [PATCH 093/181] Working on the gpu kspace induce term: dipole
 spreading and/or fft calls

---
 src/GPU/pair_amoeba_gpu.cpp | 135 +++++++++++++++++++++++++++++++++++-
 src/GPU/pair_amoeba_gpu.h   |   2 +
 2 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 582eb7b595..734ca53bba 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -18,7 +18,7 @@
 
 #include "pair_amoeba_gpu.h"
 
-#include "amoeba_convolution.h"
+#include "amoeba_convolution_gpu.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
@@ -46,6 +46,8 @@ enum{GEAR,ASPC,LSQR};
 enum{BUILD,APPLY};
 enum{GORDON1,GORDON2};
 
+enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
+
 #define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
 
 // External functions from cuda library for atom decomposition
@@ -108,6 +110,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_dispersion_real_ready = false;   // always false for AMOEBA
   gpu_multipole_real_ready = true;     // need to be true for precompute()
   gpu_udirect2b_ready = true;
+  gpu_umutual1_ready = true;
   gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
 
@@ -176,6 +179,17 @@ void PairAmoebaGPU::init_style()
     tq_single = false;
   else
     tq_single = true;
+
+  // replace with the gpu counterpart
+
+  if (gpu_umutual1_ready) {
+    if (use_ewald && ic_kspace) {
+      delete ic_kspace;
+      ic_kspace =
+        new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
+    }
+
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -924,6 +938,125 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
   time_mutual_kspace += time2 - time1;
 }
 
+/* ----------------------------------------------------------------------
+   umutual1 = Ewald recip mutual induced field
+   umutual1 computes the reciprocal space contribution of the
+   induced atomic dipole moments to the field
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::umutual1(double **field, double **fieldp)
+{
+  int i,j,k,m,n;
+  int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
+  double term;
+  double a[3][3];  // indices not flipped vs Fortran
+
+  // return if the Ewald coefficient is zero
+
+  if (aewald < 1.0e-6) return;
+
+  // convert Cartesian dipoles to fractional coordinates
+
+  for (j = 0; j < 3; j++) {
+    a[0][j] = nfft1 * recip[0][j];
+    a[1][j] = nfft2 * recip[1][j];
+    a[2][j] = nfft3 * recip[2][j];
+  }
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
+      fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
+    }
+  }
+
+  // gridpre = my portion of 4d grid in brick decomp w/ ghost values
+
+  double ****gridpre = (double ****) ic_kspace->zero();
+
+  // map 2 values to grid
+
+  grid_uind(fuind,fuinp,gridpre);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomposition
+
+  double *gridfft = ic_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  nxlo = ic_kspace->nxlo_fft;
+  nxhi = ic_kspace->nxhi_fft;
+  nylo = ic_kspace->nylo_fft;
+  nyhi = ic_kspace->nyhi_fft;
+  nzlo = ic_kspace->nzlo_fft;
+  nzhi = ic_kspace->nzhi_fft;
+
+  // use qfac values stored in udirect1()
+
+  m = n = 0;
+  for (k = nzlo; k <= nzhi; k++) {
+    for (j = nylo; j <= nyhi; j++) {
+      for (i = nxlo; i <= nxhi; i++) {
+        term = qfac[m++];
+        gridfft[n] *= term;
+        gridfft[n+1] *= term;
+        n += 2;
+      }
+    }
+  }
+
+  // post-convolution operations including backward FFT
+  // gridppost = my portion of 4d grid in brick decomp w/ ghost values
+
+  double ****gridpost = (double ****) ic_kspace->post_convolution();
+
+  // get potential
+
+  fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
+
+  // store fractional reciprocal potentials for OPT method
+
+  if (poltyp == OPT) {
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 10; j++) {
+        fopt[i][optlevel][j] = fdip_phi1[i][j];
+        foptp[i][optlevel][j] = fdip_phi2[i][j];
+      }
+    }
+  }
+
+  // convert the dipole fields from fractional to Cartesian
+
+  for (i = 0; i < 3; i++) {
+    a[0][i] = nfft1 * recip[0][i];
+    a[1][i] = nfft2 * recip[1][i];
+    a[2][i] = nfft3 * recip[2][i];
+  }
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
+        a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
+      dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
+        a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
+    }
+  }
+
+  // increment the field at each multipole site
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] -= dipfield1[i][j];
+      fieldp[i][j] -= dipfield2[i][j];
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    umutual2b = Ewald real mutual field via list
    umutual2b computes the real space contribution of the induced
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index e419ccd1a1..e0563cd8b5 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -38,6 +38,7 @@ class PairAmoebaGPU : public PairAmoeba {
   //virtual void dispersion_real();
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
+  virtual void umutual1(double **, double **);
   virtual void umutual2b(double **, double **);
   virtual void ufield0c(double **, double **);
   virtual void polar_real();
@@ -54,6 +55,7 @@ class PairAmoebaGPU : public PairAmoeba {
   bool gpu_dispersion_real_ready;
   bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
   bool gpu_umutual2b_ready;
   bool gpu_polar_real_ready;
 

From 46b8b00a4faf716c1bad0139a37461138c572094 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndtrung@uchicago.edu>
Date: Mon, 15 Aug 2022 15:51:43 -0500
Subject: [PATCH 094/181] Working on fft on the device

---
 lib/gpu/lal_amoeba_ext.cpp         |  4 ++++
 lib/gpu/lal_base_amoeba.cpp        | 22 ++++++++++++++++++++++
 lib/gpu/lal_base_amoeba.h          |  5 ++++-
 src/GPU/amoeba_convolution_gpu.cpp |  5 ++++-
 4 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 63ed683833..be183b284d 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,6 +162,10 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void amoeba_compute_fft1d(void** in, void** out, const int mode) {
+  AMOEBAMF.compute_fft1d(in, out, mode);
+}
+
 double amoeba_gpu_bytes() {
   return AMOEBAMF.host_memory_usage();
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 3b67ee31a1..b0d6ecee68 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -568,12 +568,30 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
   _tep.update_host(_max_tep_size*4,false);
 }
 
+// ---------------------------------------------------------------------------
+// Return the memory bytes allocated on the host and device
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 double BaseAmoebaT::host_memory_usage_atomic() const {
   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
          4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
 }
 
+// ---------------------------------------------------------------------------
+// Compute FFT
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode)
+{
+
+}
+
+// ---------------------------------------------------------------------------
+// Copy the extra data from host to device
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
                                   double** uind, double** uinp, double* pval) {
@@ -645,6 +663,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
   }
 }
 
+// ---------------------------------------------------------------------------
+// Compile (load) the kernel strings and set the kernels
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_multipole,
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index f439e2945f..cf767be96e 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -189,7 +189,10 @@ class BaseAmoeba {
      // _fieldp store both arrays, one after another
     _fieldp.update_host(_max_fieldp_size*8,false);
   }
-  
+
+  /// compute forward/backward FFT on the device
+  void compute_fft1d(void** in, void** out, const int mode);
+
   // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
index 976a115fe1..ad52df3d4b 100644
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -23,7 +23,8 @@ using namespace LAMMPS_NS;
 
 // External functions from GPU library
 
-//int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int size, const int flag);
+int amoeba_setup_fft(const int size);
+int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode);
 
 /* ----------------------------------------------------------------------
    partition an FFT grid across processors
@@ -39,6 +40,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
   AmoebaConvolution(lmp, pair, nx_caller, ny_caller,  nz_caller, order_caller,
                     which_caller)
 {
+
 }
 
 /* ----------------------------------------------------------------------
@@ -81,6 +83,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
   debug_scalar(FFT,"PRE Convo / POST Remap");
   debug_file(FFT,"pre.convo.post.remap");
 #endif
+
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);

From 28dabb9687db9f30232cde5929debba6b8ef0396 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 16 Aug 2022 15:37:49 -0500
Subject: [PATCH 095/181] Cleaned up unused variables in the amoeba kernels,
 made room for convolution gpu

---
 lib/gpu/lal_amoeba.cu              | 68 +++---------------------------
 lib/gpu/lal_amoeba_ext.cpp         |  4 ++
 lib/gpu/lal_base_amoeba.cpp        | 12 +++++-
 lib/gpu/lal_base_amoeba.h          |  5 +++
 src/GPU/amoeba_convolution_gpu.cpp |  9 +++-
 src/GPU/pair_amoeba_gpu.cpp        |  3 +-
 6 files changed, 34 insertions(+), 67 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 6f0c7c8433..1b2900f97f 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -515,8 +515,8 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       const numtyp4 pol3j = polar3[j];
       numtyp qkyz = pol3j.x; // rpole[j][9];
       numtyp qkzz = pol3j.y; // rpole[j][12];
-      int jtype = pol3j.z; // amtype[j];
-      int jgroup =  pol3j.w; // amgroup[j];
+      //int jtype = pol3j.z; // amtype[j];
+      //int jgroup =  pol3j.w; // amgroup[j];
 
       const numtyp4 sp_pol = sp_amoeba[sbmask15(jextra)];
       numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
@@ -546,18 +546,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp dirx = diy*zr - diz*yr;
       numtyp diry = diz*xr - dix*zr;
       numtyp dirz = dix*yr - diy*xr;
-      numtyp dkrx = dky*zr - dkz*yr;
-      numtyp dkry = dkz*xr - dkx*zr;
-      numtyp dkrz = dkx*yr - dky*xr;
       numtyp dikx = diy*dkz - diz*dky;
       numtyp diky = diz*dkx - dix*dkz;
       numtyp dikz = dix*dky - diy*dkx;
       numtyp qirx = qiz*yr - qiy*zr;
       numtyp qiry = qix*zr - qiz*xr;
       numtyp qirz = qiy*xr - qix*yr;
-      numtyp qkrx = qkz*yr - qky*zr;
-      numtyp qkry = qkx*zr - qkz*xr;
-      numtyp qkrz = qky*xr - qkx*yr;
       numtyp qikx = qky*qiz - qkz*qiy;
       numtyp qiky = qkz*qix - qkx*qiz;
       numtyp qikz = qkx*qiy - qky*qix;
@@ -570,18 +564,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp qikrx = qizk*yr - qiyk*zr;
       numtyp qikry = qixk*zr - qizk*xr;
       numtyp qikrz = qiyk*xr - qixk*yr;
-      numtyp qkirx = qkzi*yr - qkyi*zr;
-      numtyp qkiry = qkxi*zr - qkzi*xr;
-      numtyp qkirz = qkyi*xr - qkxi*yr;
       numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
       numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
       numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
       numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
       numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
       numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
-      numtyp diqkrx = diqkz*yr - diqky*zr;
-      numtyp diqkry = diqkx*zr - diqkz*xr;
-      numtyp diqkrz = diqky*xr - diqkx*yr;
       numtyp dkqirx = dkqiz*yr - dkqiy*zr;
       numtyp dkqiry = dkqix*zr - dkqiz*xr;
       numtyp dkqirz = dkqiy*xr - dkqix*yr;
@@ -735,7 +723,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_charge();
+  //local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@@ -751,8 +739,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -762,21 +748,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    //numtyp bn[4],bcn[3];
-    //numtyp fid[3],fip[3];
-
-    const numtyp4 pol1i = polar1[i];
-    numtyp dix = pol1i.y;    // rpole[i][1];
-    numtyp diy = pol1i.z;    // rpole[i][2];
-    numtyp diz = pol1i.w;    // rpole[i][3];
-    const numtyp4 pol2i = polar2[i];
-    numtyp qixx = pol2i.x;   // rpole[i][4];
-    numtyp qixy = pol2i.y;   // rpole[i][5];
-    numtyp qixz = pol2i.z;   // rpole[i][6];
-    numtyp qiyy = pol2i.w;   // rpole[i][8];
     const numtyp4 pol3i = polar3[i];
-    numtyp qiyz = pol3i.x;   // rpole[i][9];
-    numtyp qizz = pol3i.y;   // rpole[i][12];
     int itype  = pol3i.z;    // amtype[i];
     int igroup = pol3i.w;    // amgroup[i];
 
@@ -843,11 +815,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
       // intermediates involving moments and separation distance
 
-      numtyp dir = dix*xr + diy*yr + diz*zr;
-      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
-      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
-      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
-      numtyp qir = qix*xr + qiy*yr + qiz*zr;
       numtyp dkr = dkx*xr + dky*yr + dkz*zr;
       numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
       numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
@@ -959,7 +926,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_charge();
+  //local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@@ -977,8 +944,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -989,9 +954,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     }
 
     int itype,igroup;
-    //numtyp bn[4],bcn[3];
-    //numtyp fid[3],fip[3];
-
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
 
@@ -1008,7 +970,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -1171,23 +1132,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int itype,igroup;
-    /*
-    numtyp bfac;
-    numtyp psc3,psc5,psc7;
-    numtyp dsc3,dsc5,dsc7;
-    numtyp usc3,usc5;
-    numtyp psr3,psr5,psr7;
-    numtyp dsr3,dsr5,dsr7;
-    numtyp usr5;
-    numtyp term1,term2,term3;
-    numtyp term4,term5;
-    numtyp term6,term7;
-    numtyp rc3[3],rc5[3],rc7[3];
-    numtyp prc3[3],prc5[3],prc7[3];
-    numtyp drc3[3],drc5[3],drc7[3];
-    numtyp urc3[3],urc5[3];
-    numtyp bn[5];
-    */
     numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
 
     int numj, nbor, nbor_end;
@@ -1196,8 +1140,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -1303,7 +1245,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
       numtyp qkr = qkx*xr + qky*yr + qkz*zr;
       numtyp uir = uix*xr + uiy*yr + uiz*zr;
-      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
+      //numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
       numtyp ukr = ukx*xr + uky*yr + ukz*zr;
       numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
 
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index be183b284d..304159e571 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,6 +162,10 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void amoeba_setup_fft(const int size, const int element_type) {
+  AMOEBAMF.setup_fft(size, element_type);
+}
+
 void amoeba_compute_fft1d(void** in, void** out, const int mode) {
   AMOEBAMF.compute_fft1d(in, out, mode);
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index b0d6ecee68..05a48f9588 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -579,7 +579,17 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 }
 
 // ---------------------------------------------------------------------------
-// Compute FFT
+// Setup the FFT plan
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::setup_fft(const int size, const int element_type)
+{
+
+}
+
+// ---------------------------------------------------------------------------
+// Compute FFT on the device
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index cf767be96e..2bff362f29 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -190,7 +190,12 @@ class BaseAmoeba {
     _fieldp.update_host(_max_fieldp_size*8,false);
   }
 
+  /// setup a plan for FFT, where size is the number of elements
+
+  void setup_fft(const int size, const int element_type=0);
+
   /// compute forward/backward FFT on the device
+
   void compute_fft1d(void** in, void** out, const int mode);
 
   // -------------------------- DEVICE DATA -------------------------
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
index ad52df3d4b..f514a50620 100644
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -21,10 +21,12 @@ using namespace LAMMPS_NS;
 
 #define SCALE 0
 
+enum {FORWARD,BACKWARD};
+
 // External functions from GPU library
 
-int amoeba_setup_fft(const int size);
-int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode);
+int amoeba_setup_fft(const int size, const int element_type);
+int amoeba_compute_fft1d(void* in, void* out, const int mode);
 
 /* ----------------------------------------------------------------------
    partition an FFT grid across processors
@@ -64,6 +66,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
   debug_scalar(GRIDBRICK_IN,"PRE Convo / POST GridComm");
   debug_file(GRIDBRICK_IN,"pre.convo.post.gridcomm");
 #endif
+
   // copy owned 4d brick grid values to FFT grid
 
   n = 0;
@@ -88,6 +91,8 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
 
+  //amoeba_compute_fft1d(cfft,cfft,FORWARD);
+
   if (SCALE) {
     double scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 734ca53bba..29db1b4c1b 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -38,6 +38,7 @@
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
+// same as in amoeba_induce.cpp
 enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP};   // forward comm
 enum{FIELD,ZRSD,TORQUE,UFLD};                               // reverse comm
 enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
@@ -46,6 +47,7 @@ enum{GEAR,ASPC,LSQR};
 enum{BUILD,APPLY};
 enum{GORDON1,GORDON2};
 
+// same as in pair_amoeba.cpp
 enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
 
 #define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
@@ -188,7 +190,6 @@ void PairAmoebaGPU::init_style()
       ic_kspace =
         new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
     }
-
   }
 }
 

From 921796a15f012659aa0b0bca57be71b547ec905f Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 16 Aug 2022 16:29:38 -0500
Subject: [PATCH 096/181] Cleaned up unused variables in the hippo kernels

---
 lib/gpu/lal_hippo.cu | 152 ++++++-------------------------------------
 1 file changed, 20 insertions(+), 132 deletions(-)

diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 5b88ac4955..be8d2c0701 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -467,7 +467,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
     }
 
     const numtyp4 pol1i = polar1[i];
-    numtyp ci  = pol1i.x;    // rpole[i][0];
+    //numtyp ci  = pol1i.x;    // rpole[i][0];
     numtyp dix = pol1i.y;    // rpole[i][1];
     numtyp diy = pol1i.z;    // rpole[i][2];
     numtyp diz = pol1i.w;    // rpole[i][3];
@@ -501,7 +501,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       if (r2>off2) continue;
 
       const numtyp4 pol1j = polar1[j];
-      numtyp ck  = pol1j.x;  // rpole[j][0];
+      //numtyp ck  = pol1j.x;  // rpole[j][0];
       numtyp dkx = pol1j.y;  // rpole[j][1];
       numtyp dky = pol1j.z;  // rpole[j][2];
       numtyp dkz = pol1j.w;  // rpole[j][3];
@@ -548,18 +548,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp dirx = diy*zr - diz*yr;
       numtyp diry = diz*xr - dix*zr;
       numtyp dirz = dix*yr - diy*xr;
-      numtyp dkrx = dky*zr - dkz*yr;
-      numtyp dkry = dkz*xr - dkx*zr;
-      numtyp dkrz = dkx*yr - dky*xr;
       numtyp dikx = diy*dkz - diz*dky;
       numtyp diky = diz*dkx - dix*dkz;
       numtyp dikz = dix*dky - diy*dkx;
       numtyp qirx = qiz*yr - qiy*zr;
       numtyp qiry = qix*zr - qiz*xr;
       numtyp qirz = qiy*xr - qix*yr;
-      numtyp qkrx = qkz*yr - qky*zr;
-      numtyp qkry = qkx*zr - qkz*xr;
-      numtyp qkrz = qky*xr - qkx*yr;
       numtyp qikx = qky*qiz - qkz*qiy;
       numtyp qiky = qkz*qix - qkx*qiz;
       numtyp qikz = qkx*qiy - qky*qix;
@@ -572,18 +566,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp qikrx = qizk*yr - qiyk*zr;
       numtyp qikry = qixk*zr - qizk*xr;
       numtyp qikrz = qiyk*xr - qixk*yr;
-      numtyp qkirx = qkzi*yr - qkyi*zr;
-      numtyp qkiry = qkxi*zr - qkzi*xr;
-      numtyp qkirz = qkyi*xr - qkxi*yr;
       numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
       numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
       numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
       numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
       numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
       numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
-      numtyp diqkrx = diqkz*yr - diqky*zr;
-      numtyp diqkry = diqkx*zr - diqkz*xr;
-      numtyp diqkrz = diqky*xr - diqkx*yr;
       numtyp dkqirx = dkqiz*yr - dkqiy*zr;
       numtyp dkqiry = dkqix*zr - dkqiz*xr;
       numtyp dkqirz = dkqiy*xr - dkqix*yr;
@@ -768,8 +756,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -955,10 +941,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int m;
     int itype,iclass;
-    //numtyp bfac;
-    //numtyp term1,term2,term3;
-    //numtyp term4,term5,term6;
-    //numtyp bn[6];
 
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -966,8 +948,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -978,7 +958,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
     }
 
     const numtyp4 pol1i = polar1[i];
-    numtyp ci  = pol1i.x;    // rpole[i][0];
     numtyp dix = pol1i.y;    // rpole[i][1];
     numtyp diy = pol1i.z;    // rpole[i][2];
     numtyp diz = pol1i.w;    // rpole[i][3];
@@ -1015,7 +994,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 
       numtyp r = ucl_sqrt(r2);
       const numtyp4 pol1j = polar1[j];
-      numtyp ck  = pol1j.x;  // rpole[j][0];
       numtyp dkx = pol1j.y;  // rpole[j][1];
       numtyp dky = pol1j.z;  // rpole[j][2];
       numtyp dkz = pol1j.w;  // rpole[j][3];
@@ -1028,7 +1006,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp qkyz = pol3j.x; // rpole[j][9];
       numtyp qkzz = pol3j.y; // rpole[j][12];
       int jtype = pol3j.z; // amtype[j];
-      int jgroup =  pol3j.w; // amgroup[j];
       int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
 
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
@@ -1063,18 +1040,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp dirx = diy*zr - diz*yr;
       numtyp diry = diz*xr - dix*zr;
       numtyp dirz = dix*yr - diy*xr;
-      numtyp dkrx = dky*zr - dkz*yr;
-      numtyp dkry = dkz*xr - dkx*zr;
-      numtyp dkrz = dkx*yr - dky*xr;
       numtyp dikx = diy*dkz - diz*dky;
       numtyp diky = diz*dkx - dix*dkz;
       numtyp dikz = dix*dky - diy*dkx;
       numtyp qirx = qiz*yr - qiy*zr;
       numtyp qiry = qix*zr - qiz*xr;
       numtyp qirz = qiy*xr - qix*yr;
-      numtyp qkrx = qkz*yr - qky*zr;
-      numtyp qkry = qkx*zr - qkz*xr;
-      numtyp qkrz = qky*xr - qkx*yr;
       numtyp qikx = qky*qiz - qkz*qiy;
       numtyp qiky = qkz*qix - qkx*qiz;
       numtyp qikz = qkx*qiy - qky*qix;
@@ -1087,18 +1058,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp qikrx = qizk*yr - qiyk*zr;
       numtyp qikry = qixk*zr - qizk*xr;
       numtyp qikrz = qiyk*xr - qixk*yr;
-      numtyp qkirx = qkzi*yr - qkyi*zr;
-      numtyp qkiry = qkxi*zr - qkzi*xr;
-      numtyp qkirz = qkyi*xr - qkxi*yr;
       numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
       numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
       numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
       numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
       numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
       numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
-      numtyp diqkrx = diqkz*yr - diqky*zr;
-      numtyp diqkry = diqkx*zr - diqkz*xr;
-      numtyp diqkrz = diqky*xr - diqkx*yr;
       numtyp dkqirx = dkqiz*yr - dkqiy*zr;
       numtyp dkqiry = dkqix*zr - dkqiz*xr;
       numtyp dkqirz = dkqiy*xr - dkqix*yr;
@@ -1279,7 +1244,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_charge();
+  //local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@@ -1296,8 +1261,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -1307,28 +1270,12 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    //numtyp bn[4],bcn[3];
-    //numtyp fid[3],fip[3];
-
-    const numtyp4 pol1i = polar1[i];
-    numtyp dix = pol1i.y;    // rpole[i][1];
-    numtyp diy = pol1i.z;    // rpole[i][2];
-    numtyp diz = pol1i.w;    // rpole[i][3];
-    const numtyp4 pol2i = polar2[i];
-    numtyp qixx = pol2i.x;   // rpole[i][4];
-    numtyp qixy = pol2i.y;   // rpole[i][5];
-    numtyp qixz = pol2i.z;   // rpole[i][6];
-    numtyp qiyy = pol2i.w;   // rpole[i][8];
     const numtyp4 pol3i = polar3[i];
-    numtyp qiyz = pol3i.x;   // rpole[i][9];
-    numtyp qizz = pol3i.y;   // rpole[i][12];
     int itype  = pol3i.z;    // amtype[i];
     int igroup = pol3i.w;    // amgroup[i];
     int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
 
-    numtyp corei = coeff_amclass[iclass].z;  // pcore[iclass];
     numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
-    numtyp vali = polar6[i].x;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -1340,7 +1287,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -1359,7 +1305,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
 
       const numtyp4 pol1j = polar1[j];
-      numtyp ck  = pol1j.x;  // rpole[j][0];
       numtyp dkx = pol1j.y;  // rpole[j][1];
       numtyp dky = pol1j.z;  // rpole[j][2];
       numtyp dkz = pol1j.w;  // rpole[j][3];
@@ -1379,24 +1324,16 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp alphak = coeff_amclass[jclass].w; // palpha[jclass];
       numtyp valk = polar6[j].x;
 
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      numtyp factor_dscale, factor_pscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
-      factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
       if (igroup == jgroup) {
         factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
-        factor_uscale = polar_uscale;
       } else {
         factor_dscale = factor_pscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
-        factor_uscale = (numtyp)1.0;
       }
 
       // intermediates involving moments and separation distance
 
-      numtyp dir = dix*xr + diy*yr + diz*zr;
-      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
-      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
-      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
-      numtyp qir = qix*xr + qiy*yr + qiz*zr;
       numtyp dkr = dkx*xr + dky*yr + dkz*zr;
       numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
       numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
@@ -1407,7 +1344,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
-      numtyp bn[4],bcn[3];
+      numtyp bn[4];
       /*
       numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
       numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
@@ -1427,9 +1364,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       dampdir(r,alphai,alphak,dmpi,dmpk);
 
       numtyp scalek = factor_dscale;
-      numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
-      numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
-      numtyp rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7;
       numtyp rr3k = bn[1] - ((numtyp)1.0-scalek*dmpk[2])*rr3;
       numtyp rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5;
       numtyp rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7;
@@ -1444,9 +1378,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
 
       scalek = factor_pscale;
       rr3 = r2inv * rr1;
-      rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
-      rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
-      rr7i = bn[3] - ((numtyp)1.0-scalek*dmpi[6])*rr7;
       rr3k = bn[1] - ((numtyp)1.0-scalek*dmpk[2])*rr3;
       rr5k = bn[2] - ((numtyp)1.0-scalek*dmpk[4])*rr5;
       rr7k = bn[3] - ((numtyp)1.0-scalek*dmpk[6])*rr7;
@@ -1500,7 +1431,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_charge();
+  //local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@@ -1508,7 +1439,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
   numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
   numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
-  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
+  //numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
 
   //numtyp4 xi__;
 
@@ -1519,8 +1450,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -1530,17 +1459,12 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       nbor_mem = dev_short_nbor;
     }
 
-    int itype,igroup;
-    //numtyp bn[4],bcn[3];
-    //numtyp fid[3],fip[3];
-
+    int itype;
     itype  = polar3[i].z; // amtype[i];
-    igroup = polar3[i].w; // amgroup[i];
+    //igroup = polar3[i].w; // amgroup[i];
 
     int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
-    numtyp corei = coeff_amclass[iclass].z;  // pcore[iclass];
     numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
-    numtyp vali = polar6[i].x;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -1552,7 +1476,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -1571,7 +1494,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
       const numtyp4 pol3j = polar3[j];
       int jtype = pol3j.z; // amtype[j];
-      int jgroup =  pol3j.w; // amgroup[j];
+      //int jgroup =  pol3j.w; // amgroup[j];
       const numtyp4 pol4j = polar4[j];
       numtyp ukx = pol4j.x;  // uind[j][0];
       numtyp uky = pol4j.y;  // uind[j][1];
@@ -1582,20 +1505,11 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       numtyp ukzp = pol5j.z; // uinp[j][2];
 
       int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
-      numtyp corek = coeff_amclass[jclass].z;  // pcore[jclass];
       numtyp alphak = coeff_amclass[jclass].w; // palpha[jclass];
-      numtyp valk = polar6[j].x;
 
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      numtyp factor_wscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
-      if (igroup == jgroup) {
-        factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
-        factor_uscale = polar_uscale;
-      } else {
-        factor_dscale = factor_pscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
-        factor_uscale = (numtyp)1.0;
-      }
 
       // calculate the real space Ewald error function terms
 
@@ -1716,15 +1630,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int itype,igroup;
-    /*
-    numtyp bfac;
-    numtyp term1,term2,term3;
-    numtyp term4,term5;
-    numtyp term6,term7;
-    numtyp rc3[3],rc5[3],rc7[3];
-    numtyp bn[5];
-    */
-    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+    numtyp uix,uiy,uiz,uixp,uiyp,uizp;
 
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -1744,7 +1650,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
     }
 
     const numtyp4 pol1i = polar1[i];
-    ci  = pol1i.x;    // rpole[i][0];
     dix = pol1i.y;    // rpole[i][1];
     diy = pol1i.z;    // rpole[i][2];
     diz = pol1i.w;    // rpole[i][3];
@@ -1780,7 +1685,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -1793,10 +1697,9 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp r = ucl_sqrt(r2);
 
       const numtyp4 pol1j = polar1[j];
-      numtyp ck = polar1[j].x;   // rpole[j][0];
-      numtyp dkx = polar1[j].y;  // rpole[j][1];
-      numtyp dky = polar1[j].z;  // rpole[j][2];
-      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
       const numtyp4 pol2j = polar2[j];
       numtyp qkxx = pol2j.x; // rpole[j][4];
       numtyp qkxy = pol2j.y; // rpole[j][5];
@@ -1816,15 +1719,13 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp ukyp = pol5j.y; // uinp[j][1];
       numtyp ukzp = pol5j.z; // uinp[j][2];
 
-      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      numtyp factor_wscale, factor_dscale;
       const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
       factor_wscale = sp_pol.x; // special_polar_wscale[sbmask15(jextra)];
       if (igroup == jgroup) {
-        factor_dscale = factor_pscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
-        factor_uscale = polar_uscale;
+        factor_dscale = sp_pol.y; // special_polar_piscale[sbmask15(jextra)];
       } else {
-        factor_dscale = factor_pscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
-        factor_uscale = (numtyp)1.0;
+        factor_dscale = sp_pol.z; // special_polar_pscale[sbmask15(jextra)];
       }
 
       // intermediates involving moments and separation distance
@@ -1840,9 +1741,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
       numtyp qkr = qkx*xr + qky*yr + qkz*zr;
       numtyp uir = uix*xr + uiy*yr + uiz*zr;
-      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
       numtyp ukr = ukx*xr + uky*yr + ukz*zr;
-      numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
 
       // get reciprocal distance terms for this interaction
 
@@ -1856,7 +1755,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
 
       // calculate the real space Ewald error function terms
       
-      int k,m;
+      int m;
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[5];
@@ -1878,18 +1777,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       }
       for (m = 0; m < 5; m++) bn[m] *= felec;
 
-      // apply Thole polarization damping to scale factors
-
-      numtyp sc3 = (numtyp)1.0;
-      numtyp sc5 = (numtyp)1.0;
-      numtyp sc7 = (numtyp)1.0;
-      numtyp rc3[3],rc5[3],rc7[3];
-      for (k = 0; k < 3; k++) {
-        rc3[k] = (numtyp)0.0;
-        rc5[k] = (numtyp)0.0;
-        rc7[k] = (numtyp)0.0;
-      }
-
       // apply charge penetration damping to scale factors
 
       numtyp corek = coeff_amclass[jtype].z;  // pcore[jclass];
@@ -1900,6 +1787,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       damppole(r,9,alphai,alphak,dmpi,dmpk,dmpik);
       numtyp rr3core = bn[1] - ((numtyp)1.0-factor_dscale)*rr3;
       numtyp rr5core = bn[2] - ((numtyp)1.0-factor_dscale)*rr5;
+
       numtyp rr3i = bn[1] - ((numtyp)1.0-factor_dscale*dmpi[2])*rr3;
       numtyp rr5i = bn[2] - ((numtyp)1.0-factor_dscale*dmpi[4])*rr5;
       numtyp rr7i = bn[3] - ((numtyp)1.0-factor_dscale*dmpi[6])*rr7;

From f4a90c62c070deb7b6b82c9f3f3b0c639e7a84a6 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 23 Aug 2022 15:42:05 -0500
Subject: [PATCH 097/181] First attempt to port the forward FFT in the k-space
 induce term to the GPU, not working yet

---
 lib/gpu/Makefile.lammps.standard   |  2 +-
 lib/gpu/lal_amoeba_ext.cpp         |  8 +--
 lib/gpu/lal_base_amoeba.cpp        | 84 +++++++++++++++++++++++++++++-
 lib/gpu/lal_base_amoeba.h          | 21 +++++++-
 src/AMOEBA/amoeba_convolution.cpp  | 38 +++++++++++++-
 src/AMOEBA/amoeba_convolution.h    |  2 +
 src/AMOEBA/pair_amoeba.cpp         | 10 ++++
 src/GPU/amoeba_convolution_gpu.cpp | 30 +++++++++--
 8 files changed, 181 insertions(+), 14 deletions(-)

diff --git a/lib/gpu/Makefile.lammps.standard b/lib/gpu/Makefile.lammps.standard
index 9526e8e373..0bb3394b3e 100644
--- a/lib/gpu/Makefile.lammps.standard
+++ b/lib/gpu/Makefile.lammps.standard
@@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
 endif
 
 gpu_SYSINC =
-gpu_SYSLIB =  -lcudart -lcuda
+gpu_SYSLIB =  -lcudart -lcuda -lcufft
 gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs
 
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 304159e571..7d9d836b29 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,12 +162,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
-void amoeba_setup_fft(const int size, const int element_type) {
-  AMOEBAMF.setup_fft(size, element_type);
+void amoeba_setup_fft(const int numel, const int element_type) {
+  AMOEBAMF.setup_fft(numel, element_type);
 }
 
-void amoeba_compute_fft1d(void** in, void** out, const int mode) {
-  AMOEBAMF.compute_fft1d(in, out, mode);
+void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
+  AMOEBAMF.compute_fft1d(in, out, numel, mode);
 }
 
 double amoeba_gpu_bytes() {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 05a48f9588..2f3c04c7f1 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -15,6 +15,7 @@
  ***************************************************************************/
 
 #include "lal_base_amoeba.h"
+
 namespace LAMMPS_AL {
 #define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
 
@@ -39,6 +40,9 @@ BaseAmoebaT::~BaseAmoeba() {
   k_polar.clear();
   k_special15.clear();
   k_short_nbor.clear();
+
+  //if (cufft_plan_created) cufftDestroy(plan);
+
   if (pair_program) delete pair_program;
 }
 
@@ -137,11 +141,15 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _max_fieldp_size = _max_tep_size;
   _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
+  _max_thetai_size = 0;
+
   _nmax = nall;
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
 
+  cufft_plan_created = false;
+
   return success;
 }
 
@@ -169,6 +177,9 @@ void BaseAmoebaT::clear_atomic() {
 
   _tep.clear();
   _fieldp.clear();
+  _thetai1.clear();
+  _thetai2.clear();
+  _thetai3.clear();
   dev_nspecial15.clear();
   dev_special15.clear();
   dev_special15_t.clear();
@@ -422,6 +433,36 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   return nbor->host_jlist.begin()-host_start;
 }
 
+// ---------------------------------------------------------------------------
+// Prepare for umutual1: bspline_fill
+//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed
+//   - transfer extra data from host to device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::precompute_umutual1(const int ago, const int inum_full, const int nall,
+                                       const int bsordermax, double **host_x,
+                                       double **host_thetai1, double **host_thetai2,
+                                       double **host_thetai3, void* grid) {
+  
+  _bsordermax = bsordermax;
+
+  if (_max_thetai_size == 0) {
+    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _thetai1.alloc(_max_thetai_size*_bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai2.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai3.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  } else {
+    if (inum_full>_max_thetai_size) {
+      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+      _thetai1.resize(_max_thetai_size*_bsordermax*4);
+      _thetai2.resize(_max_thetai_size*_bsordermax*4);
+      _thetai3.resize(_max_thetai_size*_bsordermax*4);
+    }
+  }
+
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
@@ -583,7 +624,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::setup_fft(const int size, const int element_type)
+void BaseAmoebaT::setup_fft(const int numel, const int element_type)
 {
 
 }
@@ -593,9 +634,48 @@ void BaseAmoebaT::setup_fft(const int size, const int element_type)
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode)
+void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode)
 {
+  if (cufft_plan_created == false) {
+    int m = numel/2;
+    cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
+    cufft_plan_created = true;
+  }
 
+  // n = number of double complex
+  int n = numel/2;
+  
+  // copy the host array to the device (data)
+  UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
+  data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
+  int m = 0;
+  double* d_in = (double*)in;
+  for (int i = 0; i < n; i++) {
+    data[i].x = d_in[m];
+    data[i].y = d_in[m+1];
+    m += 2;
+  }
+  data.update_device(false);
+
+  // perform the in-place forward FFT
+  
+  cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
+    (cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
+  ucl_device->sync();
+  data.update_host(false);
+
+  // copy back the data to the host array
+
+  m = 0;
+  double* d_out = (double*)out;
+  for (int i = 0; i < n; i++) {
+    d_out[m] = data[i].x;
+    d_out[m+1] = data[i].y;
+    m += 2;
+  }
+
+  data.clear();
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 2bff362f29..3d0b3ab1a4 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -31,6 +31,14 @@
 #include "geryon/nvd_texture.h"
 #endif
 
+#if !defined(USE_OPENCL) && !defined(USE_HIP)
+// temporary workaround for int2 also defined in cufft
+#ifdef int2
+#undef int2
+#endif
+#include "cufft.h"
+#endif
+
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
@@ -142,6 +150,11 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
+  virtual void precompute_umutual1(const int ago, const int inum_full, const int nall,
+                                    const int bsordermax, double **host_x,
+                                    double **host_thetai1, double **host_thetai2,
+                                    double **host_thetai3, void* grid);
+
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -196,7 +209,7 @@ class BaseAmoeba {
 
   /// compute forward/backward FFT on the device
 
-  void compute_fft1d(void** in, void** out, const int mode);
+  void compute_fft1d(void* in, void* out, const int numel, const int mode);
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -230,6 +243,10 @@ class BaseAmoeba {
   UCL_Vector<acctyp,acctyp> _tep, _fieldp;
   int _nmax, _max_tep_size, _max_fieldp_size;
 
+  int _bsordermax;
+  UCL_Vector<acctyp,acctyp> _thetai1, _thetai2, _thetai3;
+  int _max_thetai_size;
+
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   Answer<numtyp,acctyp> *ans;
@@ -282,6 +299,8 @@ class BaseAmoeba {
   virtual int umutual2b(const int eflag, const int vflag) = 0;
   virtual int polar_real(const int eflag, const int vflag) = 0;
 
+  cufftHandle plan;
+  bool cufft_plan_created;
 };
 
 }
diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp
index 9c8f728f99..4dde750c61 100644
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@@ -203,7 +203,7 @@ AmoebaConvolution::AmoebaConvolution(LAMMPS *lmp, Pair *pair,
   fft1 = new FFT3d(lmp,world,nx,ny,nz,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-       1,0,&tmp,0);
+                   1,0,&tmp,0);
   //       0,0,&tmp,0);
 
   fft2 = new FFT3d(lmp,world,nx,ny,nz,
@@ -358,15 +358,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
     cfft[n++] = ZEROF;
   }
 
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = MPI_Wtime();
 
   if (SCALE) {
     double scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
   }
 
+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT1,"PRE Convo / POST FFT");
   debug_file(CFFT1,"pre.convo.post.fft");
@@ -414,15 +422,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
   debug_scalar(FFT,"PRE Convo / POST Remap");
   debug_file(FFT,"pre.convo.post.remap");
 #endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = MPI_Wtime();
 
   if (SCALE) {
     double scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
   }
 
+  time_fft += time1  - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT1,"PRE Convo / POST FFT");
   debug_file(CFFT1,"pre.convo.post.fft");
@@ -455,7 +472,16 @@ void *AmoebaConvolution::post_convolution_3d()
   debug_scalar(CFFT1,"POST Convo / PRE FFT");
   debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+  time1 = MPI_Wtime();
+
+  time_fft += time1 - time0;
 
 #if DEBUG_AMOEBA
   debug_scalar(CFFT2,"POST Convo / POST FFT");
@@ -497,8 +523,18 @@ void *AmoebaConvolution::post_convolution_4d()
   debug_scalar(CFFT1,"POST Convo / PRE FFT");
   debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
 
+  time1 = MPI_Wtime();
+
+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT2,"POST Convo / POST FFT");
   debug_file(CFFT2,"post.convo.post.fft");
diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h
index 00f2b8ed91..8e7f09218a 100644
--- a/src/AMOEBA/amoeba_convolution.h
+++ b/src/AMOEBA/amoeba_convolution.h
@@ -47,6 +47,8 @@ class AmoebaConvolution : protected Pointers {
   FFT_SCALAR *pre_convolution();
   void *post_convolution();
 
+  double time_fft;
+
  protected:
   int which;                   // caller name for convolution being performed
   int flag3d;                  // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index d5270af450..3b66ebc221 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -347,6 +347,10 @@ void PairAmoeba::compute(int eflag, int vflag)
     time_direct_rspace = time_direct_kspace = 0.0;
     time_mutual_rspace = time_mutual_kspace = 0.0;
     time_polar_rspace = time_polar_kspace = 0.0;
+
+    if (ic_kspace) {
+      ic_kspace->time_fft = 0.0;
+    }
   }
 
   double time0,time1,time2,time3,time4,time5,time6,time7,time8;
@@ -542,6 +546,10 @@ void PairAmoeba::finish()
   MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_polar_kspace = ave/comm->nprocs;
 
+  double time_mutual_fft = ic_kspace->time_fft;
+  MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_fft = ave/comm->nprocs;
+  
   double time_total = (time_init + time_hal + time_repulse + time_disp +
                        time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
 
@@ -570,7 +578,9 @@ void PairAmoeba::finish()
     utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
     utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
     utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
+    utils::logmesg(lmp,"       - FFT time: {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
     utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
+    
   }
 }
 
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
index f514a50620..f9daa06e65 100644
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -21,12 +21,13 @@ using namespace LAMMPS_NS;
 
 #define SCALE 0
 
-enum {FORWARD,BACKWARD};
+//#define USE_AMOEBA_FFT
 
+#ifdef USE_AMOEBA_FFT
 // External functions from GPU library
-
-int amoeba_setup_fft(const int size, const int element_type);
-int amoeba_compute_fft1d(void* in, void* out, const int mode);
+int amoeba_setup_fft(const int size, const int numel, const int element_type);
+int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
+#endif
 
 /* ----------------------------------------------------------------------
    partition an FFT grid across processors
@@ -52,6 +53,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
 FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
 {
   int ix,iy,iz,n;
+  double time0,time1;
 
   // reverse comm for 4d brick grid + ghosts
 
@@ -87,11 +89,20 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
   debug_file(FFT,"pre.convo.post.remap");
 #endif
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   // perform forward FFT
 
+  #ifdef USE_AMOEBA_FFT
+  amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
+  #else
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  #endif
 
-  //amoeba_compute_fft1d(cfft,cfft,FORWARD);
+  time1 = MPI_Wtime();
+
+  time_fft += time1 - time0;
 
   if (SCALE) {
     double scale = 1.0/nfft_global;
@@ -119,7 +130,16 @@ void *AmoebaConvolutionGPU::post_convolution_4d()
   debug_scalar(CFFT1,"POST Convo / PRE FFT");
   debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+  time1 = MPI_Wtime();
+
+  time_fft += time1 - time0;
 
 #if DEBUG_AMOEBA
   debug_scalar(CFFT2,"POST Convo / POST FFT");

From b2d6df5bfbe44b7092ab4588539113b94cd34023 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 25 Aug 2022 23:18:13 -0500
Subject: [PATCH 098/181] Re-arranged some for loops in umutual1 to improve
 cache-friendly memory access; made placeholder for grid_uind on the GPU lib,
 maybe FFT is not that heavy to be put on the device.

---
 lib/gpu/lal_amoeba_ext.cpp  |  5 +++
 src/GPU/pair_amoeba_gpu.cpp | 87 ++++++++++++++++++++++++++++++-------
 2 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 7d9d836b29..6989a5e6f6 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,6 +162,11 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, double ***grid) {
+}
+
 void amoeba_setup_fft(const int numel, const int element_type) {
   AMOEBAMF.setup_fft(numel, element_type);
 }
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 29db1b4c1b..cd3c01cde3 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -88,6 +88,10 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 
+void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp,
+                          double** host_thetai1, double** host_thetai2,
+                          double** host_thetai3, double ***grid);
+
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -869,7 +873,7 @@ void PairAmoebaGPU::udirect2b_cpu()
 
 void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
 {
-  int i,j;
+  //int i,j;
   double term;
 
   double time0,time1,time2;
@@ -879,13 +883,18 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
   int nlocal = atom->nlocal;
   int nall = nlocal + atom->nghost;
 
-  for (i = 0; i < nall; i++) {
-    for (j = 0; j < 3; j++) {
+  memset(&field[0][0], 0, 3*nall *sizeof(double));
+  memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
+
+/*  
+  for (int i = 0; i < nall; i++) {
+    for (int j = 0; j < 3; j++) {
       field[i][j] = 0.0;
       fieldp[i][j] = 0.0;
     }
   }
-
+*/
+  
   // get the real space portion of the mutual field first
 
   MPI_Barrier(world);
@@ -902,13 +911,24 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
   // add the self-energy portion of the mutual field
 
   term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
+  for (int i = 0; i < nlocal; i++) {
+    field[i][0] += term*uind[i][0];
+    field[i][1] += term*uind[i][1];
+    field[i][2] += term*uind[i][2];
+  }
+  for (int i = 0; i < nlocal; i++) {
+    fieldp[i][0] += term*uinp[i][0];
+    fieldp[i][1] += term*uinp[i][1];
+    fieldp[i][2] += term*uinp[i][2];
+  }
+/*  
   for (i = 0; i < nlocal; i++) {
     for (j = 0; j < 3; j++) {
       field[i][j] += term*uind[i][j];
       fieldp[i][j] += term*uinp[i][j];
     }
   }
-
+*/
   // accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
@@ -947,7 +967,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
 
 void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 {
-  int i,j,k,m,n;
+  int m,n;
   int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
   double term;
   double a[3][3];  // indices not flipped vs Fortran
@@ -958,7 +978,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 
   // convert Cartesian dipoles to fractional coordinates
 
-  for (j = 0; j < 3; j++) {
+  for (int j = 0; j < 3; j++) {
     a[0][j] = nfft1 * recip[0][j];
     a[1][j] = nfft2 * recip[1][j];
     a[2][j] = nfft3 * recip[2][j];
@@ -966,13 +986,25 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 
   int nlocal = atom->nlocal;
 
+  for (int i = 0; i < nlocal; i++) {
+    fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
+    fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
+    fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
+  }
+    
+  for (int i = 0; i < nlocal; i++) {      
+    fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
+    fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
+    fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
+  }
+/*
   for (i = 0; i < nlocal; i++) {
     for (j = 0; j < 3; j++) {
       fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
       fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
     }
   }
-
+*/
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
   double ****gridpre = (double ****) ic_kspace->zero();
@@ -1000,9 +1032,9 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
   // use qfac values stored in udirect1()
 
   m = n = 0;
-  for (k = nzlo; k <= nzhi; k++) {
-    for (j = nylo; j <= nyhi; j++) {
-      for (i = nxlo; i <= nxhi; i++) {
+  for (int k = nzlo; k <= nzhi; k++) {
+    for (int j = nylo; j <= nyhi; j++) {
+      for (int i = nxlo; i <= nxhi; i++) {
         term = qfac[m++];
         gridfft[n] *= term;
         gridfft[n+1] *= term;
@@ -1023,8 +1055,8 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
   // store fractional reciprocal potentials for OPT method
 
   if (poltyp == OPT) {
-    for (i = 0; i < nlocal; i++) {
-      for (j = 0; j < 10; j++) {
+    for (int i = 0; i < nlocal; i++) {
+      for (int j = 0; j < 10; j++) {
         fopt[i][optlevel][j] = fdip_phi1[i][j];
         foptp[i][optlevel][j] = fdip_phi2[i][j];
       }
@@ -1033,13 +1065,37 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 
   // convert the dipole fields from fractional to Cartesian
 
-  for (i = 0; i < 3; i++) {
+  for (int i = 0; i < 3; i++) {
     a[0][i] = nfft1 * recip[0][i];
     a[1][i] = nfft2 * recip[1][i];
     a[2][i] = nfft3 * recip[2][i];
   }
 
-  for (i = 0; i < nlocal; i++) {
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi1[i][1] +
+      a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
+    double dfy = a[1][0]*fdip_phi1[i][1] +
+      a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3];
+    double dfz = a[2][0]*fdip_phi1[i][1] +
+      a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3];
+    field[i][0] -= dfx;
+    field[i][1] -= dfy;
+    field[i][2] -= dfz;
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi2[i][1] +
+      a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3];
+    double dfy = a[1][0]*fdip_phi2[i][1] +
+      a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3];
+    double dfz = a[2][0]*fdip_phi2[i][1] +
+      a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3];
+    fieldp[i][0] -= dfx;
+    fieldp[i][1] -= dfy;
+    fieldp[i][2] -= dfz;
+  }
+/*
+  for (int i = 0; i < nlocal; i++) {
     for (j = 0; j < 3; j++) {
       dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
         a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
@@ -1056,6 +1112,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
       fieldp[i][j] -= dipfield2[i][j];
     }
   }
+*/
 }
 
 /* ----------------------------------------------------------------------

From b160460dccaa440a2475b0bceb164e9181bd80f1 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 26 Aug 2022 12:55:46 -0500
Subject: [PATCH 099/181] Added preprocessors to comment out cufft entirely for
 now

---
 lib/gpu/Opencl.makefile     |  2 +-
 lib/gpu/lal_base_amoeba.cpp | 14 +++++++++-----
 lib/gpu/lal_base_amoeba.h   |  4 +++-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 64a2161f85..d318da15dd 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -6,7 +6,7 @@ UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
 
 # Headers for Host files
-HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
          lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
          lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
          lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 2f3c04c7f1..d552a53e5a 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -40,8 +40,10 @@ BaseAmoebaT::~BaseAmoeba() {
   k_polar.clear();
   k_special15.clear();
   k_short_nbor.clear();
-
-  //if (cufft_plan_created) cufftDestroy(plan);
+  
+  #if !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (fft_plan_created) cufftDestroy(plan);
+  #endif
 
   if (pair_program) delete pair_program;
 }
@@ -148,7 +150,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
 
-  cufft_plan_created = false;
+  fft_plan_created = false;
 
   return success;
 }
@@ -636,10 +638,11 @@ void BaseAmoebaT::setup_fft(const int numel, const int element_type)
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode)
 {
-  if (cufft_plan_created == false) {
+  #if !defined(USE_OPENCL) && !defined(USE_HIP)    
+  if (fft_plan_created == false) {
     int m = numel/2;
     cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
-    cufft_plan_created = true;
+    fft_plan_created = true;
   }
 
   // n = number of double complex
@@ -676,6 +679,7 @@ void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int
   }
 
   data.clear();
+  #endif
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 3d0b3ab1a4..eb0eff1e8d 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -299,8 +299,10 @@ class BaseAmoeba {
   virtual int umutual2b(const int eflag, const int vflag) = 0;
   virtual int polar_real(const int eflag, const int vflag) = 0;
 
+  #if !defined(USE_OPENCL) && !defined(USE_HIP)
   cufftHandle plan;
-  bool cufft_plan_created;
+  #endif
+  bool fft_plan_created;
 };
 
 }

From 9e7bbad4d4a9b276005075088b4e405ba5ee37c7 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 27 Aug 2022 13:19:52 -0500
Subject: [PATCH 100/181] Working on fphi_uind in the GPU lib

---
 lib/gpu/Nvidia.makefile     | 18 +++++++++----
 lib/gpu/lal_amoeba.cpp      | 27 ++++++++++++++++++++
 lib/gpu/lal_amoeba.h        |  1 +
 lib/gpu/lal_base_amoeba.cpp | 50 +++++++++++++++++++++++++++++--------
 lib/gpu/lal_base_amoeba.h   | 21 +++++++++++-----
 lib/gpu/lal_hippo.cpp       | 27 ++++++++++++++++++++
 lib/gpu/lal_hippo.h         |  1 +
 lib/gpu/lal_pppm.cu         | 18 ++++++-------
 8 files changed, 132 insertions(+), 31 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 56942d3f3c..768daff53a 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -1,9 +1,17 @@
+# Common headers for kernels
+PRE1_H = lal_preprocessor.h lal_aux_fun1.h
+
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
          lal_pre_cuda_hip.h
-ALL_H  =  $(NVD_H) $(wildcard ./lal_*.h)
 
+# Headers for Host files
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h lal_base_amoeba.h \
+         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
+         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
+         lal_neighbor_shared.h lal_pre_ocl_config.h $(NVD_H)
+         
 # Source files
 SRCS := $(wildcard ./lal_*.cpp)
 OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
@@ -54,13 +62,13 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
 
-$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
+$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(PRE1_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
 	$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@
 
 # host code compilation
 
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(ALL_H)
+$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 
 #ifdef CUDPP_OPT
@@ -77,10 +85,10 @@ $(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
 	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
 
 $(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
-	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu -Icudpp_mini
 
 $(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
-	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu -Icudpp_mini
 #endif
 
 # build libgpu.a
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 924a175cfe..498c55ceba 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -259,6 +259,33 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::fphi_uind() {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+/*
+  this->time_pair.start();
+
+  this->k_fphi_uind.set_size(GX,BX);
+  this->k_fphi_uind.run();
+  this->time_pair.stop();
+*/
+
+  return GX;
+}
+
 // ---------------------------------------------------------------------------
 // Calculate the polar real-space term, returning tep
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index d12b79719f..005ea14fb9 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -91,6 +91,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
+  int fphi_uind();
   int polar_real(const int eflag, const int vflag);
 
 };
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index d552a53e5a..88a2c87166 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -442,27 +442,36 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::precompute_umutual1(const int ago, const int inum_full, const int nall,
-                                       const int bsordermax, double **host_x,
-                                       double **host_thetai1, double **host_thetai2,
-                                       double **host_thetai3, void* grid) {
+void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
+                                    double **host_thetai1, double **host_thetai2,
+                                    double **host_thetai3, int** host_igrid) {
   
-  _bsordermax = bsordermax;
+  _bsorder = bsorder;
 
   if (_max_thetai_size == 0) {
     _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*_bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _thetai2.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _thetai3.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
   } else {
     if (inum_full>_max_thetai_size) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-      _thetai1.resize(_max_thetai_size*_bsordermax*4);
-      _thetai2.resize(_max_thetai_size*_bsordermax*4);
-      _thetai3.resize(_max_thetai_size*_bsordermax*4);
+      _thetai1.resize(_max_thetai_size*bsorder*4);
+      _thetai2.resize(_max_thetai_size*bsorder*4);
+      _thetai3.resize(_max_thetai_size*bsorder*4);
+      _igrid.resize(_max_thetai_size*4);
     }
   }
 
+  memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp));
+  memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp));
+  memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp));
+  memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int));
+  _thetai1.update_device(inum_full*bsorder*4,true);
+  _thetai2.update_device(inum_full*bsorder*4,true);
+  _thetai3.update_device(inum_full*bsorder*4,true);
+  _igrid.update_device(inum_full*4,true);
 }
 
 // ---------------------------------------------------------------------------
@@ -575,6 +584,25 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
   // _fieldp.update_host(_max_fieldp_size*8,false);
 }
 
+// ---------------------------------------------------------------------------
+// fphi_uind = induced potential from grid
+// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
+                                    double **host_thetai1, double **host_thetai2,
+                                    double **host_thetai3, int** igrid,
+                                    double ****host_grid, double **host_fdip_phi1,
+                                    double **host_fdip_phi2, double **host_fdip_sum_phi)
+{
+  // once allocation and transfers
+  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid);
+  
+  const int red_bllocks = fphi_uind();
+}
+
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index eb0eff1e8d..68c3470977 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -150,10 +150,9 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
-  virtual void precompute_umutual1(const int ago, const int inum_full, const int nall,
-                                    const int bsordermax, double **host_x,
-                                    double **host_thetai1, double **host_thetai2,
-                                    double **host_thetai3, void* grid);
+  virtual void precompute_induce(const int inum_full, const int bsorder,
+                                 double **host_thetai1, double **host_thetai2,
+                                 double **host_thetai3, int** igrid);
 
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
@@ -177,6 +176,12 @@ class BaseAmoeba {
                 double **host_uind, double **host_uinp, double *host_pval,
                 const double aewald, const double off2_polar, void **fieldp_ptr);
 
+  virtual void compute_fphi_uind(const int inum_full, const int bsorder,
+                                 double **host_thetai1, double **host_thetai2,
+                                 double **host_thetai3, int** igrid,
+                                 double ****host_grid, double **host_fdip_phi1,
+                                 double **host_fdip_phi2, double **host_fdip_sum_phi);
+
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
                 double **host_uind, double **host_uinp, double *host_pval,
@@ -243,8 +248,9 @@ class BaseAmoeba {
   UCL_Vector<acctyp,acctyp> _tep, _fieldp;
   int _nmax, _max_tep_size, _max_fieldp_size;
 
-  int _bsordermax;
-  UCL_Vector<acctyp,acctyp> _thetai1, _thetai2, _thetai3;
+  int _bsorder;
+  UCL_Vector<numtyp,numtyp> _thetai1, _thetai2, _thetai3;
+  UCL_Vector<int,int> _igrid;
   int _max_thetai_size;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------
@@ -297,8 +303,11 @@ class BaseAmoeba {
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
+  virtual int fphi_uind() = 0;
   virtual int polar_real(const int eflag, const int vflag) = 0;
 
+  
+
   #if !defined(USE_OPENCL) && !defined(USE_HIP)
   cufftHandle plan;
   #endif
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 79a8772c3e..d980ae0ed6 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -592,6 +592,33 @@ int HippoT::umutual2b(const int eflag, const int vflag) {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::fphi_uind() {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+/*
+  this->time_pair.start();
+
+  this->k_fphi_uind.set_size(GX,BX);
+  this->k_fphi_uind.run();
+  this->time_pair.stop();
+*/
+
+  return GX;
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 492712eb85..cece72caac 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -157,6 +157,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
+  int fphi_uind();
   int polar_real(const int eflag, const int vflag);
 
 };
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index e17df5b88c..a8e929efe4 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -273,19 +273,19 @@ __kernel void interp(const __global numtyp4 *restrict x_,
         int my=mz+fast_mul(ny,npts_x);
         for (int m=0; m<order; m++) {
           grdtyp y0=z0*rho1d_1[m][tid];
-                for (int l=0; l<order; l++) {
-                  grdtyp x0=y0*rho1d_0[l][tid];
-                  grdtyp4 el=brick[my+l];
-                  ek.x-=x0*el.x;
-                  ek.y-=x0*el.y;
-                  ek.z-=x0*el.z;
-                }
+          for (int l=0; l<order; l++) {
+            grdtyp x0=y0*rho1d_0[l][tid];
+            grdtyp4 el=brick[my+l];
+            ek.x-=x0*el.x;
+            ek.y-=x0*el.y;
+            ek.z-=x0*el.z;
+          }
           my+=npts_x;
         }
         mz+=npts_yx;
-            }
+      }
     }
     ans[ii]=ek;
-        }
+  }
 }
 

From c5c3c697dfd80c445fc3c310f86b220354bff6a0 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 29 Aug 2022 00:13:30 -0500
Subject: [PATCH 101/181] Adding fphi_uind kernel, working on the arrays
 allocation

---
 lib/gpu/Nvidia.makefile     |   2 +-
 lib/gpu/lal_amoeba.cu       | 268 ++++++++++++++++++++++++++++++++++++
 lib/gpu/lal_base_amoeba.cpp |  13 +-
 lib/gpu/lal_base_amoeba.h   |   2 +-
 4 files changed, 279 insertions(+), 6 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 768daff53a..5f50486e28 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -62,7 +62,7 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
 
-$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(PRE1_H)
+$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
 	$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 1b2900f97f..1239764108 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1615,6 +1615,274 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
+                          const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int4 *restrict igrid,
+                          const __global numtyp4 *restrict grid,
+                          __global numtyp4 *restrict fdip_phi1,
+                          __global numtyp4 *restrict fdip_phi2,
+                          __global numtyp4 *restrict fdip_sum_phi,
+                          const int bsorder, const int inum,
+                          const int t_per_atom)
+{
+  int tid, ii, offset, i, n_stride;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int j,k,m;
+    numtyp v0,v1,v2,v3;
+    numtyp u0,u1,u2,u3;
+    numtyp t0,t1,t2,t3;
+    numtyp t0_1,t0_2,t1_1,t1_2;
+    numtyp t2_1,t2_2,tq_1,tq_2;
+    numtyp tu00,tu10,tu01,tu20,tu11;
+    numtyp tu02,tu30,tu21,tu12,tu03;
+    numtyp tu00_1,tu01_1,tu10_1;
+    numtyp tu00_2,tu01_2,tu10_2;
+    numtyp tu20_1,tu11_1,tu02_1;
+    numtyp tu20_2,tu11_2,tu02_2;
+    numtyp tuv100_1,tuv010_1,tuv001_1;
+    numtyp tuv100_2,tuv010_2,tuv001_2;
+    numtyp tuv200_1,tuv020_1,tuv002_1;
+    numtyp tuv110_1,tuv101_1,tuv011_1;
+    numtyp tuv200_2,tuv020_2,tuv002_2;
+    numtyp tuv110_2,tuv101_2,tuv011_2;
+    numtyp tuv000,tuv100,tuv010,tuv001;
+    numtyp tuv200,tuv020,tuv002,tuv110;
+    numtyp tuv101,tuv011,tuv300,tuv030;
+    numtyp tuv003,tuv210,tuv201,tuv120;
+    numtyp tuv021,tuv102,tuv012,tuv111;
+
+    int nlpts = (bsorder-1) / 2;
+
+    // extract the permanent multipole field at each site
+
+    tuv100_1 = 0.0;
+    tuv010_1 = 0.0;
+    tuv001_1 = 0.0;
+    tuv200_1 = 0.0;
+    tuv020_1 = 0.0;
+    tuv002_1 = 0.0;
+    tuv110_1 = 0.0;
+    tuv101_1 = 0.0;
+    tuv011_1 = 0.0;
+    tuv100_2 = 0.0;
+    tuv010_2 = 0.0;
+    tuv001_2 = 0.0;
+    tuv200_2 = 0.0;
+    tuv020_2 = 0.0;
+    tuv002_2 = 0.0;
+    tuv110_2 = 0.0;
+    tuv101_2 = 0.0;
+    tuv011_2 = 0.0;
+    tuv000 = 0.0;
+    tuv001 = 0.0;
+    tuv010 = 0.0;
+    tuv100 = 0.0;
+    tuv200 = 0.0;
+    tuv020 = 0.0;
+    tuv002 = 0.0;
+    tuv110 = 0.0;
+    tuv101 = 0.0;
+    tuv011 = 0.0;
+    tuv300 = 0.0;
+    tuv030 = 0.0;
+    tuv003 = 0.0;
+    tuv210 = 0.0;
+    tuv201 = 0.0;
+    tuv120 = 0.0;
+    tuv021 = 0.0;
+    tuv102 = 0.0;
+    tuv012 = 0.0;
+    tuv111 = 0.0;
+
+    k = igrid[i].z - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      /*
+      v0 = thetai3[m][kb][0];
+      v1 = thetai3[m][kb][1];
+      v2 = thetai3[m][kb][2];
+      v3 = thetai3[m][kb][3];
+      */
+      tu00_1 = 0.0;
+      tu01_1 = 0.0;
+      tu10_1 = 0.0;
+      tu20_1 = 0.0;
+      tu11_1 = 0.0;
+      tu02_1 = 0.0;
+      tu00_2 = 0.0;
+      tu01_2 = 0.0;
+      tu10_2 = 0.0;
+      tu20_2 = 0.0;
+      tu11_2 = 0.0;
+      tu02_2 = 0.0;
+      tu00 = 0.0;
+      tu10 = 0.0;
+      tu01 = 0.0;
+      tu20 = 0.0;
+      tu11 = 0.0;
+      tu02 = 0.0;
+      tu30 = 0.0;
+      tu21 = 0.0;
+      tu12 = 0.0;
+      tu03 = 0.0;
+
+      j = igrid[i].y - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        /*
+        u0 = thetai2[m][jb][0];
+        u1 = thetai2[m][jb][1];
+        u2 = thetai2[m][jb][2];
+        u3 = thetai2[m][jb][3];
+        */
+        t0_1 = 0.0;
+        t1_1 = 0.0;
+        t2_1 = 0.0;
+        t0_2 = 0.0;
+        t1_2 = 0.0;
+        t2_2 = 0.0;
+        t3 = 0.0;
+
+        i = igrid[m].x - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          /*
+          tq_1 = grid[k][j][i][0];
+          tq_2 = grid[k][j][i][1];
+          t0_1 += tq_1*thetai1[m][ib][0];
+          t1_1 += tq_1*thetai1[m][ib][1];
+          t2_1 += tq_1*thetai1[m][ib][2];
+          t0_2 += tq_2*thetai1[m][ib][0];
+          t1_2 += tq_2*thetai1[m][ib][1];
+          t2_2 += tq_2*thetai1[m][ib][2];
+          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
+          */
+          i++;
+        }
+
+        tu00_1 += t0_1*u0;
+        tu10_1 += t1_1*u0;
+        tu01_1 += t0_1*u1;
+        tu20_1 += t2_1*u0;
+        tu11_1 += t1_1*u1;
+        tu02_1 += t0_1*u2;
+        tu00_2 += t0_2*u0;
+        tu10_2 += t1_2*u0;
+        tu01_2 += t0_2*u1;
+        tu20_2 += t2_2*u0;
+        tu11_2 += t1_2*u1;
+        tu02_2 += t0_2*u2;
+        t0 = t0_1 + t0_2;
+        t1 = t1_1 + t1_2;
+        t2 = t2_1 + t2_2;
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv100_1 += tu10_1*v0;
+      tuv010_1 += tu01_1*v0;
+      tuv001_1 += tu00_1*v1;
+      tuv200_1 += tu20_1*v0;
+      tuv020_1 += tu02_1*v0;
+      tuv002_1 += tu00_1*v2;
+      tuv110_1 += tu11_1*v0;
+      tuv101_1 += tu10_1*v1;
+      tuv011_1 += tu01_1*v1;
+      tuv100_2 += tu10_2*v0;
+      tuv010_2 += tu01_2*v0;
+      tuv001_2 += tu00_2*v1;
+      tuv200_2 += tu20_2*v0;
+      tuv020_2 += tu02_2*v0;
+      tuv002_2 += tu00_2*v2;
+      tuv110_2 += tu11_2*v0;
+      tuv101_2 += tu10_2*v1;
+      tuv011_2 += tu01_2*v1;
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+/*
+    fdip_phi1[m][0] = 0.0;
+    fdip_phi1[m][1] = tuv100_1;
+    fdip_phi1[m][2] = tuv010_1;
+    fdip_phi1[m][3] = tuv001_1;
+    fdip_phi1[m][4] = tuv200_1;
+    fdip_phi1[m][5] = tuv020_1;
+    fdip_phi1[m][6] = tuv002_1;
+    fdip_phi1[m][7] = tuv110_1;
+    fdip_phi1[m][8] = tuv101_1;
+    fdip_phi1[m][9] = tuv011_1;
+
+    fdip_phi2[m][0] = 0.0;
+    fdip_phi2[m][1] = tuv100_2;
+    fdip_phi2[m][2] = tuv010_2;
+    fdip_phi2[m][3] = tuv001_2;
+    fdip_phi2[m][4] = tuv200_2;
+    fdip_phi2[m][5] = tuv020_2;
+    fdip_phi2[m][6] = tuv002_2;
+    fdip_phi2[m][7] = tuv110_2;
+    fdip_phi2[m][8] = tuv101_2;
+    fdip_phi2[m][9] = tuv011_2;
+
+    fdip_sum_phi[m][0] = tuv000;
+    fdip_sum_phi[m][1] = tuv100;
+    fdip_sum_phi[m][2] = tuv010;
+    fdip_sum_phi[m][3] = tuv001;
+    fdip_sum_phi[m][4] = tuv200;
+    fdip_sum_phi[m][5] = tuv020;
+    fdip_sum_phi[m][6] = tuv002;
+    fdip_sum_phi[m][7] = tuv110;
+    fdip_sum_phi[m][8] = tuv101;
+    fdip_sum_phi[m][9] = tuv011;
+    fdip_sum_phi[m][10] = tuv300;
+    fdip_sum_phi[m][11] = tuv030;
+    fdip_sum_phi[m][12] = tuv003;
+    fdip_sum_phi[m][13] = tuv210;
+    fdip_sum_phi[m][14] = tuv201;
+    fdip_sum_phi[m][15] = tuv120;
+    fdip_sum_phi[m][16] = tuv021;
+    fdip_sum_phi[m][17] = tuv102;
+    fdip_sum_phi[m][18] = tuv012;
+    fdip_sum_phi[m][19] = tuv111;
+*/
+  }
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 88a2c87166..c18b10675b 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -453,14 +453,14 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
     _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
     _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
   } else {
     if (inum_full>_max_thetai_size) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
       _thetai1.resize(_max_thetai_size*bsorder*4);
       _thetai2.resize(_max_thetai_size*bsorder*4);
       _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size*4);
+      _igrid.resize(_max_thetai_size);
     }
   }
 
@@ -471,7 +471,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   _thetai1.update_device(inum_full*bsorder*4,true);
   _thetai2.update_device(inum_full*bsorder*4,true);
   _thetai3.update_device(inum_full*bsorder*4,true);
-  _igrid.update_device(inum_full*4,true);
+  _igrid.update_device(inum_full,true);
 }
 
 // ---------------------------------------------------------------------------
@@ -593,12 +593,17 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                     double **host_thetai1, double **host_thetai2,
                                     double **host_thetai3, int** igrid,
-                                    double ****host_grid, double **host_fdip_phi1,
+                                    double ****host_cgrid_brick, double **host_fdip_phi1,
                                     double **host_fdip_phi2, double **host_fdip_sum_phi)
 {
   // once allocation and transfers
   precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid);
   
+  // resize grid if needed, then copy from host to device
+  // cgrid_brick.alloc()/resize()
+  // cgrid_brick.begin() = host_cgrid_brick[0][0][0][0];
+  //
+
   const int red_bllocks = fphi_uind();
 }
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 68c3470977..f333bdf9a6 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -250,7 +250,7 @@ class BaseAmoeba {
 
   int _bsorder;
   UCL_Vector<numtyp,numtyp> _thetai1, _thetai2, _thetai3;
-  UCL_Vector<int,int> _igrid;
+  UCL_Vector<int4,int4> _igrid;
   int _max_thetai_size;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------

From aac264f2e27b9c7db7748c627e143a65afda8db1 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndtrung@uchicago.edu>
Date: Tue, 30 Aug 2022 23:40:04 -0500
Subject: [PATCH 102/181] Working on the fphi_uind kernel and array allocations

---
 lib/gpu/Nvidia.makefile     |  26 +++-
 lib/gpu/lal_amoeba.cpp      |  13 +-
 lib/gpu/lal_amoeba.cu       | 265 +++++++++++++++++++++---------------
 lib/gpu/lal_base_amoeba.cpp | 106 +++++++++++----
 lib/gpu/lal_base_amoeba.h   |  21 ++-
 5 files changed, 283 insertions(+), 148 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 5f50486e28..c52246b06b 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -68,7 +68,31 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 
 # host code compilation
 
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+	
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 
 #ifdef CUDPP_OPT
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 498c55ceba..38058bab55 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -275,13 +275,18 @@ int AmoebaT::fphi_uind() {
   const int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
-/*
-  this->time_pair.start();
 
+  this->time_pair.start();
+  int ngridyz = this->_ngridy * this->_ngridz;
   this->k_fphi_uind.set_size(GX,BX);
-  this->k_fphi_uind.run();
+  this->k_fphi_uind.run(&this->atom->x, &this->_thetai1,
+                        &this->_thetai2, &this->_thetai3,
+                        &this->_igrid, &this->_cgrid_brick,
+                        &this->_fdip_phi1, &this->_fdip_phi2,
+                        &this->_fdip_sum_phi, &this->_bsorder,
+                        &ainum, &ngridyz, &this->_ngridy,
+                        &this->_threads_per_atom);
   this->time_pair.stop();
-*/
 
   return GX;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 1239764108..984154f16e 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1621,15 +1621,16 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
-                          const __global numtyp4 *restrict thetai1,
-                          const __global numtyp4 *restrict thetai2,
-                          const __global numtyp4 *restrict thetai3,
-                          const __global int4 *restrict igrid,
-                          const __global numtyp4 *restrict grid,
-                          __global numtyp4 *restrict fdip_phi1,
-                          __global numtyp4 *restrict fdip_phi2,
-                          __global numtyp4 *restrict fdip_sum_phi,
+                          const __global numtyp *restrict thetai1,
+                          const __global numtyp *restrict thetai2,
+                          const __global numtyp *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp *restrict grid,
+                          __global numtyp *restrict fdip_phi1,
+                          __global numtyp *restrict fdip_phi2,
+                          __global numtyp *restrict fdip_sum_phi,
                           const int bsorder, const int inum,
+                          const int nyzgrid, const int nygrid,
                           const int t_per_atom)
 {
   int tid, ii, offset, i, n_stride;
@@ -1666,46 +1667,46 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
 
     // extract the permanent multipole field at each site
 
-    tuv100_1 = 0.0;
-    tuv010_1 = 0.0;
-    tuv001_1 = 0.0;
-    tuv200_1 = 0.0;
-    tuv020_1 = 0.0;
-    tuv002_1 = 0.0;
-    tuv110_1 = 0.0;
-    tuv101_1 = 0.0;
-    tuv011_1 = 0.0;
-    tuv100_2 = 0.0;
-    tuv010_2 = 0.0;
-    tuv001_2 = 0.0;
-    tuv200_2 = 0.0;
-    tuv020_2 = 0.0;
-    tuv002_2 = 0.0;
-    tuv110_2 = 0.0;
-    tuv101_2 = 0.0;
-    tuv011_2 = 0.0;
-    tuv000 = 0.0;
-    tuv001 = 0.0;
-    tuv010 = 0.0;
-    tuv100 = 0.0;
-    tuv200 = 0.0;
-    tuv020 = 0.0;
-    tuv002 = 0.0;
-    tuv110 = 0.0;
-    tuv101 = 0.0;
-    tuv011 = 0.0;
-    tuv300 = 0.0;
-    tuv030 = 0.0;
-    tuv003 = 0.0;
-    tuv210 = 0.0;
-    tuv201 = 0.0;
-    tuv120 = 0.0;
-    tuv021 = 0.0;
-    tuv102 = 0.0;
-    tuv012 = 0.0;
-    tuv111 = 0.0;
+    tuv100_1 = (numtyp)0.0;
+    tuv010_1 = (numtyp)0.0;
+    tuv001_1 = (numtyp)0.0;
+    tuv200_1 = (numtyp)0.0;
+    tuv020_1 = (numtyp)0.0;
+    tuv002_1 = (numtyp)0.0;
+    tuv110_1 = (numtyp)0.0;
+    tuv101_1 = (numtyp)0.0;
+    tuv011_1 = (numtyp)0.0;
+    tuv100_2 = (numtyp)0.0;
+    tuv010_2 = (numtyp)0.0;
+    tuv001_2 = (numtyp)0.0;
+    tuv200_2 = (numtyp)0.0;
+    tuv020_2 = (numtyp)0.0;
+    tuv002_2 = (numtyp)0.0;
+    tuv110_2 = (numtyp)0.0;
+    tuv101_2 = (numtyp)0.0;
+    tuv011_2 = (numtyp)0.0;
+    tuv000 = (numtyp)0.0;
+    tuv001 = (numtyp)0.0;
+    tuv010 = (numtyp)0.0;
+    tuv100 = (numtyp)0.0;
+    tuv200 = (numtyp)0.0;
+    tuv020 = (numtyp)0.0;
+    tuv002 = (numtyp)0.0;
+    tuv110 = (numtyp)0.0;
+    tuv101 = (numtyp)0.0;
+    tuv011 = (numtyp)0.0;
+    tuv300 = (numtyp)0.0;
+    tuv030 = (numtyp)0.0;
+    tuv003 = (numtyp)0.0;
+    tuv210 = (numtyp)0.0;
+    tuv201 = (numtyp)0.0;
+    tuv120 = (numtyp)0.0;
+    tuv021 = (numtyp)0.0;
+    tuv102 = (numtyp)0.0;
+    tuv012 = (numtyp)0.0;
+    tuv111 = (numtyp)0.0;
 
-    k = igrid[i].z - nlpts;
+    k = igrid[3*i+2] - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
       /*
       v0 = thetai3[m][kb][0];
@@ -1713,30 +1714,35 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v2 = thetai3[m][kb][2];
       v3 = thetai3[m][kb][3];
       */
-      tu00_1 = 0.0;
-      tu01_1 = 0.0;
-      tu10_1 = 0.0;
-      tu20_1 = 0.0;
-      tu11_1 = 0.0;
-      tu02_1 = 0.0;
-      tu00_2 = 0.0;
-      tu01_2 = 0.0;
-      tu10_2 = 0.0;
-      tu20_2 = 0.0;
-      tu11_2 = 0.0;
-      tu02_2 = 0.0;
-      tu00 = 0.0;
-      tu10 = 0.0;
-      tu01 = 0.0;
-      tu20 = 0.0;
-      tu11 = 0.0;
-      tu02 = 0.0;
-      tu30 = 0.0;
-      tu21 = 0.0;
-      tu12 = 0.0;
-      tu03 = 0.0;
+      int i3 = m*4*bsorder + 4*kb;
+      v0 = thetai3[i3];
+      v1 = thetai3[i3]+1;
+      v2 = thetai3[i3+2];
+      v3 = thetai3[i3+3];
+      tu00_1 = (numtyp)0.0;
+      tu01_1 = (numtyp)0.0;
+      tu10_1 = (numtyp)0.0;
+      tu20_1 = (numtyp)0.0;
+      tu11_1 = (numtyp)0.0;
+      tu02_1 = (numtyp)0.0;
+      tu00_2 = (numtyp)0.0;
+      tu01_2 = (numtyp)0.0;
+      tu10_2 = (numtyp)0.0;
+      tu20_2 = (numtyp)0.0;
+      tu11_2 = (numtyp)0.0;
+      tu02_2 = (numtyp)0.0;
+      tu00 = (numtyp)0.0;
+      tu10 = (numtyp)0.0;
+      tu01 = (numtyp)0.0;
+      tu20 = (numtyp)0.0;
+      tu11 = (numtyp)0.0;
+      tu02 = (numtyp)0.0;
+      tu30 = (numtyp)0.0;
+      tu21 = (numtyp)0.0;
+      tu12 = (numtyp)0.0;
+      tu03 = (numtyp)0.0;
 
-      j = igrid[i].y - nlpts;
+      j = igrid[3*i+1] - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
         /*
         u0 = thetai2[m][jb][0];
@@ -1744,19 +1750,24 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u2 = thetai2[m][jb][2];
         u3 = thetai2[m][jb][3];
         */
-        t0_1 = 0.0;
-        t1_1 = 0.0;
-        t2_1 = 0.0;
-        t0_2 = 0.0;
-        t1_2 = 0.0;
-        t2_2 = 0.0;
-        t3 = 0.0;
+        int i2 = m*4*bsorder+4*jb;
+        u0 = thetai2[i2];
+        u1 = thetai2[i2+1];
+        u2 = thetai2[i2+2];
+        u3 = thetai2[i2+3];
+        t0_1 = (numtyp)0.0;
+        t1_1 = (numtyp)0.0;
+        t2_1 = (numtyp)0.0;
+        t0_2 = (numtyp)0.0;
+        t1_2 = (numtyp)0.0;
+        t2_2 = (numtyp)0.0;
+        t3 = (numtyp)0.0;
 
-        i = igrid[m].x - nlpts;
+        int ii = igrid[3*i] - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
           /*
-          tq_1 = grid[k][j][i][0];
-          tq_2 = grid[k][j][i][1];
+          tq_1 = grid[k][j][ii][0];
+          tq_2 = grid[k][j][ii][1];
           t0_1 += tq_1*thetai1[m][ib][0];
           t1_1 += tq_1*thetai1[m][ib][1];
           t2_1 += tq_1*thetai1[m][ib][2];
@@ -1765,7 +1776,22 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          i++;
+          int i1 = m*4*bsorder+4*ib;
+          numtyp w0 = thetai1[i1];
+          numtyp w1 = thetai1[i1+1];
+          numtyp w2 = thetai1[i1+2];
+          numtyp w3 = thetai1[i1+3];
+          int gidx = 2*(k*nyzgrid + j*nygrid + ii);
+          tq_1 = grid[gidx];
+          tq_2 = grid[gidx+1];
+          t0_1 += tq_1*w0;
+          t1_1 += tq_1*w1;
+          t2_1 += tq_1*w2;
+          t0_2 += tq_2*w0;
+          t1_2 += tq_2*w1;
+          t2_2 += tq_2*w2;
+          t3 += (tq_1+tq_2)*w3;
+          ii++;
         }
 
         tu00_1 += t0_1*u0;
@@ -1836,6 +1862,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       tuv111 += tu11*v1;
       k++;
     }
+
 /*
     fdip_phi1[m][0] = 0.0;
     fdip_phi1[m][1] = tuv100_1;
@@ -1847,39 +1874,51 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     fdip_phi1[m][7] = tuv110_1;
     fdip_phi1[m][8] = tuv101_1;
     fdip_phi1[m][9] = tuv011_1;
-
-    fdip_phi2[m][0] = 0.0;
-    fdip_phi2[m][1] = tuv100_2;
-    fdip_phi2[m][2] = tuv010_2;
-    fdip_phi2[m][3] = tuv001_2;
-    fdip_phi2[m][4] = tuv200_2;
-    fdip_phi2[m][5] = tuv020_2;
-    fdip_phi2[m][6] = tuv002_2;
-    fdip_phi2[m][7] = tuv110_2;
-    fdip_phi2[m][8] = tuv101_2;
-    fdip_phi2[m][9] = tuv011_2;
-
-    fdip_sum_phi[m][0] = tuv000;
-    fdip_sum_phi[m][1] = tuv100;
-    fdip_sum_phi[m][2] = tuv010;
-    fdip_sum_phi[m][3] = tuv001;
-    fdip_sum_phi[m][4] = tuv200;
-    fdip_sum_phi[m][5] = tuv020;
-    fdip_sum_phi[m][6] = tuv002;
-    fdip_sum_phi[m][7] = tuv110;
-    fdip_sum_phi[m][8] = tuv101;
-    fdip_sum_phi[m][9] = tuv011;
-    fdip_sum_phi[m][10] = tuv300;
-    fdip_sum_phi[m][11] = tuv030;
-    fdip_sum_phi[m][12] = tuv003;
-    fdip_sum_phi[m][13] = tuv210;
-    fdip_sum_phi[m][14] = tuv201;
-    fdip_sum_phi[m][15] = tuv120;
-    fdip_sum_phi[m][16] = tuv021;
-    fdip_sum_phi[m][17] = tuv102;
-    fdip_sum_phi[m][18] = tuv012;
-    fdip_sum_phi[m][19] = tuv111;
 */
+    int idx = 10*m;
+    fdip_phi1[idx+0] = (numtyp)0.0;
+    fdip_phi1[idx+1] = tuv100_1;
+    fdip_phi1[idx+2] = tuv010_1;
+    fdip_phi1[idx+3] = tuv001_1;
+    fdip_phi1[idx+4] = tuv200_1;
+    fdip_phi1[idx+5] = tuv020_1;
+    fdip_phi1[idx+6] = tuv002_1;
+    fdip_phi1[idx+7] = tuv110_1;
+    fdip_phi1[idx+8] = tuv101_1;
+    fdip_phi1[idx+9] = tuv011_1;
+
+    fdip_phi2[idx+0] = (numtyp)0.0;
+    fdip_phi2[idx+1] = tuv100_2;
+    fdip_phi2[idx+2] = tuv010_2;
+    fdip_phi2[idx+3] = tuv001_2;
+    fdip_phi2[idx+4] = tuv200_2;
+    fdip_phi2[idx+5] = tuv020_2;
+    fdip_phi2[idx+6] = tuv002_2;
+    fdip_phi2[idx+7] = tuv110_2;
+    fdip_phi2[idx+8] = tuv101_2;
+    fdip_phi2[idx+9] = tuv011_2;
+
+    idx = 20*m;
+    fdip_sum_phi[idx+0] = tuv000;
+    fdip_sum_phi[idx+1] = tuv100;
+    fdip_sum_phi[idx+2] = tuv010;
+    fdip_sum_phi[idx+3] = tuv001;
+    fdip_sum_phi[idx+4] = tuv200;
+    fdip_sum_phi[idx+5] = tuv020;
+    fdip_sum_phi[idx+6] = tuv002;
+    fdip_sum_phi[idx+7] = tuv110;
+    fdip_sum_phi[idx+8] = tuv101;
+    fdip_sum_phi[idx+9] = tuv011;
+    fdip_sum_phi[idx+10] = tuv300;
+    fdip_sum_phi[idx+11] = tuv030;
+    fdip_sum_phi[idx+12] = tuv003;
+    fdip_sum_phi[idx+13] = tuv210;
+    fdip_sum_phi[idx+14] = tuv201;
+    fdip_sum_phi[idx+15] = tuv120;
+    fdip_sum_phi[idx+16] = tuv021;
+    fdip_sum_phi[idx+17] = tuv102;
+    fdip_sum_phi[idx+18] = tuv012;
+    fdip_sum_phi[idx+19] = tuv111;
   }
 }
 
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index c18b10675b..cd5a9abf81 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
   k_multipole.clear();
   k_udirect2b.clear();
   k_umutual2b.clear();
+  k_fphi_uind.clear();
   k_polar.clear();
   k_special15.clear();
   k_short_nbor.clear();
@@ -182,6 +183,11 @@ void BaseAmoebaT::clear_atomic() {
   _thetai1.clear();
   _thetai2.clear();
   _thetai3.clear();
+  _igrid.clear();
+  _fdip_phi1.clear();
+  _fdip_phi2.clear();
+  _cgrid_brick.clear();
+  _fdip_sum_phi.clear();
   dev_nspecial15.clear();
   dev_special15.clear();
   dev_special15_t.clear();
@@ -444,34 +450,70 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 template <class numtyp, class acctyp>
 void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                     double **host_thetai1, double **host_thetai2,
-                                    double **host_thetai3, int** host_igrid) {
+                                    double **host_thetai3, int** host_igrid,
+                                    double* grid_brick_start, int nzlo_out,
+                                    int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out) {
   
   _bsorder = bsorder;
 
+  // allocate or resize per-atom arrays
+  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
+  //   will be consolidated once all terms are ready
+
   if (_max_thetai_size == 0) {
     _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY);
+
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
+
   } else {
     if (inum_full>_max_thetai_size) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
       _thetai1.resize(_max_thetai_size*bsorder*4);
       _thetai2.resize(_max_thetai_size*bsorder*4);
       _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size);
+      _igrid.resize(_max_thetai_size*3);
+
+      _fdip_phi1.resize(_max_thetai_size*10);
+      _fdip_phi2.resize(_max_thetai_size*10);
+      _fdip_sum_phi.resize(_max_thetai_size*20);
     }
   }
 
-  memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int));
-  _thetai1.update_device(inum_full*bsorder*4,true);
-  _thetai2.update_device(inum_full*bsorder*4,true);
-  _thetai3.update_device(inum_full*bsorder*4,true);
-  _igrid.update_device(inum_full,true);
+  UCL_H_Vec<double> dview;
+
+  // copy from host to device
+  
+  dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai1,dview,false);
+  dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai2,dview,false);
+  dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai3,dview,false);
+
+  UCL_H_Vec<int> dview_int;
+  dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device));
+  ucl_copy(_igrid,dview_int,false);
+
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
+  dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device));
+  ucl_copy(_cgrid_brick,dview,false);
+
 }
 
 // ---------------------------------------------------------------------------
@@ -593,18 +635,35 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                     double **host_thetai1, double **host_thetai2,
                                     double **host_thetai3, int** igrid,
-                                    double ****host_cgrid_brick, double **host_fdip_phi1,
-                                    double **host_fdip_phi2, double **host_fdip_sum_phi)
+                                    double *host_grid_brick_start, double **host_fdip_phi1,
+                                    double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out)
 {
-  // once allocation and transfers
-  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid);
+  // allocation/resize and transfers (do this right after udirect?)
   
-  // resize grid if needed, then copy from host to device
-  // cgrid_brick.alloc()/resize()
-  // cgrid_brick.begin() = host_cgrid_brick[0][0][0][0];
-  //
+  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
+                    igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out,
+                    nxlo_out, nxhi_out);
 
-  const int red_bllocks = fphi_uind();
+  // update the cgrid_brick with data host
+  
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
+
+  UCL_H_Vec<double> dview;
+  dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device));
+  ucl_copy(_cgrid_brick,dview,false);
+
+  const int red_blocks = fphi_uind();
 }
 
 
@@ -814,6 +873,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
   k_polar.set_function(*pair_program,kname_polar);
+  k_fphi_uind.set_function(*pair_program,"kname_fphi_uind");
   k_short_nbor.set_function(*pair_program,kname_short_nbor);
   k_special15.set_function(*pair_program,kname_special15);
   pos_tex.get_texture(*pair_program,"pos_tex");
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index f333bdf9a6..8503e6fba4 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -152,7 +152,10 @@ class BaseAmoeba {
 
   virtual void precompute_induce(const int inum_full, const int bsorder,
                                  double **host_thetai1, double **host_thetai2,
-                                 double **host_thetai3, int** igrid);
+                                 double **host_thetai3, int** igrid,
+                                 double* grid_brick_start, int nzlo_out,
+                                 int nzhi_out, int nylo_out, int nyhi_out,
+                                 int nxlo_out, int nxhi_out);
 
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
@@ -179,8 +182,10 @@ class BaseAmoeba {
   virtual void compute_fphi_uind(const int inum_full, const int bsorder,
                                  double **host_thetai1, double **host_thetai2,
                                  double **host_thetai3, int** igrid,
-                                 double ****host_grid, double **host_fdip_phi1,
-                                 double **host_fdip_phi2, double **host_fdip_sum_phi);
+                                 double *host_grid_brick_start, double **host_fdip_phi1,
+                                 double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                 int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                 int nxlo_out, int nxhi_out);
 
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -249,9 +254,12 @@ class BaseAmoeba {
   int _nmax, _max_tep_size, _max_fieldp_size;
 
   int _bsorder;
-  UCL_Vector<numtyp,numtyp> _thetai1, _thetai2, _thetai3;
-  UCL_Vector<int4,int4> _igrid;
+  UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3, _cgrid_brick;
+  UCL_D_Vec<int> _igrid;
+  UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
   int _max_thetai_size;
+  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
+  int _ngridx, _ngridy, _ngridz, _num_grid_points;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
@@ -272,7 +280,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind;
   UCL_Kernel k_special15, k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
@@ -305,7 +313,6 @@ class BaseAmoeba {
   virtual int umutual2b(const int eflag, const int vflag) = 0;
   virtual int fphi_uind() = 0;
   virtual int polar_real(const int eflag, const int vflag) = 0;
-
   
 
   #if !defined(USE_OPENCL) && !defined(USE_HIP)

From cad7e1b364c6b6e2a376b26b31af6386038580e3 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 2 Sep 2022 10:18:59 -0500
Subject: [PATCH 103/181] Moved fphi_uind up to BaseAmoeba

---
 lib/gpu/lal_amoeba.cpp      | 32 --------------------------------
 lib/gpu/lal_amoeba.cu       |  6 +++---
 lib/gpu/lal_amoeba.h        |  1 -
 lib/gpu/lal_base_amoeba.cpp | 36 ++++++++++++++++++++++++++++++++----
 lib/gpu/lal_base_amoeba.h   |  2 +-
 5 files changed, 36 insertions(+), 41 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 38058bab55..924a175cfe 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -259,38 +259,6 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   return GX;
 }
 
-// ---------------------------------------------------------------------------
-// Interpolate the potential from the PME grid
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int AmoebaT::fphi_uind() {
-  int ainum=this->ans->inum();
-  if (ainum == 0)
-    return 0;
-
-  int _nall=this->atom->nall();
-  int nbor_pitch=this->nbor->nbor_pitch();
-
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  this->time_pair.start();
-  int ngridyz = this->_ngridy * this->_ngridz;
-  this->k_fphi_uind.set_size(GX,BX);
-  this->k_fphi_uind.run(&this->atom->x, &this->_thetai1,
-                        &this->_thetai2, &this->_thetai3,
-                        &this->_igrid, &this->_cgrid_brick,
-                        &this->_fdip_phi1, &this->_fdip_phi2,
-                        &this->_fdip_sum_phi, &this->_bsorder,
-                        &ainum, &ngridyz, &this->_ngridy,
-                        &this->_threads_per_atom);
-  this->time_pair.stop();
-
-  return GX;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate the polar real-space term, returning tep
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 984154f16e..200191cea2 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1706,7 +1706,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     tuv012 = (numtyp)0.0;
     tuv111 = (numtyp)0.0;
 
-    k = igrid[3*i+2] - nlpts;
+    k = igrid[4*i+2] - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
       /*
       v0 = thetai3[m][kb][0];
@@ -1742,7 +1742,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       tu12 = (numtyp)0.0;
       tu03 = (numtyp)0.0;
 
-      j = igrid[3*i+1] - nlpts;
+      j = igrid[4*i+1] - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
         /*
         u0 = thetai2[m][jb][0];
@@ -1763,7 +1763,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         t2_2 = (numtyp)0.0;
         t3 = (numtyp)0.0;
 
-        int ii = igrid[3*i] - nlpts;
+        int ii = igrid[4*i] - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
           /*
           tq_1 = grid[k][j][ii][0];
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 005ea14fb9..d12b79719f 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -91,7 +91,6 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
-  int fphi_uind();
   int polar_real(const int eflag, const int vflag);
 
 };
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index cd5a9abf81..1269a798db 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _max_fieldp_size = _max_tep_size;
   _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
-  _max_thetai_size = 0;
+  _max_thetai_size = _max_tep_size;
 
   _nmax = nall;
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
@@ -466,7 +466,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
     _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
     _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
 
     _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
     _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
@@ -478,7 +478,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
       _thetai1.resize(_max_thetai_size*bsorder*4);
       _thetai2.resize(_max_thetai_size*bsorder*4);
       _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size*3);
+      _igrid.resize(_max_thetai_size*4);
 
       _fdip_phi1.resize(_max_thetai_size*10);
       _fdip_phi2.resize(_max_thetai_size*10);
@@ -498,7 +498,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   ucl_copy(_thetai3,dview,false);
 
   UCL_H_Vec<int> dview_int;
-  dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device));
+  dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device));
   ucl_copy(_igrid,dview_int,false);
 
   _nzlo_out = nzlo_out;
@@ -666,6 +666,34 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
   const int red_blocks = fphi_uind();
 }
 
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_uind() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=atom->nall();
+  int nbor_pitch=nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
+                               (BX/_threads_per_atom)));
+
+  time_pair.start();
+  int ngridyz = _ngridy * _ngridz;
+  k_fphi_uind.set_size(GX,BX);
+  k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3,
+                  &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2,
+                  &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy,
+                  &_threads_per_atom);
+  time_pair.stop();
+
+  return GX;
+}
 
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 8503e6fba4..d3ae3a750b 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -311,7 +311,7 @@ class BaseAmoeba {
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
-  virtual int fphi_uind() = 0;
+  virtual int fphi_uind();
   virtual int polar_real(const int eflag, const int vflag) = 0;
   
 

From 21b7fb2fcfb842b1f332eb737ae83fa5f89d48d2 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndtrung@uchicago.edu>
Date: Fri, 2 Sep 2022 14:55:20 -0500
Subject: [PATCH 104/181] Exposing fphi_uind to the gpu pair style, still
 keeping the part not ready though

---
 lib/gpu/lal_amoeba_ext.cpp  |  12 ++-
 lib/gpu/lal_base_amoeba.cpp | 200 ++++++++++++++++++++----------------
 lib/gpu/lal_base_amoeba.h   |  14 +--
 src/AMOEBA/pair_amoeba.h    |   2 +-
 src/GPU/pair_amoeba_gpu.cpp |  60 ++++++++++-
 src/GPU/pair_amoeba_gpu.h   |   4 +
 6 files changed, 193 insertions(+), 99 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 6989a5e6f6..151c38c9c4 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,9 +162,17 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
-void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp,
+void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, double ***grid) {
+                          double ***host_thetai3, int** igrid,
+                          double *host_grid_brick_start, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                          int nxlo_out, int nxhi_out, bool& first_iteration) {
+   AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
+                          host_thetai3, igrid, host_grid_brick_start, host_fdip_phi1,
+                          host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
+                          nylo_out, nyhi_out,  nxlo_out, nxhi_out, first_iteration);
 }
 
 void amoeba_setup_fft(const int numel, const int element_type) {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 1269a798db..bdd43aa59e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _max_fieldp_size = _max_tep_size;
   _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
-  _max_thetai_size = _max_tep_size;
+  _max_thetai_size = 0;
 
   _nmax = nall;
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
@@ -441,81 +441,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   return nbor->host_jlist.begin()-host_start;
 }
 
-// ---------------------------------------------------------------------------
-// Prepare for umutual1: bspline_fill
-//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed
-//   - transfer extra data from host to device
-// ---------------------------------------------------------------------------
-
-template <class numtyp, class acctyp>
-void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
-                                    double **host_thetai1, double **host_thetai2,
-                                    double **host_thetai3, int** host_igrid,
-                                    double* grid_brick_start, int nzlo_out,
-                                    int nzhi_out, int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out) {
-  
-  _bsorder = bsorder;
-
-  // allocate or resize per-atom arrays
-  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
-  //   will be consolidated once all terms are ready
-
-  if (_max_thetai_size == 0) {
-    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
-
-    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
-    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
-    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
-
-  } else {
-    if (inum_full>_max_thetai_size) {
-      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-      _thetai1.resize(_max_thetai_size*bsorder*4);
-      _thetai2.resize(_max_thetai_size*bsorder*4);
-      _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size*4);
-
-      _fdip_phi1.resize(_max_thetai_size*10);
-      _fdip_phi2.resize(_max_thetai_size*10);
-      _fdip_sum_phi.resize(_max_thetai_size*20);
-    }
-  }
-
-  UCL_H_Vec<double> dview;
-
-  // copy from host to device
-  
-  dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device));
-  ucl_copy(_thetai1,dview,false);
-  dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device));
-  ucl_copy(_thetai2,dview,false);
-  dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device));
-  ucl_copy(_thetai3,dview,false);
-
-  UCL_H_Vec<int> dview_int;
-  dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device));
-  ucl_copy(_igrid,dview_int,false);
-
-  _nzlo_out = nzlo_out;
-  _nzhi_out = nzhi_out;
-  _nylo_out = nylo_out;
-  _nyhi_out = nyhi_out;
-  _nxlo_out = nxlo_out;
-  _nxhi_out = nxhi_out;
-  _ngridz = nzhi_out - nzlo_out + 1;
-  _ngridy = nyhi_out - nylo_out + 1;
-  _ngridx = nxhi_out - nxlo_out + 1;
-  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
-  dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device));
-  ucl_copy(_cgrid_brick,dview,false);
-
-}
-
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
@@ -626,6 +551,98 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
   // _fieldp.update_host(_max_fieldp_size*8,false);
 }
 
+// ---------------------------------------------------------------------------
+// Prepare for umutual1() after bspline_fill() is done on host
+//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
+//     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
+//     host_igrid is allocated with nmax by by 4
+//   - transfer extra data from host to device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** host_igrid,
+                                    double* grid_brick_start, int nzlo_out,
+                                    int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out) {
+  
+  _bsorder = bsorder;
+
+  // allocate or resize per-atom arrays
+  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
+  //   will be consolidated once all terms are ready
+
+  if (_max_thetai_size == 0) {
+    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
+
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
+
+  } else {
+    if (inum_full>_max_thetai_size) {
+      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+      _thetai1.resize(_max_thetai_size*bsorder*4);
+      _thetai2.resize(_max_thetai_size*bsorder*4);
+      _thetai3.resize(_max_thetai_size*bsorder*4);
+      _igrid.resize(_max_thetai_size*4);
+
+      _fdip_phi1.resize(_max_thetai_size*10);
+      _fdip_phi2.resize(_max_thetai_size*10);
+      _fdip_sum_phi.resize(_max_thetai_size*20);
+    }
+  }
+
+  UCL_H_Vec<double> dview;
+  dview.alloc(inum_full*bsorder*4,*(this->ucl_device));
+
+  // pack host data to device
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*4*bsorder + 4*j;
+      dview[idx+0] = host_thetai1[i][j][0];
+      dview[idx+1] = host_thetai1[i][j][1];
+      dview[idx+2] = host_thetai1[i][j][2];
+      dview[idx+3] = host_thetai1[i][j][3];
+    }
+  ucl_copy(_thetai1,dview,false);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*4*bsorder + 4*j;
+      dview[idx+0] = host_thetai2[i][j][0];
+      dview[idx+1] = host_thetai2[i][j][1];
+      dview[idx+2] = host_thetai2[i][j][2];
+      dview[idx+3] = host_thetai2[i][j][3];
+    }
+  ucl_copy(_thetai2,dview,false);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*4*bsorder + 4*j;
+      dview[idx+0] = host_thetai3[i][j][0];
+      dview[idx+1] = host_thetai3[i][j][1];
+      dview[idx+2] = host_thetai3[i][j][2];
+      dview[idx+3] = host_thetai3[i][j][3];
+    }
+  ucl_copy(_thetai3,dview,false);
+
+  UCL_H_Vec<int> dview_int;
+  for (int i = 0; i < inum_full; i++) {
+    int idx = i*4;
+    dview_int[idx+0] = host_igrid[i][0];
+    dview_int[idx+1] = host_igrid[i][1];
+    dview_int[idx+2] = host_igrid[i][2];
+  }
+  ucl_copy(_igrid,dview_int,false);
+}
+
 // ---------------------------------------------------------------------------
 // fphi_uind = induced potential from grid
 // fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
@@ -633,19 +650,22 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
-                                    double **host_thetai1, double **host_thetai2,
-                                    double **host_thetai3, int** igrid,
-                                    double *host_grid_brick_start, double **host_fdip_phi1,
-                                    double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** igrid,
+                                    double *host_grid_brick_start, void** host_fdip_phi1,
+                                    void **host_fdip_phi2, void **host_fdip_sum_phi,
                                     int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out)
+                                    int nxlo_out, int nxhi_out, bool& first_iteration)
 {
-  // allocation/resize and transfers (do this right after udirect?)
+  // allocation/resize and transfers before the first iteration
   
-  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
-                    igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out,
-                    nxlo_out, nxhi_out);
-
+  if (first_iteration) {
+    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
+                      igrid, host_grid_brick_start, nzlo_out, nzhi_out,
+                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
+    if (first_iteration) first_iteration = false;
+  }
+    
   // update the cgrid_brick with data host
   
   _nzlo_out = nzlo_out;
@@ -664,6 +684,14 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
   ucl_copy(_cgrid_brick,dview,false);
 
   const int red_blocks = fphi_uind();
+
+  _fdip_phi1.update_host(_max_thetai_size*10);
+  _fdip_phi2.update_host(_max_thetai_size*10);
+  _fdip_sum_phi.update_host(_max_thetai_size*20);
+
+  *host_fdip_phi1 = _fdip_phi1.host.begin();
+  *host_fdip_phi2 = _fdip_phi2.host.begin();
+  *host_fdip_sum_phi = _fdip_sum_phi.host.begin();
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index d3ae3a750b..a001423812 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -151,8 +151,8 @@ class BaseAmoeba {
                 double *charge, double *boxlo, double *prd);
 
   virtual void precompute_induce(const int inum_full, const int bsorder,
-                                 double **host_thetai1, double **host_thetai2,
-                                 double **host_thetai3, int** igrid,
+                                 double ***host_thetai1, double ***host_thetai2,
+                                 double ***host_thetai3, int** igrid,
                                  double* grid_brick_start, int nzlo_out,
                                  int nzhi_out, int nylo_out, int nyhi_out,
                                  int nxlo_out, int nxhi_out);
@@ -180,12 +180,12 @@ class BaseAmoeba {
                 const double aewald, const double off2_polar, void **fieldp_ptr);
 
   virtual void compute_fphi_uind(const int inum_full, const int bsorder,
-                                 double **host_thetai1, double **host_thetai2,
-                                 double **host_thetai3, int** igrid,
-                                 double *host_grid_brick_start, double **host_fdip_phi1,
-                                 double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                 double ***host_thetai1, double ***host_thetai2,
+                                 double ***host_thetai3, int** igrid,
+                                 double *host_grid_brick_start, void **host_fdip_phi1,
+                                 void **host_fdip_phi2, void **host_fdip_sum_phi,
                                  int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                 int nxlo_out, int nxhi_out);
+                                 int nxlo_out, int nxhi_out, bool& first_iteration);
 
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 93978ab1f2..17b2d4a1e8 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -407,7 +407,7 @@ class PairAmoeba : public Pair {
   void grid_mpole(double **, double ***);
   void fphi_mpole(double ***, double **);
   void grid_uind(double **, double **, double ****);
-  void fphi_uind(double ****, double **, double **, double **);
+  virtual void fphi_uind(double ****, double **, double **, double **);
   void grid_disp(double ***);
 
   void kewald();
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index cd3c01cde3..bf6db3472d 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -88,9 +88,13 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 
-void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp,
-                          double** host_thetai1, double** host_thetai2,
-                          double** host_thetai3, double ***grid);
+void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          double *host_grid_brick_start, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                          int nxlo_out, int nxhi_out, bool& first_iteration);
 
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
@@ -117,6 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;     // need to be true for precompute()
   gpu_udirect2b_ready = true;
   gpu_umutual1_ready = true;
+  gpu_fphi_uind_ready = false;
   gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
 
@@ -481,6 +486,8 @@ void PairAmoebaGPU::induce()
 
     // conjugate gradient iteration of the mutual induced dipoles
 
+    first_induce_iteration  = true;
+
     while (!done) {
       iter++;
 
@@ -1115,6 +1122,53 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 */
 }
 
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
+                              double **fdip_phi2, double **fdip_sum_phi)
+{
+  if (!gpu_fphi_uind_ready) {
+    PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
+    return;
+  }
+
+  void* fdip_phi1_pinned = nullptr;
+  void* fdip_phi2_pinned = nullptr;
+  void* fdip_sum_phi_pinned = nullptr;
+  amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3,
+                        igrid, ic_kspace->grid_brick_start,
+                        &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned,
+                        ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                        ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                        ic_kspace->nxlo_out, ic_kspace->nxhi_out,
+                        first_induce_iteration);
+  
+  int nlocal = atom->nlocal;
+  double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 10 * i;
+    for (int m = 0; m < 10; m++)
+      fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+  }
+
+  double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 10 * i;
+    for (int m = 0; m < 10; m++)
+      fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m];
+  }
+
+  double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 20 * i;
+    for (int m = 0; m < 20; m++)
+      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m];
+  }
+}
+
 /* ----------------------------------------------------------------------
    umutual2b = Ewald real mutual field via list
    umutual2b computes the real space contribution of the induced
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index e0563cd8b5..fe6ed3368f 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -39,6 +39,7 @@ class PairAmoebaGPU : public PairAmoeba {
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
   virtual void umutual1(double **, double **);
+  virtual void fphi_uind(double ****, double **, double **, double **);
   virtual void umutual2b(double **, double **);
   virtual void ufield0c(double **, double **);
   virtual void polar_real();
@@ -56,9 +57,12 @@ class PairAmoebaGPU : public PairAmoeba {
   bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
   bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
   bool gpu_umutual2b_ready;
   bool gpu_polar_real_ready;
 
+  bool first_induce_iteration;
+
   void udirect2b_cpu();
 
   template<class numtyp>

From a0af9627e5e9d2d3849ad74f1fe4d2ef7291123c Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 6 Sep 2022 16:19:17 -0500
Subject: [PATCH 105/181] Fixed memory bugs with device array allocations

---
 lib/gpu/lal_amoeba.cu       |  6 ++---
 lib/gpu/lal_base_amoeba.cpp | 49 ++++++++++++++++++++-----------------
 2 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 200191cea2..4a26f7f98d 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1714,7 +1714,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v2 = thetai3[m][kb][2];
       v3 = thetai3[m][kb][3];
       */
-      int i3 = m*4*bsorder + 4*kb;
+      int i3 = i*4*bsorder + 4*kb;
       v0 = thetai3[i3];
       v1 = thetai3[i3]+1;
       v2 = thetai3[i3+2];
@@ -1750,7 +1750,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u2 = thetai2[m][jb][2];
         u3 = thetai2[m][jb][3];
         */
-        int i2 = m*4*bsorder+4*jb;
+        int i2 = i*4*bsorder+4*jb;
         u0 = thetai2[i2];
         u1 = thetai2[i2+1];
         u2 = thetai2[i2+2];
@@ -1776,7 +1776,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          int i1 = m*4*bsorder+4*ib;
+          int i1 = i*4*bsorder+4*ib;
           numtyp w0 = thetai1[i1];
           numtyp w1 = thetai1[i1+1];
           numtyp w2 = thetai1[i1+2];
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index bdd43aa59e..af8d5ca481 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -563,7 +563,7 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                     double ***host_thetai1, double ***host_thetai2,
                                     double ***host_thetai3, int** host_igrid,
-                                    double* grid_brick_start, int nzlo_out,
+                                    double* host_grid_brick_start, int nzlo_out,
                                     int nzhi_out, int nylo_out, int nyhi_out,
                                     int nxlo_out, int nxhi_out) {
   
@@ -580,9 +580,9 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
     _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
 
-    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
-    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
-    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
 
   } else {
     if (inum_full>_max_thetai_size) {
@@ -634,13 +634,33 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   ucl_copy(_thetai3,dview,false);
 
   UCL_H_Vec<int> dview_int;
+  dview_int.alloc(inum_full*4, *(this->ucl_device));
   for (int i = 0; i < inum_full; i++) {
     int idx = i*4;
     dview_int[idx+0] = host_igrid[i][0];
     dview_int[idx+1] = host_igrid[i][1];
     dview_int[idx+2] = host_igrid[i][2];
   }
-  ucl_copy(_igrid,dview_int,false);
+  ucl_copy(_igrid, dview_int, false);
+
+  // update the cgrid_brick with data host
+  
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx * _ngridy * _ngridz;
+
+  UCL_H_Vec<double> dview_cgrid;
+  dview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device));
+  _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(_cgrid_brick,dview_cgrid,false);
+
 }
 
 // ---------------------------------------------------------------------------
@@ -666,23 +686,6 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
     if (first_iteration) first_iteration = false;
   }
     
-  // update the cgrid_brick with data host
-  
-  _nzlo_out = nzlo_out;
-  _nzhi_out = nzhi_out;
-  _nylo_out = nylo_out;
-  _nyhi_out = nyhi_out;
-  _nxlo_out = nxlo_out;
-  _nxhi_out = nxhi_out;
-  _ngridz = nzhi_out - nzlo_out + 1;
-  _ngridy = nyhi_out - nylo_out + 1;
-  _ngridx = nxhi_out - nxlo_out + 1;
-  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
-
-  UCL_H_Vec<double> dview;
-  dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device));
-  ucl_copy(_cgrid_brick,dview,false);
-
   const int red_blocks = fphi_uind();
 
   _fdip_phi1.update_host(_max_thetai_size*10);
@@ -929,7 +932,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
   k_polar.set_function(*pair_program,kname_polar);
-  k_fphi_uind.set_function(*pair_program,"kname_fphi_uind");
+  k_fphi_uind.set_function(*pair_program,"k_fphi_uind");
   k_short_nbor.set_function(*pair_program,kname_short_nbor);
   k_special15.set_function(*pair_program,kname_special15);
   pos_tex.get_texture(*pair_program,"pos_tex");

From 4b8caac727c793674abc7714d4f436a4b70d71f6 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 9 Sep 2022 12:14:36 -0500
Subject: [PATCH 106/181] Made some progress with fphi_uind in the gpu pair
 style

---
 lib/gpu/lal_amoeba.cu       | 61 ++++++++++++++++--------
 lib/gpu/lal_amoeba_ext.cpp  |  6 +--
 lib/gpu/lal_base_amoeba.cpp | 93 ++++++++++++++++++++++---------------
 lib/gpu/lal_base_amoeba.h   |  9 ++--
 src/GPU/pair_amoeba_gpu.cpp | 14 ++++--
 5 files changed, 114 insertions(+), 69 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 4a26f7f98d..b0013f0b9b 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -14,7 +14,7 @@
 // ***************************************************************************
 
 #if defined(NV_KERNEL) || defined(USE_HIP)
-//#include <stdio.h>
+#include <stdio.h>
 #include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@@ -1630,14 +1630,19 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
                           __global numtyp *restrict fdip_phi2,
                           __global numtyp *restrict fdip_sum_phi,
                           const int bsorder, const int inum,
-                          const int nyzgrid, const int nygrid,
-                          const int t_per_atom)
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          const int ngridxy, const int ngridx)
 {
-  int tid, ii, offset, i, n_stride;
-  atom_info(t_per_atom,ii,tid,offset);
+  //int tid, ii, offset, i, n_stride;
+  //atom_info(t_per_atom,ii,tid,offset);
+
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
 
   if (ii<inum) {
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i];
 
     int j,k,m;
     numtyp v0,v1,v2,v3;
@@ -1706,7 +1711,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     tuv012 = (numtyp)0.0;
     tuv111 = (numtyp)0.0;
 
-    k = igrid[4*i+2] - nlpts;
+    k = igrid[4*ii+2] - nzlo_out - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
       /*
       v0 = thetai3[m][kb][0];
@@ -1714,9 +1719,9 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v2 = thetai3[m][kb][2];
       v3 = thetai3[m][kb][3];
       */
-      int i3 = i*4*bsorder + 4*kb;
+      int i3 = ii*4*bsorder + 4*kb;
       v0 = thetai3[i3];
-      v1 = thetai3[i3]+1;
+      v1 = thetai3[i3+1];
       v2 = thetai3[i3+2];
       v3 = thetai3[i3+3];
       tu00_1 = (numtyp)0.0;
@@ -1742,7 +1747,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       tu12 = (numtyp)0.0;
       tu03 = (numtyp)0.0;
 
-      j = igrid[4*i+1] - nlpts;
+      j = igrid[4*ii+1] - nylo_out - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
         /*
         u0 = thetai2[m][jb][0];
@@ -1750,7 +1755,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u2 = thetai2[m][jb][2];
         u3 = thetai2[m][jb][3];
         */
-        int i2 = i*4*bsorder+4*jb;
+        int i2 = ii*4*bsorder+4*jb;
         u0 = thetai2[i2];
         u1 = thetai2[i2+1];
         u2 = thetai2[i2+2];
@@ -1763,11 +1768,11 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         t2_2 = (numtyp)0.0;
         t3 = (numtyp)0.0;
 
-        int ii = igrid[4*i] - nlpts;
+        int i = igrid[4*ii] - nxlo_out - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
           /*
-          tq_1 = grid[k][j][ii][0];
-          tq_2 = grid[k][j][ii][1];
+          tq_1 = grid[k][j][i][0];
+          tq_2 = grid[k][j][i][1];
           t0_1 += tq_1*thetai1[m][ib][0];
           t1_1 += tq_1*thetai1[m][ib][1];
           t2_1 += tq_1*thetai1[m][ib][2];
@@ -1776,14 +1781,19 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          int i1 = i*4*bsorder+4*ib;
+          int i1 = ii*4*bsorder+4*ib;
           numtyp w0 = thetai1[i1];
           numtyp w1 = thetai1[i1+1];
           numtyp w2 = thetai1[i1+2];
           numtyp w3 = thetai1[i1+3];
-          int gidx = 2*(k*nyzgrid + j*nygrid + ii);
+          int gidx = 2*(k*ngridxy + j*ngridx + i);
           tq_1 = grid[gidx];
           tq_2 = grid[gidx+1];
+/*
+          if (ii == 0 && jb == 0 && kb == 0)
+            printf("ii = 0: igrid %d %d %d; grid %f %f; k = %d j = %d; i = %d; origin = %f %f; gidx = %d\n",
+              igrid[4*ii+0], igrid[4*ii+1], igrid[4*ii+2], tq_1, tq_2, k, j, i, grid[0], grid[1], gidx);
+*/          
           t0_1 += tq_1*w0;
           t1_1 += tq_1*w1;
           t2_1 += tq_1*w2;
@@ -1791,7 +1801,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t1_2 += tq_2*w1;
           t2_2 += tq_2*w2;
           t3 += (tq_1+tq_2)*w3;
-          ii++;
+          i++;
         }
 
         tu00_1 += t0_1*u0;
@@ -1875,7 +1885,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     fdip_phi1[m][8] = tuv101_1;
     fdip_phi1[m][9] = tuv011_1;
 */
-    int idx = 10*m;
+    int idx = 10*ii;
     fdip_phi1[idx+0] = (numtyp)0.0;
     fdip_phi1[idx+1] = tuv100_1;
     fdip_phi1[idx+2] = tuv010_1;
@@ -1886,7 +1896,18 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     fdip_phi1[idx+7] = tuv110_1;
     fdip_phi1[idx+8] = tuv101_1;
     fdip_phi1[idx+9] = tuv011_1;
-
+/*
+    fdip_phi2[m][0] = 0.0;
+    fdip_phi2[m][1] = tuv100_2;
+    fdip_phi2[m][2] = tuv010_2;
+    fdip_phi2[m][3] = tuv001_2;
+    fdip_phi2[m][4] = tuv200_2;
+    fdip_phi2[m][5] = tuv020_2;
+    fdip_phi2[m][6] = tuv002_2;
+    fdip_phi2[m][7] = tuv110_2;
+    fdip_phi2[m][8] = tuv101_2;
+    fdip_phi2[m][9] = tuv011_2;
+*/    
     fdip_phi2[idx+0] = (numtyp)0.0;
     fdip_phi2[idx+1] = tuv100_2;
     fdip_phi2[idx+2] = tuv010_2;
@@ -1898,7 +1919,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     fdip_phi2[idx+8] = tuv101_2;
     fdip_phi2[idx+9] = tuv011_2;
 
-    idx = 20*m;
+    idx = 20*ii;
     fdip_sum_phi[idx+0] = tuv000;
     fdip_sum_phi[idx+1] = tuv100;
     fdip_sum_phi[idx+2] = tuv010;
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 151c38c9c4..95b7237e46 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -165,12 +165,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
-                          double *host_grid_brick_start, void **host_fdip_phi1,
-                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          double *host_grid_brick_start, double ****host_grid_brick,
+                          void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                           int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                           int nxlo_out, int nxhi_out, bool& first_iteration) {
    AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
-                          host_thetai3, igrid, host_grid_brick_start, host_fdip_phi1,
+                          host_thetai3, igrid, host_grid_brick_start, host_grid_brick, host_fdip_phi1,
                           host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
                           nylo_out, nyhi_out,  nxlo_out, nxhi_out, first_iteration);
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index af8d5ca481..21a97a0852 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -555,7 +555,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 // Prepare for umutual1() after bspline_fill() is done on host
 //   - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
 //     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
-//     host_igrid is allocated with nmax by by 4
+//     host_igrid is allocated with nmax by 4
 //   - transfer extra data from host to device
 // ---------------------------------------------------------------------------
 
@@ -563,8 +563,9 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                     double ***host_thetai1, double ***host_thetai2,
                                     double ***host_thetai3, int** host_igrid,
-                                    double* host_grid_brick_start, int nzlo_out,
-                                    int nzhi_out, int nylo_out, int nyhi_out,
+                                    double* host_grid_brick_start, double**** host_grid_brick,
+                                    int nzlo_out, int nzhi_out,
+                                    int nylo_out, int nyhi_out,
                                     int nxlo_out, int nxhi_out) {
   
   _bsorder = bsorder;
@@ -599,7 +600,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   }
 
   UCL_H_Vec<double> dview;
-  dview.alloc(inum_full*bsorder*4,*(this->ucl_device));
+  dview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
 
   // pack host data to device
 
@@ -634,7 +635,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   ucl_copy(_thetai3,dview,false);
 
   UCL_H_Vec<int> dview_int;
-  dview_int.alloc(inum_full*4, *(this->ucl_device));
+  dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
   for (int i = 0; i < inum_full; i++) {
     int idx = i*4;
     dview_int[idx+0] = host_igrid[i][0];
@@ -643,6 +644,33 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   }
   ucl_copy(_igrid, dview_int, false);
 
+  
+  
+}
+
+// ---------------------------------------------------------------------------
+// fphi_uind = induced potential from grid
+// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** igrid,
+                                    double *host_grid_brick_start, double ****host_grid_brick,
+                                    void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
+                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out, bool& first_iteration)
+{
+  // allocation/resize and transfers before the first iteration
+  
+  if (first_iteration) {
+    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
+                      igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out,
+                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
+    if (first_iteration) first_iteration = false;
+  }
+
   // update the cgrid_brick with data host
   
   _nzlo_out = nzlo_out;
@@ -656,36 +684,27 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   _ngridx = nxhi_out - nxlo_out + 1;
   _num_grid_points = _ngridx * _ngridy * _ngridz;
 
-  UCL_H_Vec<double> dview_cgrid;
-  dview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device));
+  UCL_H_Vec<double> hview_cgrid;
+  hview_cgrid.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_WRITE);
+  int n = 0;
+  for (int iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (int iy = nylo_out; iy <= nyhi_out; iy++)
+      for (int ix = nxlo_out; ix <= nxhi_out; ix++) {
+/*        
+        if (iz == nzlo_out && iy == nylo_out && ix == nxlo_out) {
+          printf("origin = %d %d %d: grid = %f %f %f\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1]);
+        }
+        if (iz == -2 && iy == 4 && ix == 8) printf("ixyz = %d %d %d: grid = %f %f %f; n = %d\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1], n);
+*/        
+        hview_cgrid[n] = host_grid_brick[iz][iy][ix][0];
+        hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1];
+        n += 2;
+      }
+  //hview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device));
   _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY);
-  ucl_copy(_cgrid_brick,dview_cgrid,false);
+  ucl_copy(_cgrid_brick,hview_cgrid,false);
 
-}
 
-// ---------------------------------------------------------------------------
-// fphi_uind = induced potential from grid
-// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
-// ---------------------------------------------------------------------------
-
-template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
-                                    double ***host_thetai1, double ***host_thetai2,
-                                    double ***host_thetai3, int** igrid,
-                                    double *host_grid_brick_start, void** host_fdip_phi1,
-                                    void **host_fdip_phi2, void **host_fdip_sum_phi,
-                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out, bool& first_iteration)
-{
-  // allocation/resize and transfers before the first iteration
-  
-  if (first_iteration) {
-    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
-                      igrid, host_grid_brick_start, nzlo_out, nzhi_out,
-                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
-    if (first_iteration) first_iteration = false;
-  }
-    
   const int red_blocks = fphi_uind();
 
   _fdip_phi1.update_host(_max_thetai_size*10);
@@ -711,16 +730,16 @@ int BaseAmoebaT::fphi_uind() {
 
   // Compute the block size and grid size to keep all cores busy
   const int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
-                               (BX/_threads_per_atom)));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   time_pair.start();
-  int ngridyz = _ngridy * _ngridz;
+  int ngridxy = _ngridx * _ngridy;
   k_fphi_uind.set_size(GX,BX);
   k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3,
                   &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2,
-                  &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy,
-                  &_threads_per_atom);
+                  &_fdip_sum_phi, &_bsorder, &ainum, 
+                  &_nzlo_out, &_nzhi_out, &_nylo_out, &_nyhi_out, 
+                  &_nxlo_out, &_nxhi_out, &ngridxy, &_ngridx);
   time_pair.stop();
 
   return GX;
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index a001423812..c2c2a2d93d 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -153,8 +153,9 @@ class BaseAmoeba {
   virtual void precompute_induce(const int inum_full, const int bsorder,
                                  double ***host_thetai1, double ***host_thetai2,
                                  double ***host_thetai3, int** igrid,
-                                 double* grid_brick_start, int nzlo_out,
-                                 int nzhi_out, int nylo_out, int nyhi_out,
+                                 double *host_grid_brick_start, double ****host_grid_brick,
+                                 int nzlo_out, int nzhi_out,
+                                 int nylo_out, int nyhi_out,
                                  int nxlo_out, int nxhi_out);
 
   /// Compute multipole real-space with device neighboring
@@ -182,8 +183,8 @@ class BaseAmoeba {
   virtual void compute_fphi_uind(const int inum_full, const int bsorder,
                                  double ***host_thetai1, double ***host_thetai2,
                                  double ***host_thetai3, int** igrid,
-                                 double *host_grid_brick_start, void **host_fdip_phi1,
-                                 void **host_fdip_phi2, void **host_fdip_sum_phi,
+                                 double *host_grid_brick_start, double ****host_grid_brick,
+                                 void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                                  int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                                  int nxlo_out, int nxhi_out, bool& first_iteration);
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index bf6db3472d..936cf8afbc 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -91,7 +91,7 @@ void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
-                          double *host_grid_brick_start, void **host_fdip_phi1,
+                          double *host_grid_brick_start, double ****host_grid_brick, void **host_fdip_phi1,
                           void **host_fdip_phi2, void **host_fdip_sum_phi,
                           int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                           int nxlo_out, int nxhi_out, bool& first_iteration);
@@ -121,7 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;     // need to be true for precompute()
   gpu_udirect2b_ready = true;
   gpu_umutual1_ready = true;
-  gpu_fphi_uind_ready = false;
+  gpu_fphi_uind_ready = true;
   gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
 
@@ -1139,7 +1139,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
   void* fdip_phi2_pinned = nullptr;
   void* fdip_sum_phi_pinned = nullptr;
   amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3,
-                        igrid, ic_kspace->grid_brick_start,
+                        igrid, ic_kspace->grid_brick_start, grid,
                         &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned,
                         ic_kspace->nzlo_out, ic_kspace->nzhi_out,
                         ic_kspace->nylo_out, ic_kspace->nyhi_out,
@@ -1150,8 +1150,10 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
   double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
   for (int i = 0; i < nlocal; i++) {
     int idx = 10 * i;
-    for (int m = 0; m < 10; m++)
-      fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+    for (int m = 0; m < 10; m++) {
+       fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+    }
+    if (i == 0) printf("gpu fdip phi1 = %f %f %f\n", fdip_phi1[i][0], fdip_phi1[i][1], fdip_phi1[i][2]);      
   }
 
   double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
@@ -1159,6 +1161,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
     int idx = 10 * i;
     for (int m = 0; m < 10; m++)
       fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m];
+    if (i == 0) printf("gpu fdip phi2 = %f %f %f\n", fdip_phi2[i][0], fdip_phi2[i][1], fdip_phi2[i][2]);      
   }
 
   double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
@@ -1166,6 +1169,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
     int idx = 20 * i;
     for (int m = 0; m < 20; m++)
       fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m];
+    if (i == 0) printf("gpu fdip sum phi = %f %f %f\n", fdip_sum_phi[i][0], fdip_sum_phi[i][1], fdip_sum_phi[i][2]);            
   }
 }
 

From b72b71837ebc6de746c694c3c132f5fde5c36c80 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 9 Sep 2022 13:34:57 -0500
Subject: [PATCH 107/181] Moved first_induce_iteration in induce() to the right
 place

---
 lib/gpu/lal_base_amoeba.cpp | 6 +++---
 src/GPU/pair_amoeba_gpu.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 21a97a0852..ceb9b97cbc 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -656,7 +656,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                     double ***host_thetai1, double ***host_thetai2,
-                                    double ***host_thetai3, int** igrid,
+                                    double ***host_thetai3, int** host_igrid,
                                     double *host_grid_brick_start, double ****host_grid_brick,
                                     void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                                     int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
@@ -666,9 +666,9 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
   
   if (first_iteration) {
     precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
-                      igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out,
+                      host_igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out,
                       nylo_out, nyhi_out, nxlo_out, nxhi_out);
-    if (first_iteration) first_iteration = false;
+    first_iteration = false;
   }
 
   // update the cgrid_brick with data host
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 936cf8afbc..8d799a82eb 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -290,6 +290,8 @@ void PairAmoebaGPU::induce()
 
   int debug = 1;
 
+  first_induce_iteration  = true;
+
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
@@ -486,8 +488,6 @@ void PairAmoebaGPU::induce()
 
     // conjugate gradient iteration of the mutual induced dipoles
 
-    first_induce_iteration  = true;
-
     while (!done) {
       iter++;
 

From c58343b2e29c0c514563169018f322cccf73715d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 9 Sep 2022 13:50:41 -0500
Subject: [PATCH 108/181] Cleaned up debugging stuffs, need more refactoring
 and add to hippo

---
 lib/gpu/lal_amoeba.cu       |  7 +------
 lib/gpu/lal_base_amoeba.cpp | 11 +----------
 src/GPU/pair_amoeba_gpu.cpp |  3 ---
 3 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index b0013f0b9b..591a896bc8 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -14,7 +14,7 @@
 // ***************************************************************************
 
 #if defined(NV_KERNEL) || defined(USE_HIP)
-#include <stdio.h>
+//#include <stdio.h>
 #include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@@ -1789,11 +1789,6 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           int gidx = 2*(k*ngridxy + j*ngridx + i);
           tq_1 = grid[gidx];
           tq_2 = grid[gidx+1];
-/*
-          if (ii == 0 && jb == 0 && kb == 0)
-            printf("ii = 0: igrid %d %d %d; grid %f %f; k = %d j = %d; i = %d; origin = %f %f; gidx = %d\n",
-              igrid[4*ii+0], igrid[4*ii+1], igrid[4*ii+2], tq_1, tq_2, k, j, i, grid[0], grid[1], gidx);
-*/          
           t0_1 += tq_1*w0;
           t1_1 += tq_1*w1;
           t2_1 += tq_1*w2;
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index ceb9b97cbc..05b830d773 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -642,10 +642,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     dview_int[idx+1] = host_igrid[i][1];
     dview_int[idx+2] = host_igrid[i][2];
   }
-  ucl_copy(_igrid, dview_int, false);
-
-  
-  
+  ucl_copy(_igrid, dview_int, false);  
 }
 
 // ---------------------------------------------------------------------------
@@ -690,12 +687,6 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
   for (int iz = nzlo_out; iz <= nzhi_out; iz++)
     for (int iy = nylo_out; iy <= nyhi_out; iy++)
       for (int ix = nxlo_out; ix <= nxhi_out; ix++) {
-/*        
-        if (iz == nzlo_out && iy == nylo_out && ix == nxlo_out) {
-          printf("origin = %d %d %d: grid = %f %f %f\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1]);
-        }
-        if (iz == -2 && iy == 4 && ix == 8) printf("ixyz = %d %d %d: grid = %f %f %f; n = %d\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1], n);
-*/        
         hview_cgrid[n] = host_grid_brick[iz][iy][ix][0];
         hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1];
         n += 2;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 8d799a82eb..8618317704 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -1153,7 +1153,6 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
     for (int m = 0; m < 10; m++) {
        fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
     }
-    if (i == 0) printf("gpu fdip phi1 = %f %f %f\n", fdip_phi1[i][0], fdip_phi1[i][1], fdip_phi1[i][2]);      
   }
 
   double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
@@ -1161,7 +1160,6 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
     int idx = 10 * i;
     for (int m = 0; m < 10; m++)
       fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m];
-    if (i == 0) printf("gpu fdip phi2 = %f %f %f\n", fdip_phi2[i][0], fdip_phi2[i][1], fdip_phi2[i][2]);      
   }
 
   double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
@@ -1169,7 +1167,6 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
     int idx = 20 * i;
     for (int m = 0; m < 20; m++)
       fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m];
-    if (i == 0) printf("gpu fdip sum phi = %f %f %f\n", fdip_sum_phi[i][0], fdip_sum_phi[i][1], fdip_sum_phi[i][2]);            
   }
 }
 

From 363b6c51d0355ce9e6e470ac8462263c811de33d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 10 Sep 2022 02:31:39 -0500
Subject: [PATCH 109/181] Used local arrays and re-arranged for coalesced
 global memory writes

---
 lib/gpu/lal_amoeba.cu       | 125 +++++++++++++++++-------------------
 src/GPU/pair_amoeba_gpu.cpp |  21 +++---
 2 files changed, 72 insertions(+), 74 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 591a896bc8..fb66158d06 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1637,12 +1637,14 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
 {
   //int tid, ii, offset, i, n_stride;
   //atom_info(t_per_atom,ii,tid,offset);
+  
 
   int tid=THREAD_ID_X;
   int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
 
   if (ii<inum) {
     numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i];
+    acctyp fdip_buf[32];
 
     int j,k,m;
     numtyp v0,v1,v2,v3;
@@ -1868,73 +1870,64 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       k++;
     }
 
-/*
-    fdip_phi1[m][0] = 0.0;
-    fdip_phi1[m][1] = tuv100_1;
-    fdip_phi1[m][2] = tuv010_1;
-    fdip_phi1[m][3] = tuv001_1;
-    fdip_phi1[m][4] = tuv200_1;
-    fdip_phi1[m][5] = tuv020_1;
-    fdip_phi1[m][6] = tuv002_1;
-    fdip_phi1[m][7] = tuv110_1;
-    fdip_phi1[m][8] = tuv101_1;
-    fdip_phi1[m][9] = tuv011_1;
-*/
-    int idx = 10*ii;
-    fdip_phi1[idx+0] = (numtyp)0.0;
-    fdip_phi1[idx+1] = tuv100_1;
-    fdip_phi1[idx+2] = tuv010_1;
-    fdip_phi1[idx+3] = tuv001_1;
-    fdip_phi1[idx+4] = tuv200_1;
-    fdip_phi1[idx+5] = tuv020_1;
-    fdip_phi1[idx+6] = tuv002_1;
-    fdip_phi1[idx+7] = tuv110_1;
-    fdip_phi1[idx+8] = tuv101_1;
-    fdip_phi1[idx+9] = tuv011_1;
-/*
-    fdip_phi2[m][0] = 0.0;
-    fdip_phi2[m][1] = tuv100_2;
-    fdip_phi2[m][2] = tuv010_2;
-    fdip_phi2[m][3] = tuv001_2;
-    fdip_phi2[m][4] = tuv200_2;
-    fdip_phi2[m][5] = tuv020_2;
-    fdip_phi2[m][6] = tuv002_2;
-    fdip_phi2[m][7] = tuv110_2;
-    fdip_phi2[m][8] = tuv101_2;
-    fdip_phi2[m][9] = tuv011_2;
-*/    
-    fdip_phi2[idx+0] = (numtyp)0.0;
-    fdip_phi2[idx+1] = tuv100_2;
-    fdip_phi2[idx+2] = tuv010_2;
-    fdip_phi2[idx+3] = tuv001_2;
-    fdip_phi2[idx+4] = tuv200_2;
-    fdip_phi2[idx+5] = tuv020_2;
-    fdip_phi2[idx+6] = tuv002_2;
-    fdip_phi2[idx+7] = tuv110_2;
-    fdip_phi2[idx+8] = tuv101_2;
-    fdip_phi2[idx+9] = tuv011_2;
+    int idx;
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_1;
+    fdip_buf[2] = tuv010_1;
+    fdip_buf[3] = tuv001_1;
+    fdip_buf[4] = tuv200_1;
+    fdip_buf[5] = tuv020_1;
+    fdip_buf[6] = tuv002_1;
+    fdip_buf[7] = tuv110_1;
+    fdip_buf[8] = tuv101_1;
+    fdip_buf[9] = tuv011_1;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi1[idx] = fdip_buf[m];
+      idx += inum;
+    }
 
-    idx = 20*ii;
-    fdip_sum_phi[idx+0] = tuv000;
-    fdip_sum_phi[idx+1] = tuv100;
-    fdip_sum_phi[idx+2] = tuv010;
-    fdip_sum_phi[idx+3] = tuv001;
-    fdip_sum_phi[idx+4] = tuv200;
-    fdip_sum_phi[idx+5] = tuv020;
-    fdip_sum_phi[idx+6] = tuv002;
-    fdip_sum_phi[idx+7] = tuv110;
-    fdip_sum_phi[idx+8] = tuv101;
-    fdip_sum_phi[idx+9] = tuv011;
-    fdip_sum_phi[idx+10] = tuv300;
-    fdip_sum_phi[idx+11] = tuv030;
-    fdip_sum_phi[idx+12] = tuv003;
-    fdip_sum_phi[idx+13] = tuv210;
-    fdip_sum_phi[idx+14] = tuv201;
-    fdip_sum_phi[idx+15] = tuv120;
-    fdip_sum_phi[idx+16] = tuv021;
-    fdip_sum_phi[idx+17] = tuv102;
-    fdip_sum_phi[idx+18] = tuv012;
-    fdip_sum_phi[idx+19] = tuv111;
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_2;
+    fdip_buf[2] = tuv010_2;
+    fdip_buf[3] = tuv001_2;
+    fdip_buf[4] = tuv200_2;
+    fdip_buf[5] = tuv020_2;
+    fdip_buf[6] = tuv002_2;
+    fdip_buf[7] = tuv110_2;
+    fdip_buf[8] = tuv101_2;
+    fdip_buf[9] = tuv011_2;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi2[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = tuv000;
+    fdip_buf[1] = tuv100;
+    fdip_buf[2] = tuv010;
+    fdip_buf[3] = tuv001;
+    fdip_buf[4] = tuv200;
+    fdip_buf[5] = tuv020;
+    fdip_buf[6] = tuv002;
+    fdip_buf[7] = tuv110;
+    fdip_buf[8] = tuv101;
+    fdip_buf[9] = tuv011;
+    fdip_buf[10] = tuv300;
+    fdip_buf[11] = tuv030;
+    fdip_buf[12] = tuv003;
+    fdip_buf[13] = tuv210;
+    fdip_buf[14] = tuv201;
+    fdip_buf[15] = tuv120;
+    fdip_buf[16] = tuv021;
+    fdip_buf[17] = tuv102;
+    fdip_buf[18] = tuv012;
+    fdip_buf[19] = tuv111;
+    idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fdip_sum_phi[idx] = fdip_buf[m];
+      idx += inum;
+    }
   }
 }
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 8618317704..4c77417ff0 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -1149,24 +1149,29 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
   int nlocal = atom->nlocal;
   double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
   for (int i = 0; i < nlocal; i++) {
-    int idx = 10 * i;
+    int n = i;
     for (int m = 0; m < 10; m++) {
-       fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+      fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+      n += nlocal;
     }
   }
 
   double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
   for (int i = 0; i < nlocal; i++) {
-    int idx = 10 * i;
-    for (int m = 0; m < 10; m++)
-      fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m];
+    int n = i;
+    for (int m = 0; m < 10; m++) {
+      fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+      n += nlocal;
+    }
   }
 
   double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
   for (int i = 0; i < nlocal; i++) {
-    int idx = 20 * i;
-    for (int m = 0; m < 20; m++)
-      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m];
+    int n = i;
+    for (int m = 0; m < 20; m++) {
+      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+      n += nlocal;
+    }
   }
 }
 

From 5e59c95be403b26e59e8b914e2b43fe31441dd9f Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 10 Sep 2022 02:45:06 -0500
Subject: [PATCH 110/181] Moved temp variables inside loops

---
 lib/gpu/lal_amoeba.cu | 187 ++++++++++++++++++------------------------
 1 file changed, 82 insertions(+), 105 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index fb66158d06..105f18cfa8 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1643,75 +1643,52 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
   int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
 
   if (ii<inum) {
-    numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i];
+    //numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i];
     acctyp fdip_buf[32];
 
-    int j,k,m;
-    numtyp v0,v1,v2,v3;
-    numtyp u0,u1,u2,u3;
-    numtyp t0,t1,t2,t3;
-    numtyp t0_1,t0_2,t1_1,t1_2;
-    numtyp t2_1,t2_2,tq_1,tq_2;
-    numtyp tu00,tu10,tu01,tu20,tu11;
-    numtyp tu02,tu30,tu21,tu12,tu03;
-    numtyp tu00_1,tu01_1,tu10_1;
-    numtyp tu00_2,tu01_2,tu10_2;
-    numtyp tu20_1,tu11_1,tu02_1;
-    numtyp tu20_2,tu11_2,tu02_2;
-    numtyp tuv100_1,tuv010_1,tuv001_1;
-    numtyp tuv100_2,tuv010_2,tuv001_2;
-    numtyp tuv200_1,tuv020_1,tuv002_1;
-    numtyp tuv110_1,tuv101_1,tuv011_1;
-    numtyp tuv200_2,tuv020_2,tuv002_2;
-    numtyp tuv110_2,tuv101_2,tuv011_2;
-    numtyp tuv000,tuv100,tuv010,tuv001;
-    numtyp tuv200,tuv020,tuv002,tuv110;
-    numtyp tuv101,tuv011,tuv300,tuv030;
-    numtyp tuv003,tuv210,tuv201,tuv120;
-    numtyp tuv021,tuv102,tuv012,tuv111;
-
+    int j,k,m;    
     int nlpts = (bsorder-1) / 2;
 
     // extract the permanent multipole field at each site
 
-    tuv100_1 = (numtyp)0.0;
-    tuv010_1 = (numtyp)0.0;
-    tuv001_1 = (numtyp)0.0;
-    tuv200_1 = (numtyp)0.0;
-    tuv020_1 = (numtyp)0.0;
-    tuv002_1 = (numtyp)0.0;
-    tuv110_1 = (numtyp)0.0;
-    tuv101_1 = (numtyp)0.0;
-    tuv011_1 = (numtyp)0.0;
-    tuv100_2 = (numtyp)0.0;
-    tuv010_2 = (numtyp)0.0;
-    tuv001_2 = (numtyp)0.0;
-    tuv200_2 = (numtyp)0.0;
-    tuv020_2 = (numtyp)0.0;
-    tuv002_2 = (numtyp)0.0;
-    tuv110_2 = (numtyp)0.0;
-    tuv101_2 = (numtyp)0.0;
-    tuv011_2 = (numtyp)0.0;
-    tuv000 = (numtyp)0.0;
-    tuv001 = (numtyp)0.0;
-    tuv010 = (numtyp)0.0;
-    tuv100 = (numtyp)0.0;
-    tuv200 = (numtyp)0.0;
-    tuv020 = (numtyp)0.0;
-    tuv002 = (numtyp)0.0;
-    tuv110 = (numtyp)0.0;
-    tuv101 = (numtyp)0.0;
-    tuv011 = (numtyp)0.0;
-    tuv300 = (numtyp)0.0;
-    tuv030 = (numtyp)0.0;
-    tuv003 = (numtyp)0.0;
-    tuv210 = (numtyp)0.0;
-    tuv201 = (numtyp)0.0;
-    tuv120 = (numtyp)0.0;
-    tuv021 = (numtyp)0.0;
-    tuv102 = (numtyp)0.0;
-    tuv012 = (numtyp)0.0;
-    tuv111 = (numtyp)0.0;
+    numtyp tuv100_1 = (numtyp)0.0;
+    numtyp tuv010_1 = (numtyp)0.0;
+    numtyp tuv001_1 = (numtyp)0.0;
+    numtyp tuv200_1 = (numtyp)0.0;
+    numtyp tuv020_1 = (numtyp)0.0;
+    numtyp tuv002_1 = (numtyp)0.0;
+    numtyp tuv110_1 = (numtyp)0.0;
+    numtyp tuv101_1 = (numtyp)0.0;
+    numtyp tuv011_1 = (numtyp)0.0;
+    numtyp tuv100_2 = (numtyp)0.0;
+    numtyp tuv010_2 = (numtyp)0.0;
+    numtyp tuv001_2 = (numtyp)0.0;
+    numtyp tuv200_2 = (numtyp)0.0;
+    numtyp tuv020_2 = (numtyp)0.0;
+    numtyp tuv002_2 = (numtyp)0.0;
+    numtyp tuv110_2 = (numtyp)0.0;
+    numtyp tuv101_2 = (numtyp)0.0;
+    numtyp tuv011_2 = (numtyp)0.0;
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
 
     k = igrid[4*ii+2] - nzlo_out - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
@@ -1722,32 +1699,32 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v3 = thetai3[m][kb][3];
       */
       int i3 = ii*4*bsorder + 4*kb;
-      v0 = thetai3[i3];
-      v1 = thetai3[i3+1];
-      v2 = thetai3[i3+2];
-      v3 = thetai3[i3+3];
-      tu00_1 = (numtyp)0.0;
-      tu01_1 = (numtyp)0.0;
-      tu10_1 = (numtyp)0.0;
-      tu20_1 = (numtyp)0.0;
-      tu11_1 = (numtyp)0.0;
-      tu02_1 = (numtyp)0.0;
-      tu00_2 = (numtyp)0.0;
-      tu01_2 = (numtyp)0.0;
-      tu10_2 = (numtyp)0.0;
-      tu20_2 = (numtyp)0.0;
-      tu11_2 = (numtyp)0.0;
-      tu02_2 = (numtyp)0.0;
-      tu00 = (numtyp)0.0;
-      tu10 = (numtyp)0.0;
-      tu01 = (numtyp)0.0;
-      tu20 = (numtyp)0.0;
-      tu11 = (numtyp)0.0;
-      tu02 = (numtyp)0.0;
-      tu30 = (numtyp)0.0;
-      tu21 = (numtyp)0.0;
-      tu12 = (numtyp)0.0;
-      tu03 = (numtyp)0.0;
+      numtyp v0 = thetai3[i3];
+      numtyp v1 = thetai3[i3+1];
+      numtyp v2 = thetai3[i3+2];
+      numtyp v3 = thetai3[i3+3];
+      numtyp tu00_1 = (numtyp)0.0;
+      numtyp tu01_1 = (numtyp)0.0;
+      numtyp tu10_1 = (numtyp)0.0;
+      numtyp tu20_1 = (numtyp)0.0;
+      numtyp tu11_1 = (numtyp)0.0;
+      numtyp tu02_1 = (numtyp)0.0;
+      numtyp tu00_2 = (numtyp)0.0;
+      numtyp tu01_2 = (numtyp)0.0;
+      numtyp tu10_2 = (numtyp)0.0;
+      numtyp tu20_2 = (numtyp)0.0;
+      numtyp tu11_2 = (numtyp)0.0;
+      numtyp tu02_2 = (numtyp)0.0;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
 
       j = igrid[4*ii+1] - nylo_out - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
@@ -1758,17 +1735,17 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u3 = thetai2[m][jb][3];
         */
         int i2 = ii*4*bsorder+4*jb;
-        u0 = thetai2[i2];
-        u1 = thetai2[i2+1];
-        u2 = thetai2[i2+2];
-        u3 = thetai2[i2+3];
-        t0_1 = (numtyp)0.0;
-        t1_1 = (numtyp)0.0;
-        t2_1 = (numtyp)0.0;
-        t0_2 = (numtyp)0.0;
-        t1_2 = (numtyp)0.0;
-        t2_2 = (numtyp)0.0;
-        t3 = (numtyp)0.0;
+        numtyp u0 = thetai2[i2];
+        numtyp u1 = thetai2[i2+1];
+        numtyp u2 = thetai2[i2+2];
+        numtyp u3 = thetai2[i2+3];
+        numtyp t0_1 = (numtyp)0.0;
+        numtyp t1_1 = (numtyp)0.0;
+        numtyp t2_1 = (numtyp)0.0;
+        numtyp t0_2 = (numtyp)0.0;
+        numtyp t1_2 = (numtyp)0.0;
+        numtyp t2_2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
 
         int i = igrid[4*ii] - nxlo_out - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
@@ -1789,8 +1766,8 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           numtyp w2 = thetai1[i1+2];
           numtyp w3 = thetai1[i1+3];
           int gidx = 2*(k*ngridxy + j*ngridx + i);
-          tq_1 = grid[gidx];
-          tq_2 = grid[gidx+1];
+          numtyp tq_1 = grid[gidx];
+          numtyp tq_2 = grid[gidx+1];
           t0_1 += tq_1*w0;
           t1_1 += tq_1*w1;
           t2_1 += tq_1*w2;
@@ -1813,9 +1790,9 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         tu20_2 += t2_2*u0;
         tu11_2 += t1_2*u1;
         tu02_2 += t0_2*u2;
-        t0 = t0_1 + t0_2;
-        t1 = t1_1 + t1_2;
-        t2 = t2_1 + t2_2;
+        numtyp t0 = t0_1 + t0_2;
+        numtyp t1 = t1_1 + t1_2;
+        numtyp t2 = t2_1 + t2_2;
         tu00 += t0*u0;
         tu10 += t1*u0;
         tu01 += t0*u1;

From 7f4efa380affe5018654277e95bb7d3aa5b04acd Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 11 Sep 2022 18:58:34 -0500
Subject: [PATCH 111/181] Re-arranged memory allocation for cgrid_brick, some
 issues need to be fixed

---
 lib/gpu/lal_amoeba.cu       |  2 +-
 lib/gpu/lal_amoeba_ext.cpp  | 15 ++++---
 lib/gpu/lal_base_amoeba.cpp | 87 +++++++++++++++++++++----------------
 lib/gpu/lal_base_amoeba.h   | 19 ++++----
 4 files changed, 69 insertions(+), 54 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 105f18cfa8..d67fa4f869 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1646,7 +1646,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     //numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i];
     acctyp fdip_buf[32];
 
-    int j,k,m;    
+    int j,k;    
     int nlpts = (bsorder-1) / 2;
 
     // extract the permanent multipole field at each site
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 95b7237e46..f91b76f688 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -164,15 +164,16 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
 
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, int** igrid,
-                          double *host_grid_brick_start, double ****host_grid_brick,
+                          double ***host_thetai3, int** igrid, double ****host_grid_brick,
                           void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
-                          int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                          int nxlo_out, int nxhi_out, bool& first_iteration) {
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          bool& first_iteration) {
    AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
-                          host_thetai3, igrid, host_grid_brick_start, host_grid_brick, host_fdip_phi1,
-                          host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
-                          nylo_out, nyhi_out,  nxlo_out, nxhi_out, first_iteration);
+                              host_thetai3, igrid, host_grid_brick, host_fdip_phi1,
+                              host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
+                              nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
 }
 
 void amoeba_setup_fft(const int numel, const int element_type) {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 05b830d773..dfd5565f1e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -186,8 +186,10 @@ void BaseAmoebaT::clear_atomic() {
   _igrid.clear();
   _fdip_phi1.clear();
   _fdip_phi2.clear();
-  _cgrid_brick.clear();
   _fdip_sum_phi.clear();
+  _cgrid_brick.clear();
+  hview_cgrid.clear();
+
   dev_nspecial15.clear();
   dev_special15.clear();
   dev_special15_t.clear();
@@ -563,10 +565,9 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                     double ***host_thetai1, double ***host_thetai2,
                                     double ***host_thetai3, int** host_igrid,
-                                    double* host_grid_brick_start, double**** host_grid_brick,
-                                    int nzlo_out, int nzhi_out,
-                                    int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out) {
+                                    const int nzlo_out, const int nzhi_out,
+                                    const int nylo_out, const int nyhi_out,
+                                    const int nxlo_out, const int nxhi_out) {
   
   _bsorder = bsorder;
 
@@ -642,34 +643,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     dview_int[idx+1] = host_igrid[i][1];
     dview_int[idx+2] = host_igrid[i][2];
   }
-  ucl_copy(_igrid, dview_int, false);  
-}
+  ucl_copy(_igrid, dview_int, false);
 
-// ---------------------------------------------------------------------------
-// fphi_uind = induced potential from grid
-// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
-// ---------------------------------------------------------------------------
-
-template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
-                                    double ***host_thetai1, double ***host_thetai2,
-                                    double ***host_thetai3, int** host_igrid,
-                                    double *host_grid_brick_start, double ****host_grid_brick,
-                                    void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
-                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out, bool& first_iteration)
-{
-  // allocation/resize and transfers before the first iteration
-  
-  if (first_iteration) {
-    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
-                      host_igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out,
-                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
-    first_iteration = false;
-  }
-
-  // update the cgrid_brick with data host
-  
   _nzlo_out = nzlo_out;
   _nzhi_out = nzhi_out;
   _nylo_out = nylo_out;
@@ -681,8 +656,47 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
   _ngridx = nxhi_out - nxlo_out + 1;
   _num_grid_points = _ngridx * _ngridy * _ngridz;
 
-  UCL_H_Vec<double> hview_cgrid;
-  hview_cgrid.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_WRITE);
+  int numel = _num_grid_points*2;
+  if (_cgrid_brick.cols() == 0) {
+    hview_cgrid.alloc(numel, *(this->ucl_device), UCL_READ_WRITE);
+    _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_ONLY);
+  } else if (numel > _cgrid_brick.cols()) {
+    hview_cgrid.resize(numel);
+    _cgrid_brick.resize(numel);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// fphi_uind = induced potential from grid
+// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** host_igrid,
+                                    double ****host_grid_brick,
+                                    void** host_fdip_phi1,
+                                    void **host_fdip_phi2,
+                                    void **host_fdip_sum_phi,
+                                    const int nzlo_out, const int nzhi_out,
+                                    const int nylo_out, const int nyhi_out,
+                                    const int nxlo_out, const int nxhi_out,
+                                    bool& first_iteration)
+{
+  // TODO: find out why this alloc helps makes the cgrid_brick ucl_copy work
+  UCL_H_Vec<numtyp> hview;
+  hview.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
+  
+  // allocation/resize and transfers before the first iteration
+  
+  if (first_iteration) {
+    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
+                      host_thetai3, host_igrid, nzlo_out, nzhi_out,
+                      nylo_out,  nyhi_out, nxlo_out, nxhi_out);
+    first_iteration = false;
+  }
+
   int n = 0;
   for (int iz = nzlo_out; iz <= nzhi_out; iz++)
     for (int iy = nylo_out; iy <= nyhi_out; iy++)
@@ -691,10 +705,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
         hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1];
         n += 2;
       }
-  //hview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device));
-  _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY);
-  ucl_copy(_cgrid_brick,hview_cgrid,false);
-
+  ucl_copy(_cgrid_brick, hview_cgrid, false);
 
   const int red_blocks = fphi_uind();
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index c2c2a2d93d..a4a7a8d1a7 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -153,10 +153,9 @@ class BaseAmoeba {
   virtual void precompute_induce(const int inum_full, const int bsorder,
                                  double ***host_thetai1, double ***host_thetai2,
                                  double ***host_thetai3, int** igrid,
-                                 double *host_grid_brick_start, double ****host_grid_brick,
-                                 int nzlo_out, int nzhi_out,
-                                 int nylo_out, int nyhi_out,
-                                 int nxlo_out, int nxhi_out);
+                                 const int nzlo_out, const int nzhi_out,
+                                 const int nylo_out, const int nyhi_out,
+                                 const int nxlo_out, const int nxhi_out);
 
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
@@ -183,10 +182,12 @@ class BaseAmoeba {
   virtual void compute_fphi_uind(const int inum_full, const int bsorder,
                                  double ***host_thetai1, double ***host_thetai2,
                                  double ***host_thetai3, int** igrid,
-                                 double *host_grid_brick_start, double ****host_grid_brick,
+                                 double ****host_grid_brick,
                                  void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
-                                 int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                 int nxlo_out, int nxhi_out, bool& first_iteration);
+                                 const int nzlo_out, const int nzhi_out,
+                                 const int nylo_out, const int nyhi_out,
+                                 const int nxlo_out, const int nxhi_out,
+                                 bool& first_iteration);
 
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -255,7 +256,9 @@ class BaseAmoeba {
   int _nmax, _max_tep_size, _max_fieldp_size;
 
   int _bsorder;
-  UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3, _cgrid_brick;
+  UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3;
+  UCL_H_Vec<numtyp> hview_cgrid;
+  UCL_D_Vec<numtyp> _cgrid_brick;
   UCL_D_Vec<int> _igrid;
   UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
   int _max_thetai_size;

From 17e54c939019466f8b78c4cfb0372499925bb3c3 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 11 Sep 2022 19:00:40 -0500
Subject: [PATCH 112/181] Updated the GPU API in the gpu pair style

---
 src/GPU/pair_amoeba_gpu.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 4c77417ff0..3790ca4231 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -91,10 +91,12 @@ void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
-                          double *host_grid_brick_start, double ****host_grid_brick, void **host_fdip_phi1,
+                          double ****host_grid_brick, void **host_fdip_phi1,
                           void **host_fdip_phi2, void **host_fdip_sum_phi,
-                          int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                          int nxlo_out, int nxhi_out, bool& first_iteration);
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          bool& first_iteration);
 
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
@@ -1138,13 +1140,14 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
   void* fdip_phi1_pinned = nullptr;
   void* fdip_phi2_pinned = nullptr;
   void* fdip_sum_phi_pinned = nullptr;
-  amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3,
-                        igrid, ic_kspace->grid_brick_start, grid,
-                        &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned,
-                        ic_kspace->nzlo_out, ic_kspace->nzhi_out,
-                        ic_kspace->nylo_out, ic_kspace->nyhi_out,
-                        ic_kspace->nxlo_out, ic_kspace->nxhi_out,
-                        first_induce_iteration);
+  amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1,
+                       thetai2, thetai3, igrid, grid,
+                       &fdip_phi1_pinned, &fdip_phi2_pinned,
+                       &fdip_sum_phi_pinned,
+                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                       ic_kspace->nxlo_out, ic_kspace->nxhi_out,
+                       first_induce_iteration);
   
   int nlocal = atom->nlocal;
   double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;

From 31047b4a316413b15d856a3a32256aefa77195e9 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 13 Sep 2022 12:53:48 -0500
Subject: [PATCH 113/181] Removed mem alloc in precompute_induce, used buffer
 for packing, and switched to using ucl_vector

---
 lib/gpu/lal_base_amoeba.cpp | 79 +++++++++++++++++++------------------
 lib/gpu/lal_base_amoeba.h   |  8 ++--
 2 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index dfd5565f1e..5989ba889d 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -188,7 +188,8 @@ void BaseAmoebaT::clear_atomic() {
   _fdip_phi2.clear();
   _fdip_sum_phi.clear();
   _cgrid_brick.clear();
-  hview_cgrid.clear();
+
+  hview.clear();
 
   dev_nspecial15.clear();
   dev_special15.clear();
@@ -586,6 +587,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
     _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
 
+    hview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
+
   } else {
     if (inum_full>_max_thetai_size) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
@@ -597,53 +600,53 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
       _fdip_phi1.resize(_max_thetai_size*10);
       _fdip_phi2.resize(_max_thetai_size*10);
       _fdip_sum_phi.resize(_max_thetai_size*20);
+
+      hview.resize(_max_thetai_size*bsorder*4);
     }
   }
 
-  UCL_H_Vec<double> dview;
-  dview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
-
   // pack host data to device
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
       int idx = i*4*bsorder + 4*j;
-      dview[idx+0] = host_thetai1[i][j][0];
-      dview[idx+1] = host_thetai1[i][j][1];
-      dview[idx+2] = host_thetai1[i][j][2];
-      dview[idx+3] = host_thetai1[i][j][3];
+      hview[idx+0] = host_thetai1[i][j][0];
+      hview[idx+1] = host_thetai1[i][j][1];
+      hview[idx+2] = host_thetai1[i][j][2];
+      hview[idx+3] = host_thetai1[i][j][3];
     }
-  ucl_copy(_thetai1,dview,false);
+  ucl_copy(_thetai1,hview,false);
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
       int idx = i*4*bsorder + 4*j;
-      dview[idx+0] = host_thetai2[i][j][0];
-      dview[idx+1] = host_thetai2[i][j][1];
-      dview[idx+2] = host_thetai2[i][j][2];
-      dview[idx+3] = host_thetai2[i][j][3];
+      hview[idx+0] = host_thetai2[i][j][0];
+      hview[idx+1] = host_thetai2[i][j][1];
+      hview[idx+2] = host_thetai2[i][j][2];
+      hview[idx+3] = host_thetai2[i][j][3];
     }
-  ucl_copy(_thetai2,dview,false);
+  ucl_copy(_thetai2,hview,false);
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
       int idx = i*4*bsorder + 4*j;
-      dview[idx+0] = host_thetai3[i][j][0];
-      dview[idx+1] = host_thetai3[i][j][1];
-      dview[idx+2] = host_thetai3[i][j][2];
-      dview[idx+3] = host_thetai3[i][j][3];
+      hview[idx+0] = host_thetai3[i][j][0];
+      hview[idx+1] = host_thetai3[i][j][1];
+      hview[idx+2] = host_thetai3[i][j][2];
+      hview[idx+3] = host_thetai3[i][j][3];
     }
-  ucl_copy(_thetai3,dview,false);
+  ucl_copy(_thetai3,hview,false);
 
-  UCL_H_Vec<int> dview_int;
-  dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
+  //UCL_H_Vec<int> dview_int;
+  //dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
   for (int i = 0; i < inum_full; i++) {
     int idx = i*4;
-    dview_int[idx+0] = host_igrid[i][0];
-    dview_int[idx+1] = host_igrid[i][1];
-    dview_int[idx+2] = host_igrid[i][2];
+    _igrid[idx+0] = host_igrid[i][0];
+    _igrid[idx+1] = host_igrid[i][1];
+    _igrid[idx+2] = host_igrid[i][2];
   }
-  ucl_copy(_igrid, dview_int, false);
+  //ucl_copy(_igrid, dview_int, false);
+  _igrid.update_device(false);
 
   _nzlo_out = nzlo_out;
   _nzhi_out = nzhi_out;
@@ -658,10 +661,8 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
 
   int numel = _num_grid_points*2;
   if (_cgrid_brick.cols() == 0) {
-    hview_cgrid.alloc(numel, *(this->ucl_device), UCL_READ_WRITE);
-    _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_ONLY);
+    _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
   } else if (numel > _cgrid_brick.cols()) {
-    hview_cgrid.resize(numel);
     _cgrid_brick.resize(numel);
   }
 }
@@ -684,10 +685,6 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                     const int nxlo_out, const int nxhi_out,
                                     bool& first_iteration)
 {
-  // TODO: find out why this alloc helps makes the cgrid_brick ucl_copy work
-  UCL_H_Vec<numtyp> hview;
-  hview.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
-  
   // allocation/resize and transfers before the first iteration
   
   if (first_iteration) {
@@ -697,15 +694,19 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
     first_iteration = false;
   }
 
+  // TODO: find out why this host alloc helps makes the cgrid_brick update_device() work correcly
+  UCL_H_Vec<numtyp> hdummy;
+  hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
+
   int n = 0;
-  for (int iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (int iy = nylo_out; iy <= nyhi_out; iy++)
-      for (int ix = nxlo_out; ix <= nxhi_out; ix++) {
-        hview_cgrid[n] = host_grid_brick[iz][iy][ix][0];
-        hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1];
+  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
+    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
+      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
+        _cgrid_brick[n] = host_grid_brick[iz][iy][ix][0];
+        _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1];
         n += 2;
       }
-  ucl_copy(_cgrid_brick, hview_cgrid, false);
+  _cgrid_brick.update_device(false);
 
   const int red_blocks = fphi_uind();
 
@@ -762,7 +763,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
   int** firstneigh = nullptr;
 
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
-  atom->add_extra_data();                          
+  atom->add_extra_data();                    
 
   *tep_ptr=_tep.host.begin();
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index a4a7a8d1a7..760d0e3005 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -257,14 +257,16 @@ class BaseAmoeba {
 
   int _bsorder;
   UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3;
-  UCL_H_Vec<numtyp> hview_cgrid;
-  UCL_D_Vec<numtyp> _cgrid_brick;
-  UCL_D_Vec<int> _igrid;
+  UCL_Vector<int,int> _igrid;
+  UCL_Vector<numtyp,numtyp> _cgrid_brick;
   UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
   int _max_thetai_size;
   int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
   int _ngridx, _ngridy, _ngridz, _num_grid_points;
 
+  /// buffer
+  UCL_H_Vec<double> hview;
+
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   Answer<numtyp,acctyp> *ans;

From 9c4d3db5584635066410d13ce89d9c3edd4bdb3d Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 13 Sep 2022 16:48:39 -0500
Subject: [PATCH 114/181] Cleaned up and converted arrays to ucl_vector of
 numtyp4

---
 lib/gpu/lal_amoeba.cu       | 39 +++++++++++----------
 lib/gpu/lal_base_amoeba.cpp | 68 +++++++++++++++++--------------------
 lib/gpu/lal_base_amoeba.h   |  5 +--
 3 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index d67fa4f869..53a9f6aa3e 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1621,9 +1621,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
-                          const __global numtyp *restrict thetai1,
-                          const __global numtyp *restrict thetai2,
-                          const __global numtyp *restrict thetai3,
+                          const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
                           const __global numtyp *restrict grid,
                           __global numtyp *restrict fdip_phi1,
@@ -1698,11 +1698,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v2 = thetai3[m][kb][2];
       v3 = thetai3[m][kb][3];
       */
-      int i3 = ii*4*bsorder + 4*kb;
-      numtyp v0 = thetai3[i3];
-      numtyp v1 = thetai3[i3+1];
-      numtyp v2 = thetai3[i3+2];
-      numtyp v3 = thetai3[i3+3];
+      int i3 = ii*bsorder + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
       numtyp tu00_1 = (numtyp)0.0;
       numtyp tu01_1 = (numtyp)0.0;
       numtyp tu10_1 = (numtyp)0.0;
@@ -1734,11 +1735,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u2 = thetai2[m][jb][2];
         u3 = thetai2[m][jb][3];
         */
-        int i2 = ii*4*bsorder+4*jb;
-        numtyp u0 = thetai2[i2];
-        numtyp u1 = thetai2[i2+1];
-        numtyp u2 = thetai2[i2+2];
-        numtyp u3 = thetai2[i2+3];
+        int i2 = ii*bsorder+jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
         numtyp t0_1 = (numtyp)0.0;
         numtyp t1_1 = (numtyp)0.0;
         numtyp t2_1 = (numtyp)0.0;
@@ -1760,11 +1762,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          int i1 = ii*4*bsorder+4*ib;
-          numtyp w0 = thetai1[i1];
-          numtyp w1 = thetai1[i1+1];
-          numtyp w2 = thetai1[i1+2];
-          numtyp w3 = thetai1[i1+3];
+          int i1 = ii*bsorder+ib;
+          numtyp4 tha1 = thetai1[i1];
+          numtyp w0 = tha1.x;
+          numtyp w1 = tha1.y;
+          numtyp w2 = tha1.z;
+          numtyp w3 = tha1.w;
           int gidx = 2*(k*ngridxy + j*ngridx + i);
           numtyp tq_1 = grid[gidx];
           numtyp tq_2 = grid[gidx+1];
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 5989ba889d..3e14159d5a 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -189,8 +189,6 @@ void BaseAmoebaT::clear_atomic() {
   _fdip_sum_phi.clear();
   _cgrid_brick.clear();
 
-  hview.clear();
-
   dev_nspecial15.clear();
   dev_special15.clear();
   dev_special15_t.clear();
@@ -578,30 +576,25 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
 
   if (_max_thetai_size == 0) {
     _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
     _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
 
     _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
     _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
     _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
-
-    hview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
-
   } else {
     if (inum_full>_max_thetai_size) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-      _thetai1.resize(_max_thetai_size*bsorder*4);
-      _thetai2.resize(_max_thetai_size*bsorder*4);
-      _thetai3.resize(_max_thetai_size*bsorder*4);
+      _thetai1.resize(_max_thetai_size*bsorder);
+      _thetai2.resize(_max_thetai_size*bsorder);
+      _thetai3.resize(_max_thetai_size*bsorder);
       _igrid.resize(_max_thetai_size*4);
 
       _fdip_phi1.resize(_max_thetai_size*10);
       _fdip_phi2.resize(_max_thetai_size*10);
       _fdip_sum_phi.resize(_max_thetai_size*20);
-
-      hview.resize(_max_thetai_size*bsorder*4);
     }
   }
 
@@ -609,44 +602,47 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
-      int idx = i*4*bsorder + 4*j;
-      hview[idx+0] = host_thetai1[i][j][0];
-      hview[idx+1] = host_thetai1[i][j][1];
-      hview[idx+2] = host_thetai1[i][j][2];
-      hview[idx+3] = host_thetai1[i][j][3];
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai1[i][j][0];
+      v.y = host_thetai1[i][j][1];
+      v.z = host_thetai1[i][j][2];
+      v.w = host_thetai1[i][j][3];
+      _thetai1[idx] = v;
     }
-  ucl_copy(_thetai1,hview,false);
+  _thetai1.update_device(true);
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
-      int idx = i*4*bsorder + 4*j;
-      hview[idx+0] = host_thetai2[i][j][0];
-      hview[idx+1] = host_thetai2[i][j][1];
-      hview[idx+2] = host_thetai2[i][j][2];
-      hview[idx+3] = host_thetai2[i][j][3];
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai2[i][j][0];
+      v.y = host_thetai2[i][j][1];
+      v.z = host_thetai2[i][j][2];
+      v.w = host_thetai2[i][j][3];
+      _thetai2[idx] = v;
     }
-  ucl_copy(_thetai2,hview,false);
+  _thetai2.update_device(true);
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
-      int idx = i*4*bsorder + 4*j;
-      hview[idx+0] = host_thetai3[i][j][0];
-      hview[idx+1] = host_thetai3[i][j][1];
-      hview[idx+2] = host_thetai3[i][j][2];
-      hview[idx+3] = host_thetai3[i][j][3];
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai3[i][j][0];
+      v.y = host_thetai3[i][j][1];
+      v.z = host_thetai3[i][j][2];
+      v.w = host_thetai3[i][j][3];
+      _thetai3[idx] = v;
     }
-  ucl_copy(_thetai3,hview,false);
+  _thetai3.update_device(true);
 
-  //UCL_H_Vec<int> dview_int;
-  //dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
   for (int i = 0; i < inum_full; i++) {
     int idx = i*4;
     _igrid[idx+0] = host_igrid[i][0];
     _igrid[idx+1] = host_igrid[i][1];
     _igrid[idx+2] = host_igrid[i][2];
   }
-  //ucl_copy(_igrid, dview_int, false);
-  _igrid.update_device(false);
+  _igrid.update_device(true);
 
   _nzlo_out = nzlo_out;
   _nzhi_out = nzhi_out;
@@ -694,7 +690,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
     first_iteration = false;
   }
 
-  // TODO: find out why this host alloc helps makes the cgrid_brick update_device() work correcly
+  // TODO: find out why this host alloc helps the cgrid_brick update_device() work correcly
   UCL_H_Vec<numtyp> hdummy;
   hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 760d0e3005..802b6962b7 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -256,7 +256,7 @@ class BaseAmoeba {
   int _nmax, _max_tep_size, _max_fieldp_size;
 
   int _bsorder;
-  UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3;
+  UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
   UCL_Vector<int,int> _igrid;
   UCL_Vector<numtyp,numtyp> _cgrid_brick;
   UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
@@ -264,9 +264,6 @@ class BaseAmoeba {
   int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
   int _ngridx, _ngridy, _ngridz, _num_grid_points;
 
-  /// buffer
-  UCL_H_Vec<double> hview;
-
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   Answer<numtyp,acctyp> *ans;

From cd3a00c2c44086c7c2531e5f61c2985789e5658c Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 14 Sep 2022 15:28:44 -0500
Subject: [PATCH 115/181] Added timing breakdown for fphi_uind

---
 lib/gpu/lal_hippo.cpp       | 27 ---------------------------
 lib/gpu/lal_hippo.h         |  1 -
 src/AMOEBA/pair_amoeba.cpp  |  7 ++++++-
 src/AMOEBA/pair_amoeba.h    |  1 +
 src/GPU/pair_amoeba_gpu.cpp |  7 +++++++
 5 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index d980ae0ed6..79a8772c3e 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -592,33 +592,6 @@ int HippoT::umutual2b(const int eflag, const int vflag) {
   return GX;
 }
 
-// ---------------------------------------------------------------------------
-// Interpolate the potential from the PME grid
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int HippoT::fphi_uind() {
-  int ainum=this->ans->inum();
-  if (ainum == 0)
-    return 0;
-
-  int _nall=this->atom->nall();
-  int nbor_pitch=this->nbor->nbor_pitch();
-
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-/*
-  this->time_pair.start();
-
-  this->k_fphi_uind.set_size(GX,BX);
-  this->k_fphi_uind.run();
-  this->time_pair.stop();
-*/
-
-  return GX;
-}
-
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index cece72caac..492712eb85 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -157,7 +157,6 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
-  int fphi_uind();
   int polar_real(const int eflag, const int vflag);
 
 };
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index 3b66ebc221..9890904e42 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -348,6 +348,7 @@ void PairAmoeba::compute(int eflag, int vflag)
     time_mutual_rspace = time_mutual_kspace = 0.0;
     time_polar_rspace = time_polar_kspace = 0.0;
 
+    time_fphi_uind = 0.0;
     if (ic_kspace) {
       ic_kspace->time_fft = 0.0;
     }
@@ -546,6 +547,9 @@ void PairAmoeba::finish()
   MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_polar_kspace = ave/comm->nprocs;
 
+  MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_fphi_uind = ave/comm->nprocs;
+
   double time_mutual_fft = ic_kspace->time_fft;
   MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_mutual_fft = ave/comm->nprocs;
@@ -578,7 +582,8 @@ void PairAmoeba::finish()
     utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
     utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
     utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
-    utils::logmesg(lmp,"       - FFT time: {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
+    utils::logmesg(lmp,"       - FFT     : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
+    utils::logmesg(lmp,"       - Interp  : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
     utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
     
   }
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 17b2d4a1e8..a95065d851 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -92,6 +92,7 @@ class PairAmoeba : public Pair {
   double time_direct_rspace,time_direct_kspace;
   double time_mutual_rspace,time_mutual_kspace;
   double time_polar_rspace,time_polar_kspace;
+  double time_fphi_uind;
 
   // energy/virial components
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 3790ca4231..b85db8ea47 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -1058,9 +1058,16 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
   double ****gridpost = (double ****) ic_kspace->post_convolution();
 
   // get potential
+  double time0, time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
 
+  time1 = MPI_Wtime();
+  time_fphi_uind += (time1 - time0);
+
   // store fractional reciprocal potentials for OPT method
 
   if (poltyp == OPT) {

From 0359d405802f295b933a93da9515a73eb9c17897 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 14 Sep 2022 16:11:43 -0500
Subject: [PATCH 116/181] Added interpolation timing for the cpu version

---
 src/AMOEBA/amoeba_induce.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 01491a8708..90a52ca402 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -940,9 +940,16 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
   double ****gridpost = (double ****) ic_kspace->post_convolution();
 
   // get potential
+  double time0, time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
 
+  time1 = MPI_Wtime();
+  time_fphi_uind += (time1 - time0);
+
   // store fractional reciprocal potentials for OPT method
 
   if (poltyp == OPT) {

From 880f20c2858c0fb4c855c0fcf84a3aaaa86af533 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 15 Sep 2022 15:29:14 -0500
Subject: [PATCH 117/181] Cleaned up kernels

---
 lib/gpu/lal_amoeba.cpp      |   5 +-
 lib/gpu/lal_amoeba.cu       |  47 +-
 lib/gpu/lal_base_amoeba.cpp |   8 +-
 src/AMOEBA/pair_amoeba.h    |   2 +-
 src/GPU/pair_amoeba_gpu.cpp | 951 ++++++++++++++++++++++++++++++++++++
 src/GPU/pair_amoeba_gpu.h   |   1 +
 6 files changed, 989 insertions(+), 25 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 924a175cfe..48316e9b6e 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -117,7 +117,10 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
 
   _allocated=true;
   this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
-    + sp_amoeba.row_bytes() + this->_tep.row_bytes();
+    + sp_amoeba.row_bytes() + this->_tep.row_bytes()
+    + this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+    + this->_thetai2.row_bytes()  + this->_thetai3.row_bytes()
+    + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
   return 0;
 }
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 53a9f6aa3e..d391279f5d 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -849,7 +849,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       if (damp != (numtyp)0.0) {
         numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
         if (pgamma != (numtyp)0.0) {
-          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          //damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          numtyp tmp = r*ucl_recip(damp);
+          damp = pgamma * ucl_sqrt(tmp*tmp*tmp);
           if (damp < (numtyp)50.0) {
             numtyp expdamp = ucl_exp(-damp) ;
             scale3 = (numtyp)1.0 - expdamp ;
@@ -858,7 +860,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
           }
         } else {
           pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
-          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+          //damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+          numtyp tmp = r*ucl_recip(damp);
+          damp = pgamma * (tmp*tmp*tmp);
           if (damp < (numtyp)50.0) {
             numtyp expdamp = ucl_exp(-damp);
             scale3 = (numtyp)1.0 - expdamp;
@@ -1314,7 +1318,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
         numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
-        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        //damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        numtyp tmp = r*ucl_recip(damp);
+        damp = pgamma * (tmp*tmp*tmp);
         if (damp < (numtyp)50.0) {
           numtyp expdamp = ucl_exp(-damp);
           sc3 = (numtyp)1.0 - expdamp;
@@ -1620,8 +1626,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
    fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-__kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
-                          const __global numtyp4 *restrict thetai1,
+__kernel void k_fphi_uind(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai2,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
@@ -1630,10 +1635,9 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
                           __global numtyp *restrict fdip_phi2,
                           __global numtyp *restrict fdip_sum_phi,
                           const int bsorder, const int inum,
-                          const int nzlo_out, const int nzhi_out,
-                          const int nylo_out, const int nyhi_out,
-                          const int nxlo_out, const int nxhi_out,
-                          const int ngridxy, const int ngridx)
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
 {
   //int tid, ii, offset, i, n_stride;
   //atom_info(t_per_atom,ii,tid,offset);
@@ -1643,11 +1647,16 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
   int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
 
   if (ii<inum) {
-    //numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i];
-    acctyp fdip_buf[32];
 
-    int j,k;    
     int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    int igridx = igrid[istart];
+    int igridy = igrid[istart+1];
+    int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
 
     // extract the permanent multipole field at each site
 
@@ -1690,7 +1699,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     numtyp tuv012 = (numtyp)0.0;
     numtyp tuv111 = (numtyp)0.0;
 
-    k = igrid[4*ii+2] - nzlo_out - nlpts;
+    int k = (igridz - nzlo_out) - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
       /*
       v0 = thetai3[m][kb][0];
@@ -1698,7 +1707,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v2 = thetai3[m][kb][2];
       v3 = thetai3[m][kb][3];
       */
-      int i3 = ii*bsorder + kb;
+      int i3 = istart + kb;
       numtyp4 tha3 = thetai3[i3];
       numtyp v0 = tha3.x;
       numtyp v1 = tha3.y;
@@ -1727,7 +1736,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       numtyp tu12 = (numtyp)0.0;
       numtyp tu03 = (numtyp)0.0;
 
-      j = igrid[4*ii+1] - nylo_out - nlpts;
+      int j = (igridy - nylo_out) - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
         /*
         u0 = thetai2[m][jb][0];
@@ -1735,7 +1744,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u2 = thetai2[m][jb][2];
         u3 = thetai2[m][jb][3];
         */
-        int i2 = ii*bsorder+jb;
+        int i2 = istart + jb;
         numtyp4 tha2 = thetai2[i2];
         numtyp u0 = tha2.x;
         numtyp u1 = tha2.y;
@@ -1749,7 +1758,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         numtyp t2_2 = (numtyp)0.0;
         numtyp t3 = (numtyp)0.0;
 
-        int i = igrid[4*ii] - nxlo_out - nlpts;
+        int i = (igridx - nxlo_out) - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
           /*
           tq_1 = grid[k][j][i][0];
@@ -1762,7 +1771,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          int i1 = ii*bsorder+ib;
+          int i1 = istart + ib;
           numtyp4 tha1 = thetai1[i1];
           numtyp w0 = tha1.x;
           numtyp w1 = tha1.y;
@@ -1851,6 +1860,8 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     }
 
     int idx;
+    numtyp fdip_buf[20];
+
     fdip_buf[0] = (numtyp)0.0;
     fdip_buf[1] = tuv100_1;
     fdip_buf[2] = tuv010_1;
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 3e14159d5a..3ee0517dfb 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -734,11 +734,9 @@ int BaseAmoebaT::fphi_uind() {
   time_pair.start();
   int ngridxy = _ngridx * _ngridy;
   k_fphi_uind.set_size(GX,BX);
-  k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3,
-                  &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2,
-                  &_fdip_sum_phi, &_bsorder, &ainum, 
-                  &_nzlo_out, &_nzhi_out, &_nylo_out, &_nyhi_out, 
-                  &_nxlo_out, &_nxhi_out, &ngridxy, &_ngridx);
+  k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
+                  &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum, 
+                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
   time_pair.stop();
 
   return GX;
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index a95065d851..24ce6fcfbc 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -374,7 +374,7 @@ class PairAmoeba : public Pair {
   void polar();
   void polar_energy();
   virtual void polar_real();
-  void polar_kspace();
+  virtual void polar_kspace();
   void damppole(double, int, double, double, double *, double *, double *);
 
   virtual void induce();
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index b85db8ea47..e62c8185be 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -1278,6 +1278,957 @@ void PairAmoebaGPU::polar_real()
   }
 }
 
+/* ----------------------------------------------------------------------
+   polar_kspace = KSpace portion of induced dipole polarization
+   adapted from Tinker eprecip1() routine
+   same as PairAmoeba, except that fphi_uind() is reimplemented here
+ ------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::polar_kspace()
+{
+  int i,j,k,m,n;
+  int nhalf1,nhalf2,nhalf3;
+  int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
+  int j1,j2,j3;
+  int ix,iy,iz;
+  double eterm,felec;
+  double r1,r2,r3;
+  double h1,h2,h3;
+  double f1,f2,f3;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double volterm,denom;
+  double hsq,expterm;
+  double term,pterm;
+  double vterm,struc2;
+  double tep[3];
+  double fix[3],fiy[3],fiz[3];
+  double cphid[4],cphip[4];
+  double a[3][3];    // indices not flipped vs Fortran
+
+  // indices into the electrostatic field array
+  // decremented by 1 versus Fortran
+
+  int deriv1[10] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19};
+  int deriv2[10] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16};
+  int deriv3[10] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18};
+
+  // return if the Ewald coefficient is zero
+
+  if (aewald < 1.0e-6) return;
+
+  // owned atoms
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int nlocal = atom->nlocal;
+
+  double volbox = domain->prd[0] * domain->prd[1] * domain->prd[2];
+  pterm = pow((MY_PI/aewald),2.0);
+  volterm = MY_PI * volbox;
+
+  // initialize variables required for the scalar summation
+
+  felec = electric / am_dielectric;
+
+  // remove scalar sum virial from prior multipole FFT
+  // can only do this if multipoles were computed with same aeewald = apewald
+  // else need to re-compute it via new long-range solve
+
+  nfft1 = p_kspace->nx;
+  nfft2 = p_kspace->ny;
+  nfft3 = p_kspace->nz;
+  bsorder = p_kspace->order;
+
+  nhalf1 = (nfft1+1) / 2;
+  nhalf2 = (nfft2+1) / 2;
+  nhalf3 = (nfft3+1) / 2;
+
+  nxlo = p_kspace->nxlo_fft;
+  nxhi = p_kspace->nxhi_fft;
+  nylo = p_kspace->nylo_fft;
+  nyhi = p_kspace->nyhi_fft;
+  nzlo = p_kspace->nzlo_fft;
+  nzhi = p_kspace->nzhi_fft;
+
+  // use previous results or compute new qfac and convolution
+
+  if (aewald == aeewald) {
+    vxx = -vmsave[0];
+    vyy = -vmsave[1];
+    vzz = -vmsave[2];
+    vxy = -vmsave[3];
+    vxz = -vmsave[4];
+    vyz = -vmsave[5];
+
+  } else {
+
+    // setup stencil size and B-spline coefficients
+
+    moduli();
+    bspline_fill();
+
+    // convert Cartesian multipoles to fractional coordinates
+
+    cmp_to_fmp(cmp,fmp);
+
+    // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+
+    double ***gridpre = (double ***) p_kspace->zero();
+
+    // map atoms to grid
+
+    grid_mpole(fmp,gridpre);
+
+    // pre-convolution operations including forward FFT
+    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
+
+    double *gridfft = p_kspace->pre_convolution();
+
+    // ---------------------
+    // convolution operation
+    // ---------------------
+
+    // zero virial accumulation variables
+
+    vxx = vyy = vzz = vxy = vxz = vyz = 0.0;
+
+    // perform convolution on K-space points I own
+
+    m = n = 0;
+    for (k = nzlo; k <= nzhi; k++) {
+      for (j = nylo; j <= nyhi; j++) {
+        for (i = nxlo; i <= nxhi; i++) {
+          r1 = (i >= nhalf1) ? i-nfft1 : i;
+          r2 = (j >= nhalf2) ? j-nfft2 : j;
+          r3 = (k >= nhalf3) ? k-nfft3 : k;
+          h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;  // matvec
+          h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
+          h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
+          hsq = h1*h1 + h2*h2 + h3*h3;
+          term = -pterm * hsq;
+          expterm = 0.0;
+          if (term > -50.0 && hsq != 0.0) {
+            denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k];
+            if (hsq) expterm = exp(term) / denom;
+            struc2 = gridfft[n]*gridfft[n] + gridfft[n+1]*gridfft[n+1];
+            eterm = 0.5 * felec * expterm * struc2;
+            vterm = (2.0/hsq) * (1.0-term) * eterm;
+            vxx -= h1*h1*vterm - eterm;
+            vyy -= h2*h2*vterm - eterm;
+            vzz -= h3*h3*vterm - eterm;
+            vxy -= h1*h2*vterm;
+            vxz -= h1*h3*vterm;
+            vyz -= h2*h3*vterm;
+          }
+
+          expterm = qfac[m++];
+          gridfft[n] *= expterm;
+          gridfft[n+1] *= expterm;
+          n += 2;
+        }
+      }
+    }
+
+    // post-convolution operations including backward FFT
+    // gridppost = my portion of 3d grid in brick decomp w/ ghost values
+
+    double ***gridpost = (double ***) p_kspace->post_convolution();
+
+    // get potential
+
+    fphi_mpole(gridpost,fphi);
+
+    for (i = 0; i < nlocal; i++) {
+      for (k = 0; k < 20; k++)
+        fphi[i][k] *= felec;
+    }
+
+    // convert field from fractional to Cartesian
+
+    fphi_to_cphi(fphi,cphi);
+  }
+
+  // convert Cartesian induced dipoles to fractional coordinates
+
+  for (i = 0; i < 3; i++) {
+    a[0][i] = nfft1 * recip[0][i];
+    a[1][i] = nfft2 * recip[1][i];
+    a[2][i] = nfft3 * recip[2][i];
+  }
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
+      fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
+    }
+  }
+
+  // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values
+
+  double ****gridpre2 = (double ****) pc_kspace->zero();
+
+  // map 2 values to grid
+
+  grid_uind(fuind,fuinp,gridpre2);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomposition
+
+  double *gridfft = pc_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  // use qfac values from above or from induce()
+
+  m = n = 0;
+  for (k = nzlo; k <= nzhi; k++) {
+    for (j = nylo; j <= nyhi; j++) {
+      for (i = nxlo; i <= nxhi; i++) {
+        term = qfac[m++];
+        gridfft[n] *= term;
+        gridfft[n+1] *= term;
+        n += 2;
+      }
+    }
+  }
+
+  // post-convolution operations including backward FFT
+  // gridppost = my portion of 4d grid in brick decomp w/ ghost values
+
+  double ****gridpost = (double ****) pc_kspace->post_convolution();
+
+  // get potential
+
+  fphi_uind(gridpost,fphid,fphip,fphidp);
+
+  // TODO: port the remaining loops to the GPU
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 1; j < 10; j++) {
+      fphid[i][j] = felec * fphid[i][j];
+      fphip[i][j] = felec * fphip[i][j];
+    }
+    for (j = 0; j < 20; j++)
+      fphidp[i][j] = felec * fphidp[i][j];
+  }
+
+  // increment the dipole polarization gradient contributions
+
+  for (i = 0; i < nlocal; i++) {
+    f1 = 0.0;
+    f2 = 0.0;
+    f3 = 0.0;
+    for (k = 0; k < 3; k++) {
+      j1 = deriv1[k+1];
+      j2 = deriv2[k+1];
+      j3 = deriv3[k+1];
+      f1 += (fuind[i][k]+fuinp[i][k])*fphi[i][j1];
+      f2 += (fuind[i][k]+fuinp[i][k])*fphi[i][j2];
+      f3 += (fuind[i][k]+fuinp[i][k])*fphi[i][j3];
+      if (poltyp == MUTUAL) {
+        f1 += fuind[i][k]*fphip[i][j1] + fuinp[i][k]*fphid[i][j1];
+        f2 += fuind[i][k]*fphip[i][j2] + fuinp[i][k]*fphid[i][j2];
+        f3 += fuind[i][k]*fphip[i][j3] + fuinp[i][k]*fphid[i][j3];
+      }
+    }
+    for (k = 0; k < 10; k++) {
+      f1 += fmp[i][k]*fphidp[i][deriv1[k]];
+      f2 += fmp[i][k]*fphidp[i][deriv2[k]];
+      f3 += fmp[i][k]*fphidp[i][deriv3[k]];
+    }
+    f1 *= 0.5 * nfft1;
+    f2 *= 0.5 * nfft2;
+    f3 *= 0.5 * nfft3;
+    h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3;
+    h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3;
+    h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3;
+    f[i][0] -= h1;
+    f[i][1] -= h2;
+    f[i][2] -= h3;
+  }
+
+  // set the potential to be the induced dipole average
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 10; j++)
+      fphidp[i][j] *= 0.5;
+  }
+
+  fphi_to_cphi(fphidp,cphidp);
+
+  // get the fractional to Cartesian transformation matrix
+
+  //frac_to_cart();
+
+  // increment the dipole polarization virial contributions
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 1; j < 4; j++) {
+      cphid[j] = 0.0;
+      cphip[j] = 0.0;
+      for (k = 1; k < 4; k++) {
+        cphid[j] += ftc[j][k]*fphid[i][k];
+        cphip[j] += ftc[j][k]*fphip[i][k];
+      }
+    }
+
+    vxx -= cmp[i][1]*cphidp[i][1] +
+      0.5*((uind[i][0]+uinp[i][0])*cphi[i][1]);
+    vyy -= cmp[i][2]*cphidp[i][2] +
+      0.5*((uind[i][1]+uinp[i][1])*cphi[i][2]);
+    vzz -= cmp[i][3]*cphidp[i][3] +
+      0.5*((uind[i][2]+uinp[i][2])*cphi[i][3]);
+    vxy -= 0.5*(cphidp[i][1]*cmp[i][2]+cphidp[i][2]*cmp[i][1]) +
+      0.25*((uind[i][1]+uinp[i][1])*cphi[i][1] +
+            (uind[i][0]+uinp[i][0])*cphi[i][2]);
+    vyz -= 0.5*(cphidp[i][2]*cmp[i][3]+cphidp[i][3]*cmp[i][2]) +
+      0.25*((uind[i][2]+uinp[i][2])*cphi[i][2] +
+            (uind[i][1]+uinp[i][1])*cphi[i][3]);
+    vxz -= 0.5*(cphidp[i][1]*cmp[i][3]+cphidp[i][3]*cmp[i][1]) +
+      0.25*((uind[i][2]+uinp[i][2])*cphi[i][1] +
+            (uind[i][0]+uinp[i][0])*cphi[i][3]);
+
+    vxx -= 2.0*cmp[i][4]*cphidp[i][4] + cmp[i][7]*cphidp[i][7] +
+      cmp[i][8]*cphidp[i][8];
+    vyy -= 2.0*cmp[i][5]*cphidp[i][5] + cmp[i][7]*cphidp[i][7] +
+      cmp[i][9]*cphidp[i][9];
+    vzz -= 2.0*cmp[i][6]*cphidp[i][6] + cmp[i][8]*cphidp[i][8] +
+      cmp[i][9]*cphidp[i][9];
+    vxy -= (cmp[i][4]+cmp[i][5])*cphidp[i][7] +
+      0.5*(cmp[i][7]*(cphidp[i][5]+cphidp[i][4]) +
+           cmp[i][8]*cphidp[i][9]+cmp[i][9]*cphidp[i][8]);
+    vyz -= (cmp[i][5]+cmp[i][6])*cphidp[i][9] +
+      0.5*(cmp[i][9]*(cphidp[i][5]+cphidp[i][6]) +
+           cmp[i][7]*cphidp[i][8]+cmp[i][8]*cphidp[i][7]);
+    vxz -= (cmp[i][4]+cmp[i][6])*cphidp[i][8] +
+      0.5*(cmp[i][8]*(cphidp[i][4]+cphidp[i][6]) +
+           cmp[i][7]*cphidp[i][9]+cmp[i][9]*cphidp[i][7]);
+
+    if (poltyp == MUTUAL) {
+      vxx -= 0.5 * (cphid[1]*uinp[i][0]+cphip[1]*uind[i][0]);
+      vyy -= 0.5 * (cphid[2]*uinp[i][1]+cphip[2]*uind[i][1]);
+      vzz -= 0.5 * (cphid[3]*uinp[i][2]+cphip[3]*uind[i][2]);
+      vxy -= 0.25 * (cphid[1]*uinp[i][1]+cphip[1]*uind[i][1] +
+                     cphid[2]*uinp[i][0]+cphip[2]*uind[i][0]);
+      vyz -= 0.25 * (cphid[2]*uinp[i][2]+cphip[2]*uind[i][2] +
+                     cphid[3]*uinp[i][1]+cphip[3]*uind[i][1]);
+      vxz -= 0.25 * (cphid[1]*uinp[i][2]+cphip[1]*uind[i][2] +
+                     cphid[3]*uinp[i][0]+cphip[3]*uind[i][0]);
+    }
+  }
+
+
+  // resolve site torques then increment forces and virial
+
+  for (i = 0; i < nlocal; i++) {
+    tep[0] = cmp[i][3]*cphidp[i][2] - cmp[i][2]*cphidp[i][3] +
+      2.0*(cmp[i][6]-cmp[i][5])*cphidp[i][9] + cmp[i][8]*cphidp[i][7] +
+      cmp[i][9]*cphidp[i][5]- cmp[i][7]*cphidp[i][8] - cmp[i][9]*cphidp[i][6];
+    tep[1] = cmp[i][1]*cphidp[i][3] - cmp[i][3]*cphidp[i][1] +
+      2.0*(cmp[i][4]-cmp[i][6])*cphidp[i][8] + cmp[i][7]*cphidp[i][9] +
+      cmp[i][8]*cphidp[i][6] - cmp[i][8]*cphidp[i][4] - cmp[i][9]*cphidp[i][7];
+    tep[2] = cmp[i][2]*cphidp[i][1] - cmp[i][1]*cphidp[i][2] +
+      2.0*(cmp[i][5]-cmp[i][4])*cphidp[i][7] + cmp[i][7]*cphidp[i][4] +
+      cmp[i][9]*cphidp[i][8] - cmp[i][7]*cphidp[i][5] - cmp[i][8]*cphidp[i][9];
+
+    torque2force(i,tep,fix,fiy,fiz,f);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx += xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy += yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz += zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy += 0.5*(yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
+                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vyz += 0.5*(zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
+                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+    vxz += 0.5*(zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
+                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+  }
+
+  // account for dipole response terms in the OPT method
+
+  if (poltyp == OPT) {
+    for (i = 0; i < nlocal; i++) {
+      for (k = 0; k < optorder; k++) {
+        for (j = 1; j < 10; j++) {
+          fphid[i][j] = felec * fopt[i][k][j];
+          fphip[i][j] = felec * foptp[i][k][j];
+        }
+
+        for (m = 0; m < optorder-k; m++) {
+          for (j = 0; j < 3; j++) {
+            fuind[i][j] = a[0][j]*uopt[i][m][0] + a[1][j]*uopt[i][m][1] +
+              a[2][j]*uopt[i][m][2];
+            fuinp[i][j] = a[0][j]*uoptp[i][m][0] + a[1][j]*uoptp[i][m][1] +
+              a[2][j]*uoptp[i][m][2];
+          }
+
+          f1 = 0.0;
+          f2 = 0.0;
+          f3 = 0.0;
+
+          for (j = 0; j < 3; j++) {
+            j1 = deriv1[j+1];
+            j2 = deriv2[j+1];
+            j3 = deriv3[j+1];
+            f1 += fuind[i][j]*fphip[i][j1] + fuinp[i][j]*fphid[i][j1];
+            f2 += fuind[i][j]*fphip[i][j2] + fuinp[i][j]*fphid[i][j2];
+            f3 += fuind[i][j]*fphip[i][j3] + fuinp[i][j]*fphid[i][j3];
+          }
+
+          f1 *= 0.5 * nfft1;
+          f2 *= 0.5 * nfft2;
+          f3 *= 0.5 * nfft3;
+          h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3;
+          h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3;
+          h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3;
+
+          f[i][0] -= copm[k+m+1]*h1;
+          f[i][1] -= copm[k+m+1]*h2;
+          f[i][2] -= copm[k+m+1]*h3;
+
+          for (j = 1; j < 4; j++) {
+            cphid[j] = 0.0;
+            cphip[j] = 0.0;
+            for (j1 = 1; j1 < 4; j1++) {
+              cphid[j] += ftc[j][j1]*fphid[i][j1];
+              cphip[j] += ftc[j][j1]*fphip[i][j1];
+            }
+          }
+
+          vxx -= 0.5*copm[k+m+1] *
+            (cphid[1]*uoptp[i][m][0] + cphip[1]*uopt[i][m][0]);
+          vyy -= 0.5*copm[k+m+1] *
+            (cphid[2]*uoptp[i][m][1]+ cphip[2]*uopt[i][m][1]);
+          vzz -= 0.5*copm[k+m+1] *
+            (cphid[3]*uoptp[i][m][2]+ cphip[3]*uopt[i][m][2]);
+          vxy -= 0.25*copm[k+m+1] *
+            (cphid[1]*uoptp[i][m][1]+ cphip[1]*uopt[i][m][1]+
+             cphid[2]*uoptp[i][m][0]+ cphip[2]*uopt[i][m][0]);
+          vyz -= 0.25*copm[k+m+1] *
+            (cphid[1]*uoptp[i][m][2]+ cphip[1]*uopt[i][m][2]+
+             cphid[3]*uoptp[i][m][0]+ cphip[3]*uopt[i][m][0]);
+          vxz -= 0.25*copm[k+m+1] *
+            (cphid[2]*uoptp[i][m][2]+ cphip[2]*uopt[i][m][2]+
+             cphid[3]*uoptp[i][m][1]+ cphip[3]*uopt[i][m][1]);
+        }
+      }
+    }
+  }
+
+  // account for dipole response terms in the TCG method
+
+  /*
+  if (poltyp == TCG) {
+
+    for (m = 0; m < tcgnab; m++) {
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          fuind[i][j] = a[0][j]*uad[i][m][0] + a[1][j]*uad[i][m][1] +
+            a[2][j]*uad[i][m][2];
+          fuinp[i][j] = a[0][j]*ubp[i][m][0] + a[1][j]*ubp[i][m][1] +
+            a[2][j]*ubp[i][m][2];
+        }
+      }
+
+      grid_uind(fuind,fuinp);
+      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
+
+      for (k = 0; k < nfft3; k++) {
+        for (j = 0; j < nfft2; j++) {
+          for (i = 0; i < nfft1; i++) {
+            term = qfac[k][j][i];
+            qgrid[k][j][i][0] *= term;
+            qgrid[k][j][i][1] *= term;
+          }
+        }
+      }
+
+      efft->compute(qgrid[0][0][0],qgrid[0][0][0],-1);
+      fphi_uind(fphid,fphip,fphidp);
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 1; j < 10; j++) {
+          fphid[i][j] *= felec;
+          fphip[i][j] *= felec;
+        }
+      }
+
+      for (i = 0; i < nlocal; i++) {
+        f1 = 0.0;
+        f2 = 0.0;
+        f3 = 0.0;
+        for (k = 0; k < 3; k++) {
+          j1 = deriv1[k+1];
+          j2 = deriv2[k+1];
+          j3 = deriv3[k+1];
+          f1 += fuind[i][k]*fphip[i][j1]+fuinp[i][k]*fphid[i][j1];
+          f2 += fuind[i][k]*fphip[i][j2]+fuinp[i][k]*fphid[i][j2];
+          f3 += fuind[i][k]*fphip[i][j3]+fuinp[i][k]*fphid[i][j3];
+        }
+
+        f1 *= 0.5 * nfft1;
+        f2 *= 0.5 * nfft2;
+        f3 *= 0.5 * nfft3;
+        h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3;
+        h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3;
+        h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3;
+        f[i][0] -= h1;
+        f[i][1] -= h2;
+        f[i][2] -= h3;
+
+        for (j = 1; j < 4; j++) {
+          cphid[j] = 0.0;
+          cphip[j] = 0.0;
+          for (k = 1; k < 4; k++) {
+            cphid[j] += ftc[j][k]*fphid[i][k];
+            cphip[j] += ftc[j][k]*fphip[i][k];
+          }
+        }
+
+        vxx -= 0.5*(cphid[1]*ubp[i][m][0] + cphip[1]*uad[i][m][0]);
+        vyy -= 0.5*(cphid[2]*ubp[i][m][1] + cphip[2]*uad[i][m][1]);
+        vzz -= 0.5*(cphid[3]*ubp[i][m][2] + cphip[3]*uad[i][m][2]);
+
+        vxy -= 0.25*(cphid[1]*ubp[i][m][1] + cphip[1]*uad[i][m][1] +
+                        cphid[2]*ubp[i][m][0] + cphip[2]*uad[i][m][0]);
+        vyz -= 0.25*(cphid[1]*ubp[i][m][2] + cphip[1]*uad[i][m][2] +
+                        cphid[3]*ubp[i][m][0] + cphip[3]*uad[i][m][0]);
+        vxz -= 0.25*(cphid[2]*ubp[i][m][2] + cphip[2]*uad[i][m][2] +
+                        cphid[3]*ubp[i][m][1] + cphip[3]*uad[i][m][1]);
+      }
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          fuind[i][j] = a[0][j]*ubd[i][m][0] + a[1][j]*ubd[i][m][1] +
+            a[2][j]*ubd[i][m][2];
+          fuinp[i][j] = a[0][j]*uap[i][m][0] + a[1][j]*uap[i][m][1] +
+            a[2][j]*uap[i][m][2];
+        }
+      }
+
+      grid_uind(fuind,fuinp);
+      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
+
+      for (k = 0; k < nfft3; k++) {
+        for (j = 0; j < nfft2; j++) {
+          for (i = 0; i < nfft1; i++) {
+            term = qfac[k][j][i];
+            qgrid[k][j][i][0] *= term;
+            qgrid[k][j][i][1] *= term;
+          }
+        }
+      }
+
+      efft->compute(qgrid[0][0][0],qgrid[0][0][0],-1);
+      fphi_uind(fphid,fphip,fphidp);
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 1; j < 10; j++) {
+          fphid[i][j] *= felec;
+          fphip[i][j] *= felec;
+        }
+      }
+
+      for (i = 0; i < nlocal; i++) {
+        f1 = 0.0;
+        f2 = 0.0;
+        f3 = 0.0;
+        for (k = 0; k < 3; k++) {
+          j1 = deriv1[k+1];
+          j2 = deriv2[k+1];
+          j3 = deriv3[k+1];
+          f1 += fuind[i][k]*fphip[i][j1]+fuinp[i][k]*fphid[i][j1];
+          f2 += fuind[i][k]*fphip[i][j2]+fuinp[i][k]*fphid[i][j2];
+          f3 += fuind[i][k]*fphip[i][j3]+fuinp[i][k]*fphid[i][j3];
+        }
+
+        f1 *= 0.5 * nfft1;
+        f2 *= 0.5 * nfft2;
+        f3 *= 0.5 * nfft3;
+        h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3;  // matvec
+        h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3;
+        h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3;
+        f[i][0] -= h1;
+        f[i][1] -= h2;
+        f[i][2] -= h3;
+
+        for (j = 1; j < 4; j++) {
+          cphid[j] = 0.0;
+          cphip[j] = 0.0;
+          for (k = 1; k < 4; k++) {
+            cphid[j] += ftc[j][k]*fphid[i][k];
+            cphip[j] += ftc[j][k]*fphip[i][k];
+          }
+        }
+
+        vxx -= 0.5*(cphid[1]*uap[i][m][0] + cphip[1]*ubd[i][m][0]);
+        vyy -= 0.5*(cphid[2]*uap[i][m][1] + cphip[2]*ubd[i][m][1]);
+        vzz -= 0.5*(cphid[3]*uap[i][m][2] + cphip[3]*ubd[i][m][2]);
+        vxy -= 0.25*(cphid[1]*uap[i][m][1] + cphip[1]*ubd[i][m][1] +
+                     cphid[2]*uap[i][m][0] + cphip[2]*ubd[i][m][0]);
+        vxz -= 0.25*(cphid[1]*uap[i][m][2] + cphip[1]*ubd[i][m][2] +
+                     cphid[3]*uap[i][m][0] + cphip[3]*ubd[i][m][0]);
+        vyz -= 0.25*(cphid[2]*uap[i][m][2] + cphip[2]*ubd[i][m][2] +
+                     cphid[3]*uap[i][m][1] + cphip[3]*ubd[i][m][1]);
+      }
+    }
+  }
+  */
+
+  // assign permanent and induced multipoles to the PME grid
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 1; j < 4; j++)
+      cmp[i][j] += uinp[i][j-1];
+  }
+
+  // convert Cartesian multipoles to fractional multipoles
+
+  cmp_to_fmp(cmp,fmp);
+
+  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+  // zeroed by zero()
+
+  double ***gridpre = (double ***) p_kspace->zero();
+
+  // map atoms to grid
+
+  grid_mpole(fmp,gridpre);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
+
+  gridfft = p_kspace->pre_convolution();
+
+  // gridfft1 = copy of first FFT
+
+  int nfft_owned = p_kspace->nfft_owned;
+  memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
+
+  // assign induced dipoles to the PME grid
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 1; j < 4; j++)
+      cmp[i][j] += uind[i][j-1] - uinp[i][j-1];
+  }
+
+  // convert Cartesian multipoles to fractional multipoles
+
+  cmp_to_fmp(cmp,fmp);
+
+  // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+  // zeroed by zero()
+
+  gridpre = (double ***) p_kspace->zero();
+
+  // map atoms to grid
+
+  grid_mpole(fmp,gridpre);
+
+  // pre-convolution operations including forward FFT
+  // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors
+
+  double *gridfft2 = p_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  m = n = 0;
+  for (k = nzlo; k <= nzhi; k++) {
+    for (j = nylo; j <= nyhi; j++) {
+      for (i = nxlo; i <= nxhi; i++) {
+        r1 = (i >= nhalf1) ? i-nfft1 : i;
+        r2 = (j >= nhalf2) ? j-nfft2 : j;
+        r3 = (k >= nhalf3) ? k-nfft3 : k;
+        h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;  // matvec
+        h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
+        h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
+        hsq = h1*h1 + h2*h2 + h3*h3;
+        term = -pterm * hsq;
+        expterm = 0.0;
+        if (term > -50.0 && hsq != 0.0) {
+          denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k];
+          expterm = exp(term) / denom;
+          struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1];
+          eterm = 0.5 * felec * expterm * struc2;
+          vterm = (2.0/hsq) * (1.0-term) * eterm;
+          vxx += h1*h1*vterm - eterm;
+          vyy += h2*h2*vterm - eterm;
+          vzz += h3*h3*vterm - eterm;
+          vxy += h1*h2*vterm;
+          vyz += h2*h3*vterm;
+          vxz += h1*h3*vterm;
+        }
+        n += 2;
+      }
+    }
+  }
+
+  // assign only the induced dipoles to the PME grid
+  // and perform the 3-D FFT forward transformation
+  // NOTE: why is there no inverse FFT in this section?
+
+  if (poltyp == DIRECT || poltyp == TCG) {
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 10; j++)
+        cmp[i][j] = 0.0;
+      for (j = 1; j < 4; j++)
+        cmp[i][j] = uinp[i][j-1];
+    }
+
+    // convert Cartesian multipoles to fractional multipoles
+
+    cmp_to_fmp(cmp,fmp);
+
+    // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+    // zeroed by zero()
+
+    double ***gridpre = (double ***) p_kspace->zero();
+
+    // map atoms to grid
+
+    grid_mpole(fmp,gridpre);
+
+    // pre-convolution operations including forward FFT
+    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
+
+    double *gridfft = p_kspace->pre_convolution();
+
+    // gridfft1 = copy of first FFT
+
+    int nfft_owned = p_kspace->nfft_owned;
+    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
+
+    // assign ??? to the PME grid
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 1; j < 4; j++)
+        cmp[i][j] = uind[i][j-1];
+    }
+
+    // convert Cartesian multipoles to fractional multipoles
+
+    cmp_to_fmp(cmp,fmp);
+
+    // gridpre = my portion of 3d grid in brick decomp w/ ghost values
+
+    gridpre = (double ***) p_kspace->zero();
+
+    // map atoms to grid
+
+    grid_mpole(fmp,gridpre);
+
+    // pre-convolution operations including forward FFT
+    // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
+
+    double *gridfft2 = p_kspace->pre_convolution();
+
+    // ---------------------
+    // convolution operation
+    // ---------------------
+
+    m = n = 0;
+    for (k = nzlo; k <= nzhi; k++) {
+      for (j = nylo; j <= nyhi; j++) {
+        for (i = nxlo; i <= nxhi; i++) {
+          r1 = (i >= nhalf1) ? i-nfft1 : i;
+          r2 = (j >= nhalf2) ? j-nfft2 : j;
+          r3 = (k >= nhalf3) ? k-nfft3 : k;
+          h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;  // matvec
+          h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
+          h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
+          hsq = h1*h1 + h2*h2 + h3*h3;
+          term = -pterm * hsq;
+          expterm = 0.0;
+          if (term > -50.0 && hsq != 0.0) {
+            denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k];
+            expterm = exp(term) / denom;
+            struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1];
+            eterm = 0.5 * felec * expterm * struc2;
+            vterm = (2.0/hsq) * (1.0-term) * eterm;
+            vxx += h1*h1*vterm - eterm;
+            vyy += h2*h2*vterm - eterm;
+            vzz += h3*h3*vterm - eterm;
+            vxy += h1*h2*vterm;
+            vyz += h2*h3*vterm;
+            vxz += h1*h3*vterm;
+          }
+          n += 2;
+        }
+      }
+    }
+  }
+
+  // add back missing terms for the TCG polarization method;
+  // first do the term for "UAD" dotted with "UBP"
+
+  /*
+  if (poltyp == TCG) {
+
+    for (m = 0; m < tcgnab; m++) {
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 10; j++)
+          cmp[i][j] = 0.0;
+        for (j = 1; j < 4; j++)
+          cmp[i][j] = ubp[i][m][j-1];
+      }
+
+      cmp_to_fmp(cmp,fmp);
+      grid_mpole(fmp);
+      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
+
+      for (k = 0; k < nfft3; k++) {
+        for (j = 0; j < nfft2; j++) {
+          for (i = 0; i < nfft1; i++) {
+            qgrip[k][j][i][0] = qgrid[k][j][i][0];
+            qgrip[k][j][i][1] = qgrid[k][j][i][1];
+          }
+        }
+      }
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 1; j < 4; j++)
+          cmp[i][j] = uad[i][m][j-1];
+      }
+
+      cmp_to_fmp(cmp,fmp);
+      grid_mpole(fmp);
+      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
+
+      // make the scalar summation over reciprocal lattice
+      // NOTE: this loop has to be distributed for parallel
+      // NOTE: why does this one include m = 0 ?
+
+      for (m = 1; m < ntot; m++) {
+        k1 = m % nfft1;
+        k2 = (m % nff) / nfft1;
+        k3 = m/nff;
+        r1 = (k1 >= nf1) ? k1-nfft1 : k1;
+        r2 = (k2 >= nf2) ? k2-nfft2 : k2;
+        r3 = (k3 >= nf3) ? k3-nfft3 : k3;
+        h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;
+        h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
+        h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
+        hsq = h1*h1 + h2*h2 + h3*h3;
+        term = -pterm * hsq;
+        expterm = 0.0;
+        if (term > -50.0 && hsq != 0.0) {
+          denom = volterm*hsq*bsmod1[k1]*bsmod2[k2]*bsmod3[k3];
+          expterm = exp(term) / denom;
+          struc2 = qgrid[k3][k2][k1][0]*qgrip[k3][k2][k1][0] +
+            qgrid[k3][k2][k1][1]*qgrip[k3][k2][k1][1];
+          eterm = 0.5 * felec * expterm * struc2;
+          vterm = (2.0/hsq) * (1.0-term) * eterm;
+          virpolar[0] -= h1*h1*vterm - eterm;
+          virpolar[1] -= h2*h2*vterm - eterm;
+          virpolar[2] -= h3*h3*vterm - eterm;
+          virpolar[3] -= h1*h2*vterm;
+          virpolar[4] -= h1*h3*vterm;
+          virpolar[5] -= h2*h3*vterm;
+        }
+      }
+
+      // now do the TCG terms with "UBD" dotted with "UAP"
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 10; j++)
+          cmp[i][j] = 0.0;
+        for (j = 1; j < 4; j++)
+          cmp[i][j] = uap[i][m][j-1];
+      }
+
+      cmp_to_fmp(cmp,fmp);
+      grid_mpole(fmp);
+      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
+
+      for (k = 0; k < nfft3; k++) {
+        for (j = 0; j < nfft2; j++) {
+          for (i = 0; i < nfft1; i++) {
+            qgrip[k][j][i][0] = qgrid[k][j][i][0];
+            qgrip[k][j][i][1] = qgrid[k][j][i][1];
+          }
+        }
+      }
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 1; j < 4; j++)
+          cmp[i][j] = ubd[i][m][j-1];
+      }
+
+      cmp_to_fmp(cmp,fmp);
+      grid_mpole(fmp);
+      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
+
+      // make the scalar summation over reciprocal lattice
+      // NOTE: this loop has to be distributed for parallel
+      // NOTE: why does this one include m = 0 ?
+
+      for (m = 1; m < ntot; m++) {
+        k1 = m % nfft1;
+        k2 = (m % nff) / nfft1;
+        k3 = m/nff;
+        r1 = (k1 >= nf1) ? k1-nfft1 : k1;
+        r2 = (k2 >= nf2) ? k2-nfft2 : k2;
+        r3 = (k3 >= nf3) ? k3-nfft3 : k3;
+        h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;
+        h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
+        h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
+        hsq = h1*h1 + h2*h2 + h3*h3;
+        term = -pterm * hsq;
+        expterm = 0.0;
+        if (term > -50.0 && hsq != 0.0) {
+          denom = volterm*hsq*bsmod1[k1]*bsmod2[k2]*bsmod3[k3];
+          expterm = exp(term) / denom;
+          struc2 = qgrid[k3][k2][k1][0]*qgrip[k3][k2][k1][0] +
+            qgrid[k3][k2][k1][1]*qgrip[k3][k2][k1][1];
+          eterm = 0.5 * felec * expterm * struc2;
+          vterm = (2.0/hsq) * (1.0-term) * eterm;
+          virpolar[0] -= h1*h1*vterm - eterm;
+          virpolar[1] -= h2*h2*vterm - eterm;
+          virpolar[2] -= h3*h3*vterm - eterm;
+          virpolar[3] -= h1*h2*vterm;
+          virpolar[4] -= h1*h3*vterm;
+          virpolar[5] -= h2*h3*vterm;
+        }
+      }
+    }
+  }
+  */
+
+  // increment the total internal virial tensor components
+
+  if (vflag_global) {
+    virpolar[0] -= vxx;
+    virpolar[1] -= vyy;
+    virpolar[2] -= vzz;
+    virpolar[3] -= vxy;
+    virpolar[4] -= vxz;
+    virpolar[5] -= vyz;
+  }
+}
+
 /* ----------------------------------------------------------------------
    compute atom forces from torques
 ------------------------------------------------------------------------- */
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index fe6ed3368f..77b594177b 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -43,6 +43,7 @@ class PairAmoebaGPU : public PairAmoeba {
   virtual void umutual2b(double **, double **);
   virtual void ufield0c(double **, double **);
   virtual void polar_real();
+  virtual void polar_kspace();
 
  private:
   int gpu_mode;

From 62ecf98cda4d1bd970b7bf1b5e8f1a09c388d009 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 16 Sep 2022 14:47:16 -0500
Subject: [PATCH 118/181] Enabled fphi_uind in hippo/gpu, really need to
 refactor hippo and amoeba in the GPU lib to remove kernel duplicates

---
 lib/gpu/Nvidia.makefile     |  26 +--
 lib/gpu/lal_amoeba.cpp      |   3 +-
 lib/gpu/lal_amoeba.cu       |   2 +-
 lib/gpu/lal_base_amoeba.cpp |  24 +--
 lib/gpu/lal_base_amoeba.h   |  10 +-
 lib/gpu/lal_hippo.cpp       |   3 +-
 lib/gpu/lal_hippo.cu        | 301 ++++++++++++++++++++++++++++++++++
 lib/gpu/lal_hippo_ext.cpp   |  14 ++
 src/GPU/pair_amoeba_gpu.cpp |   2 +-
 src/GPU/pair_hippo_gpu.cpp  | 311 ++++++++++++++++++++++++++++++++----
 src/GPU/pair_hippo_gpu.h    |   6 +
 11 files changed, 626 insertions(+), 76 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index c52246b06b..5f50486e28 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -68,31 +68,7 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 
 # host code compilation
 
-$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
-	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
-	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
-	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
-	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
-	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
-	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
-	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
-	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
-	
-$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
+$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 
 #ifdef CUDPP_OPT
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 48316e9b6e..02870ea861 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -64,7 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
                             cell_size,gpu_split,_screen,amoeba,
                             "k_amoeba_multipole", "k_amoeba_udirect2b",
                             "k_amoeba_umutual2b", "k_amoeba_polar",
-                            "k_amoeba_short_nbor", "k_amoeba_special15");
+                            "k_amoeba_fphi_uind", "k_amoeba_short_nbor",
+                            "k_amoeba_special15");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index d391279f5d..66926721cb 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1626,7 +1626,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
    fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-__kernel void k_fphi_uind(const __global numtyp4 *restrict thetai1,
+__kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai2,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 3ee0517dfb..eac704fbfc 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -65,6 +65,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const char *k_name_udirect2b,
                              const char *k_name_umutual2b,
                              const char *k_name_polar,
+                             const char *k_name_fphi_uind,
                              const char *k_name_short_nbor,
                              const char* k_name_special15) {
   screen=_screen;
@@ -100,7 +101,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program,k_name_multipole,
                   k_name_udirect2b, k_name_umutual2b,k_name_polar,
-                  k_name_short_nbor, k_name_special15);
+                  k_name_fphi_uind, k_name_short_nbor, k_name_special15);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -934,6 +935,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_udirect2b,
                                   const char *kname_umutual2b,
                                   const char *kname_polar,
+                                  const char *kname_fphi_uind,
                                   const char *kname_short_nbor,
                                   const char* kname_special15) {
   if (_compiled)
@@ -942,17 +944,17 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
-  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
 
-  k_multipole.set_function(*pair_program,kname_multipole);
-  k_udirect2b.set_function(*pair_program,kname_udirect2b);
-  k_umutual2b.set_function(*pair_program,kname_umutual2b);
-  k_polar.set_function(*pair_program,kname_polar);
-  k_fphi_uind.set_function(*pair_program,"k_fphi_uind");
-  k_short_nbor.set_function(*pair_program,kname_short_nbor);
-  k_special15.set_function(*pair_program,kname_special15);
-  pos_tex.get_texture(*pair_program,"pos_tex");
-  q_tex.get_texture(*pair_program,"q_tex");
+  k_multipole.set_function(*pair_program, kname_multipole);
+  k_udirect2b.set_function(*pair_program, kname_udirect2b);
+  k_umutual2b.set_function(*pair_program, kname_umutual2b);
+  k_polar.set_function(*pair_program, kname_polar);
+  k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
+  k_short_nbor.set_function(*pair_program, kname_short_nbor);
+  k_special15.set_function(*pair_program, kname_special15);
+  pos_tex.get_texture(*pair_program, "pos_tex");
+  q_tex.get_texture(*pair_program, "q_tex");
 
   _compiled=true;
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 802b6962b7..5aeb729993 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -62,9 +62,10 @@ class BaseAmoeba {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_multipole,
-                  const char *kname_udirect2b, const char *kname_umutual2b,
-                  const char *kname_polar, const char *kname_short_nbor, const char* kname_special15);
+                  const char *kname_multipole, const char *kname_udirect2b,
+                  const char *kname_umutual2b, const char *kname_polar,
+                  const char *kname_fphi_uind, const char *kname_short_nbor,
+                  const char* kname_special15);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -309,7 +310,8 @@ class BaseAmoeba {
   void compile_kernels(UCL_Device &dev, const void *pair_string,
      const char *kname_multipole, const char *kname_udirect2b,
      const char *kname_umutual2b, const char *kname_polar,
-     const char *kname_short_nbor, const char* kname_special15);
+     const char *kname_fphi_uind, const char *kname_short_nbor,
+     const char* kname_special15);
 
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 79a8772c3e..9917ab91a2 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -67,7 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
                             cell_size,gpu_split,_screen,hippo,
                             "k_hippo_multipole", "k_hippo_udirect2b",
                             "k_hippo_umutual2b", "k_hippo_polar",
-                            "k_hippo_short_nbor", "k_hippo_special15");
+                            "k_hippo_fphi_uind", "k_hippo_short_nbor",
+                            "k_hippo_special15");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index be8d2c0701..dde8f9bfd5 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -2045,6 +2045,307 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp *restrict grid,
+                          __global numtyp *restrict fdip_phi1,
+                          __global numtyp *restrict fdip_phi2,
+                          __global numtyp *restrict fdip_sum_phi,
+                          const int bsorder, const int inum,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  //int tid, ii, offset, i, n_stride;
+  //atom_info(t_per_atom,ii,tid,offset);
+  
+
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    int igridx = igrid[istart];
+    int igridy = igrid[istart+1];
+    int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv100_1 = (numtyp)0.0;
+    numtyp tuv010_1 = (numtyp)0.0;
+    numtyp tuv001_1 = (numtyp)0.0;
+    numtyp tuv200_1 = (numtyp)0.0;
+    numtyp tuv020_1 = (numtyp)0.0;
+    numtyp tuv002_1 = (numtyp)0.0;
+    numtyp tuv110_1 = (numtyp)0.0;
+    numtyp tuv101_1 = (numtyp)0.0;
+    numtyp tuv011_1 = (numtyp)0.0;
+    numtyp tuv100_2 = (numtyp)0.0;
+    numtyp tuv010_2 = (numtyp)0.0;
+    numtyp tuv001_2 = (numtyp)0.0;
+    numtyp tuv200_2 = (numtyp)0.0;
+    numtyp tuv020_2 = (numtyp)0.0;
+    numtyp tuv002_2 = (numtyp)0.0;
+    numtyp tuv110_2 = (numtyp)0.0;
+    numtyp tuv101_2 = (numtyp)0.0;
+    numtyp tuv011_2 = (numtyp)0.0;
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      /*
+      v0 = thetai3[m][kb][0];
+      v1 = thetai3[m][kb][1];
+      v2 = thetai3[m][kb][2];
+      v3 = thetai3[m][kb][3];
+      */
+      int i3 = istart + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
+      numtyp tu00_1 = (numtyp)0.0;
+      numtyp tu01_1 = (numtyp)0.0;
+      numtyp tu10_1 = (numtyp)0.0;
+      numtyp tu20_1 = (numtyp)0.0;
+      numtyp tu11_1 = (numtyp)0.0;
+      numtyp tu02_1 = (numtyp)0.0;
+      numtyp tu00_2 = (numtyp)0.0;
+      numtyp tu01_2 = (numtyp)0.0;
+      numtyp tu10_2 = (numtyp)0.0;
+      numtyp tu20_2 = (numtyp)0.0;
+      numtyp tu11_2 = (numtyp)0.0;
+      numtyp tu02_2 = (numtyp)0.0;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        /*
+        u0 = thetai2[m][jb][0];
+        u1 = thetai2[m][jb][1];
+        u2 = thetai2[m][jb][2];
+        u3 = thetai2[m][jb][3];
+        */
+        int i2 = istart + jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
+        numtyp t0_1 = (numtyp)0.0;
+        numtyp t1_1 = (numtyp)0.0;
+        numtyp t2_1 = (numtyp)0.0;
+        numtyp t0_2 = (numtyp)0.0;
+        numtyp t1_2 = (numtyp)0.0;
+        numtyp t2_2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          /*
+          tq_1 = grid[k][j][i][0];
+          tq_2 = grid[k][j][i][1];
+          t0_1 += tq_1*thetai1[m][ib][0];
+          t1_1 += tq_1*thetai1[m][ib][1];
+          t2_1 += tq_1*thetai1[m][ib][2];
+          t0_2 += tq_2*thetai1[m][ib][0];
+          t1_2 += tq_2*thetai1[m][ib][1];
+          t2_2 += tq_2*thetai1[m][ib][2];
+          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
+          */
+          int i1 = istart + ib;
+          numtyp4 tha1 = thetai1[i1];
+          numtyp w0 = tha1.x;
+          numtyp w1 = tha1.y;
+          numtyp w2 = tha1.z;
+          numtyp w3 = tha1.w;
+          int gidx = 2*(k*ngridxy + j*ngridx + i);
+          numtyp tq_1 = grid[gidx];
+          numtyp tq_2 = grid[gidx+1];
+          t0_1 += tq_1*w0;
+          t1_1 += tq_1*w1;
+          t2_1 += tq_1*w2;
+          t0_2 += tq_2*w0;
+          t1_2 += tq_2*w1;
+          t2_2 += tq_2*w2;
+          t3 += (tq_1+tq_2)*w3;
+          i++;
+        }
+
+        tu00_1 += t0_1*u0;
+        tu10_1 += t1_1*u0;
+        tu01_1 += t0_1*u1;
+        tu20_1 += t2_1*u0;
+        tu11_1 += t1_1*u1;
+        tu02_1 += t0_1*u2;
+        tu00_2 += t0_2*u0;
+        tu10_2 += t1_2*u0;
+        tu01_2 += t0_2*u1;
+        tu20_2 += t2_2*u0;
+        tu11_2 += t1_2*u1;
+        tu02_2 += t0_2*u2;
+        numtyp t0 = t0_1 + t0_2;
+        numtyp t1 = t1_1 + t1_2;
+        numtyp t2 = t2_1 + t2_2;
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv100_1 += tu10_1*v0;
+      tuv010_1 += tu01_1*v0;
+      tuv001_1 += tu00_1*v1;
+      tuv200_1 += tu20_1*v0;
+      tuv020_1 += tu02_1*v0;
+      tuv002_1 += tu00_1*v2;
+      tuv110_1 += tu11_1*v0;
+      tuv101_1 += tu10_1*v1;
+      tuv011_1 += tu01_1*v1;
+      tuv100_2 += tu10_2*v0;
+      tuv010_2 += tu01_2*v0;
+      tuv001_2 += tu00_2*v1;
+      tuv200_2 += tu20_2*v0;
+      tuv020_2 += tu02_2*v0;
+      tuv002_2 += tu00_2*v2;
+      tuv110_2 += tu11_2*v0;
+      tuv101_2 += tu10_2*v1;
+      tuv011_2 += tu01_2*v1;
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    int idx;
+    numtyp fdip_buf[20];
+
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_1;
+    fdip_buf[2] = tuv010_1;
+    fdip_buf[3] = tuv001_1;
+    fdip_buf[4] = tuv200_1;
+    fdip_buf[5] = tuv020_1;
+    fdip_buf[6] = tuv002_1;
+    fdip_buf[7] = tuv110_1;
+    fdip_buf[8] = tuv101_1;
+    fdip_buf[9] = tuv011_1;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi1[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_2;
+    fdip_buf[2] = tuv010_2;
+    fdip_buf[3] = tuv001_2;
+    fdip_buf[4] = tuv200_2;
+    fdip_buf[5] = tuv020_2;
+    fdip_buf[6] = tuv002_2;
+    fdip_buf[7] = tuv110_2;
+    fdip_buf[8] = tuv101_2;
+    fdip_buf[9] = tuv011_2;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi2[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = tuv000;
+    fdip_buf[1] = tuv100;
+    fdip_buf[2] = tuv010;
+    fdip_buf[3] = tuv001;
+    fdip_buf[4] = tuv200;
+    fdip_buf[5] = tuv020;
+    fdip_buf[6] = tuv002;
+    fdip_buf[7] = tuv110;
+    fdip_buf[8] = tuv101;
+    fdip_buf[9] = tuv011;
+    fdip_buf[10] = tuv300;
+    fdip_buf[11] = tuv030;
+    fdip_buf[12] = tuv003;
+    fdip_buf[13] = tuv210;
+    fdip_buf[14] = tuv201;
+    fdip_buf[15] = tuv120;
+    fdip_buf[16] = tuv021;
+    fdip_buf[17] = tuv102;
+    fdip_buf[18] = tuv012;
+    fdip_buf[19] = tuv111;
+    idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fdip_sum_phi[idx] = fdip_buf[m];
+      idx += inum;
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index e7deaddbf3..6b189defe9 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -193,6 +193,20 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid, double ****host_grid_brick,
+                          void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          bool& first_iteration) {
+   HIPPOMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
+                             host_thetai3, igrid, host_grid_brick, host_fdip_phi1,
+                             host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
+                             nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
+}
+
 double hippo_gpu_bytes() {
   return HIPPOMF.host_memory_usage();
 }
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index e62c8185be..267dc666d6 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -938,7 +938,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
     }
   }
 */
-  // accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU
+  // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
   amoeba_gpu_update_fieldp(&fieldp_pinned);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 41c1355fbb..8c1b380f65 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -18,7 +18,7 @@
 
 #include "pair_hippo_gpu.h"
 
-#include "amoeba_convolution.h"
+#include "amoeba_convolution_gpu.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
@@ -46,6 +46,9 @@ enum{GEAR,ASPC,LSQR};
 enum{BUILD,APPLY};
 enum{GORDON1,GORDON2};
 
+// same as in pair_amoeba.cpp
+enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
+
 #define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
 
 // External functions from cuda library for atom decomposition
@@ -102,6 +105,16 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void hippo_gpu_update_fieldp(void **fieldp_ptr);
 
+void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          bool& first_iteration);
+
 void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -129,8 +142,10 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_dispersion_real_ready = true;
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
+  gpu_umutual1_ready = true;
+  gpu_fphi_uind_ready = true;
   gpu_umutual2b_ready = true;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -198,6 +213,16 @@ void PairHippoGPU::init_style()
     tq_single = false;
   else
     tq_single = true;
+
+  // replace with the gpu counterpart
+
+  if (gpu_umutual1_ready) {
+    if (use_ewald && ic_kspace) {
+      delete ic_kspace;
+      ic_kspace =
+        new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -392,6 +417,8 @@ void PairHippoGPU::induce()
 
   int debug = 1;
 
+  first_induce_iteration  = true;
+
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
@@ -403,8 +430,6 @@ void PairHippoGPU::induce()
 
   // owned atoms
 
-  double **x = atom->x;
-  double **f = atom->f;
   int nlocal = atom->nlocal;
 
   // zero out the induced dipoles at each site
@@ -996,37 +1021,60 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
   int i,j;
   double term;
 
+  double time0,time1,time2;
+
   // zero field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
   int nall = nlocal + atom->nghost;
 
-  for (i = 0; i < nall; i++) {
-    for (j = 0; j < 3; j++) {
+  memset(&field[0][0], 0, 3*nall *sizeof(double));
+  memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
+
+/*  
+  for (int i = 0; i < nall; i++) {
+    for (int j = 0; j < 3; j++) {
       field[i][j] = 0.0;
       fieldp[i][j] = 0.0;
     }
   }
-
+*/
+  
   // get the real space portion of the mutual field first
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = MPI_Wtime();
 
   // get the reciprocal space part of the mutual field
 
   if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = MPI_Wtime();
 
   // add the self-energy portion of the mutual field
 
   term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
+  for (int i = 0; i < nlocal; i++) {
+    field[i][0] += term*uind[i][0];
+    field[i][1] += term*uind[i][1];
+    field[i][2] += term*uind[i][2];
+  }
+  for (int i = 0; i < nlocal; i++) {
+    fieldp[i][0] += term*uinp[i][0];
+    fieldp[i][1] += term*uinp[i][1];
+    fieldp[i][2] += term*uinp[i][2];
+  }
+/*  
   for (i = 0; i < nlocal; i++) {
     for (j = 0; j < 3; j++) {
       field[i][j] += term*uind[i][j];
       fieldp[i][j] += term*uinp[i][j];
     }
   }
-
-  // accumulate the field and fieldp values from real-space portion from umutual2b() on the GPU
+*/
+  // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
   hippo_gpu_update_fieldp(&fieldp_pinned);
@@ -1049,6 +1097,228 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
+}
+
+/* ----------------------------------------------------------------------
+   umutual1 = Ewald recip mutual induced field
+   umutual1 computes the reciprocal space contribution of the
+   induced atomic dipole moments to the field
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::umutual1(double **field, double **fieldp)
+{
+  int m,n;
+  int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
+  double term;
+  double a[3][3];  // indices not flipped vs Fortran
+
+  // return if the Ewald coefficient is zero
+
+  if (aewald < 1.0e-6) return;
+
+  // convert Cartesian dipoles to fractional coordinates
+
+  for (int j = 0; j < 3; j++) {
+    a[0][j] = nfft1 * recip[0][j];
+    a[1][j] = nfft2 * recip[1][j];
+    a[2][j] = nfft3 * recip[2][j];
+  }
+
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+    fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
+    fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
+    fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
+  }
+    
+  for (int i = 0; i < nlocal; i++) {      
+    fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
+    fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
+    fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
+  }
+/*
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
+      fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
+    }
+  }
+*/
+  // gridpre = my portion of 4d grid in brick decomp w/ ghost values
+
+  double ****gridpre = (double ****) ic_kspace->zero();
+
+  // map 2 values to grid
+
+  grid_uind(fuind,fuinp,gridpre);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomposition
+
+  double *gridfft = ic_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  nxlo = ic_kspace->nxlo_fft;
+  nxhi = ic_kspace->nxhi_fft;
+  nylo = ic_kspace->nylo_fft;
+  nyhi = ic_kspace->nyhi_fft;
+  nzlo = ic_kspace->nzlo_fft;
+  nzhi = ic_kspace->nzhi_fft;
+
+  // use qfac values stored in udirect1()
+
+  m = n = 0;
+  for (int k = nzlo; k <= nzhi; k++) {
+    for (int j = nylo; j <= nyhi; j++) {
+      for (int i = nxlo; i <= nxhi; i++) {
+        term = qfac[m++];
+        gridfft[n] *= term;
+        gridfft[n+1] *= term;
+        n += 2;
+      }
+    }
+  }
+
+  // post-convolution operations including backward FFT
+  // gridppost = my portion of 4d grid in brick decomp w/ ghost values
+
+  double ****gridpost = (double ****) ic_kspace->post_convolution();
+
+  // get potential
+  double time0, time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
+  fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
+
+  time1 = MPI_Wtime();
+  time_fphi_uind += (time1 - time0);
+
+  // store fractional reciprocal potentials for OPT method
+
+  if (poltyp == OPT) {
+    for (int i = 0; i < nlocal; i++) {
+      for (int j = 0; j < 10; j++) {
+        fopt[i][optlevel][j] = fdip_phi1[i][j];
+        foptp[i][optlevel][j] = fdip_phi2[i][j];
+      }
+    }
+  }
+
+  // convert the dipole fields from fractional to Cartesian
+
+  for (int i = 0; i < 3; i++) {
+    a[0][i] = nfft1 * recip[0][i];
+    a[1][i] = nfft2 * recip[1][i];
+    a[2][i] = nfft3 * recip[2][i];
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi1[i][1] +
+      a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
+    double dfy = a[1][0]*fdip_phi1[i][1] +
+      a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3];
+    double dfz = a[2][0]*fdip_phi1[i][1] +
+      a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3];
+    field[i][0] -= dfx;
+    field[i][1] -= dfy;
+    field[i][2] -= dfz;
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi2[i][1] +
+      a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3];
+    double dfy = a[1][0]*fdip_phi2[i][1] +
+      a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3];
+    double dfz = a[2][0]*fdip_phi2[i][1] +
+      a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3];
+    fieldp[i][0] -= dfx;
+    fieldp[i][1] -= dfy;
+    fieldp[i][2] -= dfz;
+  }
+/*
+  for (int i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
+        a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
+      dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
+        a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
+    }
+  }
+
+  // increment the field at each multipole site
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] -= dipfield1[i][j];
+      fieldp[i][j] -= dipfield2[i][j];
+    }
+  }
+*/
+}
+
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1,
+                              double **fdip_phi2, double **fdip_sum_phi)
+{
+  if (!gpu_fphi_uind_ready) {
+    PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
+    return;
+  }
+
+  void* fdip_phi1_pinned = nullptr;
+  void* fdip_phi2_pinned = nullptr;
+  void* fdip_sum_phi_pinned = nullptr;
+  hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1,
+                       thetai2, thetai3, igrid, grid,
+                       &fdip_phi1_pinned, &fdip_phi2_pinned,
+                       &fdip_sum_phi_pinned,
+                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                       ic_kspace->nxlo_out, ic_kspace->nxhi_out,
+                       first_induce_iteration);
+  
+  int nlocal = atom->nlocal;
+  double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int n = i;
+    for (int m = 0; m < 10; m++) {
+      fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+      n += nlocal;
+    }
+  }
+
+  double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int n = i;
+    for (int m = 0; m < 10; m++) {
+      fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+      n += nlocal;
+    }
+  }
+
+  double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int n = i;
+    for (int m = 0; m < 20; m++) {
+      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+      n += nlocal;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1089,29 +1359,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
   double *pval = atom->dvector[index_pval];
   hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
                               aewald, off2, &fieldp_pinned);
-/*
-  // accumulate the field and fieldp values from the GPU lib
-  //   field and fieldp may already have some nonzero values from kspace (umutual1)
-
-  int nlocal = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
-
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
-
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
-  }
-*/
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index 1ed1c3299d..742fbfb119 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -39,6 +39,8 @@ class PairHippoGPU : public PairAmoeba {
   virtual void dispersion_real();
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
+  virtual void umutual1(double **, double **);
+  virtual void fphi_uind(double ****, double **, double **, double **);
   virtual void umutual2b(double **, double **);
   virtual void ufield0c(double **, double **);
   virtual void polar_real();
@@ -55,9 +57,13 @@ class PairHippoGPU : public PairAmoeba {
   bool gpu_dispersion_real_ready;
   bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
   bool gpu_umutual2b_ready;
   bool gpu_polar_real_ready;
 
+  bool first_induce_iteration;
+
   void udirect2b_cpu();
 
   template<class numtyp>

From f9f777b099902e40a7880ab13f44e609fd1bb975 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 18 Sep 2022 15:09:26 -0500
Subject: [PATCH 119/181] Refactored precompute_induce to overlap data
 transfers with kernel launches

---
 lib/gpu/lal_amoeba_ext.cpp  | 11 +++++++++++
 lib/gpu/lal_base_amoeba.cpp | 15 +++------------
 lib/gpu/lal_hippo_ext.cpp   | 11 +++++++++++
 src/GPU/pair_amoeba_gpu.cpp | 13 +++++++++++++
 src/GPU/pair_hippo_gpu.cpp  | 13 +++++++++++++
 5 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index f91b76f688..425caaabbb 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,6 +162,17 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out) {
+   AMOEBAMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
+                              host_thetai3, igrid, nzlo_out, nzhi_out,
+                              nylo_out, nyhi_out, nxlo_out, nxhi_out);
+}
+
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid, double ****host_grid_brick,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index eac704fbfc..304e23274f 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -580,7 +580,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
     _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
     _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
-    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
 
     _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
     _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
@@ -674,7 +674,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                     double ***host_thetai1, double ***host_thetai2,
                                     double ***host_thetai3, int** host_igrid,
                                     double ****host_grid_brick,
-                                    void** host_fdip_phi1,
+                                    void **host_fdip_phi1,
                                     void **host_fdip_phi2,
                                     void **host_fdip_sum_phi,
                                     const int nzlo_out, const int nzhi_out,
@@ -682,16 +682,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                     const int nxlo_out, const int nxhi_out,
                                     bool& first_iteration)
 {
-  // allocation/resize and transfers before the first iteration
-  
-  if (first_iteration) {
-    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
-                      host_thetai3, host_igrid, nzlo_out, nzhi_out,
-                      nylo_out,  nyhi_out, nxlo_out, nxhi_out);
-    first_iteration = false;
-  }
-
-  // TODO: find out why this host alloc helps the cgrid_brick update_device() work correcly
+  // TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly
   UCL_H_Vec<numtyp> hdummy;
   hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
 
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 6b189defe9..2cc17c6ced 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -193,6 +193,17 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out) {
+   HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
+                              host_thetai3, igrid, nzlo_out, nzhi_out,
+                              nylo_out, nyhi_out, nxlo_out, nxhi_out);
+}
+
 void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid, double ****host_grid_brick,
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 267dc666d6..5770d9542d 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -88,6 +88,13 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 
+void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out);
+
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
@@ -294,6 +301,12 @@ void PairAmoebaGPU::induce()
 
   first_induce_iteration  = true;
 
+  amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1,
+                       thetai2, thetai3, igrid,
+                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                       ic_kspace->nxlo_out, ic_kspace->nxhi_out);
+
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 8c1b380f65..9317b11794 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -105,6 +105,13 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void hippo_gpu_update_fieldp(void **fieldp_ptr);
 
+void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out);
+
 void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
@@ -419,6 +426,12 @@ void PairHippoGPU::induce()
 
   first_induce_iteration  = true;
 
+  hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1,
+                       thetai2, thetai3, igrid,
+                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                       ic_kspace->nxlo_out, ic_kspace->nxhi_out);
+
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 

From caa66d904ecd6aa7fd0c0b4f04c517cb27e8b319 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 18 Sep 2022 15:54:12 -0500
Subject: [PATCH 120/181] Cleaned up GPU lib functions

---
 lib/gpu/lal_amoeba_ext.cpp  | 16 ++++-----------
 lib/gpu/lal_base_amoeba.cpp | 11 ++---------
 lib/gpu/lal_base_amoeba.h   | 19 +++++++-----------
 lib/gpu/lal_hippo_ext.cpp   | 15 +++-----------
 src/GPU/pair_amoeba_gpu.cpp | 37 +++++++++++++----------------------
 src/GPU/pair_amoeba_gpu.h   |  2 --
 src/GPU/pair_hippo_gpu.cpp  | 39 +++++++++++++------------------------
 src/GPU/pair_hippo_gpu.h    |  2 --
 8 files changed, 43 insertions(+), 98 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 425caaabbb..42384cf7de 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -173,18 +173,10 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
                               nylo_out, nyhi_out, nxlo_out, nxhi_out);
 }
 
-void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
-                          double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, int** igrid, double ****host_grid_brick,
-                          void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
-                          const int nzlo_out, const int nzhi_out,
-                          const int nylo_out, const int nyhi_out,
-                          const int nxlo_out, const int nxhi_out,
-                          bool& first_iteration) {
-   AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
-                              host_thetai3, igrid, host_grid_brick, host_fdip_phi1,
-                              host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
-                              nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
+void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi) {
+   AMOEBAMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1,
+                              host_fdip_phi2, host_fdip_sum_phi);
 }
 
 void amoeba_setup_fft(const int numel, const int element_type) {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 304e23274f..e3da81762e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -670,17 +670,10 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
-                                    double ***host_thetai1, double ***host_thetai2,
-                                    double ***host_thetai3, int** host_igrid,
-                                    double ****host_grid_brick,
+void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
                                     void **host_fdip_phi1,
                                     void **host_fdip_phi2,
-                                    void **host_fdip_sum_phi,
-                                    const int nzlo_out, const int nzhi_out,
-                                    const int nylo_out, const int nyhi_out,
-                                    const int nxlo_out, const int nxhi_out,
-                                    bool& first_iteration)
+                                    void **host_fdip_sum_phi)
 {
   // TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly
   UCL_H_Vec<numtyp> hdummy;
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 5aeb729993..a88a63e870 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -151,13 +151,6 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
-  virtual void precompute_induce(const int inum_full, const int bsorder,
-                                 double ***host_thetai1, double ***host_thetai2,
-                                 double ***host_thetai3, int** igrid,
-                                 const int nzlo_out, const int nzhi_out,
-                                 const int nylo_out, const int nyhi_out,
-                                 const int nxlo_out, const int nxhi_out);
-
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -180,15 +173,17 @@ class BaseAmoeba {
                 double **host_uind, double **host_uinp, double *host_pval,
                 const double aewald, const double off2_polar, void **fieldp_ptr);
 
-  virtual void compute_fphi_uind(const int inum_full, const int bsorder,
+  /// Allocate/resize per-atom arrays before induce()
+  virtual void precompute_induce(const int inum_full, const int bsorder,
                                  double ***host_thetai1, double ***host_thetai2,
                                  double ***host_thetai3, int** igrid,
-                                 double ****host_grid_brick,
-                                 void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                                  const int nzlo_out, const int nzhi_out,
                                  const int nylo_out, const int nyhi_out,
-                                 const int nxlo_out, const int nxhi_out,
-                                 bool& first_iteration);
+                                 const int nxlo_out, const int nxhi_out);
+
+  virtual void compute_fphi_uind(double ****host_grid_brick,
+                                 void **host_fdip_phi1, void **host_fdip_phi2,
+                                 void **host_fdip_sum_phi);
 
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 2cc17c6ced..1bd6bade3a 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -204,18 +204,9 @@ void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
                               nylo_out, nyhi_out, nxlo_out, nxhi_out);
 }
 
-void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
-                          double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, int** igrid, double ****host_grid_brick,
-                          void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
-                          const int nzlo_out, const int nzhi_out,
-                          const int nylo_out, const int nyhi_out,
-                          const int nxlo_out, const int nxhi_out,
-                          bool& first_iteration) {
-   HIPPOMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
-                             host_thetai3, igrid, host_grid_brick, host_fdip_phi1,
-                             host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
-                             nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
+void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                         void **host_fdip_phi2, void **host_fdip_sum_phi) {
+   HIPPOMF.compute_fphi_uind(host_grid_brick, host_fdip_phi1, host_fdip_phi2, host_fdip_sum_phi);
 }
 
 double hippo_gpu_bytes() {
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 5770d9542d..e5cdc281b9 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -95,15 +95,8 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
                           const int nylo_out, const int nyhi_out,
                           const int nxlo_out, const int nxhi_out);
 
-void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
-                          double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, int** igrid,
-                          double ****host_grid_brick, void **host_fdip_phi1,
-                          void **host_fdip_phi2, void **host_fdip_sum_phi,
-                          const int nzlo_out, const int nzhi_out,
-                          const int nylo_out, const int nyhi_out,
-                          const int nxlo_out, const int nxhi_out,
-                          bool& first_iteration);
+void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi);
 
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
@@ -299,13 +292,6 @@ void PairAmoebaGPU::induce()
 
   int debug = 1;
 
-  first_induce_iteration  = true;
-
-  amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1,
-                       thetai2, thetai3, igrid,
-                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
-                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
-                       ic_kspace->nxlo_out, ic_kspace->nxhi_out);
 
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
@@ -351,6 +337,15 @@ void PairAmoebaGPU::induce()
     }
   }
 
+  // allocate memory and make early host-device transfers
+  // must be done before the first ufield0c
+
+  amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
+                               thetai3, igrid,
+                               ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                               ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                               ic_kspace->nxlo_out, ic_kspace->nxhi_out);
+
   // get induced dipoles via the OPT extrapolation method
   // NOTE: any way to rewrite these loops to avoid allocating
   //       uopt,uoptp with a optorder+1 dimension, just optorder ??
@@ -1160,14 +1155,8 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
   void* fdip_phi1_pinned = nullptr;
   void* fdip_phi2_pinned = nullptr;
   void* fdip_sum_phi_pinned = nullptr;
-  amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1,
-                       thetai2, thetai3, igrid, grid,
-                       &fdip_phi1_pinned, &fdip_phi2_pinned,
-                       &fdip_sum_phi_pinned,
-                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
-                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
-                       ic_kspace->nxlo_out, ic_kspace->nxhi_out,
-                       first_induce_iteration);
+  amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned,
+                       &fdip_sum_phi_pinned);
   
   int nlocal = atom->nlocal;
   double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index 77b594177b..420874df21 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -62,8 +62,6 @@ class PairAmoebaGPU : public PairAmoeba {
   bool gpu_umutual2b_ready;
   bool gpu_polar_real_ready;
 
-  bool first_induce_iteration;
-
   void udirect2b_cpu();
 
   template<class numtyp>
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 9317b11794..1151027993 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -112,15 +112,8 @@ void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
                           const int nylo_out, const int nyhi_out,
                           const int nxlo_out, const int nxhi_out);
 
-void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
-                          double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, int** igrid,
-                          double ****host_grid_brick, void **host_fdip_phi1,
-                          void **host_fdip_phi2, void **host_fdip_sum_phi,
-                          const int nzlo_out, const int nzhi_out,
-                          const int nylo_out, const int nyhi_out,
-                          const int nxlo_out, const int nxhi_out,
-                          bool& first_iteration);
+void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi);
 
 void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
@@ -424,14 +417,6 @@ void PairHippoGPU::induce()
 
   int debug = 1;
 
-  first_induce_iteration  = true;
-
-  hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1,
-                       thetai2, thetai3, igrid,
-                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
-                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
-                       ic_kspace->nxlo_out, ic_kspace->nxhi_out);
-
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
@@ -486,6 +471,16 @@ void PairHippoGPU::induce()
       udirp[i][0], udirp[i][1], udirp[i][2]);
   }
 */
+
+  // allocate memory and make early host-device transfers
+  // must be done before the first ufield0c
+
+  hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
+                              thetai3, igrid,
+                              ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                              ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                              ic_kspace->nxlo_out, ic_kspace->nxhi_out);
+
   // get induced dipoles via the OPT extrapolation method
   // NOTE: any way to rewrite these loops to avoid allocating
   //       uopt,uoptp with a optorder+1 dimension, just optorder ??
@@ -1296,14 +1291,8 @@ void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1,
   void* fdip_phi1_pinned = nullptr;
   void* fdip_phi2_pinned = nullptr;
   void* fdip_sum_phi_pinned = nullptr;
-  hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1,
-                       thetai2, thetai3, igrid, grid,
-                       &fdip_phi1_pinned, &fdip_phi2_pinned,
-                       &fdip_sum_phi_pinned,
-                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
-                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
-                       ic_kspace->nxlo_out, ic_kspace->nxhi_out,
-                       first_induce_iteration);
+  hippo_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned,
+                      &fdip_sum_phi_pinned);
   
   int nlocal = atom->nlocal;
   double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index 742fbfb119..b1b908411d 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -62,8 +62,6 @@ class PairHippoGPU : public PairAmoeba {
   bool gpu_umutual2b_ready;
   bool gpu_polar_real_ready;
 
-  bool first_induce_iteration;
-
   void udirect2b_cpu();
 
   template<class numtyp>

From 356c46c9139e10e7a14864efbd2f4a007b0289c1 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 18 Sep 2022 16:28:30 -0500
Subject: [PATCH 121/181] Replaced mem allocation/deallocation inside moduli()
 with using member variables and mem resize if needed

---
 lib/gpu/lal_amoeba.cu        | 179 +++++++++++++++++++++++++++++++++++
 src/AMOEBA/amoeba_kspace.cpp |  28 +++---
 src/AMOEBA/pair_amoeba.cpp   |  95 ++++++++++---------
 src/AMOEBA/pair_amoeba.h     |  14 ++-
 4 files changed, 259 insertions(+), 57 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 66926721cb..da5c6f0c3c 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1922,6 +1922,185 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
   }
 }
 
+
+/* ----------------------------------------------------------------------
+   fphi_mpole = multipole potential from grid
+   fphi_mpole extracts the permanent multipole potential from
+   the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp *restrict grid,
+                          __global numtyp *restrict fphi,
+                          const int bsorder, const int inum,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    int igridx = igrid[istart];
+    int igridy = igrid[istart+1];
+    int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      /*
+      v0 = thetai3[m][kb][0];
+      v1 = thetai3[m][kb][1];
+      v2 = thetai3[m][kb][2];
+      v3 = thetai3[m][kb][3];
+      */
+      int i3 = istart + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        /*
+        u0 = thetai2[m][jb][0];
+        u1 = thetai2[m][jb][1];
+        u2 = thetai2[m][jb][2];
+        u3 = thetai2[m][jb][3];
+        */
+        int i2 = istart + jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
+        numtyp t0 = (numtyp)0.0;
+        numtyp t1 = (numtyp)0.0;
+        numtyp t2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          int i1 = istart + ib;
+          numtyp4 tha1 = thetai1[i1];
+          int gidx = 2*(k*ngridxy + j*ngridx + i);
+          numtyp tq = grid[gidx];
+          t0 += tq*tha1.x;
+          t1 += tq*tha1.y;
+          t2 += tq*tha1.z;
+          t3 += tq*tha1.w;
+          i++;
+        }
+
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    numtyp buf[20];
+    buf[0] = tuv000;
+    buf[1] = tuv100;
+    buf[2] = tuv010;
+    buf[3] = tuv001;
+    buf[4] = tuv200;
+    buf[5] = tuv020;
+    buf[6] = tuv002;
+    buf[7] = tuv110;
+    buf[8] = tuv101;
+    buf[9] = tuv011;
+    buf[10] = tuv300;
+    buf[11] = tuv030;
+    buf[12] = tuv003;
+    buf[13] = tuv210;
+    buf[14] = tuv201;
+    buf[15] = tuv120;
+    buf[16] = tuv021;
+    buf[17] = tuv102;
+    buf[18] = tuv012;
+    buf[19] = tuv111;
+
+    int idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fphi[idx] = buf[m];
+      idx += inum;
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp
index 6f76c7ec51..0f74f19797 100644
--- a/src/AMOEBA/amoeba_kspace.cpp
+++ b/src/AMOEBA/amoeba_kspace.cpp
@@ -62,25 +62,31 @@ void PairAmoeba::moduli()
   int maxfft = MAX(nfft1,nfft2);
   maxfft = MAX(maxfft,nfft3);
 
-  double *array = new double[bsorder];
-  double *bsarray = new double[maxfft];
-
+  //double *array = new double[bsorder];
+  //double *bsarray = new double[maxfft];
+  if (maxfft > _nfft_max) {
+    memory->destroy(_moduli_bsarray);
+    _nfft_max = maxfft;
+    memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray");
+  }
+  
   // compute and load the moduli values
 
   double x = 0.0;
-  bspline(x,bsorder,array);
+  //bspline(x,bsorder,array);
+  bspline(x,bsorder,_moduli_array);
 
-  for (i = 0; i < maxfft; i++) bsarray[i] = 0.0;
-  for (i = 0; i < bsorder; i++) bsarray[i+1] = array[i];
+  for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0;
+  for (i = 0; i < bsorder; i++) _moduli_bsarray[i+1] = _moduli_array[i];
 
-  dftmod(bsmod1,bsarray,nfft1,bsorder);
-  dftmod(bsmod2,bsarray,nfft2,bsorder);
-  dftmod(bsmod3,bsarray,nfft3,bsorder);
+  dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder);
+  dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder);
+  dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder);
 
   // perform deallocation of local arrays
 
-  delete[] array;
-  delete[] bsarray;
+  //delete[] array;
+  //delete[] bsarray;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index 9890904e42..d301a86cdb 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -68,67 +68,71 @@ PairAmoeba::PairAmoeba(LAMMPS *lmp) : Pair(lmp)
   // force field settings
 
   nmax = 0;
-  xaxis2local = yaxis2local = zaxis2local = NULL;
-  rpole = NULL;
-  tq = NULL;
+  xaxis2local = yaxis2local = zaxis2local = nullptr;
+  rpole = nullptr;
+  tq = nullptr;
 
-  red2local = NULL;
-  xred = NULL;
+  red2local = nullptr;
+  xred = nullptr;
 
-  uind = uinp = udirp = NULL;
-  uopt = uoptp = NULL;
-  fopt = foptp = NULL;
-  field = fieldp = NULL;
-  ufld = dufld = NULL;
-  rsd = rsdp = NULL;
-  zrsd = zrsdp = NULL;
+  uind = uinp = udirp = nullptr;
+  uopt = uoptp = nullptr;
+  fopt = foptp = nullptr;
+  field = fieldp = nullptr;
+  ufld = dufld = nullptr;
+  rsd = rsdp = nullptr;
+  zrsd = zrsdp = nullptr;
 
-  cmp = fmp = NULL;
-  cphi = fphi = NULL;
+  cmp = fmp = nullptr;
+  cphi = fphi = nullptr;
 
-  poli = NULL;
-  conj = conjp = NULL;
-  vec = vecp = NULL;
-  udir = usum = usump = NULL;
+  _moduli_array = nullptr;
+  _moduli_bsarray = nullptr;
+  _nfft_max = 0;
 
-  fuind = fuinp = NULL;
-  fdip_phi1 = fdip_phi2 = fdip_sum_phi = NULL;
-  dipfield1 = dipfield2 = NULL;
+  poli = nullptr;
+  conj = conjp = nullptr;
+  vec = vecp = nullptr;
+  udir = usum = usump = nullptr;
 
-  fphid = fphip = NULL;
-  fphidp = cphidp = NULL;
+  fuind = fuinp = nullptr;
+  fdip_phi1 = fdip_phi2 = fdip_sum_phi = nullptr;
+  dipfield1 = dipfield2 = nullptr;
+
+  fphid = fphip = nullptr;
+  fphidp = cphidp = nullptr;
 
   bsordermax = 0;
-  thetai1 = thetai2 = thetai3 = NULL;
-  bsmod1 = bsmod2 = bsmod3 = NULL;
-  bsbuild = NULL;
-  igrid = NULL;
-  m_kspace = p_kspace = pc_kspace = d_kspace = NULL;
-  i_kspace = ic_kspace = NULL;
+  thetai1 = thetai2 = thetai3 = nullptr;
+  bsmod1 = bsmod2 = bsmod3 = nullptr;
+  bsbuild = nullptr;
+  igrid = nullptr;
+  m_kspace = p_kspace = pc_kspace = d_kspace = nullptr;
+  i_kspace = ic_kspace = nullptr;
 
-  numneigh_dipole = NULL;
-  firstneigh_dipole = NULL;
-  firstneigh_dipdip = NULL;
-  ipage_dipole = NULL;
-  dpage_dipdip = NULL;
+  numneigh_dipole = nullptr;
+  firstneigh_dipole = nullptr;
+  firstneigh_dipdip = nullptr;
+  ipage_dipole = nullptr;
+  dpage_dipdip = nullptr;
 
-  numneigh_precond = NULL;
-  firstneigh_precond = NULL;
-  ipage_precond = NULL;
+  numneigh_precond = nullptr;
+  firstneigh_precond = nullptr;
+  ipage_precond = nullptr;
 
-  firstneigh_pcpc = NULL;
-  dpage_pcpc = NULL;
+  firstneigh_pcpc = nullptr;
+  dpage_pcpc = nullptr;
 
-  qfac = NULL;
-  gridfft1 = NULL;
+  qfac = nullptr;
+  gridfft1 = nullptr;
 
   initialize_type_class();
   initialize_vdwl();
   initialize_smallsize();
 
-  forcefield = NULL;
+  forcefield = nullptr;
 
-  id_pole = id_udalt = id_upalt = NULL;
+  id_pole = id_udalt = id_upalt = nullptr;
 
   nualt = 0;
   first_flag = 1;
@@ -220,6 +224,9 @@ PairAmoeba::~PairAmoeba()
   memory->destroy(fphidp);
   memory->destroy(cphidp);
 
+  memory->destroy(_moduli_array);
+  memory->destroy(_moduli_bsarray);
+
   memory->destroy(thetai1);
   memory->destroy(thetai2);
   memory->destroy(thetai3);
@@ -2312,6 +2319,8 @@ void PairAmoeba::grow_local()
     firstneigh_pcpc = (double **)
       memory->smalloc(nmax*sizeof(double *),"induce:firstneigh_pcpc");
   }
+
+  memory->create(_moduli_array,bsordermax,"amoeba:_moduli_array");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 24ce6fcfbc..91ec8faf0c 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -337,7 +337,11 @@ class PairAmoeba : public Pair {
   double *gridfft1;                // copy of p_kspace FFT grid
 
   double **cmp,**fmp;              // Cartesian and fractional multipoles
-  double **cphi,**fphi;     
+  double **cphi,**fphi;
+
+  double *_moduli_array;           // buffers for moduli
+  double *_moduli_bsarray;
+  int _nfft_max;
 
   // params for current KSpace solve and FFT being worked on
 
@@ -347,8 +351,12 @@ class PairAmoeba : public Pair {
   double ctf[10][10];      // indices NOT flipped vs Fortran
   double ftc[10][10];      // indices NOT flipped vs Fortran
 
-  class AmoebaConvolution *m_kspace,*p_kspace,*pc_kspace,*d_kspace;
-  class AmoebaConvolution *i_kspace,*ic_kspace;
+  class AmoebaConvolution *m_kspace;   // multipole KSpace
+  class AmoebaConvolution *p_kspace;   // polar KSpace
+  class AmoebaConvolution *pc_kspace;  
+  class AmoebaConvolution *d_kspace;   // dispersion KSpace 
+  class AmoebaConvolution *i_kspace;   // induce KSpace
+  class AmoebaConvolution *ic_kspace;
 
   // FFT grid size factors
 

From 785131932c87e0575d336f5b296cbce5731f13b6 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 20 Sep 2022 13:58:17 -0500
Subject: [PATCH 122/181] Added fphi_mpole in amoeba/gpu, fixed a bug in the
 kernel when indexing grid

---
 lib/gpu/lal_amoeba.cpp      |   4 +-
 lib/gpu/lal_amoeba.cu       |   2 +-
 lib/gpu/lal_amoeba_ext.cpp  |   4 +
 lib/gpu/lal_base_amoeba.cpp |  74 ++++++++++++++-
 lib/gpu/lal_base_amoeba.h   |  14 ++-
 lib/gpu/lal_hippo.cpp       |   4 +-
 lib/gpu/lal_hippo.cu        | 178 ++++++++++++++++++++++++++++++++++++
 src/GPU/pair_amoeba_gpu.cpp |  37 +++++++-
 8 files changed, 300 insertions(+), 17 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 02870ea861..7be4a6f59c 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -64,8 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
                             cell_size,gpu_split,_screen,amoeba,
                             "k_amoeba_multipole", "k_amoeba_udirect2b",
                             "k_amoeba_umutual2b", "k_amoeba_polar",
-                            "k_amoeba_fphi_uind", "k_amoeba_short_nbor",
-                            "k_amoeba_special15");
+                            "k_amoeba_fphi_uind", "k_amoeba_fphi_mpole",
+                            "k_amoeba_short_nbor", "k_amoeba_special15");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index da5c6f0c3c..6f77fb932f 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -2026,7 +2026,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
         for (int ib = 0; ib < bsorder; ib++) {
           int i1 = istart + ib;
           numtyp4 tha1 = thetai1[i1];
-          int gidx = 2*(k*ngridxy + j*ngridx + i);
+          int gidx = k*ngridxy + j*ngridx + i;
           numtyp tq = grid[gidx];
           t0 += tq*tha1.x;
           t1 += tq*tha1.y;
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 42384cf7de..1f56fa86f8 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -179,6 +179,10 @@ void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
                               host_fdip_phi2, host_fdip_sum_phi);
 }
 
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi) {
+   AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi);
+}
+
 void amoeba_setup_fft(const int numel, const int element_type) {
   AMOEBAMF.setup_fft(numel, element_type);
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index e3da81762e..08dcd8123e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -38,6 +38,7 @@ BaseAmoebaT::~BaseAmoeba() {
   k_udirect2b.clear();
   k_umutual2b.clear();
   k_fphi_uind.clear();
+  k_fphi_mpole.clear();
   k_polar.clear();
   k_special15.clear();
   k_short_nbor.clear();
@@ -66,6 +67,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const char *k_name_umutual2b,
                              const char *k_name_polar,
                              const char *k_name_fphi_uind,
+                             const char *k_name_fphi_mpole,
                              const char *k_name_short_nbor,
                              const char* k_name_special15) {
   screen=_screen;
@@ -100,8 +102,9 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program,k_name_multipole,
-                  k_name_udirect2b, k_name_umutual2b,k_name_polar,
-                  k_name_fphi_uind, k_name_short_nbor, k_name_special15);
+                   k_name_udirect2b, k_name_umutual2b,k_name_polar,
+                   k_name_fphi_uind, k_name_fphi_mpole,
+                   k_name_short_nbor, k_name_special15);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -559,6 +562,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 //     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
 //     host_igrid is allocated with nmax by 4
 //   - transfer extra data from host to device
+// NOTE: can be re-used for fphi_mpole() (already allocate 2x grid points)
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
@@ -568,7 +572,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                     const int nzlo_out, const int nzhi_out,
                                     const int nylo_out, const int nyhi_out,
                                     const int nxlo_out, const int nxhi_out) {
-  
+  // update bsorder with that of the kspace solver
   _bsorder = bsorder;
 
   // allocate or resize per-atom arrays
@@ -586,7 +590,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
     _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
   } else {
-    if (inum_full>_max_thetai_size) {
+    if (_thetai1.cols()<_max_thetai_size*bsorder) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
       _thetai1.resize(_max_thetai_size*bsorder);
       _thetai2.resize(_max_thetai_size*bsorder);
@@ -667,6 +671,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
 // ---------------------------------------------------------------------------
 // fphi_uind = induced potential from grid
 // fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// NOTE: host_grid_brick is from ic_kspace post_convolution()
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
@@ -687,7 +692,7 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
         _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1];
         n += 2;
       }
-  _cgrid_brick.update_device(false);
+  _cgrid_brick.update_device(_num_grid_points*2, false);
 
   const int red_blocks = fphi_uind();
 
@@ -727,6 +732,63 @@ int BaseAmoebaT::fphi_uind() {
   return GX;
 }
 
+// ---------------------------------------------------------------------------
+// fphi_mpole = multipole potential from grid (limited to polar_kspace for now)
+// fphi_mpole extracts the permanent multipole potential from
+//   the particle mesh Ewald grid
+// NOTE: host_grid_brick is from p_kspace post_convolution()
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi)
+{
+  // TODO: grid brick[k][j][i] is a scalar
+  UCL_H_Vec<numtyp> hdummy;
+  hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
+
+  int n = 0;
+  for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
+    for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
+      for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
+        _cgrid_brick[n] = host_grid_brick[iz][iy][ix];
+        n++;
+      }
+  _cgrid_brick.update_device(_num_grid_points, false);
+
+  const int red_blocks = fphi_mpole();
+
+  _fdip_sum_phi.update_host(_max_thetai_size*20);
+
+  *host_fphi = _fdip_sum_phi.host.begin();
+}
+
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_mpole() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=atom->nall();
+  int nbor_pitch=nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+
+  time_pair.start();
+  int ngridxy = _ngridx * _ngridy;
+  k_fphi_mpole.set_size(GX,BX);
+  k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
+                  &_fdip_sum_phi, &_bsorder, &ainum,
+                  &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
+  time_pair.stop();
+
+  return GX;
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
@@ -920,6 +982,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_umutual2b,
                                   const char *kname_polar,
                                   const char *kname_fphi_uind,
+                                  const char *kname_fphi_mpole,
                                   const char *kname_short_nbor,
                                   const char* kname_special15) {
   if (_compiled)
@@ -935,6 +998,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_umutual2b.set_function(*pair_program, kname_umutual2b);
   k_polar.set_function(*pair_program, kname_polar);
   k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
+  k_fphi_mpole.set_function(*pair_program, kname_fphi_mpole);
   k_short_nbor.set_function(*pair_program, kname_short_nbor);
   k_special15.set_function(*pair_program, kname_special15);
   pos_tex.get_texture(*pair_program, "pos_tex");
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index a88a63e870..a5ee245623 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -64,8 +64,8 @@ class BaseAmoeba {
                   const double gpu_split, FILE *screen, const void *pair_program,
                   const char *kname_multipole, const char *kname_udirect2b,
                   const char *kname_umutual2b, const char *kname_polar,
-                  const char *kname_fphi_uind, const char *kname_short_nbor,
-                  const char* kname_special15);
+                  const char *kname_fphi_uind, const char *kname_fphi_mpole,
+                  const char *kname_short_nbor, const char* kname_special15);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -185,6 +185,8 @@ class BaseAmoeba {
                                  void **host_fdip_phi1, void **host_fdip_phi2,
                                  void **host_fdip_sum_phi);
 
+  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi);
+
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
                 double **host_uind, double **host_uinp, double *host_pval,
@@ -279,7 +281,8 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_fphi_uind, k_fphi_mpole;
   UCL_Kernel k_special15, k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
@@ -305,13 +308,14 @@ class BaseAmoeba {
   void compile_kernels(UCL_Device &dev, const void *pair_string,
      const char *kname_multipole, const char *kname_udirect2b,
      const char *kname_umutual2b, const char *kname_polar,
-     const char *kname_fphi_uind, const char *kname_short_nbor,
-     const char* kname_special15);
+     const char *kname_fphi_uind, const char *kname_fphi_mpole,
+     const char *kname_short_nbor, const char* kname_special15);
 
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
   virtual int fphi_uind();
+  virtual int fphi_mpole();
   virtual int polar_real(const int eflag, const int vflag) = 0;
   
 
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 9917ab91a2..3de6dc544c 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -67,8 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
                             cell_size,gpu_split,_screen,hippo,
                             "k_hippo_multipole", "k_hippo_udirect2b",
                             "k_hippo_umutual2b", "k_hippo_polar",
-                            "k_hippo_fphi_uind", "k_hippo_short_nbor",
-                            "k_hippo_special15");
+                            "k_hippo_fphi_uind", "k_hippo_fphi_mpole",
+                            "k_hippo_short_nbor", "k_hippo_special15");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index dde8f9bfd5..91793747ef 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -2346,6 +2346,184 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
   }
 }
 
+/* ----------------------------------------------------------------------
+   fphi_mpole = multipole potential from grid
+   fphi_mpole extracts the permanent multipole potential from
+   the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp *restrict grid,
+                          __global numtyp *restrict fphi,
+                          const int bsorder, const int inum,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    int igridx = igrid[istart];
+    int igridy = igrid[istart+1];
+    int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      /*
+      v0 = thetai3[m][kb][0];
+      v1 = thetai3[m][kb][1];
+      v2 = thetai3[m][kb][2];
+      v3 = thetai3[m][kb][3];
+      */
+      int i3 = istart + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        /*
+        u0 = thetai2[m][jb][0];
+        u1 = thetai2[m][jb][1];
+        u2 = thetai2[m][jb][2];
+        u3 = thetai2[m][jb][3];
+        */
+        int i2 = istart + jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
+        numtyp t0 = (numtyp)0.0;
+        numtyp t1 = (numtyp)0.0;
+        numtyp t2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          int i1 = istart + ib;
+          numtyp4 tha1 = thetai1[i1];
+          int gidx = k*ngridxy + j*ngridx + i;
+          numtyp tq = grid[gidx];
+          t0 += tq*tha1.x;
+          t1 += tq*tha1.y;
+          t2 += tq*tha1.z;
+          t3 += tq*tha1.w;
+          i++;
+        }
+
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    numtyp buf[20];
+    buf[0] = tuv000;
+    buf[1] = tuv100;
+    buf[2] = tuv010;
+    buf[3] = tuv001;
+    buf[4] = tuv200;
+    buf[5] = tuv020;
+    buf[6] = tuv002;
+    buf[7] = tuv110;
+    buf[8] = tuv101;
+    buf[9] = tuv011;
+    buf[10] = tuv300;
+    buf[11] = tuv030;
+    buf[12] = tuv003;
+    buf[13] = tuv210;
+    buf[14] = tuv201;
+    buf[15] = tuv120;
+    buf[16] = tuv021;
+    buf[17] = tuv102;
+    buf[18] = tuv012;
+    buf[19] = tuv111;
+
+    int idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fphi[idx] = buf[m];
+      idx += inum;
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index e5cdc281b9..396ff0b592 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -98,6 +98,8 @@ void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
 void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
                           void **host_fdip_phi2, void **host_fdip_sum_phi);
 
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi);
+
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -339,6 +341,7 @@ void PairAmoebaGPU::induce()
 
   // allocate memory and make early host-device transfers
   // must be done before the first ufield0c
+  // NOTE: this is for ic_kspace, and thetai[1-3]
 
   amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
                                thetai3, igrid,
@@ -1311,6 +1314,8 @@ void PairAmoebaGPU::polar_kspace()
   double cphid[4],cphip[4];
   double a[3][3];    // indices not flipped vs Fortran
 
+  bool gpu_fphi_mpole_ready = true;
+
   // indices into the electrostatic field array
   // decremented by 1 versus Fortran
 
@@ -1373,6 +1378,18 @@ void PairAmoebaGPU::polar_kspace()
     moduli();
     bspline_fill();
 
+    // allocate memory and make early host-device transfers
+  
+    // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
+    if (gpu_fphi_mpole_ready) {
+       amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
+                                    thetai3, igrid, p_kspace->nzlo_out,
+                                    p_kspace->nzhi_out, p_kspace->nylo_out,
+                                    p_kspace->nyhi_out, p_kspace->nxlo_out,
+                                    p_kspace->nxhi_out);
+    }
+      
+
     // convert Cartesian multipoles to fractional coordinates
 
     cmp_to_fmp(cmp,fmp);
@@ -1441,8 +1458,24 @@ void PairAmoebaGPU::polar_kspace()
     double ***gridpost = (double ***) p_kspace->post_convolution();
 
     // get potential
-
-    fphi_mpole(gridpost,fphi);
+    
+    if (!gpu_fphi_mpole_ready) {
+      fphi_mpole(gridpost,fphi);
+      //printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
+    } else {
+      void* fphi_pinned = nullptr;
+      amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned);
+    
+      double *_fphi_ptr = (double *)fphi_pinned;
+      for (int i = 0; i < nlocal; i++) {
+        int idx = i;
+        for (int m = 0; m < 20; m++) {
+          fphi[i][m] = _fphi_ptr[idx];
+          idx += nlocal;
+        }
+      }
+      //printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
+    }
 
     for (i = 0; i < nlocal; i++) {
       for (k = 0; k < 20; k++)

From 166701f13a585da635f96d83ad96ab14a90a024c Mon Sep 17 00:00:00 2001
From: ndtrung <ndactrung@gmail.com>
Date: Fri, 23 Sep 2022 11:53:09 -0500
Subject: [PATCH 123/181] Fixed missing commas in the argument list of the
 macros in amoeba and hippo cu files, added amoeba_convolution_gpu.cpp and .h
 to the source file list in GPU.cmake

---
 cmake/Modules/Packages/GPU.cmake | 4 +++-
 lib/gpu/lal_amoeba.cu            | 2 +-
 lib/gpu/lal_hippo.cu             | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 9524324409..7bb9723485 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -3,7 +3,9 @@ set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
                 ${GPU_SOURCES_DIR}/fix_gpu.h
                 ${GPU_SOURCES_DIR}/fix_gpu.cpp
                 ${GPU_SOURCES_DIR}/fix_nh_gpu.h
-                ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp)
+                ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp
+                ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
+                ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
 target_compile_definitions(lammps PRIVATE -DLMP_GPU)
 
 set(GPU_API "opencl" CACHE STRING "API used by GPU package")
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 6f77fb932f..84a8495dfb 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -158,7 +158,7 @@ _texture( q_tex,int2);
     fieldp[ii+inum] = fp;                                                   \
   }
 
-#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 91793747ef..a5fca5cc80 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -158,7 +158,7 @@ _texture( q_tex,int2);
     fieldp[ii+inum] = fp;                                                   \
   }
 
-#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \

From e6d2582642867d12f3906567e580f0e35feaafce Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 28 Sep 2022 15:08:18 -0500
Subject: [PATCH 124/181] Updated fphi_mpole, renamed precompute_induce to
 precompute_kspace

---
 lib/gpu/Nvidia.makefile     | 29 +++++++++++-
 lib/gpu/lal_amoeba.cu       | 91 +++++++++++++++++--------------------
 lib/gpu/lal_amoeba_ext.cpp  | 11 ++---
 lib/gpu/lal_base_amoeba.cpp | 25 ++++++----
 lib/gpu/lal_base_amoeba.h   |  9 ++--
 lib/gpu/lal_hippo_ext.cpp   |  8 ++--
 src/GPU/pair_amoeba_gpu.cpp | 43 +++++++++---------
 src/GPU/pair_hippo_gpu.cpp  | 14 +++---
 8 files changed, 129 insertions(+), 101 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 5f50486e28..298d404117 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -68,7 +68,34 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 
 # host code compilation
 
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_pppm.o: lal_pppm.cpp pppm_f_cubin.h pppm_d_cubin.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 
 #ifdef CUDPP_OPT
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 84a8495dfb..ab750aaadc 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1630,7 +1630,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai2,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
-                          const __global numtyp *restrict grid,
+                          const __global numtyp2 *restrict grid,
                           __global numtyp *restrict fdip_phi1,
                           __global numtyp *restrict fdip_phi2,
                           __global numtyp *restrict fdip_sum_phi,
@@ -1648,12 +1648,12 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
 
   if (ii<inum) {
 
-    int nlpts = (bsorder-1) / 2;
+    const int nlpts = (bsorder-1) / 2;
     
     int istart = fast_mul(ii,4);
-    int igridx = igrid[istart];
-    int igridy = igrid[istart+1];
-    int igridz = igrid[istart+2];
+    const int igridx = igrid[istart];
+    const int igridy = igrid[istart+1];
+    const int igridz = igrid[istart+2];
     
     // now istart is used to index thetai1, thetai2 and thetai3
     istart = fast_mul(ii,bsorder);
@@ -1701,18 +1701,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
 
     int k = (igridz - nzlo_out) - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
-      /*
-      v0 = thetai3[m][kb][0];
-      v1 = thetai3[m][kb][1];
-      v2 = thetai3[m][kb][2];
-      v3 = thetai3[m][kb][3];
-      */
-      int i3 = istart + kb;
-      numtyp4 tha3 = thetai3[i3];
-      numtyp v0 = tha3.x;
-      numtyp v1 = tha3.y;
-      numtyp v2 = tha3.z;
-      numtyp v3 = tha3.w;
+      const int mz = fast_mul(k, ngridxy);
+      const int i3 = istart + kb;
+      const numtyp4 tha3 = thetai3[i3];
+      const numtyp v0 = tha3.x; // thetai3[m][kb][0];
+      const numtyp v1 = tha3.y; // thetai3[m][kb][1];
+      const numtyp v2 = tha3.z; // thetai3[m][kb][2];
+      const numtyp v3 = tha3.w; // thetai3[m][kb][3];
       numtyp tu00_1 = (numtyp)0.0;
       numtyp tu01_1 = (numtyp)0.0;
       numtyp tu10_1 = (numtyp)0.0;
@@ -1738,18 +1733,13 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
 
       int j = (igridy - nylo_out) - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
-        /*
-        u0 = thetai2[m][jb][0];
-        u1 = thetai2[m][jb][1];
-        u2 = thetai2[m][jb][2];
-        u3 = thetai2[m][jb][3];
-        */
-        int i2 = istart + jb;
-        numtyp4 tha2 = thetai2[i2];
-        numtyp u0 = tha2.x;
-        numtyp u1 = tha2.y;
-        numtyp u2 = tha2.z;
-        numtyp u3 = tha2.w;
+        const int my = mz + fast_mul(j, ngridx);
+        const int i2 = istart + jb;
+        const numtyp4 tha2 = thetai2[i2];
+        const numtyp u0 = tha2.x; // thetai2[m][jb][0];
+        const numtyp u1 = tha2.y; // thetai2[m][jb][1];
+        const numtyp u2 = tha2.z; // thetai2[m][jb][2];
+        const numtyp u3 = tha2.w; // thetai2[m][jb][3];
         numtyp t0_1 = (numtyp)0.0;
         numtyp t1_1 = (numtyp)0.0;
         numtyp t2_1 = (numtyp)0.0;
@@ -1771,22 +1761,25 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          int i1 = istart + ib;
-          numtyp4 tha1 = thetai1[i1];
-          numtyp w0 = tha1.x;
-          numtyp w1 = tha1.y;
-          numtyp w2 = tha1.z;
-          numtyp w3 = tha1.w;
-          int gidx = 2*(k*ngridxy + j*ngridx + i);
-          numtyp tq_1 = grid[gidx];
-          numtyp tq_2 = grid[gidx+1];
-          t0_1 += tq_1*w0;
-          t1_1 += tq_1*w1;
-          t2_1 += tq_1*w2;
-          t0_2 += tq_2*w0;
-          t1_2 += tq_2*w1;
-          t2_2 += tq_2*w2;
-          t3 += (tq_1+tq_2)*w3;
+          const int i1 = istart + ib;
+          const numtyp4 tha1 = thetai1[i1];
+          /*
+          const numtyp w0 = tha1.x;
+          const numtyp w1 = tha1.y;
+          const numtyp w2 = tha1.z;
+          const numtyp w3 = tha1.w;
+          */
+          const int gidx = my + i; // k*ngridxy + j*ngridx + i;
+          const numtyp2 tq = grid[gidx];
+          const numtyp tq_1 = tq.x; //grid[gidx];
+          const numtyp tq_2 = tq.y; //grid[gidx+1];
+          t0_1 += tq_1*tha1.x; // w0
+          t1_1 += tq_1*tha1.y; // w1
+          t2_1 += tq_1*tha1.z; // w2
+          t0_2 += tq_2*tha1.x; // w0
+          t1_2 += tq_2*tha1.y; // w1
+          t2_2 += tq_2*tha1.z; // w2
+          t3 += (tq_1+tq_2)*tha1.w; // w3
           i++;
         }
 
@@ -1933,9 +1926,9 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai2,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
-                          const __global numtyp *restrict grid,
+                          const __global numtyp2 *restrict grid,
                           __global numtyp *restrict fphi,
-                          const int bsorder, const int inum,
+                          const int bsorder, const int inum, const numtyp felec,
                           const int nzlo_out, const int nylo_out,
                           const int nxlo_out, const int ngridxy,
                           const int ngridx)
@@ -2027,7 +2020,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
           int i1 = istart + ib;
           numtyp4 tha1 = thetai1[i1];
           int gidx = k*ngridxy + j*ngridx + i;
-          numtyp tq = grid[gidx];
+          numtyp tq = grid[gidx].x;
           t0 += tq*tha1.x;
           t1 += tq*tha1.y;
           t2 += tq*tha1.z;
@@ -2095,7 +2088,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
 
     int idx = ii;    
     for (int m = 0; m < 20; m++) {
-      fphi[idx] = buf[m];
+      fphi[idx] = felec * buf[m];
       idx += inum;
     }
   }
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 1f56fa86f8..47591e75f6 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,15 +162,14 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
-void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
+void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
                           const int nzlo_out, const int nzhi_out,
                           const int nylo_out, const int nyhi_out,
                           const int nxlo_out, const int nxhi_out) {
-   AMOEBAMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
-                              host_thetai3, igrid, nzlo_out, nzhi_out,
-                              nylo_out, nyhi_out, nxlo_out, nxhi_out);
+   AMOEBAMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid,
+                              nzlo_out, nzhi_out, nylo_out, nyhi_out, nxlo_out, nxhi_out);
 }
 
 void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
@@ -179,8 +178,8 @@ void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
                               host_fdip_phi2, host_fdip_sum_phi);
 }
 
-void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi) {
-   AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi);
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec) {
+   AMOEBAMF.compute_fphi_mpole(host_grid_brick, host_fphi, felec);
 }
 
 void amoeba_setup_fft(const int numel, const int element_type) {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 08dcd8123e..5496236632 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -566,7 +566,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
+void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
                                     double ***host_thetai1, double ***host_thetai2,
                                     double ***host_thetai3, int** host_igrid,
                                     const int nzlo_out, const int nzhi_out,
@@ -660,7 +660,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   _ngridx = nxhi_out - nxlo_out + 1;
   _num_grid_points = _ngridx * _ngridy * _ngridz;
 
-  int numel = _num_grid_points*2;
+  int numel = _num_grid_points;
   if (_cgrid_brick.cols() == 0) {
     _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
   } else if (numel > _cgrid_brick.cols()) {
@@ -688,11 +688,13 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
   for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
     for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
       for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
-        _cgrid_brick[n] = host_grid_brick[iz][iy][ix][0];
-        _cgrid_brick[n+1] = host_grid_brick[iz][iy][ix][1];
-        n += 2;
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix][0];
+        v.y = host_grid_brick[iz][iy][ix][1];
+        _cgrid_brick[n] = v;
+        n++;
       }
-  _cgrid_brick.update_device(_num_grid_points*2, false);
+  _cgrid_brick.update_device(_num_grid_points, false);
 
   const int red_blocks = fphi_uind();
 
@@ -740,7 +742,7 @@ int BaseAmoebaT::fphi_uind() {
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi)
+void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
 {
   // TODO: grid brick[k][j][i] is a scalar
   UCL_H_Vec<numtyp> hdummy;
@@ -750,11 +752,15 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi
   for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
     for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
       for (int ix = _nxlo_out; ix <= _nxhi_out; ix++) {
-        _cgrid_brick[n] = host_grid_brick[iz][iy][ix];
+        numtyp2 v;
+        v.x = host_grid_brick[iz][iy][ix];
+        v.y = (numtyp)0;
+        _cgrid_brick[n] = v;
         n++;
       }
   _cgrid_brick.update_device(_num_grid_points, false);
 
+  _felec = felec;
   const int red_blocks = fphi_mpole();
 
   _fdip_sum_phi.update_host(_max_thetai_size*20);
@@ -776,13 +782,14 @@ int BaseAmoebaT::fphi_mpole() {
 
   // Compute the block size and grid size to keep all cores busy
   const int BX=block_size();
+  //printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D);
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   time_pair.start();
   int ngridxy = _ngridx * _ngridy;
   k_fphi_mpole.set_size(GX,BX);
   k_fphi_mpole.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
-                  &_fdip_sum_phi, &_bsorder, &ainum,
+                  &_fdip_sum_phi, &_bsorder, &ainum, &_felec,
                   &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
   time_pair.stop();
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index a5ee245623..f9a715808e 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -173,8 +173,8 @@ class BaseAmoeba {
                 double **host_uind, double **host_uinp, double *host_pval,
                 const double aewald, const double off2_polar, void **fieldp_ptr);
 
-  /// Allocate/resize per-atom arrays before induce()
-  virtual void precompute_induce(const int inum_full, const int bsorder,
+  /// Allocate/resize per-atom arrays before the kspace parts in induce() and polar
+  virtual void precompute_kspace(const int inum_full, const int bsorder,
                                  double ***host_thetai1, double ***host_thetai2,
                                  double ***host_thetai3, int** igrid,
                                  const int nzlo_out, const int nzhi_out,
@@ -185,7 +185,8 @@ class BaseAmoeba {
                                  void **host_fdip_phi1, void **host_fdip_phi2,
                                  void **host_fdip_sum_phi);
 
-  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi);
+  virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
+                                  const double felec);
 
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -256,7 +257,7 @@ class BaseAmoeba {
   int _bsorder;
   UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
   UCL_Vector<int,int> _igrid;
-  UCL_Vector<numtyp,numtyp> _cgrid_brick;
+  UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
   UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
   int _max_thetai_size;
   int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 1bd6bade3a..a75080bfca 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -193,15 +193,15 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
-void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
+void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
                           const int nzlo_out, const int nzhi_out,
                           const int nylo_out, const int nyhi_out,
                           const int nxlo_out, const int nxhi_out) {
-   HIPPOMF.precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2,
-                              host_thetai3, igrid, nzlo_out, nzhi_out,
-                              nylo_out, nyhi_out, nxlo_out, nxhi_out);
+   HIPPOMF.precompute_kspace(inum_full, bsorder, host_thetai1, host_thetai2,
+                             host_thetai3, igrid, nzlo_out, nzhi_out,
+                             nylo_out, nyhi_out, nxlo_out, nxhi_out);
 }
 
 void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 396ff0b592..d0018bf588 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -88,17 +88,18 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 
-void amoeba_gpu_precompute_induce(const int inum_full, const int bsorder,
-                          double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, int** igrid,
-                          const int nzlo_out, const int nzhi_out,
-                          const int nylo_out, const int nyhi_out,
-                          const int nxlo_out, const int nxhi_out);
+void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder,
+              double ***host_thetai1, double ***host_thetai2,
+              double ***host_thetai3, int** igrid,
+              const int nzlo_out, const int nzhi_out,
+              const int nylo_out, const int nyhi_out,
+              const int nxlo_out, const int nxhi_out);
 
 void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
                           void **host_fdip_phi2, void **host_fdip_sum_phi);
 
-void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi);
+void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi,
+                           const double felec);
 
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
@@ -343,7 +344,7 @@ void PairAmoebaGPU::induce()
   // must be done before the first ufield0c
   // NOTE: this is for ic_kspace, and thetai[1-3]
 
-  amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
+  amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
                                thetai3, igrid,
                                ic_kspace->nzlo_out, ic_kspace->nzhi_out,
                                ic_kspace->nylo_out, ic_kspace->nyhi_out,
@@ -1382,11 +1383,11 @@ void PairAmoebaGPU::polar_kspace()
   
     // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
     if (gpu_fphi_mpole_ready) {
-       amoeba_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
-                                    thetai3, igrid, p_kspace->nzlo_out,
-                                    p_kspace->nzhi_out, p_kspace->nylo_out,
-                                    p_kspace->nyhi_out, p_kspace->nxlo_out,
-                                    p_kspace->nxhi_out);
+       amoeba_gpu_precompute_kspace(atom->nlocal, bsorder,
+                                    thetai1, thetai2, thetai3, igrid,
+                                    p_kspace->nzlo_out, p_kspace->nzhi_out,
+                                    p_kspace->nylo_out, p_kspace->nyhi_out,
+                                    p_kspace->nxlo_out, p_kspace->nxhi_out);
     }
       
 
@@ -1461,10 +1462,15 @@ void PairAmoebaGPU::polar_kspace()
     
     if (!gpu_fphi_mpole_ready) {
       fphi_mpole(gridpost,fphi);
-      //printf("cpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
+
+      for (i = 0; i < nlocal; i++) {
+        for (k = 0; k < 20; k++)
+          fphi[i][k] *= felec;
+      }
+
     } else {
       void* fphi_pinned = nullptr;
-      amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned);
+      amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
     
       double *_fphi_ptr = (double *)fphi_pinned;
       for (int i = 0; i < nlocal; i++) {
@@ -1474,13 +1480,8 @@ void PairAmoebaGPU::polar_kspace()
           idx += nlocal;
         }
       }
-      //printf("gpu phi = %f %f %f\n", fphi[0][0],fphi[0][1],fphi[0][2]);
-    }
 
-    for (i = 0; i < nlocal; i++) {
-      for (k = 0; k < 20; k++)
-        fphi[i][k] *= felec;
-    }
+    }   
 
     // convert field from fractional to Cartesian
 
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 1151027993..4dbc998ee3 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -105,12 +105,12 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void hippo_gpu_update_fieldp(void **fieldp_ptr);
 
-void hippo_gpu_precompute_induce(const int inum_full, const int bsorder,
-                          double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, int** igrid,
-                          const int nzlo_out, const int nzhi_out,
-                          const int nylo_out, const int nyhi_out,
-                          const int nxlo_out, const int nxhi_out);
+void hippo_gpu_precompute_kspace(const int inum_full, const int bsorder,
+              double ***host_thetai1, double ***host_thetai2,
+              double ***host_thetai3, int** igrid,
+              const int nzlo_out, const int nzhi_out,
+              const int nylo_out, const int nyhi_out,
+              const int nxlo_out, const int nxhi_out);
 
 void hippo_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1,
                           void **host_fdip_phi2, void **host_fdip_sum_phi);
@@ -475,7 +475,7 @@ void PairHippoGPU::induce()
   // allocate memory and make early host-device transfers
   // must be done before the first ufield0c
 
-  hippo_gpu_precompute_induce(atom->nlocal, bsorder, thetai1, thetai2,
+  hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
                               thetai3, igrid,
                               ic_kspace->nzlo_out, ic_kspace->nzhi_out,
                               ic_kspace->nylo_out, ic_kspace->nyhi_out,

From 1d75ca3b209dbe8fc2bb14c38d6b12410134231e Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 30 Sep 2022 16:31:13 -0500
Subject: [PATCH 125/181] Moved precompute() out of the terms in amoeba and
 hippo, to be involed in the first term in a time step: multipole for amoeba
 and repulsion for hippo

---
 lib/gpu/lal_amoeba.cpp      |  2 +-
 lib/gpu/lal_amoeba_ext.cpp  | 25 +++++++++++++++++--
 lib/gpu/lal_base_amoeba.cpp | 21 ++++++++--------
 lib/gpu/lal_base_amoeba.h   |  2 +-
 lib/gpu/lal_hippo.cpp       | 18 +++++++-------
 lib/gpu/lal_hippo.h         |  6 ++---
 lib/gpu/lal_hippo_ext.cpp   | 28 ++++++++++++++++++----
 src/GPU/pair_amoeba_gpu.cpp | 47 ++++++++++++++++++++++++++----------
 src/GPU/pair_hippo_gpu.cpp  | 48 ++++++++++++++++++++++++++-----------
 9 files changed, 140 insertions(+), 57 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 7be4a6f59c..e3bb4c5ef5 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -162,7 +162,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
   this->time_pair.start();
 
   // Build the short neighbor list for the cutoff off2_mpole,
-  //   at this point mpole is the first kernel in a time step
+  //   at this point mpole is the first kernel in a time step for AMOEBA
 
   this->k_short_nbor.set_size(GX,BX);
   this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 47591e75f6..5e4d48a2da 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -117,7 +117,28 @@ void amoeba_gpu_clear() {
   AMOEBAMF.clear();
 }
 
-int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
+int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole,
+                            double **host_uind, double **host_uinp, double *host_pval,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd) {
+  return AMOEBAMF.precompute(ago, inum_full, nall, host_x, host_type,
+                             host_amtype, host_amgroup, host_rpole,
+                             nullptr, nullptr, nullptr, sublo, subhi, tag,
+                             nspecial, special, nspecial15, special15,
+                             eflag_in, vflag_in, eatom, vatom,
+                             host_start, ilist, jnum, cpu_time,
+                             success, host_q, boxlo, prd);
+}
+
+
+void amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
@@ -127,7 +148,7 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, const double aewald, const double felec, const double off2,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
-  return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+  AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, nullptr, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 5496236632..16335fa17e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -226,12 +226,12 @@ int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
-                                         const int nall, double **host_x,
-                                         int *host_type, double *sublo,
-                                         double *subhi, tagint *tag,
-                                         int **nspecial, tagint **special,
-                                         int *nspecial15, tagint **special15,
-                                         bool &success) {
+                                        const int nall, double **host_x,
+                                        int *host_type, double *sublo,
+                                        double *subhi, tagint *tag,
+                                        int **nspecial, tagint **special,
+                                        int *nspecial15, tagint **special15,
+                                        bool &success) {
   success=true;
   resize_atom(inum,nall,success);
   resize_local(inum,host_inum,nbor->max_nbors(),success);
@@ -450,7 +450,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
+void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
                                           const int nall, double **host_x,
                                           int *host_type, int *host_amtype,
                                           int *host_amgroup, double **host_rpole, double *host_pval,
@@ -469,7 +469,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   // NOTE:
   //   Once all the kernels are ready, precompute() is needed only once
   //     in the first kernel in a time step.
-
+/*
   int** firstneigh = nullptr;
   firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole,
@@ -478,7 +478,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
                           eflag_in, vflag_in, eatom, vatom,
                           host_start, ilist, jnum, cpu_time,
                           success, host_q, boxlo, prd);
-
+*/
   // ------------------- Resize _tep array ------------------------
 
   if (inum_full>_max_tep_size) {
@@ -503,7 +503,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
 
   _tep.update_host(_max_tep_size*4,false);
 
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
+//  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -782,7 +782,6 @@ int BaseAmoebaT::fphi_mpole() {
 
   // Compute the block size and grid size to keep all cores busy
   const int BX=block_size();
-  //printf("BX = %d; pppm block size = %d\n", BX, PPPM_BLOCK_1D);
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   time_pair.start();
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index f9a715808e..d00833cae7 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -152,7 +152,7 @@ class BaseAmoeba {
                 double *charge, double *boxlo, double *prd);
 
   /// Compute multipole real-space with device neighboring
-  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole, double *host_pval,
                 double *sublo, double *subhi, tagint *tag,
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 3de6dc544c..dc2b6f2c7a 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -172,7 +172,7 @@ double HippoT::host_memory_usage() const {
 // Reneighbor on GPU if necessary, and then compute repulsion
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_repulsion(const int ago, const int inum_full,
+void HippoT::compute_repulsion(const int ago, const int inum_full,
                                 const int nall, double **host_x,
                                 int *host_type, int *host_amtype,
                                 int *host_amgroup, double **host_rpole,
@@ -213,7 +213,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
   //   We only need to cast the necessary from host to device here
   //     if the neighbor lists are rebuilt and other per-atom arrays
   //     (x, type, amtype, amgroup, rpole) are ready on the device.
-
+/*
   int** firstneigh = nullptr;
   firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
                                 host_amtype, host_amgroup, host_rpole,
@@ -222,7 +222,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
                                 eflag_in, vflag_in, eatom, vatom,
                                 host_start, ilist, jnum, cpu_time,
                                 success, host_q, boxlo, prd);
-
+*/
   // ------------------- Resize _tep array ------------------------
 
   if (inum_full>this->_max_tep_size) {
@@ -253,7 +253,7 @@ int** HippoT::compute_repulsion(const int ago, const int inum_full,
 
   this->_tep.update_host(this->_max_tep_size*4,false);
 
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
+//  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -275,7 +275,7 @@ int HippoT::repulsion(const int eflag, const int vflag) {
   this->time_pair.start();
 
   // Build the short neighbor list for the cutoff off2_disp,
-  //   at this point mpole is the first kernel in a time step
+  //   at this point repuslion is the first kernel in a time step for HIPPO
 
   this->k_short_nbor.set_size(GX,BX);
   this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
@@ -302,7 +302,7 @@ int HippoT::repulsion(const int eflag, const int vflag) {
 // Reneighbor on GPU if necessary, and then compute dispersion real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
+void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
                                       double **host_rpole, const double aewald,
                                       const double off2_disp) {
 
@@ -324,7 +324,7 @@ int** HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
 
   this->hd_balancer.stop_timer();
 
-  return nullptr; // nbor->host_jlist.begin()-host_start;
+ // return nullptr; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -372,7 +372,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** HippoT::compute_multipole_real(const int ago, const int inum_full,
+void HippoT::compute_multipole_real(const int ago, const int inum_full,
                                      const int nall, double **host_x,
                                      int *host_type, int *host_amtype,
                                      int *host_amgroup, double **host_rpole,
@@ -417,7 +417,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,
 
   this->_tep.update_host(this->_max_tep_size*4,false);
 
-  return nullptr; // nbor->host_jlist.begin()-host_start;
+  //return nullptr; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 492712eb85..671c9964ff 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -55,7 +55,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
            const double polar_dscale, const double polar_uscale);
 
   /// Compute repulsion with device neighboring
-  int** compute_repulsion(const int ago, const int inum_full,
+  virtual void compute_repulsion(const int ago, const int inum_full,
                           const int nall, double **host_x,
                           int *host_type, int *host_amtype,
                           int *host_amgroup, double **host_rpole,
@@ -72,12 +72,12 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                           double c3, double c4, double c5,void** tep_ptr);
 
   /// Compute dispersion real-space with device neighboring
-  int** compute_dispersion_real(int *host_amtype,  int *host_amgroup,
+  virtual void compute_dispersion_real(int *host_amtype,  int *host_amgroup,
                                 double **host_rpole, const double aewald,
                                 const double off2_disp);
 
   /// Compute multipole real-space with device neighboring
-  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+  virtual void compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
                 int *host_amgroup, double **host_rpole, double *host_pval,
                 double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special,
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index a75080bfca..9644f5aca4 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -120,7 +120,27 @@ void hippo_gpu_clear() {
   HIPPOMF.clear();
 }
 
-int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole,
+                            double **host_uind, double **host_uinp, double *host_pval,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd) {
+  return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
+                            host_amtype, host_amgroup, host_rpole,
+                            nullptr, nullptr, nullptr, sublo, subhi, tag,
+                            nspecial, special, nspecial15, special15,
+                            eflag_in, vflag_in, eatom, vatom,
+                            host_start, ilist, jnum, cpu_time,
+                            success, host_q, boxlo, prd);
+}
+
+void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
@@ -132,7 +152,7 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                            double *host_q, double *boxlo, double *prd,
                            double cut2, double c0, double c1, double c2,
                            double c3, double c4, double c5, void **tep_ptr) {
-  return HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
+  HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
@@ -147,7 +167,7 @@ void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup,
                                          aewald, off2);
 }
 
-int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
+void hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
                            double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
@@ -157,7 +177,7 @@ int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, const double aewald, const double felec, const double off2,
                            double *host_q, double *boxlo, double *prd, void **tep_ptr) {
-  return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+  HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, host_pval, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index d0018bf588..8e021f5ce8 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -69,7 +69,19 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
                     const double polar_dscale, const double polar_uscale, int& tq_size);
 void amoeba_gpu_clear();
 
-int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole,
+                            double **host_uind, double **host_uinp, double *host_pval,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd);
+
+void amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double *sublo, double *subhi, tagint *tag,
               int **nspecial, tagint **special, int* nspecial15, tagint** special15,
@@ -240,6 +252,18 @@ void PairAmoebaGPU::multipole_real()
   }
   inum = atom->nlocal;
 
+  firstneigh = amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x,
+                                     atom->type, amtype, amgroup, rpole,
+                                     nullptr, nullptr, nullptr,
+                                     sublo, subhi, atom->tag,
+                                     atom->nspecial, atom->special,
+                                     atom->nspecial15, atom->special15,
+                                     eflag, vflag, eflag_atom, vflag_atom,
+                                     host_start, &ilist, &numneigh, cpu_time,
+                                     success, atom->q, domain->boxlo, domain->prd);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
   // select the correct cutoff for the term
 
   if (use_ewald) choose(MPOLE_LONG);
@@ -249,18 +273,17 @@ void PairAmoebaGPU::multipole_real()
 
   double felec = electric / am_dielectric;
 
-  firstneigh = amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
-                                                 atom->type, amtype, amgroup, rpole,
-                                                 sublo, subhi, atom->tag,
-                                                 atom->nspecial, atom->special,
-                                                 atom->nspecial15, atom->special15,
-                                                 eflag, vflag, eflag_atom, vflag_atom,
-                                                 host_start, &ilist, &numneigh, cpu_time,
-                                                 success, aewald, felec, off2, atom->q,
-                                                 domain->boxlo, domain->prd, &tq_pinned);
+  amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
+                                    atom->type, amtype, amgroup, rpole,
+                                    sublo, subhi, atom->tag,
+                                    atom->nspecial, atom->special,
+                                    atom->nspecial15, atom->special15,
+                                    eflag, vflag, eflag_atom, vflag_atom,
+                                    host_start, &ilist, &numneigh, cpu_time,
+                                    success, aewald, felec, off2, atom->q,
+                                    domain->boxlo, domain->prd, &tq_pinned);
 
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
+  
 
   // reference to the tep array from GPU lib
 
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 4dbc998ee3..7658ddb011 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -70,7 +70,19 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                     const double polar_dscale, const double polar_uscale, int& tq_size);
 void hippo_gpu_clear();
 
-int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
+int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
+                            double **host_x, int *host_type, int *host_amtype,
+                            int *host_amgroup, double **host_rpole,
+                            double **host_uind, double **host_uinp, double *host_pval,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            int *nspecial15, tagint **special15,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd);
+
+void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
                            double *sublo, double *subhi, tagint *tag, int **nspecial,
@@ -86,7 +98,7 @@ int** hippo_gpu_compute_repulsion(const int ago, const int inum_full,
 void hippo_gpu_compute_dispersion_real(int *host_amtype, int *host_amgroup, double **host_rpole,
                                         const double aewald, const double off2);
 
-int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+void hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double *host_pval, double *sublo, double *subhi, tagint *tag,
               int **nspecial, tagint **special, int* nspecial15, tagint** special15,
@@ -258,22 +270,30 @@ void PairHippoGPU::repulsion()
   }
   inum = atom->nlocal;
 
+  firstneigh = hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x,
+                                     atom->type, amtype, amgroup, rpole,
+                                     nullptr, nullptr, nullptr,
+                                     sublo, subhi, atom->tag,
+                                     atom->nspecial, atom->special,
+                                     atom->nspecial15, atom->special15,
+                                     eflag, vflag, eflag_atom, vflag_atom,
+                                     host_start, &ilist, &numneigh, cpu_time,
+                                     success, atom->q, domain->boxlo, domain->prd);
+
   // select the correct cutoff for the term
 
   choose(REPULSE);
 
-  // set the energy unit conversion factor for multipolar real-space calculation
-
-  firstneigh = hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x,
-                                           atom->type, amtype, amgroup, rpole,
-                                           sublo, subhi, atom->tag,
-                                           atom->nspecial, atom->special,
-                                           atom->nspecial15, atom->special15,
-                                           eflag, vflag, eflag_atom, vflag_atom,
-                                           host_start, &ilist, &numneigh, cpu_time,
-                                           success, aewald, off2, atom->q,
-                                           domain->boxlo, domain->prd, cut2,
-                                           c0, c1, c2, c3, c4, c5, &tq_pinned);
+  hippo_gpu_compute_repulsion(neighbor->ago, inum, nall, atom->x,
+                              atom->type, amtype, amgroup, rpole,
+                              sublo, subhi, atom->tag,
+                              atom->nspecial, atom->special,
+                              atom->nspecial15, atom->special15,
+                              eflag, vflag, eflag_atom, vflag_atom,
+                              host_start, &ilist, &numneigh, cpu_time,
+                              success, aewald, off2, atom->q,
+                              domain->boxlo, domain->prd, cut2,
+                              c0, c1, c2, c3, c4, c5, &tq_pinned);
 
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");

From 9a1f23a0793ce30e1fa5a835b57c1724e830ef36 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 30 Sep 2022 17:32:25 -0500
Subject: [PATCH 126/181] Cosmetic changes and cleanup

---
 lib/gpu/lal_amoeba.cpp      | 10 +++--
 lib/gpu/lal_base_amoeba.cpp | 28 ++++---------
 lib/gpu/lal_hippo.cpp       | 83 ++++++++++++++-----------------------
 3 files changed, 44 insertions(+), 77 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index e3bb4c5ef5..dfe092c52b 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -185,7 +185,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the real-space permanent field, returning field and fieldp
+// Launch the real-space permanent field kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::udirect2b(const int eflag, const int vflag) {
@@ -202,7 +202,9 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
                                (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
-  // Build the short neighbor list if not done yet
+  // Build the short neighbor list for the cutoff _off2_polar, if not done yet
+  //   this is the first kernel in a time step where _off2_polar is used
+
   if (!this->short_nbor_polar_avail) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
@@ -225,7 +227,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the real-space induced field, returning field and fieldp
+// Launch the real-space induced field kernel, returning field and fieldp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::umutual2b(const int eflag, const int vflag) {
@@ -264,7 +266,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the polar real-space term, returning tep
+// Launch the polar real-space kernel, returning tep
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::polar_real(const int eflag, const int vflag) {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 16335fa17e..17e05b4a16 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -447,7 +447,9 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute multipole real-space
+// Compute multipole real-space part
+//   precompute() should be already invoked before mem (re)allocation
+//   this is the first part in a time step done on the GPU for AMOEBA for now 
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
@@ -464,21 +466,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
                                           const double aewald, const double felec,
                                           const double off2_mpole, double *host_q,
                                           double *boxlo, double *prd, void **tep_ptr) {
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-  // NOTE:
-  //   Once all the kernels are ready, precompute() is needed only once
-  //     in the first kernel in a time step.
-/*
-  int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          nullptr, nullptr, nullptr, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
-*/
   // ------------------- Resize _tep array ------------------------
 
   if (inum_full>_max_tep_size) {
@@ -502,8 +489,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   // copy tep from device to host
 
   _tep.update_host(_max_tep_size*4,false);
-
-//  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
@@ -842,22 +827,23 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 }
 
 // ---------------------------------------------------------------------------
-// Setup the FFT plan
+// Setup the FFT plan: only placeholder for now
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::setup_fft(const int numel, const int element_type)
 {
-
+  // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
 }
 
 // ---------------------------------------------------------------------------
-// Compute FFT on the device
+// Compute FFT on the device: only placeholder for now
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode)
 {
+  // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
   #if !defined(USE_OPENCL) && !defined(USE_HIP)    
   if (fft_plan_created == false) {
     int m = numel/2;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index dc2b6f2c7a..221fe16f3c 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -143,8 +143,12 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
   _polar_uscale = polar_uscale;
 
   _allocated=true;
-  this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes() + coeff_amclass.row_bytes() +
-    + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes();
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_rep.row_bytes()
+    + coeff_amclass.row_bytes() + sp_polar.row_bytes()
+    + sp_nonpolar.row_bytes() + this->_tep.row_bytes()
+    + this->_fieldp.row_bytes() + this->_thetai1.row_bytes()
+    + this->_thetai2.row_bytes()  + this->_thetai3.row_bytes()
+    + this->_igrid.row_bytes() + this->_cgrid_brick.row_bytes();
   return 0;
 }
 
@@ -169,7 +173,7 @@ double HippoT::host_memory_usage() const {
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute repulsion
+// Compute the repulsion term, returning tep
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void HippoT::compute_repulsion(const int ago, const int inum_full,
@@ -203,26 +207,6 @@ void HippoT::compute_repulsion(const int ago, const int inum_full,
 
   this->set_kernel(eflag,vflag);
 
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-  // NOTE:
-  //   For now we invoke precompute() again here,
-  //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   Once all the kernels are ready, precompute() is needed only once
-  //     in the first kernel in a time step.
-  //   We only need to cast the necessary from host to device here
-  //     if the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type, amtype, amgroup, rpole) are ready on the device.
-/*
-  int** firstneigh = nullptr;
-  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
-                                host_amtype, host_amgroup, host_rpole,
-                                nullptr, nullptr, nullptr, sublo, subhi, tag,
-                                nspecial, special, nspecial15, special15,
-                                eflag_in, vflag_in, eatom, vatom,
-                                host_start, ilist, jnum, cpu_time,
-                                success, host_q, boxlo, prd);
-*/
   // ------------------- Resize _tep array ------------------------
 
   if (inum_full>this->_max_tep_size) {
@@ -252,12 +236,10 @@ void HippoT::compute_repulsion(const int ago, const int inum_full,
   // copy tep from device to host
 
   this->_tep.update_host(this->_max_tep_size*4,false);
-
-//  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the repulsion term, returning tep
+// Launch the repulsion kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int HippoT::repulsion(const int eflag, const int vflag) {
@@ -299,7 +281,7 @@ int HippoT::repulsion(const int eflag, const int vflag) {
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute dispersion real-space
+// Compute dispersion real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
@@ -323,12 +305,10 @@ void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
   //this->device->add_ans_object(this->ans);
 
   this->hd_balancer.stop_timer();
-
- // return nullptr; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the dispersion real-space term, returning tep
+// Launch the dispersion real-space kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int HippoT::dispersion_real(const int eflag, const int vflag) {
@@ -346,7 +326,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
   this->time_pair.start();
 
   // Build the short neighbor list for the cutoff off2_disp,
-  //   at this point mpole is the first kernel in a time step
+  //   at this point dispersion is the first kernel in a time step
 
   this->k_short_nbor.set_size(GX,BX);
   this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
@@ -356,20 +336,20 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
 
   k_dispersion.set_size(GX,BX);
   k_dispersion.run(&this->atom->x, &this->atom->extra,
-                         &coeff_amtype, &coeff_amclass, &sp_nonpolar,
-                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                         &this->dev_short_nbor,
-                         &this->ans->force, &this->ans->engv,
-                         &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                         &this->_threads_per_atom,  &this->_aewald,
-                         &this->_off2_disp);
+                   &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                   &this->dev_short_nbor,
+                   &this->ans->force, &this->ans->engv,
+                   &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                   &this->_threads_per_atom,  &this->_aewald,
+                   &this->_off2_disp);
   this->time_pair.stop();
 
   return GX;
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute multipole real-space
+// Compute the multipole real-space term, returning tep
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void HippoT::compute_multipole_real(const int ago, const int inum_full,
@@ -416,12 +396,10 @@ void HippoT::compute_multipole_real(const int ago, const int inum_full,
   // copy tep from device to host
 
   this->_tep.update_host(this->_max_tep_size*4,false);
-
-  //return nullptr; // nbor->host_jlist.begin()-host_start;
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the multipole real-space term, returning tep
+// Launch the multipole real-space kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int HippoT::multipole_real(const int eflag, const int vflag) {
@@ -438,8 +416,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
                                (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
-  // Build the short neighbor list for the cutoff off2_mpole,
-  //   at this point mpole is the first kernel in a time step
+  // Build the short neighbor list for the cutoff off2_mpole
 
   this->k_short_nbor.set_size(GX,BX);
   this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
@@ -462,8 +439,8 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute the direct real space part
-//    of the permanent field
+// Compute the direct real space part of the permanent field
+//   returning field and fieldp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -488,7 +465,7 @@ void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **hos
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the real-space permanent field, returning field and fieldp
+// Launch the real-space permanent field kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int HippoT::udirect2b(const int eflag, const int vflag) {
@@ -505,7 +482,9 @@ int HippoT::udirect2b(const int eflag, const int vflag) {
                                (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
-  // Build the short neighbor list if not done yet
+  // Build the short neighbor list for the cutoff _off2_polar, if not done yet
+  //   this is the first kernel in a time step where _off2_polar is used
+
   if (!this->short_nbor_polar_avail) {
     this->k_short_nbor.set_size(GX,BX);
     this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
@@ -529,8 +508,8 @@ int HippoT::udirect2b(const int eflag, const int vflag) {
 }
 
 // ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute the direct real space part
-//    of the induced field
+// Compute the direct real space term of the induced field
+//   returning field and fieldp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -554,7 +533,7 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the real-space induced field, returning field and fieldp
+// Launch the real-space induced field kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int HippoT::umutual2b(const int eflag, const int vflag) {
@@ -628,7 +607,7 @@ void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **ho
 }
 
 // ---------------------------------------------------------------------------
-// Calculate the polar real-space term, returning tep
+// Launch the polar real-space kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int HippoT::polar_real(const int eflag, const int vflag) {

From 009ed3630124740593603c0752b6741c92a7c8c6 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 1 Oct 2022 11:16:30 -0500
Subject: [PATCH 127/181] Updated src/GPU Install.sh to include
 amoeba_convolution_gpu.*

---
 src/GPU/Install.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index d91b744c4e..48c47ae96d 100755
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -28,6 +28,8 @@ action () {
 
 # list of files with optional dependcies
 
+action amoeba_convolution_gpu.cpp amoeba_convolution.cpp
+action amoeba_convolution_gpu.h amoeba_convolution.cpp
 action fix_gpu.cpp
 action fix_gpu.h
 action fix_nve_gpu.h
@@ -117,6 +119,10 @@ action pair_lj_cut_coul_msm_gpu.cpp pair_lj_cut_coul_msm.cpp
 action pair_lj_cut_coul_msm_gpu.h pair_lj_cut_coul_msm.h
 action pair_lj_cut_gpu.cpp
 action pair_lj_cut_gpu.h
+action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
+action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
+action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
+action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
 action pair_lj_smooth_gpu.cpp pair_lj_smooth.cpp
 action pair_lj_smooth_gpu.h pair_lj_smooth.cpp
 action pair_lj_expand_gpu.cpp
@@ -159,10 +165,6 @@ action pppm_gpu.cpp pppm.cpp
 action pppm_gpu.h pppm.cpp
 action pair_ufm_gpu.cpp pair_ufm.cpp
 action pair_ufm_gpu.h pair_ufm.h
-action pair_lj_cut_dipole_long_gpu.cpp pair_lj_cut_dipole_long.cpp
-action pair_lj_cut_dipole_long_gpu.h pair_lj_cut_dipole_long.cpp
-action pair_lj_cut_tip4p_long_gpu.h pair_lj_cut_tip4p_long.cpp
-action pair_lj_cut_tip4p_long_gpu.cpp pair_lj_cut_tip4p_long.cpp
 
 # edit 2 Makefile.package files to include/exclude package info
 

From 6b9e83fe2093fafc2167fde727c77d6b4ed2e735 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 6 Oct 2022 15:03:58 -0500
Subject: [PATCH 128/181] Added timing for the induced dipole spreading part,
 computed the block size to ensure all the CUs are occupied by the fphi_uind
 and fphi_mpole kernels

---
 lib/gpu/lal_amoeba.cpp       | 11 ++++--
 lib/gpu/lal_base_amoeba.cpp  | 64 +++++++++++++++++++++++++---------
 lib/gpu/lal_base_amoeba.h    |  4 +++
 lib/gpu/lal_device.cpp       |  1 +
 lib/gpu/lal_device.h         |  4 ++-
 lib/gpu/lal_hippo.cpp        | 11 ++++--
 src/AMOEBA/amoeba_induce.cpp |  9 ++++-
 src/AMOEBA/pair_amoeba.cpp   | 13 +++++--
 src/AMOEBA/pair_amoeba.h     | 10 +++---
 src/GPU/pair_amoeba_gpu.cpp  | 66 ++++++------------------------------
 10 files changed, 106 insertions(+), 87 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index dfe092c52b..b61d7595af 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -278,9 +278,14 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
+  const int max_cus = this->device->max_cus();
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  while (GX < max_cus) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index c79804dd95..3b2381f211 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -155,7 +155,14 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
 
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
   fft_plan_created = false;
+  #endif
+
+  #ifdef ASYNC_DEVICE_COPY
+  _end_command_queue=ucl_device->num_queues();
+  ucl_device->push_command_queue();
+  #endif
 
   return success;
 }
@@ -507,6 +514,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
  
   *fieldp_ptr=_fieldp.host.begin();
 
+  // specify the correct cutoff and alpha values
   _off2_polar = off2_polar;
   _aewald = aewald;
   const int red_blocks=udirect2b(_eflag,_vflag);
@@ -525,18 +533,20 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
                                      double **host_uind, double **host_uinp, double *host_pval,
                                      const double aewald, const double off2_polar,
                                      void** fieldp_ptr) {
-  // all the necessary data arrays are already copied from host to device
-
-  //cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
+  // only copy the necessary data arrays that are updated over the iterations
+  // use nullptr for the other arrays that are already copied from host to device
   cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
   atom->add_extra_data();                          
 
+  // set the correct cutoff and alpha
   _off2_polar = off2_polar;
   _aewald = aewald;
+  // launch the kernel
   const int red_blocks=umutual2b(_eflag,_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
   // NOTE: move this step to update_fieldp() to delay device-host transfer
+  //       after umutual1 and self are done on the GPU
   // *fieldp_ptr=_fieldp.host.begin();
   // _fieldp.update_host(_max_fieldp_size*8,false);
 }
@@ -547,7 +557,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 //     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
 //     host_igrid is allocated with nmax by 4
 //   - transfer extra data from host to device
-// NOTE: can be re-used for fphi_mpole() (already allocate 2x grid points)
+// NOTE: can be re-used for fphi_mpole() but with a different bsorder value
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
@@ -588,6 +598,12 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
     }
   }
 
+  #ifdef ASYNC_DEVICE_COPY
+  _thetai1.cq(ucl_device->cq(_end_command_queue));
+  _thetai2.cq(ucl_device->cq(_end_command_queue));
+  _thetai3.cq(ucl_device->cq(_end_command_queue));
+  #endif
+
   // pack host data to device
 
   for (int i = 0; i < inum_full; i++)
@@ -634,6 +650,8 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
   }
   _igrid.update_device(true);
 
+  // _cgrid_brick holds the grid-based potential
+
   _nzlo_out = nzlo_out;
   _nzhi_out = nzhi_out;
   _nylo_out = nylo_out;
@@ -679,14 +697,21 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
         _cgrid_brick[n] = v;
         n++;
       }
-  _cgrid_brick.update_device(_num_grid_points, false);
+  _cgrid_brick.update_device(_num_grid_points, true);
 
+  #ifdef ASYNC_DEVICE_COPY
+  ucl_device->sync();
+  #endif
+
+  // launch the kernel with its execution configuration (see below)
   const int red_blocks = fphi_uind();
 
-  _fdip_phi1.update_host(_max_thetai_size*10);
-  _fdip_phi2.update_host(_max_thetai_size*10);
-  _fdip_sum_phi.update_host(_max_thetai_size*20);
+  // copy data from device to host asynchronously
+  _fdip_phi1.update_host(_max_thetai_size*10, true);
+  _fdip_phi2.update_host(_max_thetai_size*10, true);
+  _fdip_sum_phi.update_host(_max_thetai_size*20, true);
 
+  // return the pointers to the host-side arrays
   *host_fdip_phi1 = _fdip_phi1.host.begin();
   *host_fdip_phi2 = _fdip_phi2.host.begin();
   *host_fdip_sum_phi = _fdip_sum_phi.host.begin();
@@ -701,13 +726,15 @@ int BaseAmoebaT::fphi_uind() {
   if (ainum == 0)
     return 0;
 
-  int _nall=atom->nall();
-  int nbor_pitch=nbor->nbor_pitch();
-
   // Compute the block size and grid size to keep all cores busy
-  const int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
-
+  const int max_cus = device->max_cus();
+  int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  while (GX < max_cus) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  }
+  
   time_pair.start();
   int ngridxy = _ngridx * _ngridy;
   k_fphi_uind.set_size(GX,BX);
@@ -766,8 +793,13 @@ int BaseAmoebaT::fphi_mpole() {
   int nbor_pitch=nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
-  const int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+  const int max_cus = device->max_cus();
+  int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  while (GX < max_cus) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  }
 
   time_pair.start();
   int ngridxy = _ngridx * _ngridy;
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index d00833cae7..2e992a33d9 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -31,6 +31,8 @@
 #include "geryon/nvd_texture.h"
 #endif
 
+//#define ASYNC_DEVICE_COPY
+
 #if !defined(USE_OPENCL) && !defined(USE_HIP)
 // temporary workaround for int2 also defined in cufft
 #ifdef int2
@@ -263,6 +265,8 @@ class BaseAmoeba {
   int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
   int _ngridx, _ngridy, _ngridz, _num_grid_points;
 
+  int _end_command_queue;
+  
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   Answer<numtyp,acctyp> *ans;
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 039970a0d3..89ae503a97 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -214,6 +214,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
       }
     }
     _first_device = _last_device = best_device;
+    _max_cus = best_cus;
     type = gpu->device_type(_first_device);
 
     if (ndevices > 0) {
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 74f802a096..7def4b7f82 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -241,6 +241,8 @@ class Device {
   inline int shuffle_avail() const { return _shuffle_avail; }
   /// For OpenCL, 0 if fast-math options disabled, 1 enabled
   inline int fast_math() const { return _fast_math; }
+  /// return the max number of CUs among the devices
+  inline int max_cus() const { return _max_cus; }
 
   /// Return the number of threads per atom for pair styles
   inline int threads_per_atom() const { return _threads_per_atom; }
@@ -324,7 +326,7 @@ class Device {
 
  private:
   std::queue<Answer<numtyp,acctyp> *> ans_queue;
-  int _init_count;
+  int _init_count, _max_cus;
   bool _device_init, _host_timer_started, _time_device;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 221fe16f3c..d8ef3e9a44 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -619,9 +619,14 @@ int HippoT::polar_real(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
+  const int max_cus = this->device->max_cus();
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  while (GX < max_cus) {
+    BX /= 2;
+    GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  }
+
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index f10535a36a..69125854f9 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -901,14 +901,22 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
     }
   }
 
+  double time0, time1;
+
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
   double ****gridpre = (double ****) ic_kspace->zero();
 
   // map 2 values to grid
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   grid_uind(fuind,fuinp,gridpre);
 
+  time1 = MPI_Wtime();
+  time_grid_uind += (time1 - time0);
+
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
@@ -945,7 +953,6 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
   double ****gridpost = (double ****) ic_kspace->post_convolution();
 
   // get potential
-  double time0, time1;
 
   MPI_Barrier(world);
   time0 = MPI_Wtime();
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index a164fc4d9c..75c749e61f 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -367,7 +367,7 @@ void PairAmoeba::compute(int eflag, int vflag)
     time_mutual_rspace = time_mutual_kspace = 0.0;
     time_polar_rspace = time_polar_kspace = 0.0;
 
-    time_fphi_uind = 0.0;
+    time_grid_uind = time_fphi_uind = 0.0;
     if (ic_kspace) {
       ic_kspace->time_fft = 0.0;
     }
@@ -566,6 +566,9 @@ void PairAmoeba::finish()
   MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_polar_kspace = ave/comm->nprocs;
 
+  MPI_Allreduce(&time_grid_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_grid_uind = ave/comm->nprocs;
+
   MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_fphi_uind = ave/comm->nprocs;
 
@@ -592,15 +595,19 @@ void PairAmoeba::finish()
       utils::logmesg(lmp,"  Qxfer   time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
     utils::logmesg(lmp,"  Total   time: {:.6g}\n",time_total * 100.0);
 
-    utils::logmesg(lmp,"    Real-space timing breakdown:\n");
+    double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
+    double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
+
+    utils::logmesg(lmp,"    Real-space timing breakdown: {:.3g}%\n", rspace_time/time_total);
     utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
     utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
     utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
     utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); 
-    utils::logmesg(lmp,"    K-space timing breakdown:\n");
+    utils::logmesg(lmp,"    K-space timing breakdown   : {:.3g}%\n", kspace_time/time_total);
     utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
     utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
     utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
+    utils::logmesg(lmp,"       - Grid    : {:.6g} {:.3g}%\n", time_grid_uind, time_grid_uind/time_total);
     utils::logmesg(lmp,"       - FFT     : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
     utils::logmesg(lmp,"       - Interp  : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
     utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index af40f4a6ad..781d8a1e2f 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -80,11 +80,11 @@ class PairAmoeba : public Pair {
   double time_init, time_hal, time_repulse, time_disp;
   double time_mpole, time_induce, time_polar, time_qxfer;
 
-  double time_mpole_rspace,time_mpole_kspace;
-  double time_direct_rspace,time_direct_kspace;
-  double time_mutual_rspace,time_mutual_kspace;
-  double time_polar_rspace,time_polar_kspace;
-  double time_fphi_uind;
+  double time_mpole_rspace, time_mpole_kspace;
+  double time_direct_rspace, time_direct_kspace;
+  double time_mutual_rspace, time_mutual_kspace;
+  double time_polar_rspace, time_polar_kspace;
+  double time_grid_uind, time_fphi_uind;
 
   // energy/virial components
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 49698f1825..6b977cb638 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -930,15 +930,6 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
   memset(&field[0][0], 0, 3*nall *sizeof(double));
   memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
 
-/*  
-  for (int i = 0; i < nall; i++) {
-    for (int j = 0; j < 3; j++) {
-      field[i][j] = 0.0;
-      fieldp[i][j] = 0.0;
-    }
-  }
-*/
-  
   // get the real space portion of the mutual field first
 
   MPI_Barrier(world);
@@ -960,19 +951,13 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
     field[i][1] += term*uind[i][1];
     field[i][2] += term*uind[i][2];
   }
+
   for (int i = 0; i < nlocal; i++) {
     fieldp[i][0] += term*uinp[i][0];
     fieldp[i][1] += term*uinp[i][1];
     fieldp[i][2] += term*uinp[i][2];
   }
-/*  
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      field[i][j] += term*uind[i][j];
-      fieldp[i][j] += term*uinp[i][j];
-    }
-  }
-*/
+
   // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
@@ -1029,7 +1014,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
   }
 
   int nlocal = atom->nlocal;
-
   for (int i = 0; i < nlocal; i++) {
     fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
     fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
@@ -1041,22 +1025,23 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
     fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
     fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
   }
-/*
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
-      fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
-    }
-  }
-*/
+
+  double time0, time1;
+
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
   double ****gridpre = (double ****) ic_kspace->zero();
 
   // map 2 values to grid
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   grid_uind(fuind,fuinp,gridpre);
 
+  time1 = MPI_Wtime();
+  time_grid_uind += (time1 - time0);
+ 
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
@@ -1093,9 +1078,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
   double ****gridpost = (double ****) ic_kspace->post_convolution();
 
   // get potential
-  double time0, time1;
-
-  MPI_Barrier(world);
   time0 = MPI_Wtime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
@@ -1114,14 +1096,6 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
     }
   }
 
-  // convert the dipole fields from fractional to Cartesian
-
-  for (int i = 0; i < 3; i++) {
-    a[0][i] = nfft1 * recip[0][i];
-    a[1][i] = nfft2 * recip[1][i];
-    a[2][i] = nfft3 * recip[2][i];
-  }
-
   for (int i = 0; i < nlocal; i++) {
     double dfx = a[0][0]*fdip_phi1[i][1] +
       a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
@@ -1145,25 +1119,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
     fieldp[i][1] -= dfy;
     fieldp[i][2] -= dfz;
   }
-/*
-  for (int i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
-        a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
-      dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
-        a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
-    }
-  }
 
-  // increment the field at each multipole site
-
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      field[i][j] -= dipfield1[i][j];
-      fieldp[i][j] -= dipfield2[i][j];
-    }
-  }
-*/
 }
 
 /* ----------------------------------------------------------------------

From 00f46120c79f841dcecf78d75e7498bf7a3fc708 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 7 Oct 2022 15:50:30 -0500
Subject: [PATCH 129/181] Removed max_cus() from Device, used
 device->gpu->cus() instead

---
 lib/gpu/lal_amoeba.cpp      | 4 ++--
 lib/gpu/lal_base_amoeba.cpp | 8 ++++----
 lib/gpu/lal_device.cpp      | 1 -
 lib/gpu/lal_device.h        | 4 +---
 lib/gpu/lal_hippo.cpp       | 4 ++--
 5 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index b61d7595af..1c0aa77706 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -278,10 +278,10 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
-  const int max_cus = this->device->max_cus();
+  const int cus = this->device->gpu->cus();
   int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
-  while (GX < max_cus) {
+  while (GX < cus) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
   }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 3b2381f211..8e4e8faf83 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -727,10 +727,10 @@ int BaseAmoebaT::fphi_uind() {
     return 0;
 
   // Compute the block size and grid size to keep all cores busy
-  const int max_cus = device->max_cus();
+  const int cus = device->gpu->cus();
   int BX=block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  while (GX < max_cus) {
+  while (GX < cus) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   }
@@ -793,10 +793,10 @@ int BaseAmoebaT::fphi_mpole() {
   int nbor_pitch=nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
-  const int max_cus = device->max_cus();
+  const int cus = device->gpu->cus();
   int BX=block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  while (GX < max_cus) {
+  while (GX < cus) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   }
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 89ae503a97..039970a0d3 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -214,7 +214,6 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
       }
     }
     _first_device = _last_device = best_device;
-    _max_cus = best_cus;
     type = gpu->device_type(_first_device);
 
     if (ndevices > 0) {
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 7def4b7f82..74f802a096 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -241,8 +241,6 @@ class Device {
   inline int shuffle_avail() const { return _shuffle_avail; }
   /// For OpenCL, 0 if fast-math options disabled, 1 enabled
   inline int fast_math() const { return _fast_math; }
-  /// return the max number of CUs among the devices
-  inline int max_cus() const { return _max_cus; }
 
   /// Return the number of threads per atom for pair styles
   inline int threads_per_atom() const { return _threads_per_atom; }
@@ -326,7 +324,7 @@ class Device {
 
  private:
   std::queue<Answer<numtyp,acctyp> *> ans_queue;
-  int _init_count, _max_cus;
+  int _init_count;
   bool _device_init, _host_timer_started, _time_device;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index d8ef3e9a44..f20a0cfd62 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -619,10 +619,10 @@ int HippoT::polar_real(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
-  const int max_cus = this->device->max_cus();
+  const int cus = this->device->gpu->cus();
   int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
-  while (GX < max_cus) {
+  while (GX < cus) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
   }

From 2f1f7ee0fa49d79a970adad810ec290d509933f4 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 3 Nov 2022 23:45:40 -0500
Subject: [PATCH 130/181] Cleaned up code

---
 lib/gpu/lal_amoeba.cu       | 24 +++++++-----------------
 src/GPU/pair_amoeba_gpu.cpp |  1 +
 src/GPU/pair_hippo_gpu.cpp  | 20 ++++++++++----------
 3 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index ab750aaadc..cc593e4263 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1639,10 +1639,6 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
                           const int nxlo_out, const int ngridxy,
                           const int ngridx)
 {
-  //int tid, ii, offset, i, n_stride;
-  //atom_info(t_per_atom,ii,tid,offset);
-  
-
   int tid=THREAD_ID_X;
   int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
 
@@ -1763,23 +1759,17 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
           */
           const int i1 = istart + ib;
           const numtyp4 tha1 = thetai1[i1];
-          /*
-          const numtyp w0 = tha1.x;
-          const numtyp w1 = tha1.y;
-          const numtyp w2 = tha1.z;
-          const numtyp w3 = tha1.w;
-          */
           const int gidx = my + i; // k*ngridxy + j*ngridx + i;
           const numtyp2 tq = grid[gidx];
           const numtyp tq_1 = tq.x; //grid[gidx];
           const numtyp tq_2 = tq.y; //grid[gidx+1];
-          t0_1 += tq_1*tha1.x; // w0
-          t1_1 += tq_1*tha1.y; // w1
-          t2_1 += tq_1*tha1.z; // w2
-          t0_2 += tq_2*tha1.x; // w0
-          t1_2 += tq_2*tha1.y; // w1
-          t2_2 += tq_2*tha1.z; // w2
-          t3 += (tq_1+tq_2)*tha1.w; // w3
+          t0_1 += tq_1*tha1.x;
+          t1_1 += tq_1*tha1.y;
+          t2_1 += tq_1*tha1.z;
+          t0_2 += tq_2*tha1.x;
+          t1_2 += tq_2*tha1.y;
+          t2_2 += tq_2*tha1.z;
+          t3 += (tq_1+tq_2)*tha1.w;
           i++;
         }
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 6b977cb638..fa0670a757 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -1078,6 +1078,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
   double ****gridpost = (double ****) ic_kspace->post_convolution();
 
   // get potential
+
   time0 = MPI_Wtime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index b874c656c3..49a83e75be 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -1170,22 +1170,24 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
     fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
     fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
   }
-/*
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
-      fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
-    }
-  }
-*/
+
+  double time0, time1;
+
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
   double ****gridpre = (double ****) ic_kspace->zero();
 
   // map 2 values to grid
 
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   grid_uind(fuind,fuinp,gridpre);
 
+  time1 = MPI_Wtime();
+  time_grid_uind += (time1 - time0);
+
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
@@ -1222,9 +1224,7 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
   double ****gridpost = (double ****) ic_kspace->post_convolution();
 
   // get potential
-  double time0, time1;
 
-  MPI_Barrier(world);
   time0 = MPI_Wtime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);

From a3cc0e8432495d70cb1bb4ea8dc8a51c43841f20 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 4 Nov 2022 13:45:59 -0500
Subject: [PATCH 131/181] Reverted the block size tuning, which caused bugs for
 low atom counts (will revisit later)

---
 lib/gpu/lal_amoeba.cpp      | 10 ++++++----
 lib/gpu/lal_base_amoeba.cpp | 22 +++++++++++++---------
 lib/gpu/lal_hippo.cpp       | 12 +++++++-----
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 1c0aa77706..38aa2bde27 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -278,14 +278,16 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
+
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
   const int cus = this->device->gpu->cus();
-  int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
-  while (GX < cus) {
+  while (GX < cus && GX > 1) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
   }
-
+  */
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 8e4e8faf83..e6ffcd764a 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -727,14 +727,16 @@ int BaseAmoebaT::fphi_uind() {
     return 0;
 
   // Compute the block size and grid size to keep all cores busy
-  const int cus = device->gpu->cus();
-  int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  while (GX < cus) {
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  /*
+  const int cus = this->device->gpu->cus();
+  while (GX < cus && GX > 1) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   }
-  
+  */
   time_pair.start();
   int ngridxy = _ngridx * _ngridy;
   k_fphi_uind.set_size(GX,BX);
@@ -793,14 +795,16 @@ int BaseAmoebaT::fphi_mpole() {
   int nbor_pitch=nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
+
+  const int BX=block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  /*
   const int cus = device->gpu->cus();
-  int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  while (GX < cus) {
+  while (GX < cus && GX > 1) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   }
-
+  */
   time_pair.start();
   int ngridxy = _ngridx * _ngridy;
   k_fphi_mpole.set_size(GX,BX);
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index f20a0cfd62..d4366cac85 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -619,14 +619,16 @@ int HippoT::polar_real(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
-  const int cus = this->device->gpu->cus();
-  int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
-  while (GX < cus) {
+  
+  const int BX=this->block_size();
+  const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
+  /*
+  const int cus = this->device->gpu->cus();  
+  while (GX < cus && GX > 1) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
   }
-
+  */
   this->time_pair.start();
 
   // Build the short neighbor list if not done yet

From 959b9c220fabc63f8e87ce45aacb6acb0a14ca7b Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 7 Nov 2022 15:49:37 -0600
Subject: [PATCH 132/181] Cleaned up unused member functions and hd_balancer
 calls

---
 lib/gpu/lal_base_amoeba.cpp | 102 ++----------------------------------
 lib/gpu/lal_base_amoeba.h   |  14 +----
 lib/gpu/lal_hippo.cpp       |  52 ++++++------------
 lib/gpu/lal_hippo_ext.cpp   |  78 +++++++++++++--------------
 4 files changed, 59 insertions(+), 187 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index e6ffcd764a..a9c76d578e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -270,99 +270,6 @@ inline int BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
   return mn;
 }
 
-// ---------------------------------------------------------------------------
-// Copy nbor list from host if necessary and then calculate forces, virials
-// for the polar real-space term
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_full,
-                          const int nall, double **host_x, int *host_type,
-                          int *host_amtype, int *host_amgroup, double **host_rpole,
-                          double **host_uind, double **host_uinp,
-                          int *ilist, int *numj, int **firstneigh,
-                          const bool eflag_in, const bool vflag_in,
-                          const bool eatom, const bool vatom,
-                          int &host_start, const double cpu_time,
-                          bool &success, const double aewald, const double felec,
-                          const double off2_polar, double *host_q, const int nlocal,
-                          double *boxlo, double *prd, void **tep_ptr) {
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-
-  // ------------------- Resize _tep array ------------------------
-
-  if (nall>_max_tep_size) {
-    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
-    _tep.resize(_max_tep_size*4);
-
-    dev_nspecial15.clear();
-    dev_special15.clear();
-    dev_special15_t.clear();
-    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
-    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
-  }
-
-  *tep_ptr=_tep.host.begin();
-
-  if (inum_full==0) {
-    host_start=0;
-    // Make sure textures are correct if realloc by a different hybrid style
-    resize_atom(0,nall,success);
-    zero_timers();
-    return;
-  }
-
-  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
-  ans->inum(inum);
-  host_start=inum;
-
-  if (ago==0) {
-    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
-    if (!success)
-      return;
-  }
-
-  // packing host arrays into host_extra
-
-  atom->cast_x_data(host_x,host_type);
-  atom->cast_q_data(host_q);
-  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
-  hd_balancer.start_timer();
-  atom->add_x_data(host_x,host_type);
-  atom->add_q_data();
-  atom->add_extra_data();
-
-  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
-                     boxlo, prd);
-
-  _off2_polar = off2_polar;
-  _felec = felec;
-  const int red_blocks=polar_real(eflag,vflag);
-
-  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
-  device->add_ans_object(ans);
-  hd_balancer.stop_timer();
-
-  // copy tep from device to host
-
-  _tep.update_host(_max_tep_size*4,false);
-}
-
 // ---------------------------------------------------------------------------
 // Prepare for multiple kernel calls in a time step:
 //   - reallocate per-atom arrays, if needed
@@ -450,6 +357,8 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
     dev_short_nbor.resize((2+_max_nbors)*_nmax);
   }
 
+  hd_balancer.stop_timer();
+
   return nbor->host_jlist.begin()-host_start;
 }
 
@@ -491,8 +400,6 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   //device->add_ans_object(ans);
 
-  hd_balancer.stop_timer();
-
   // copy tep from device to host
 
   _tep.update_host(_max_tep_size*4,false);
@@ -828,7 +735,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
                                      const double aewald, const double felec,
                                      const double off2_polar, void **tep_ptr) {
 
-  int** firstneigh = nullptr;
+  // cast necessary data arrays from host to device
 
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();                    
@@ -845,10 +752,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
   ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
 
-  hd_balancer.stop_timer();
-
   // copy tep from device to host
-
   _tep.update_host(_max_tep_size*4,false);
 }
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 2e992a33d9..0fb2469d23 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -182,11 +182,12 @@ class BaseAmoeba {
                                  const int nzlo_out, const int nzhi_out,
                                  const int nylo_out, const int nyhi_out,
                                  const int nxlo_out, const int nxhi_out);
-
+  /// Interpolate the induced potential from the grid
   virtual void compute_fphi_uind(double ****host_grid_brick,
                                  void **host_fdip_phi1, void **host_fdip_phi2,
                                  void **host_fdip_sum_phi);
 
+  /// Interpolate the multipolar potential from the grid
   virtual void compute_fphi_mpole(double ***host_grid_brick, void **host_fphi,
                                   const double felec);
 
@@ -198,17 +199,6 @@ class BaseAmoeba {
                 const double aewald, const double felec, const double off2_polar,
                 void **tep_ptr);
 
-  /// Compute polar real-space with host neighboring (not active for now)
-  void compute_polar_real_host_nbor(const int f_ago, const int inum_full, const int nall,
-               double **host_x, int *host_type, int *host_amtype,
-               int *host_amgroup, double **host_rpole, double **host_uind,
-               double **host_uinp, int *ilist, int *numj,
-               int **firstneigh, const bool eflag, const bool vflag,
-               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, const double aewald, const double felec,
-               const double off2_polar, double *charge, const int nlocal, double *boxlo,
-               double *prd, void **tep_ptr);
-
   // copy field and fieldp from device to host after umutual2b
   virtual void update_fieldp(void **fieldp_ptr) {
     *fieldp_ptr=_fieldp.host.begin();
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index d4366cac85..334d75ac26 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -177,20 +177,20 @@ double HippoT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void HippoT::compute_repulsion(const int ago, const int inum_full,
-                                const int nall, double **host_x,
-                                int *host_type, int *host_amtype,
-                                int *host_amgroup, double **host_rpole,
-                                double *sublo, double *subhi, tagint *tag,
-                                int **nspecial, tagint **special,
-                                int *nspecial15, tagint **special15,
-                                const bool eflag_in, const bool vflag_in,
-                                const bool eatom, const bool vatom,
-                                int &host_start, int **ilist, int **jnum,
-                                const double cpu_time, bool &success,
-                                const double aewald, const double off2_repulse,
-                                double *host_q, double *boxlo, double *prd,
-                                double cut2, double c0, double c1, double c2,
-                                double c3, double c4, double c5, void **tep_ptr) {
+                               const int nall, double **host_x,
+                               int *host_type, int *host_amtype,
+                               int *host_amgroup, double **host_rpole,
+                               double *sublo, double *subhi, tagint *tag,
+                               int **nspecial, tagint **special,
+                               int *nspecial15, tagint **special15,
+                               const bool eflag_in, const bool vflag_in,
+                               const bool eatom, const bool vatom,
+                               int &host_start, int **ilist, int **jnum,
+                               const double cpu_time, bool &success,
+                               const double aewald, const double off2_repulse,
+                               double *host_q, double *boxlo, double *prd,
+                               double cut2, double c0, double c1, double c2,
+                               double c3, double c4, double c5, void **tep_ptr) {
   this->acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -225,16 +225,7 @@ void HippoT::compute_repulsion(const int ago, const int inum_full,
   _c5 = c5;
   const int red_blocks=repulsion(this->_eflag,this->_vflag);
 
-  // only copy them back if this is the last kernel
-  //   otherwise, commenting out these two lines to leave the answers
-  //   (forces, energies and virial) on the device until the last kernel
-  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //this->device->add_ans_object(this->ans);
-
-  this->hd_balancer.stop_timer();
-
   // copy tep from device to host
-
   this->_tep.update_host(this->_max_tep_size*4,false);
 }
 
@@ -303,8 +294,6 @@ void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
   //   (forces, energies and virial) on the device until the last kernel
   //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   //this->device->add_ans_object(this->ans);
-
-  this->hd_balancer.stop_timer();
 }
 
 // ---------------------------------------------------------------------------
@@ -386,15 +375,7 @@ void HippoT::compute_multipole_real(const int ago, const int inum_full,
   this->_aewald = aewald;
   const int red_blocks=multipole_real(this->_eflag,this->_vflag);
 
-  // leave the answers (forces, energies and virial) on the device,
-  //   only copy them back in the last kernel (this one, or polar_real once done)
-  //this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //this->device->add_ans_object(this->ans);
-
-  this->hd_balancer.stop_timer();
-
   // copy tep from device to host
-
   this->_tep.update_host(this->_max_tep_size*4,false);
 }
 
@@ -595,14 +576,11 @@ void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **ho
   const int red_blocks=polar_real(this->_eflag,this->_vflag);
 
   // only copy answers (forces, energies and virial) back from the device
-  //   in the last kernel (which is polar_real here)
+  //   in the last kernel in a timestep (which is polar_real here)
   this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   this->device->add_ans_object(this->ans);
 
-  this->hd_balancer.stop_timer();
-
   // copy tep from device to host
-
   this->_tep.update_host(this->_max_tep_size*4,false);
 }
 
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 9644f5aca4..77450bf7b1 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -69,15 +69,15 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
   int init_ok=0;
   if (world_me==0)
     init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
-                          host_pdamp, host_thole, host_dirdamp,
-                          host_amtype2class, host_special_repel, host_special_disp,
-                          host_special_mpole, host_special_polar_wscale,
-                          host_special_polar_piscale, host_special_polar_pscale,
-                          host_sizpr, host_dmppr, host_elepr,
-                          host_csix, host_adisp, host_pcore, host_palpha,
-                          nlocal, nall, max_nbors,
-                          maxspecial, maxspecial15, cell_size, gpu_split,
-                          screen, polar_dscale, polar_uscale);
+                         host_pdamp, host_thole, host_dirdamp,
+                         host_amtype2class, host_special_repel, host_special_disp,
+                         host_special_mpole, host_special_polar_wscale,
+                         host_special_polar_piscale, host_special_polar_pscale,
+                         host_sizpr, host_dmppr, host_elepr,
+                         host_csix, host_adisp, host_pcore, host_palpha,
+                         nlocal, nall, max_nbors,
+                         maxspecial, maxspecial15, cell_size, gpu_split,
+                         screen, polar_dscale, polar_uscale);
 
   HIPPOMF.device->world_barrier();
   if (message)
@@ -94,15 +94,15 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
-                            host_pdamp, host_thole, host_dirdamp,
-                            host_amtype2class, host_special_repel, host_special_disp,
-                            host_special_mpole, host_special_polar_wscale,
-                            host_special_polar_piscale, host_special_polar_pscale,
-                            host_sizpr, host_dmppr, host_elepr,
-                            host_csix, host_adisp, host_pcore, host_palpha,
-                            nlocal, nall, max_nbors,
-                            maxspecial, maxspecial15, cell_size, gpu_split,
-                            screen, polar_dscale, polar_uscale);
+                           host_pdamp, host_thole, host_dirdamp,
+                           host_amtype2class, host_special_repel, host_special_disp,
+                           host_special_mpole, host_special_polar_wscale,
+                           host_special_polar_piscale, host_special_polar_pscale,
+                           host_sizpr, host_dmppr, host_elepr,
+                           host_csix, host_adisp, host_pcore, host_palpha,
+                           nlocal, nall, max_nbors,
+                           maxspecial, maxspecial15, cell_size, gpu_split,
+                           screen, polar_dscale, polar_uscale);
 
     HIPPOMF.device->gpu_barrier();
     if (message)
@@ -121,16 +121,16 @@ void hippo_gpu_clear() {
 }
 
 int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
-                            double **host_x, int *host_type, int *host_amtype,
-                            int *host_amgroup, double **host_rpole,
-                            double **host_uind, double **host_uinp, double *host_pval,
-                            double *sublo, double *subhi, tagint *tag,
-                            int **nspecial, tagint **special,
-                            int *nspecial15, tagint **special15,
-                            const bool eflag_in, const bool vflag_in,
-                            const bool eatom, const bool vatom, int &host_start,
-                            int **ilist, int **jnum, const double cpu_time,
-                            bool &success, double *host_q, double *boxlo, double *prd) {
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp, double *host_pval,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo, double *prd) {
   return HIPPOMF.precompute(ago, inum_full, nall, host_x, host_type,
                             host_amtype, host_amgroup, host_rpole,
                             nullptr, nullptr, nullptr, sublo, subhi, tag,
@@ -141,17 +141,17 @@ int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
 }
 
 void hippo_gpu_compute_repulsion(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd,
-                           double cut2, double c0, double c1, double c2,
-                           double c3, double c4, double c5, void **tep_ptr) {
+                                 const int nall, double **host_x, int *host_type,
+                                 int *host_amtype, int *host_amgroup, double **host_rpole,
+                                 double *sublo, double *subhi, tagint *tag, int **nspecial,
+                                 tagint **special, int *nspecial15, tagint** special15,
+                                 const bool eflag, const bool vflag, const bool eatom,
+                                 const bool vatom, int &host_start,
+                                 int **ilist, int **jnum, const double cpu_time,
+                                 bool &success, const double aewald, const double off2,
+                                 double *host_q, double *boxlo, double *prd,
+                                 double cut2, double c0, double c1, double c2,
+                                 double c3, double c4, double c5, void **tep_ptr) {
   HIPPOMF.compute_repulsion(ago, inum_full, nall, host_x, host_type,
                           host_amtype, host_amgroup, host_rpole, sublo, subhi,
                           tag, nspecial, special, nspecial15, special15,

From 03e48f26589aebb11752d969bcf25ec750543efc Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 14 Jan 2023 19:51:42 -0600
Subject: [PATCH 133/181] Fixed memory leak in hippo/gpu

---
 src/GPU/pair_hippo_gpu.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 49a83e75be..915c67e512 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -440,11 +440,8 @@ void PairHippoGPU::induce()
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
-  if (use_ewald) {
-    choose(POLAR_LONG);
-    int nmine = p_kspace->nfft_owned;
-    memory->create(qfac,nmine,"ameoba/induce:qfac");
-  } else choose(POLAR);
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
 
   // owned atoms
 

From c21f2faa1f7e4dfa767ecd336a1d3bc3fcb593f2 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 14 Jan 2023 20:02:36 -0600
Subject: [PATCH 134/181] Cleaned up debug statements and unused sections in
 the amoeba and hippo gpu styles

---
 src/GPU/pair_amoeba_gpu.cpp | 302 +-----------------------------------
 src/GPU/pair_hippo_gpu.cpp  |  86 +++-------
 2 files changed, 22 insertions(+), 366 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index fa0670a757..534ab24085 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -13,7 +13,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Trung Nguyen (Northwestern)
+   Contributing author: Trung Nguyen (Northwestern/UChicago)
 ------------------------------------------------------------------------- */
 
 #include "pair_amoeba_gpu.h"
@@ -486,8 +486,6 @@ void PairAmoebaGPU::induce()
       comm->reverse_comm(this);
     }
 
-    //error->all(FLERR,"STOP GPU");
-
     // set initial conjugate gradient residual and conjugate vector
 
     for (i = 0; i < nlocal; i++) {
@@ -547,8 +545,6 @@ void PairAmoebaGPU::induce()
         comm->reverse_comm(this);
       }
 
-      //error->all(FLERR,"STOP");
-
       for (i = 0; i < nlocal; i++) {
         for (j = 0; j < 3; j++) {
           uind[i][j] = vec[i][j];
@@ -1751,166 +1747,6 @@ void PairAmoebaGPU::polar_kspace()
     }
   }
 
-  // account for dipole response terms in the TCG method
-
-  /*
-  if (poltyp == TCG) {
-
-    for (m = 0; m < tcgnab; m++) {
-      for (i = 0; i < nlocal; i++) {
-        for (j = 0; j < 3; j++) {
-          fuind[i][j] = a[0][j]*uad[i][m][0] + a[1][j]*uad[i][m][1] +
-            a[2][j]*uad[i][m][2];
-          fuinp[i][j] = a[0][j]*ubp[i][m][0] + a[1][j]*ubp[i][m][1] +
-            a[2][j]*ubp[i][m][2];
-        }
-      }
-
-      grid_uind(fuind,fuinp);
-      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
-
-      for (k = 0; k < nfft3; k++) {
-        for (j = 0; j < nfft2; j++) {
-          for (i = 0; i < nfft1; i++) {
-            term = qfac[k][j][i];
-            qgrid[k][j][i][0] *= term;
-            qgrid[k][j][i][1] *= term;
-          }
-        }
-      }
-
-      efft->compute(qgrid[0][0][0],qgrid[0][0][0],-1);
-      fphi_uind(fphid,fphip,fphidp);
-
-      for (i = 0; i < nlocal; i++) {
-        for (j = 1; j < 10; j++) {
-          fphid[i][j] *= felec;
-          fphip[i][j] *= felec;
-        }
-      }
-
-      for (i = 0; i < nlocal; i++) {
-        f1 = 0.0;
-        f2 = 0.0;
-        f3 = 0.0;
-        for (k = 0; k < 3; k++) {
-          j1 = deriv1[k+1];
-          j2 = deriv2[k+1];
-          j3 = deriv3[k+1];
-          f1 += fuind[i][k]*fphip[i][j1]+fuinp[i][k]*fphid[i][j1];
-          f2 += fuind[i][k]*fphip[i][j2]+fuinp[i][k]*fphid[i][j2];
-          f3 += fuind[i][k]*fphip[i][j3]+fuinp[i][k]*fphid[i][j3];
-        }
-
-        f1 *= 0.5 * nfft1;
-        f2 *= 0.5 * nfft2;
-        f3 *= 0.5 * nfft3;
-        h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3;
-        h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3;
-        h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3;
-        f[i][0] -= h1;
-        f[i][1] -= h2;
-        f[i][2] -= h3;
-
-        for (j = 1; j < 4; j++) {
-          cphid[j] = 0.0;
-          cphip[j] = 0.0;
-          for (k = 1; k < 4; k++) {
-            cphid[j] += ftc[j][k]*fphid[i][k];
-            cphip[j] += ftc[j][k]*fphip[i][k];
-          }
-        }
-
-        vxx -= 0.5*(cphid[1]*ubp[i][m][0] + cphip[1]*uad[i][m][0]);
-        vyy -= 0.5*(cphid[2]*ubp[i][m][1] + cphip[2]*uad[i][m][1]);
-        vzz -= 0.5*(cphid[3]*ubp[i][m][2] + cphip[3]*uad[i][m][2]);
-
-        vxy -= 0.25*(cphid[1]*ubp[i][m][1] + cphip[1]*uad[i][m][1] +
-                        cphid[2]*ubp[i][m][0] + cphip[2]*uad[i][m][0]);
-        vyz -= 0.25*(cphid[1]*ubp[i][m][2] + cphip[1]*uad[i][m][2] +
-                        cphid[3]*ubp[i][m][0] + cphip[3]*uad[i][m][0]);
-        vxz -= 0.25*(cphid[2]*ubp[i][m][2] + cphip[2]*uad[i][m][2] +
-                        cphid[3]*ubp[i][m][1] + cphip[3]*uad[i][m][1]);
-      }
-
-      for (i = 0; i < nlocal; i++) {
-        for (j = 0; j < 3; j++) {
-          fuind[i][j] = a[0][j]*ubd[i][m][0] + a[1][j]*ubd[i][m][1] +
-            a[2][j]*ubd[i][m][2];
-          fuinp[i][j] = a[0][j]*uap[i][m][0] + a[1][j]*uap[i][m][1] +
-            a[2][j]*uap[i][m][2];
-        }
-      }
-
-      grid_uind(fuind,fuinp);
-      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
-
-      for (k = 0; k < nfft3; k++) {
-        for (j = 0; j < nfft2; j++) {
-          for (i = 0; i < nfft1; i++) {
-            term = qfac[k][j][i];
-            qgrid[k][j][i][0] *= term;
-            qgrid[k][j][i][1] *= term;
-          }
-        }
-      }
-
-      efft->compute(qgrid[0][0][0],qgrid[0][0][0],-1);
-      fphi_uind(fphid,fphip,fphidp);
-
-      for (i = 0; i < nlocal; i++) {
-        for (j = 1; j < 10; j++) {
-          fphid[i][j] *= felec;
-          fphip[i][j] *= felec;
-        }
-      }
-
-      for (i = 0; i < nlocal; i++) {
-        f1 = 0.0;
-        f2 = 0.0;
-        f3 = 0.0;
-        for (k = 0; k < 3; k++) {
-          j1 = deriv1[k+1];
-          j2 = deriv2[k+1];
-          j3 = deriv3[k+1];
-          f1 += fuind[i][k]*fphip[i][j1]+fuinp[i][k]*fphid[i][j1];
-          f2 += fuind[i][k]*fphip[i][j2]+fuinp[i][k]*fphid[i][j2];
-          f3 += fuind[i][k]*fphip[i][j3]+fuinp[i][k]*fphid[i][j3];
-        }
-
-        f1 *= 0.5 * nfft1;
-        f2 *= 0.5 * nfft2;
-        f3 *= 0.5 * nfft3;
-        h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3;  // matvec
-        h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3;
-        h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3;
-        f[i][0] -= h1;
-        f[i][1] -= h2;
-        f[i][2] -= h3;
-
-        for (j = 1; j < 4; j++) {
-          cphid[j] = 0.0;
-          cphip[j] = 0.0;
-          for (k = 1; k < 4; k++) {
-            cphid[j] += ftc[j][k]*fphid[i][k];
-            cphip[j] += ftc[j][k]*fphip[i][k];
-          }
-        }
-
-        vxx -= 0.5*(cphid[1]*uap[i][m][0] + cphip[1]*ubd[i][m][0]);
-        vyy -= 0.5*(cphid[2]*uap[i][m][1] + cphip[2]*ubd[i][m][1]);
-        vzz -= 0.5*(cphid[3]*uap[i][m][2] + cphip[3]*ubd[i][m][2]);
-        vxy -= 0.25*(cphid[1]*uap[i][m][1] + cphip[1]*ubd[i][m][1] +
-                     cphid[2]*uap[i][m][0] + cphip[2]*ubd[i][m][0]);
-        vxz -= 0.25*(cphid[1]*uap[i][m][2] + cphip[1]*ubd[i][m][2] +
-                     cphid[3]*uap[i][m][0] + cphip[3]*ubd[i][m][0]);
-        vyz -= 0.25*(cphid[2]*uap[i][m][2] + cphip[2]*ubd[i][m][2] +
-                     cphid[3]*uap[i][m][1] + cphip[3]*ubd[i][m][1]);
-      }
-    }
-  }
-  */
-
   // assign permanent and induced multipoles to the PME grid
 
   for (i = 0; i < nlocal; i++) {
@@ -2097,142 +1933,6 @@ void PairAmoebaGPU::polar_kspace()
     }
   }
 
-  // add back missing terms for the TCG polarization method;
-  // first do the term for "UAD" dotted with "UBP"
-
-  /*
-  if (poltyp == TCG) {
-
-    for (m = 0; m < tcgnab; m++) {
-      for (i = 0; i < nlocal; i++) {
-        for (j = 0; j < 10; j++)
-          cmp[i][j] = 0.0;
-        for (j = 1; j < 4; j++)
-          cmp[i][j] = ubp[i][m][j-1];
-      }
-
-      cmp_to_fmp(cmp,fmp);
-      grid_mpole(fmp);
-      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
-
-      for (k = 0; k < nfft3; k++) {
-        for (j = 0; j < nfft2; j++) {
-          for (i = 0; i < nfft1; i++) {
-            qgrip[k][j][i][0] = qgrid[k][j][i][0];
-            qgrip[k][j][i][1] = qgrid[k][j][i][1];
-          }
-        }
-      }
-
-      for (i = 0; i < nlocal; i++) {
-        for (j = 1; j < 4; j++)
-          cmp[i][j] = uad[i][m][j-1];
-      }
-
-      cmp_to_fmp(cmp,fmp);
-      grid_mpole(fmp);
-      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
-
-      // make the scalar summation over reciprocal lattice
-      // NOTE: this loop has to be distributed for parallel
-      // NOTE: why does this one include m = 0 ?
-
-      for (m = 1; m < ntot; m++) {
-        k1 = m % nfft1;
-        k2 = (m % nff) / nfft1;
-        k3 = m/nff;
-        r1 = (k1 >= nf1) ? k1-nfft1 : k1;
-        r2 = (k2 >= nf2) ? k2-nfft2 : k2;
-        r3 = (k3 >= nf3) ? k3-nfft3 : k3;
-        h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;
-        h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
-        h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
-        hsq = h1*h1 + h2*h2 + h3*h3;
-        term = -pterm * hsq;
-        expterm = 0.0;
-        if (term > -50.0 && hsq != 0.0) {
-          denom = volterm*hsq*bsmod1[k1]*bsmod2[k2]*bsmod3[k3];
-          expterm = exp(term) / denom;
-          struc2 = qgrid[k3][k2][k1][0]*qgrip[k3][k2][k1][0] +
-            qgrid[k3][k2][k1][1]*qgrip[k3][k2][k1][1];
-          eterm = 0.5 * felec * expterm * struc2;
-          vterm = (2.0/hsq) * (1.0-term) * eterm;
-          virpolar[0] -= h1*h1*vterm - eterm;
-          virpolar[1] -= h2*h2*vterm - eterm;
-          virpolar[2] -= h3*h3*vterm - eterm;
-          virpolar[3] -= h1*h2*vterm;
-          virpolar[4] -= h1*h3*vterm;
-          virpolar[5] -= h2*h3*vterm;
-        }
-      }
-
-      // now do the TCG terms with "UBD" dotted with "UAP"
-
-      for (i = 0; i < nlocal; i++) {
-        for (j = 0; j < 10; j++)
-          cmp[i][j] = 0.0;
-        for (j = 1; j < 4; j++)
-          cmp[i][j] = uap[i][m][j-1];
-      }
-
-      cmp_to_fmp(cmp,fmp);
-      grid_mpole(fmp);
-      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
-
-      for (k = 0; k < nfft3; k++) {
-        for (j = 0; j < nfft2; j++) {
-          for (i = 0; i < nfft1; i++) {
-            qgrip[k][j][i][0] = qgrid[k][j][i][0];
-            qgrip[k][j][i][1] = qgrid[k][j][i][1];
-          }
-        }
-      }
-
-      for (i = 0; i < nlocal; i++) {
-        for (j = 1; j < 4; j++)
-          cmp[i][j] = ubd[i][m][j-1];
-      }
-
-      cmp_to_fmp(cmp,fmp);
-      grid_mpole(fmp);
-      efft->compute(qgrid[0][0][0],qgrid[0][0][0],1);
-
-      // make the scalar summation over reciprocal lattice
-      // NOTE: this loop has to be distributed for parallel
-      // NOTE: why does this one include m = 0 ?
-
-      for (m = 1; m < ntot; m++) {
-        k1 = m % nfft1;
-        k2 = (m % nff) / nfft1;
-        k3 = m/nff;
-        r1 = (k1 >= nf1) ? k1-nfft1 : k1;
-        r2 = (k2 >= nf2) ? k2-nfft2 : k2;
-        r3 = (k3 >= nf3) ? k3-nfft3 : k3;
-        h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3;
-        h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3;
-        h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3;
-        hsq = h1*h1 + h2*h2 + h3*h3;
-        term = -pterm * hsq;
-        expterm = 0.0;
-        if (term > -50.0 && hsq != 0.0) {
-          denom = volterm*hsq*bsmod1[k1]*bsmod2[k2]*bsmod3[k3];
-          expterm = exp(term) / denom;
-          struc2 = qgrid[k3][k2][k1][0]*qgrip[k3][k2][k1][0] +
-            qgrid[k3][k2][k1][1]*qgrip[k3][k2][k1][1];
-          eterm = 0.5 * felec * expterm * struc2;
-          vterm = (2.0/hsq) * (1.0-term) * eterm;
-          virpolar[0] -= h1*h1*vterm - eterm;
-          virpolar[1] -= h2*h2*vterm - eterm;
-          virpolar[2] -= h3*h3*vterm - eterm;
-          virpolar[3] -= h1*h2*vterm;
-          virpolar[4] -= h1*h3*vterm;
-          virpolar[5] -= h2*h3*vterm;
-        }
-      }
-    }
-  }
-  */
-
   // increment the total internal virial tensor components
 
   if (vflag_global) {
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 915c67e512..61c30c0ad1 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -13,7 +13,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Trung Nguyen (Northwestern)
+   Contributing author: Trung Nguyen (Northwestern/UChicago)
 ------------------------------------------------------------------------- */
 
 #include "pair_hippo_gpu.h"
@@ -208,14 +208,14 @@ void PairHippoGPU::init_style()
   int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
-                                pdamp, thole, dirdamp, amtype2class,
-                                special_repel, special_disp, special_mpole,
-                                special_polar_wscale, special_polar_piscale,
-                                special_polar_pscale, sizpr, dmppr, elepr,
-                                csix, adisp, pcore, palpha,
-                                atom->nlocal, atom->nlocal+atom->nghost, mnf,
-                                maxspecial, maxspecial15, cell_size, gpu_mode,
-                                screen, polar_dscale, polar_uscale, tq_size);
+                               pdamp, thole, dirdamp, amtype2class,
+                               special_repel, special_disp, special_mpole,
+                               special_polar_wscale, special_polar_piscale,
+                               special_polar_pscale, sizpr, dmppr, elepr,
+                               csix, adisp, pcore, palpha,
+                               atom->nlocal, atom->nlocal+atom->nghost, mnf,
+                               maxspecial, maxspecial15, cell_size, gpu_mode,
+                               screen, polar_dscale, polar_uscale, tq_size);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE)
@@ -271,14 +271,14 @@ void PairHippoGPU::repulsion()
   inum = atom->nlocal;
 
   firstneigh = hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x,
-                                     atom->type, amtype, amgroup, rpole,
-                                     nullptr, nullptr, nullptr,
-                                     sublo, subhi, atom->tag,
-                                     atom->nspecial, atom->special,
-                                     atom->nspecial15, atom->special15,
-                                     eflag, vflag, eflag_atom, vflag_atom,
-                                     host_start, &ilist, &numneigh, cpu_time,
-                                     success, atom->q, domain->boxlo, domain->prd);
+                                    atom->type, amtype, amgroup, rpole,
+                                    nullptr, nullptr, nullptr,
+                                    sublo, subhi, atom->tag,
+                                    atom->nspecial, atom->special,
+                                    atom->nspecial15, atom->special15,
+                                    eflag, vflag, eflag_atom, vflag_atom,
+                                    host_start, &ilist, &numneigh, cpu_time,
+                                    success, atom->q, domain->boxlo, domain->prd);
 
   // select the correct cutoff for the term
 
@@ -480,14 +480,6 @@ void PairHippoGPU::induce()
       }
     }
   }
-/*
-  printf("GPU: cutghost = %f\n", comm->cutghost[0]);
-  for (i = 0; i < 10; i++) {
-    printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
-      i, udir[i][0], udir[i][1], udir[i][2],
-      udirp[i][0], udirp[i][1], udirp[i][2]);
-  }
-*/
 
   // allocate memory and make early host-device transfers
   // must be done before the first ufield0c
@@ -611,8 +603,6 @@ void PairHippoGPU::induce()
       comm->reverse_comm(this);
     }
 
-    //error->all(FLERR,"STOP GPU");
-
     // set initial conjugate gradient residual and conjugate vector
 
     for (i = 0; i < nlocal; i++) {
@@ -1022,7 +1012,7 @@ void PairHippoGPU::udirect2b_cpu()
         tdipdip[ndip++] = bcn[1]*yr*zr;
         tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr;
       } else {
-        if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
+
       }
 
     } // jj
@@ -1055,16 +1045,7 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
 
   memset(&field[0][0], 0, 3*nall *sizeof(double));
   memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
-
-/*  
-  for (int i = 0; i < nall; i++) {
-    for (int j = 0; j < 3; j++) {
-      field[i][j] = 0.0;
-      fieldp[i][j] = 0.0;
-    }
-  }
-*/
-  
+ 
   // get the real space portion of the mutual field first
 
   MPI_Barrier(world);
@@ -1086,19 +1067,13 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
     field[i][1] += term*uind[i][1];
     field[i][2] += term*uind[i][2];
   }
+
   for (int i = 0; i < nlocal; i++) {
     fieldp[i][0] += term*uinp[i][0];
     fieldp[i][1] += term*uinp[i][1];
     fieldp[i][2] += term*uinp[i][2];
   }
-/*  
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      field[i][j] += term*uind[i][j];
-      fieldp[i][j] += term*uinp[i][j];
-    }
-  }
-*/
+
   // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
@@ -1271,25 +1246,6 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
     fieldp[i][1] -= dfy;
     fieldp[i][2] -= dfz;
   }
-/*
-  for (int i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
-        a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
-      dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
-        a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
-    }
-  }
-
-  // increment the field at each multipole site
-
-  for (i = 0; i < nlocal; i++) {
-    for (j = 0; j < 3; j++) {
-      field[i][j] -= dipfield1[i][j];
-      fieldp[i][j] -= dipfield2[i][j];
-    }
-  }
-*/
 }
 
 /* ----------------------------------------------------------------------

From 67574601ed8bfadb5e4a4139ae52b89399e080b7 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 15 Jan 2023 15:41:54 -0600
Subject: [PATCH 135/181] Cleaned up commented-out and debugging stuffs,
 removed irrelevant changes to lj/cut/dipole/cut, reverted unwanted changes in
 the PPPMGPU destructor, fixed unresolved conflicts in tinker.py, updated the
 userbinsize==0 case in atom.cpp and using Force::pair_match() as suggested.
 Internal timing stuffs need work.

---
 cmake/CMakeLists.txt                  |  1 -
 examples/amoeba/in.ubiquitin          |  2 --
 src/AMOEBA/amoeba_induce.cpp          | 40 ++----------------------
 src/AMOEBA/amoeba_kspace.cpp          |  8 -----
 src/AMOEBA/fix_amoeba_bitorsion.cpp   |  6 ++--
 src/AMOEBA/improper_amoeba.cpp        |  7 ++---
 src/AMOEBA/pair_amoeba.cpp            |  7 -----
 src/DIPOLE/pair_lj_cut_dipole_cut.cpp | 19 ++----------
 src/Depend.sh                         |  4 +++
 src/GPU/pppm_gpu.cpp                  |  2 ++
 src/atom.cpp                          | 11 +++----
 tools/tinker/tinker2lmp.py            | 44 ---------------------------
 12 files changed, 19 insertions(+), 132 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d7137c3672..0223750ace 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -395,7 +395,6 @@ endif()
 pkg_depends(ML-IAP ML-SNAP)
 pkg_depends(MPIIO MPI)
 pkg_depends(ATC MANYBODY)
-pkg_depends(AMOEBA KSPACE)
 pkg_depends(LATBOLTZ MPI)
 pkg_depends(SCAFACOS MPI)
 pkg_depends(AMOEBA KSPACE)
diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin
index cacb7d3571..cb789a19f8 100644
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@@ -34,8 +34,6 @@ pair_coeff      * * amoeba_ubiquitin.prm amoeba_ubiquitin.key
 
 special_bonds   lj/coul 0.5 0.5 0.5 one/five yes
 
-# setup force components this way so can dump them (AMOEBA or HIPPO also needs them for now)
-
 fix             fhal all store/state 0 fx fy fz
 fix             frepulse all store/state 0 fx fy fz
 fix             fdisp all store/state 0 fx fy fz
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 17c4df326d..031173060c 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -86,17 +86,6 @@ void PairAmoeba::induce()
   crstyle = FIELD;
   comm->reverse_comm(this);
 
-  // DEBUG statements
-
-  /*
-  for (i = 0; i < nlocal; i++)
-    if (atom->tag[i] == 1)
-      printf("AAA FIELD atom %d: field %g %g %g: fieldp %g %g %g\n",
-             atom->tag[i],
-             field[i][0],field[i][1],field[i][2],
-             fieldp[i][0],fieldp[i][1],fieldp[i][2]);
-  */
-
   // set induced dipoles to polarizability times direct field
 
   for (i = 0; i < nlocal; i++) {
@@ -213,16 +202,7 @@ void PairAmoeba::induce()
 
     cfstyle = INDUCE;
     comm->forward_comm(this);
-/*
-    if (comm->me == 0) {
-      printf("CPU: cutghost = %f\n", comm->cutghost[0]);
-      for (i = 0; i < 20; i++) {
-        printf("i = %d: uind = %f %f %f; udirp = %f %f %f\n",
-          i, uind[i][0], uind[i][1], uind[i][2],
-          uinp[i][0], uinp[i][1], uinp[i][2]); 
-      }
-    }
-*/
+
     ufield0c(field,fieldp);
 
     crstyle = FIELD;
@@ -284,18 +264,6 @@ void PairAmoeba::induce()
 
       crstyle = FIELD;
       comm->reverse_comm(this);
-      
-      //error->all(FLERR,"STOP");
-/*
-      if (comm->me == 0) {
-        printf("CPU: iter = %d\n", iter);
-        for (i = 0; i < 10; i++) {
-          printf("i = %d: field = %f %f %f; fieldp = %f %f %f\n",
-            i, field[i][0], field[i][1], field[i][2],
-            fieldp[i][0], fieldp[i][1], fieldp[i][2]); 
-        }    
-      }
-*/
 
       for (i = 0; i < nlocal; i++) {
         for (j = 0; j < 3; j++) {
@@ -413,8 +381,6 @@ void PairAmoeba::induce()
       }
     }
 
-    // if (comm->me == 0) printf("CG iteration count = %d\n",iter);
-
     // terminate the calculation if dipoles failed to converge
     // NOTE: could make this an error
 
@@ -1033,9 +999,7 @@ void PairAmoeba::umutual2b(double **field, double **fieldp)
       j = jlist[jj];
       uindj = uind[j];
       uinpj = uinp[j];
-      //if (i==0 && j == 10) 
-      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
-      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
+
       fid[0] = tdipdip[0]*uindj[0] + tdipdip[1]*uindj[1] + tdipdip[2]*uindj[2];
       fid[1] = tdipdip[1]*uindj[0] + tdipdip[3]*uindj[1] + tdipdip[4]*uindj[2];
       fid[2] = tdipdip[2]*uindj[0] + tdipdip[4]*uindj[1] + tdipdip[5]*uindj[2];
diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp
index 76d13da780..9213b96042 100644
--- a/src/AMOEBA/amoeba_kspace.cpp
+++ b/src/AMOEBA/amoeba_kspace.cpp
@@ -68,8 +68,6 @@ void PairAmoeba::moduli()
   int maxfft = MAX(nfft1,nfft2);
   maxfft = MAX(maxfft,nfft3);
 
-  //double *array = new double[bsorder];
-  //double *bsarray = new double[maxfft];
   if (maxfft > _nfft_max) {
     memory->destroy(_moduli_bsarray);
     _nfft_max = maxfft;
@@ -79,7 +77,6 @@ void PairAmoeba::moduli()
   // compute and load the moduli values
 
   double x = 0.0;
-  //bspline(x,bsorder,array);
   bspline(x,bsorder,_moduli_array);
 
   for (i = 0; i < maxfft; i++) _moduli_bsarray[i] = 0.0;
@@ -88,11 +85,6 @@ void PairAmoeba::moduli()
   dftmod(bsmod1,_moduli_bsarray,nfft1,bsorder);
   dftmod(bsmod2,_moduli_bsarray,nfft2,bsorder);
   dftmod(bsmod3,_moduli_bsarray,nfft3,bsorder);
-
-  // perform deallocation of local arrays
-
-  //delete[] array;
-  //delete[] bsarray;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/AMOEBA/fix_amoeba_bitorsion.cpp b/src/AMOEBA/fix_amoeba_bitorsion.cpp
index 6c3c31eec8..cb8c62819d 100644
--- a/src/AMOEBA/fix_amoeba_bitorsion.cpp
+++ b/src/AMOEBA/fix_amoeba_bitorsion.cpp
@@ -194,10 +194,8 @@ void FixAmoebaBiTorsion::init()
   // error check that PairAmoeba or PairHiippo exist
 
   pair = nullptr;
-  pair = force->pair_match("amoeba",1,0);
-  if (!pair) pair = force->pair_match("amoeba/gpu",1,0);
-  if (!pair) pair = force->pair_match("hippo",1,0);
-  if (!pair) pair = force->pair_match("hippo/gpu",1,0);
+  pair = force->pair_match("^amoeba",0,0);
+  if (!pair) pair = force->pair_match("^hippo",0,0);
 
   if (!pair)
     error->all(FLERR,"Cannot use fix amoeba/bitorsion w/out pair amoeba/hippo");
diff --git a/src/AMOEBA/improper_amoeba.cpp b/src/AMOEBA/improper_amoeba.cpp
index 136857e74b..cb9db01b59 100644
--- a/src/AMOEBA/improper_amoeba.cpp
+++ b/src/AMOEBA/improper_amoeba.cpp
@@ -285,10 +285,9 @@ void ImproperAmoeba::init_style()
   // check if PairAmoeba disabled improper terms
 
   Pair *pair = nullptr;
-  pair = force->pair_match("amoeba",1,0);
-  if (!pair) pair = force->pair_match("amoeba/gpu",1,0);
-  if (!pair) pair = force->pair_match("hippo",1,0);
-  if (!pair) pair = force->pair_match("hippo/gpu",1,0);
+  pair = force->pair_match("^amoeba",0,0);
+  if (!pair) pair = force->pair_match("^hippo",0,0);
+
   if (!pair) error->all(FLERR,"Improper amoeba could not find pair amoeba/hippo");
 
   int tmp;
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index bb06ecb4a4..2a1a10075c 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -1055,13 +1055,6 @@ void PairAmoeba::init_style()
 
   // request standard neighbor list
 
-
-//  int irequest = neighbor->request(this,instance_me);
-
-  // for DEBUGGING with GPU
-  //neighbor->requests[irequest]->half = 0;
-  //neighbor->requests[irequest]->full = 1;
-
   neighbor->add_request(this);
 }
 
diff --git a/src/DIPOLE/pair_lj_cut_dipole_cut.cpp b/src/DIPOLE/pair_lj_cut_dipole_cut.cpp
index 2047eb8b9c..a7e5674a88 100644
--- a/src/DIPOLE/pair_lj_cut_dipole_cut.cpp
+++ b/src/DIPOLE/pair_lj_cut_dipole_cut.cpp
@@ -90,8 +90,6 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag)
   numneigh = list->numneigh;
   firstneigh = list->firstneigh;
 
-  int maxsize = 10;
-
   // loop over neighbors of my atoms
 
   for (ii = 0; ii < inum; ii++) {
@@ -104,13 +102,6 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag)
     jlist = firstneigh[i];
     jnum = numneigh[i];
 
-    double scale_dipole = 1.0;
-    if (jnum > maxsize) {
-      scale_dipole = maxsize; //1.0/(double)maxsize;
-    } else {
-      scale_dipole = jnum; //1.0/(double)jnum;
-    }
-
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
       factor_lj = special_lj[sbmask(j)];
@@ -216,7 +207,7 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag)
 
         // total force
 
-        fq = scale_dipole*factor_coul*qqrd2e;
+        fq = factor_coul*qqrd2e;
         fx = fq*forcecoulx + delx*forcelj;
         fy = fq*forcecouly + dely*forcelj;
         fz = fq*forcecoulz + delz*forcelj;
@@ -230,7 +221,7 @@ void PairLJCutDipoleCut::compute(int eflag, int vflag)
         torque[i][1] += fq*tiycoul;
         torque[i][2] += fq*tizcoul;
 
-        if (newton_pair) {
+        if (newton_pair || j < nlocal) {
           f[j][0] -= fx;
           f[j][1] -= fy;
           f[j][2] -= fz;
@@ -371,13 +362,7 @@ void PairLJCutDipoleCut::init_style()
   if (!atom->q_flag || !atom->mu_flag || !atom->torque_flag)
     error->all(FLERR,"Pair dipole/cut requires atom attributes q, mu, torque");
 
-<<<<<<< HEAD
-  int irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->full = 1;
-=======
   neighbor->add_request(this);
->>>>>>> amoeba
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/Depend.sh b/src/Depend.sh
index 6cf613cde7..28ac78d9af 100755
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -45,6 +45,10 @@ depend () {
 # add one if statement per parent package
 # add one depend() call per child package that depends on that parent
 
+if (test $1 = "AMOEBA") then
+  depend GPU
+fi
+
 if (test $1 = "ASPHERE") then
   depend GPU
   depend OPENMP
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index 4019eb467d..a2a2b0eed8 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -102,6 +102,8 @@ PPPMGPU::PPPMGPU(LAMMPS *lmp) : PPPM(lmp)
 PPPMGPU::~PPPMGPU()
 {
   PPPM_GPU_API(clear)(poisson_time);
+  destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
+  destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/atom.cpp b/src/atom.cpp
index 8b78b4f8f7..0de44e50ca 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -2358,16 +2358,13 @@ void Atom::setup_sort_bins()
   }
 
 #ifdef LMP_GPU
-  if (userbinsize == 0.0) {   
-    int ifix = modify->find_fix("package_gpu");
-    if (ifix >= 0) {
+  if (userbinsize == 0.0) {
+    auto ifix = dynamic_cast<FixGPU *>(modify->get_fix_by_id("package_gpu"));
+    if (ifix) {
       const double subx = domain->subhi[0] - domain->sublo[0];
       const double suby = domain->subhi[1] - domain->sublo[1];
       const double subz = domain->subhi[2] - domain->sublo[2];
-
-      FixGPU *fix = static_cast<FixGPU *>(modify->fix[ifix]);
-      binsize = fix->binsize(subx, suby, subz, atom->nlocal,
-                             0.5 * neighbor->cutneighmax);
+      binsize = ifix->binsize(subx, suby, subz, atom->nlocal, 0.5 * neighbor->cutneighmax);
     }
   }
 #endif
diff --git a/tools/tinker/tinker2lmp.py b/tools/tinker/tinker2lmp.py
index d376593ea3..e3ae59748c 100644
--- a/tools/tinker/tinker2lmp.py
+++ b/tools/tinker/tinker2lmp.py
@@ -227,11 +227,7 @@ class XYZfile(object):
       print(i+1,label[i],x[i],y[i],z[i],type[i], end=' ', file=fp)
       for j in bonds[i]: print(j, end=' ', file=fp)
       print(file=fp)
-<<<<<<< HEAD
-    
-=======
 
->>>>>>> develop
     fp.close()
 
   # triplet of atoms in an angle = atom 1,2,3
@@ -1098,16 +1094,6 @@ for i,one in enumerate(alist):
 
     elif len(params[3]) == 2:
       nbonds,hcount = xyz.angle_hbond_count(atom1,atom2,atom3,lmptype,lmpmass)
-<<<<<<< HEAD
-      
-      if nbonds != 3: 
-        print("Center angle atom has wrong bond count")
-        print("  angle atom IDs:",atom1,atom2,atom3)
-        print("  angle atom classes:",c1,c2,c3)
-        print("  Tinker FF file param options:",len(params[3]))
-        print("  Nbonds and hydrogen count:",nbonds,hcount)
-        #sys.exit()      NOTE: allow this for now
-=======
 
       #if nbonds != 3:
         #print("Center angle atom has wrong bond count")
@@ -1117,33 +1103,12 @@ for i,one in enumerate(alist):
         #print("  Nbonds and hydrogen count:",nbonds,hcount)
         # NOTE: allow this for now
         #sys.exit()
->>>>>>> develop
 
       if hcount == 0: which = 1
       elif hcount == 1:
         which = 2
         m += 1
 
-<<<<<<< HEAD
-      print("3-bond angle")
-      print("  angle atom IDs:",atom1,atom2,atom3)
-      print("  angle atom classes:",c1,c2,c3)
-      print("  Tinker FF file param options:",len(params[3]))
-      print("  Nbonds and hydrogen count:",nbonds,hcount)
-      print("  which:",which,m)
-
-    elif len(params[3]) == 3:
-      nbonds,hcount = xyz.angle_hbond_count(atom1,atom2,atom3,lmptype,lmpmass)
-      
-      if nbonds != 4: 
-        print("Center angle atom has wrong bond count")
-        print("  angle atom IDs:",atom1,atom2,atom3)
-        print("  angle atom classes:",c1,c2,c3)
-        print("  Tinker FF file param options:",len(params[3]))
-        print("  Nbonds and hydrogen count:",nbonds,hcount)
-        #sys.exit()     NOTE: allow this for now
-        
-=======
       #print("3-bond angle")
       #print("  angle atom IDs:",atom1,atom2,atom3)
       #print("  angle atom classes:",c1,c2,c3)
@@ -1163,7 +1128,6 @@ for i,one in enumerate(alist):
         # NOTE: allow this for now
         #sys.exit()
 
->>>>>>> develop
       if hcount == 0: which = 1
       elif hcount == 1:
         which = 2
@@ -1207,12 +1171,8 @@ for itype in range(len(aparams)):
   elif (c3,c2,c1) in badict:
     n1,n2,r1,r2 = badict[(c3,c2,c1)]
   else:
-<<<<<<< HEAD
-    print("Bond-stretch angle triplet not found: %d %d %d" % (c1,c2,c3))
-=======
     # NOTE: just for debugging
     #print("Bond-stretch angle triplet not found: %d %d %d" % (c1,c2,c3))
->>>>>>> develop
     n1,n2,r1,r2 = 4*[0.0]
 
   baparams.append((n1,n2,r1,r2))
@@ -1670,11 +1630,7 @@ print("Natoms =",natoms)
 print("Ntypes =",ntypes)
 print("Tinker XYZ types =",len(tink2lmp))
 print("Tinker PRM types =",prm.ntypes)
-<<<<<<< HEAD
-#print "Tinker groups =",ngroups
-=======
 #print("Tinker groups =",ngroups)
->>>>>>> develop
 print("Nmol =",nmol)
 print("Nbonds =",nbonds)
 print("Nangles =",nangles)

From d5b878d04726164381178fe6d9e2ebdee10c8d07 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 15 Jan 2023 15:56:40 -0600
Subject: [PATCH 136/181] Updated the doc page of amoeba/hippo styles to
 indicate that their gpu versions are supported

---
 doc/src/pair_amoeba.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst
index f5c0ea14df..94c956a585 100644
--- a/doc/src/pair_amoeba.rst
+++ b/doc/src/pair_amoeba.rst
@@ -1,11 +1,18 @@
 .. index:: pair_style amoeba
+.. index:: pair_style amoeba/gpu
 .. index:: pair_style hippo
+.. index:: pair_style hippo/gpu
 
 pair_style amoeba command
 =========================
 
+Accelerator Variants: *amoeba/gpu*
+
 pair_style hippo command
 ========================
+
+Accelerator Variants: *hippo/gpu*
+
 Syntax
 """"""
 
@@ -187,6 +194,10 @@ These pair styles can only be used via the *pair* keyword of the
 
 ----------
 
+.. include:: accel_styles.rst
+
+----------
+
 Restrictions
 """"""""""""
 

From c9ae41246d45cc29ba3e68f715809cc6b3028617 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 15 Jan 2023 16:05:36 -0600
Subject: [PATCH 137/181] Ran the four make commands in the src folder:  make
 fix-whitespace; make fix-homepage; make fix-errordocs; make fix-permissions

---
 lib/gpu/lal_atom.h                 |  2 +-
 lib/gpu/lal_base_amoeba.cpp        | 18 +++++++++---------
 lib/gpu/lal_base_amoeba.h          |  4 ++--
 lib/gpu/lal_hippo.cpp              |  4 ++--
 lib/gpu/lal_hippo.h                |  2 +-
 lib/gpu/lal_hippo_extra.h          |  2 +-
 lib/gpu/lal_neighbor.cpp           |  4 ++--
 lib/gpu/lal_neighbor.h             |  2 +-
 src/AMOEBA/amoeba_kspace.cpp       |  2 +-
 src/AMOEBA/amoeba_multipole.cpp    |  4 ++--
 src/AMOEBA/pair_amoeba.cpp         |  6 +++---
 src/AMOEBA/pair_amoeba.h           |  4 ++--
 src/GPU/amoeba_convolution_gpu.cpp |  2 +-
 src/GPU/amoeba_convolution_gpu.h   |  2 +-
 src/GPU/pair_amoeba_gpu.cpp        | 28 ++++++++++++++--------------
 src/GPU/pair_amoeba_gpu.h          | 15 +--------------
 src/GPU/pair_hippo_gpu.cpp         | 16 ++++++++--------
 src/GPU/pair_hippo_gpu.h           | 15 +--------------
 18 files changed, 53 insertions(+), 79 deletions(-)

diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index bec1ad38cc..142d64ef1d 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -285,7 +285,7 @@ class Atom {
   /// Signal that we need to transfer atom data for next timestep
   inline void data_unavail()
     { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _extra_avail=false; _resized=false; }
-  
+
   /// Signal that we need to transfer atom extra data for next kernel call
   inline void extra_data_unavail()
     { _extra_avail=false; }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index a9c76d578e..a1d4a00c2c 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -42,7 +42,7 @@ BaseAmoebaT::~BaseAmoeba() {
   k_polar.clear();
   k_special15.clear();
   k_short_nbor.clear();
-  
+
   #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
   if (fft_plan_created) cufftDestroy(plan);
   #endif
@@ -365,7 +365,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 // ---------------------------------------------------------------------------
 // Compute multipole real-space part
 //   precompute() should be already invoked before mem (re)allocation
-//   this is the first part in a time step done on the GPU for AMOEBA for now 
+//   this is the first part in a time step done on the GPU for AMOEBA for now
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
@@ -418,7 +418,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
 
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
   atom->add_extra_data();
- 
+
   *fieldp_ptr=_fieldp.host.begin();
 
   // specify the correct cutoff and alpha values
@@ -443,7 +443,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
   // only copy the necessary data arrays that are updated over the iterations
   // use nullptr for the other arrays that are already copied from host to device
   cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
-  atom->add_extra_data();                          
+  atom->add_extra_data();
 
   // set the correct cutoff and alpha
   _off2_polar = off2_polar;
@@ -648,7 +648,7 @@ int BaseAmoebaT::fphi_uind() {
   int ngridxy = _ngridx * _ngridy;
   k_fphi_uind.set_size(GX,BX);
   k_fphi_uind.run(&_thetai1, &_thetai2, &_thetai3, &_igrid, &_cgrid_brick,
-                  &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum, 
+                  &_fdip_phi1, &_fdip_phi2, &_fdip_sum_phi, &_bsorder, &ainum,
                   &_nzlo_out, &_nylo_out, &_nxlo_out, &ngridxy, &_ngridx);
   time_pair.stop();
 
@@ -738,7 +738,7 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
   // cast necessary data arrays from host to device
 
   cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp, host_pval);
-  atom->add_extra_data();                    
+  atom->add_extra_data();
 
   *tep_ptr=_tep.host.begin();
 
@@ -784,7 +784,7 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode)
 {
   // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
-  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)    
+  #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
   if (fft_plan_created == false) {
     int m = numel/2;
     cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
@@ -793,7 +793,7 @@ void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int
 
   // n = number of double complex
   int n = numel/2;
-  
+
   // copy the host array to the device (data)
   UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
   data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
@@ -807,7 +807,7 @@ void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int
   data.update_device(false);
 
   // perform the in-place forward FFT
-  
+
   cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
     (cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
   if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 0fb2469d23..a20c3886d5 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -256,7 +256,7 @@ class BaseAmoeba {
   int _ngridx, _ngridy, _ngridz, _num_grid_points;
 
   int _end_command_queue;
-  
+
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   Answer<numtyp,acctyp> *ans;
@@ -312,7 +312,7 @@ class BaseAmoeba {
   virtual int fphi_uind();
   virtual int fphi_mpole();
   virtual int polar_real(const int eflag, const int vflag) = 0;
-  
+
 
   #if !defined(USE_OPENCL) && !defined(USE_HIP)
   cufftHandle plan;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 334d75ac26..f8ab436ad0 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -597,11 +597,11 @@ int HippoT::polar_real(const int eflag, const int vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
 
   // Compute the block size and grid size to keep all cores busy
-  
+
   const int BX=this->block_size();
   const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
   /*
-  const int cus = this->device->gpu->cus();  
+  const int cus = this->device->gpu->cus();
   while (GX < cus && GX > 1) {
     BX /= 2;
     GX=static_cast<int>(ceil(static_cast<double>(ainum)/(BX/this->_threads_per_atom)));
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
index 671c9964ff..4780ab8ea9 100644
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@@ -124,7 +124,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
   UCL_D_Vec<numtyp4> coeff_amtype;
   /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
   UCL_D_Vec<numtyp4> coeff_amclass;
-  /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z; 
+  /// sizpr = coeff_rep.x; dmppr = coeff_rep.y; elepr = coeff_rep.z;
   UCL_D_Vec<numtyp4> coeff_rep;
   /// Special polar values [0-4]:
   ///   sp_polar.x = special_polar_wscale
diff --git a/lib/gpu/lal_hippo_extra.h b/lib/gpu/lal_hippo_extra.h
index ac02e2e9e8..7ff62aa9a4 100644
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@@ -173,7 +173,7 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
   dmpik[4] = pre * (s*d2s + ds*ds);
   dmpik[6] = pre * (s*d3s + (numtyp)3.0*ds*d2s);
   dmpik[8] = pre * (s*d4s + (numtyp)4.0*ds*d3s + (numtyp)3.0*d2s*d2s);
-  
+
   if (rorder >= 11) dmpik[10] = pre * (s*d5s + (numtyp)5.0*ds*d4s + (numtyp)10.0*d2s*d3s);
 }
 
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 983cea307a..10816e2fa6 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -684,7 +684,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     if (_cutoff < _cell_size) vadjust*=1.46;
     mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
     if (mn<33) mn+=3;
-    
+
     resize_max_neighbors<numtyp,acctyp>(mn,success);
     set_nbor_block_size(mn/2);
     if (!success)
@@ -837,7 +837,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   time_nbor.stop();
 }
 
-void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in, 
+void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
     const int columns_in, const int rows_in)
 {
   const int b2x=_block_cell_2d;
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 9ea02b0b40..9061ce5150 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -260,7 +260,7 @@ class Neighbor {
   }
 
   /// Helper function
-  void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in, 
+  void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in,
     const int columns_in, const int rows_in);
 
  private:
diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp
index 9213b96042..c47e734c5e 100644
--- a/src/AMOEBA/amoeba_kspace.cpp
+++ b/src/AMOEBA/amoeba_kspace.cpp
@@ -73,7 +73,7 @@ void PairAmoeba::moduli()
     _nfft_max = maxfft;
     memory->create(_moduli_bsarray,_nfft_max,"amoeba:_moduli_bsarray");
   }
-  
+
   // compute and load the moduli values
 
   double x = 0.0;
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index 3b5dbbed51..7269128080 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -419,7 +419,7 @@ void PairAmoeba::multipole_real()
           term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
           term3i*rr5i + term3k*rr5k + term3ik*rr5ik;
 
-        
+
 
         // find damped multipole intermediates for force and torque
 
@@ -465,7 +465,7 @@ void PairAmoeba::multipole_real()
         term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
         term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
         term6 = 4.0 * rr7;
-        
+
       }
 
       empole += e;
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index 2a1a10075c..df9472e188 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -570,7 +570,7 @@ void PairAmoeba::finish()
   double time_mutual_fft = ic_kspace->time_fft;
   MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_mutual_fft = ave/comm->nprocs;
-  
+
   double time_total = (time_init + time_hal + time_repulse + time_disp +
                        time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
 
@@ -597,7 +597,7 @@ void PairAmoeba::finish()
     utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_rspace, time_mpole_rspace/time_total);
     utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_rspace, time_direct_rspace/time_total);
     utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_rspace, time_mutual_rspace/time_total);
-    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total); 
+    utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_rspace, time_polar_rspace/time_total);
     utils::logmesg(lmp,"    K-space timing breakdown   : {:.3g}%\n", kspace_time/time_total);
     utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
     utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
@@ -606,7 +606,7 @@ void PairAmoeba::finish()
     utils::logmesg(lmp,"       - FFT     : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
     utils::logmesg(lmp,"       - Interp  : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
     utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
-    
+
   }
 }
 
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 1bb3212df8..f14be4bd11 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -347,8 +347,8 @@ class PairAmoeba : public Pair {
 
   class AmoebaConvolution *m_kspace;   // multipole KSpace
   class AmoebaConvolution *p_kspace;   // polar KSpace
-  class AmoebaConvolution *pc_kspace;  
-  class AmoebaConvolution *d_kspace;   // dispersion KSpace 
+  class AmoebaConvolution *pc_kspace;
+  class AmoebaConvolution *d_kspace;   // dispersion KSpace
   class AmoebaConvolution *i_kspace;   // induce KSpace
   class AmoebaConvolution *ic_kspace;
 
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
index 0284791d38..fd4aece6c8 100644
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -1,7 +1,7 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/ Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
+   LAMMPS Development team: developers@lammps.org
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/src/GPU/amoeba_convolution_gpu.h b/src/GPU/amoeba_convolution_gpu.h
index c446995b4a..4286f2155f 100644
--- a/src/GPU/amoeba_convolution_gpu.h
+++ b/src/GPU/amoeba_convolution_gpu.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/ Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
+   LAMMPS Development team: developers@lammps.org
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 534ab24085..713015b5c5 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -2,7 +2,7 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
+   LAMMPS Development team: developers@lammps.org
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -283,7 +283,7 @@ void PairAmoebaGPU::multipole_real()
                                     success, aewald, felec, off2, atom->q,
                                     domain->boxlo, domain->prd, &tq_pinned);
 
-  
+
 
   // reference to the tep array from GPU lib
 
@@ -400,7 +400,7 @@ void PairAmoebaGPU::induce()
       }
 
       for (i = 0; i < nlocal; i++) {
-	      itype = amtype[i];
+              itype = amtype[i];
         for (j = 0; j < 3; j++) {
           uopt[i][m][j] = polarity[itype] * field[i][j];
           uoptp[i][m][j] = polarity[itype] * fieldp[i][j];
@@ -666,7 +666,7 @@ void PairAmoebaGPU::induce()
 
     if (iter >= maxiter || eps > epsold)
       if (comm->me == 0)
-	      error->warning(FLERR,"AMOEBA induced dipoles did not converge");
+              error->warning(FLERR,"AMOEBA induced dipoles did not converge");
   }
 
   // update the lists of previous induced dipole values
@@ -958,7 +958,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
   amoeba_gpu_update_fieldp(&fieldp_pinned);
-  
+
   int inum = atom->nlocal;
   double *field_ptr = (double *)fieldp_pinned;
 
@@ -1015,8 +1015,8 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
     fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
     fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
   }
-    
-  for (int i = 0; i < nlocal; i++) {      
+
+  for (int i = 0; i < nlocal; i++) {
     fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
     fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
     fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
@@ -1037,7 +1037,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 
   time1 = MPI_Wtime();
   time_grid_uind += (time1 - time0);
- 
+
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
@@ -1137,7 +1137,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
   void* fdip_sum_phi_pinned = nullptr;
   amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned,
                        &fdip_sum_phi_pinned);
-  
+
   int nlocal = atom->nlocal;
   double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
   for (int i = 0; i < nlocal; i++) {
@@ -1356,7 +1356,7 @@ void PairAmoebaGPU::polar_kspace()
     bspline_fill();
 
     // allocate memory and make early host-device transfers
-  
+
     // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill
     if (gpu_fphi_mpole_ready) {
        amoeba_gpu_precompute_kspace(atom->nlocal, bsorder,
@@ -1365,7 +1365,7 @@ void PairAmoebaGPU::polar_kspace()
                                     p_kspace->nylo_out, p_kspace->nyhi_out,
                                     p_kspace->nxlo_out, p_kspace->nxhi_out);
     }
-      
+
 
     // convert Cartesian multipoles to fractional coordinates
 
@@ -1435,7 +1435,7 @@ void PairAmoebaGPU::polar_kspace()
     double ***gridpost = (double ***) p_kspace->post_convolution();
 
     // get potential
-    
+
     if (!gpu_fphi_mpole_ready) {
       fphi_mpole(gridpost,fphi);
 
@@ -1447,7 +1447,7 @@ void PairAmoebaGPU::polar_kspace()
     } else {
       void* fphi_pinned = nullptr;
       amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
-    
+
       double *_fphi_ptr = (double *)fphi_pinned;
       for (int i = 0; i < nlocal; i++) {
         int idx = i;
@@ -1457,7 +1457,7 @@ void PairAmoebaGPU::polar_kspace()
         }
       }
 
-    }   
+    }
 
     // convert field from fractional to Cartesian
 
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index 420874df21..b7230594c5 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
+   LAMMPS Development team: developers@lammps.org
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -71,16 +71,3 @@ class PairAmoebaGPU : public PairAmoeba {
 }    // namespace LAMMPS_NS
 #endif
 #endif
-
-/* ERROR/WARNING messages:
-
-E: Insufficient memory on accelerator
-
-There is insufficient memory on one of the devices specified for the gpu
-package
-
-E: Pair style amoeba/gpu requires atom attribute q
-
-The atom style defined does not have this attribute.
-
-*/
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 61c30c0ad1..bf3e113ea7 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -2,7 +2,7 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
+   LAMMPS Development team: developers@lammps.org
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -517,7 +517,7 @@ void PairHippoGPU::induce()
       }
 
       for (i = 0; i < nlocal; i++) {
-	      itype = amtype[i];
+              itype = amtype[i];
         for (j = 0; j < 3; j++) {
           uopt[i][m][j] = polarity[itype] * field[i][j];
           uoptp[i][m][j] = polarity[itype] * fieldp[i][j];
@@ -785,7 +785,7 @@ void PairHippoGPU::induce()
 
     if (iter >= maxiter || eps > epsold)
       if (comm->me == 0)
-	      error->warning(FLERR,"HIPPO induced dipoles did not converge");
+              error->warning(FLERR,"HIPPO induced dipoles did not converge");
   }
 
   // update the lists of previous induced dipole values
@@ -1045,7 +1045,7 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
 
   memset(&field[0][0], 0, 3*nall *sizeof(double));
   memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
- 
+
   // get the real space portion of the mutual field first
 
   MPI_Barrier(world);
@@ -1078,7 +1078,7 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
   hippo_gpu_update_fieldp(&fieldp_pinned);
-  
+
   int inum = atom->nlocal;
   double *field_ptr = (double *)fieldp_pinned;
 
@@ -1136,8 +1136,8 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
     fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
     fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
   }
-    
-  for (int i = 0; i < nlocal; i++) {      
+
+  for (int i = 0; i < nlocal; i++) {
     fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
     fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
     fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
@@ -1266,7 +1266,7 @@ void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1,
   void* fdip_sum_phi_pinned = nullptr;
   hippo_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned,
                       &fdip_sum_phi_pinned);
-  
+
   int nlocal = atom->nlocal;
   double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
   for (int i = 0; i < nlocal; i++) {
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index b1b908411d..44bebd29f3 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
+   LAMMPS Development team: developers@lammps.org
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -71,16 +71,3 @@ class PairHippoGPU : public PairAmoeba {
 }    // namespace LAMMPS_NS
 #endif
 #endif
-
-/* ERROR/WARNING messages:
-
-E: Insufficient memory on accelerator
-
-There is insufficient memory on one of the devices specified for the gpu
-package
-
-E: Pair style hippo/gpu requires atom attribute q
-
-The atom style defined does not have this attribute.
-
-*/

From 88e1ce33799ba875f8f94506c2ecd8e5fbf64ddd Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 15 Jan 2023 17:42:16 -0500
Subject: [PATCH 138/181] flag GPU acceleration

---
 doc/src/Commands_pair.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index 59501b4a56..d9bbe590ef 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -39,7 +39,7 @@ OPT.
    * :doc:`agni (o) <pair_agni>`
    * :doc:`airebo (io) <pair_airebo>`
    * :doc:`airebo/morse (io) <pair_airebo>`
-   * :doc:`amoeba <pair_amoeba>`
+   * :doc:`amoeba (g) <pair_amoeba>`
    * :doc:`atm <pair_atm>`
    * :doc:`awpmd/cut <pair_awpmd>`
    * :doc:`beck (go) <pair_beck>`
@@ -126,7 +126,7 @@ OPT.
    * :doc:`hbond/dreiding/lj (o) <pair_hbond_dreiding>`
    * :doc:`hbond/dreiding/morse (o) <pair_hbond_dreiding>`
    * :doc:`hdnnp <pair_hdnnp>`
-   * :doc:`hippo <pair_amoeba>`
+   * :doc:`hippo (g) <pair_amoeba>`
    * :doc:`ilp/graphene/hbn (t) <pair_ilp_graphene_hbn>`
    * :doc:`ilp/tmd (t) <pair_ilp_tmd>`
    * :doc:`kolmogorov/crespi/full <pair_kolmogorov_crespi_full>`

From 6ce7ea2f4bef772348b6dc7a811834afa610bb7b Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 15 Jan 2023 17:43:15 -0500
Subject: [PATCH 139/181] remove obsolete commands

---
 examples/amoeba/in.ubiquitin | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/examples/amoeba/in.ubiquitin b/examples/amoeba/in.ubiquitin
index cb789a19f8..4c47edfcfc 100644
--- a/examples/amoeba/in.ubiquitin
+++ b/examples/amoeba/in.ubiquitin
@@ -34,13 +34,6 @@ pair_coeff      * * amoeba_ubiquitin.prm amoeba_ubiquitin.key
 
 special_bonds   lj/coul 0.5 0.5 0.5 one/five yes
 
-fix             fhal all store/state 0 fx fy fz
-fix             frepulse all store/state 0 fx fy fz
-fix             fdisp all store/state 0 fx fy fz
-fix             fpolar all store/state 0 fx fy fz
-fix             fmpole all store/state 0 fx fy fz
-fix             fqxfer all store/state 0 fx fy fz
-
 # thermo output
 
 compute         virial all pressure NULL virial

From 62c010a7dee66fe8552ce547b2a67d65f7c7312c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 15 Jan 2023 18:11:33 -0500
Subject: [PATCH 140/181] add note to insert LAMMPS version when GPU
 acceleration was added

---
 doc/src/pair_amoeba.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst
index 94c956a585..113ae560f7 100644
--- a/doc/src/pair_amoeba.rst
+++ b/doc/src/pair_amoeba.rst
@@ -134,6 +134,10 @@ version discussed in :ref:`(Ponder) <amoeba-Ponder>`, :ref:`(Ren)
 implementation of HIPPO in LAMMPS matches the version discussed in
 :ref:`(Rackers) <amoeba-Rackers>`.
 
+.. versionadded:: TBD
+
+Accelerator support via the GPU package is available.
+
 ----------
 
 Only a single pair_coeff command is used with either the *amoeba* and

From 9dc0369cee0649047ccf16cc0e9a3ab941b5ed07 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 15 Jan 2023 23:28:48 -0600
Subject: [PATCH 141/181] Attempted to resolve the address space change issue
 when casting for OpenCL 2.0 (ref:
 https://www.intel.com/content/www/us/en/developer/articles/technical/the-generic-address-space-in-opencl-20.html#06_address_space_casting)

---
 lib/gpu/lal_device.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index cdac6dfc97..1dbe1a0c40 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -386,6 +386,9 @@ int DeviceT::set_ocl_params(std::string s_config, const std::string &extra_args)
   }
 
   _ocl_compile_string="-cl-mad-enable ";
+  #ifdef CL_VERSION_2_0
+  _ocl_compile_string+="-cl-std=CL2.0 ";
+  #endif
   if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
   _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
     std::string(OCL_PRECISION_COMPILE);

From 973b46a90709694f879fb6515cea010b63f499c3 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 16 Jan 2023 10:12:42 -0600
Subject: [PATCH 142/181] Attempted to resolve the memory access runtime errors
 when acquiring single and mixed precision arrays from the GPU lib

---
 lib/gpu/lal_amoeba.cu       |  10 +-
 lib/gpu/lal_base_amoeba.h   |   2 +-
 src/GPU/pair_amoeba_gpu.cpp | 199 +++++++++++++++++++++++++-----------
 3 files changed, 146 insertions(+), 65 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index cc593e4263..b3bbabadc3 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1631,9 +1631,9 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
                           const __global numtyp2 *restrict grid,
-                          __global numtyp *restrict fdip_phi1,
-                          __global numtyp *restrict fdip_phi2,
-                          __global numtyp *restrict fdip_sum_phi,
+                          __global acctyp *restrict fdip_phi1,
+                          __global acctyp *restrict fdip_phi2,
+                          __global acctyp *restrict fdip_sum_phi,
                           const int bsorder, const int inum,
                           const int nzlo_out, const int nylo_out,
                           const int nxlo_out, const int ngridxy,
@@ -1843,7 +1843,7 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
     }
 
     int idx;
-    numtyp fdip_buf[20];
+    acctyp fdip_buf[20];
 
     fdip_buf[0] = (numtyp)0.0;
     fdip_buf[1] = tuv100_1;
@@ -1917,7 +1917,7 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
                           const __global numtyp2 *restrict grid,
-                          __global numtyp *restrict fphi,
+                          __global acctyp *restrict fphi,
                           const int bsorder, const int inum, const numtyp felec,
                           const int nzlo_out, const int nylo_out,
                           const int nxlo_out, const int ngridxy,
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index a20c3886d5..a7f98fa5be 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -250,7 +250,7 @@ class BaseAmoeba {
   UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
   UCL_Vector<int,int> _igrid;
   UCL_Vector<numtyp2,numtyp2> _cgrid_brick;
-  UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
+  UCL_Vector<acctyp,acctyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
   int _max_thetai_size;
   int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
   int _ngridx, _ngridy, _ngridz, _num_grid_points;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 713015b5c5..d3d4103953 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -203,10 +203,7 @@ void PairAmoebaGPU::init_style()
   if (gpu_mode == GPU_FORCE)
     error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
 
-  if (tq_size == sizeof(double))
-    tq_single = false;
-  else
-    tq_single = true;
+  tq_single = tq_size != sizeof(double);
 
   // replace with the gpu counterpart
 
@@ -739,23 +736,44 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   //   field and fieldp may already have some nonzero values from kspace (udirect1)
 
   int nlocal = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
+  if (tq_single) {
+    auto field_ptr = (float *)fieldp_pinned;
 
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
 
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
+    auto fieldp_ptr = (float *)fieldp_pinned;
+    fieldp_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += fieldp_ptr[idx];
+      fieldp[i][1] += fieldp_ptr[idx+1];
+      fieldp[i][2] += fieldp_ptr[idx+2];
+    }
+  } else {
+    auto field_ptr = (double *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    auto fieldp_ptr = (double *)fieldp_pinned;
+    fieldp_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += fieldp_ptr[idx];
+      fieldp[i][1] += fieldp_ptr[idx+1];
+      fieldp[i][2] += fieldp_ptr[idx+2];
+    }
   }
+  
 
 }
 
@@ -960,23 +978,44 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
   amoeba_gpu_update_fieldp(&fieldp_pinned);
 
   int inum = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
+  if (tq_single) {
+    auto field_ptr = (float *)fieldp_pinned;
 
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
 
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
+    auto fieldp_ptr = (float *)fieldp_pinned;
+    fieldp_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += fieldp_ptr[idx];
+      fieldp[i][1] += fieldp_ptr[idx+1];
+      fieldp[i][2] += fieldp_ptr[idx+2];
+    }
+  } else {
+    auto field_ptr = (double *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    auto fieldp_ptr = (double *)fieldp_pinned;
+    fieldp_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += fieldp_ptr[idx];
+      fieldp[i][1] += fieldp_ptr[idx+1];
+      fieldp[i][2] += fieldp_ptr[idx+2];
+    }
   }
+  
 
   // accumulate timing information
 
@@ -1139,32 +1178,63 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
                        &fdip_sum_phi_pinned);
 
   int nlocal = atom->nlocal;
-  double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
-  for (int i = 0; i < nlocal; i++) {
-    int n = i;
-    for (int m = 0; m < 10; m++) {
-      fdip_phi1[i][m] = _fdip_phi1_ptr[n];
-      n += nlocal;
+  if (tq_single) {
+    auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+        n += nlocal;
+      }
     }
-  }
 
-  double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
-  for (int i = 0; i < nlocal; i++) {
-    int n = i;
-    for (int m = 0; m < 10; m++) {
-      fdip_phi2[i][m] = _fdip_phi2_ptr[n];
-      n += nlocal;
+    auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+        n += nlocal;
+      }
     }
-  }
 
-  double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
-  for (int i = 0; i < nlocal; i++) {
-    int n = i;
-    for (int m = 0; m < 20; m++) {
-      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
-      n += nlocal;
+    auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 20; m++) {
+        fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+        n += nlocal;
+      }
+    }
+
+  } else {
+    auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 20; m++) {
+        fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+        n += nlocal;
+      }
     }
   }
+  
 }
 
 /* ----------------------------------------------------------------------
@@ -1447,15 +1517,26 @@ void PairAmoebaGPU::polar_kspace()
     } else {
       void* fphi_pinned = nullptr;
       amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
-
-      double *_fphi_ptr = (double *)fphi_pinned;
-      for (int i = 0; i < nlocal; i++) {
-        int idx = i;
-        for (int m = 0; m < 20; m++) {
-          fphi[i][m] = _fphi_ptr[idx];
-          idx += nlocal;
+      if (tq_single) {
+        auto _fphi_ptr = (float *)fphi_pinned;
+        for (int i = 0; i < nlocal; i++) {
+          int idx = i;
+          for (int m = 0; m < 20; m++) {
+            fphi[i][m] = _fphi_ptr[idx];
+            idx += nlocal;
+          }
+        }  
+      } else {
+        auto _fphi_ptr = (double *)fphi_pinned;
+        for (int i = 0; i < nlocal; i++) {
+          int idx = i;
+          for (int m = 0; m < 20; m++) {
+            fphi[i][m] = _fphi_ptr[idx];
+            idx += nlocal;
+          }
         }
       }
+      
 
     }
 

From b3e45c29cafe2fdf742cbd28d2423bd440a0e184 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 16 Jan 2023 10:30:03 -0600
Subject: [PATCH 143/181] Removed whitespaces

---
 src/GPU/pair_amoeba_gpu.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index d3d4103953..6a2f87ba2e 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -773,7 +773,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
       fieldp[i][2] += fieldp_ptr[idx+2];
     }
   }
-  
+
 
 }
 
@@ -1015,7 +1015,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
       fieldp[i][2] += fieldp_ptr[idx+2];
     }
   }
-  
+
 
   // accumulate timing information
 
@@ -1234,7 +1234,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
       }
     }
   }
-  
+
 }
 
 /* ----------------------------------------------------------------------
@@ -1525,7 +1525,7 @@ void PairAmoebaGPU::polar_kspace()
             fphi[i][m] = _fphi_ptr[idx];
             idx += nlocal;
           }
-        }  
+        }
       } else {
         auto _fphi_ptr = (double *)fphi_pinned;
         for (int i = 0; i < nlocal; i++) {
@@ -1536,8 +1536,6 @@ void PairAmoebaGPU::polar_kspace()
           }
         }
       }
-      
-
     }
 
     // convert field from fractional to Cartesian

From 9ab7f792e120868c23aca85ba31897d18fffaedc Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 16 Jan 2023 22:29:04 -0600
Subject: [PATCH 144/181] Fixed nullptr bug in the mutual fft timer

---
 src/AMOEBA/pair_amoeba.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index df9472e188..677bc48344 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -567,7 +567,8 @@ void PairAmoeba::finish()
   MPI_Allreduce(&time_fphi_uind,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_fphi_uind = ave/comm->nprocs;
 
-  double time_mutual_fft = ic_kspace->time_fft;
+  double time_mutual_fft = 0;
+  if (ic_kspace) time_mutual_fft = ic_kspace->time_fft;
   MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_mutual_fft = ave/comm->nprocs;
 

From 28fbc2631b15888de1a70a35bf0689a817c796ef Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 16 Jan 2023 22:33:21 -0600
Subject: [PATCH 145/181] Fixed another bug with ic_kspace being nullptr

---
 src/GPU/pair_amoeba_gpu.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 6a2f87ba2e..8db2a901da 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -364,11 +364,12 @@ void PairAmoebaGPU::induce()
   // must be done before the first ufield0c
   // NOTE: this is for ic_kspace, and thetai[1-3]
 
-  amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
-                               thetai3, igrid,
-                               ic_kspace->nzlo_out, ic_kspace->nzhi_out,
-                               ic_kspace->nylo_out, ic_kspace->nyhi_out,
-                               ic_kspace->nxlo_out, ic_kspace->nxhi_out);
+  if (ic_kspace)
+    amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
+                                 thetai3, igrid,
+                                 ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                                 ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                                 ic_kspace->nxlo_out, ic_kspace->nxhi_out);
 
   // get induced dipoles via the OPT extrapolation method
   // NOTE: any way to rewrite these loops to avoid allocating

From b59ee8d16c190efd87a1025efc05117a4405f4e8 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 17 Jan 2023 03:31:59 -0500
Subject: [PATCH 146/181] silence compiler warnings

---
 src/GPU/pair_amoeba_gpu.cpp | 59 ++++++++++--------------------
 src/GPU/pair_hippo_gpu.cpp  | 72 ++++++++++++-------------------------
 2 files changed, 41 insertions(+), 90 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 8db2a901da..5bc2b3a48c 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -234,7 +234,7 @@ void PairAmoebaGPU::multipole_real()
   int inum, host_start;
 
   bool success = true;
-  int *ilist, *numneigh, **firstneigh;
+  int *ilist, *numneigh;
 
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
@@ -249,15 +249,15 @@ void PairAmoebaGPU::multipole_real()
   }
   inum = atom->nlocal;
 
-  firstneigh = amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x,
-                                     atom->type, amtype, amgroup, rpole,
-                                     nullptr, nullptr, nullptr,
-                                     sublo, subhi, atom->tag,
-                                     atom->nspecial, atom->special,
-                                     atom->nspecial15, atom->special15,
-                                     eflag, vflag, eflag_atom, vflag_atom,
-                                     host_start, &ilist, &numneigh, cpu_time,
-                                     success, atom->q, domain->boxlo, domain->prd);
+  amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x,
+                        atom->type, amtype, amgroup, rpole,
+                        nullptr, nullptr, nullptr,
+                        sublo, subhi, atom->tag,
+                        atom->nspecial, atom->special,
+                        atom->nspecial15, atom->special15,
+                        eflag, vflag, eflag_atom, vflag_atom,
+                        host_start, &ilist, &numneigh, cpu_time,
+                        success, atom->q, domain->boxlo, domain->prd);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
@@ -303,7 +303,7 @@ void PairAmoebaGPU::multipole_real()
 void PairAmoebaGPU::induce()
 {
   bool done;
-  int i,j,m,ii,itype;
+  int i,j,m,itype;
   int iter,maxiter;
   double polmin;
   double eps,epsold;
@@ -313,9 +313,6 @@ void PairAmoebaGPU::induce()
   double sum,sump,term;
   double reduce[4],allreduce[4];
 
-  int debug = 1;
-
-
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
@@ -702,11 +699,9 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
     return;
   }
 
-  int eflag=1, vflag=1;
-  int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
+  int inum;
   double sublo[3],subhi[3];
+
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
     sublo[1] = domain->sublo[1];
@@ -786,19 +781,19 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
 
 void PairAmoebaGPU::udirect2b_cpu()
 {
-  int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup;
+  int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup;
   double xr,yr,zr,r,r2;
   double rr1,rr2,rr3,rr5;
   double bfac,exp2a;
   double ralpha,aefac;
   double aesq2,aesq2n;
-  double pdi,pti,ddi;
+  double pdi,pti;
   double pgamma;
   double damp,expdamp;
   double scale3,scale5;
-  double scale7,scalek;
+  double scalek;
   double bn[4],bcn[3];
-  double factor_dscale,factor_pscale,factor_uscale,factor_wscale;
+  double factor_uscale;
 
   int inum,jnum;
   int *ilist,*jlist,*numneigh,**firstneigh;
@@ -839,7 +834,6 @@ void PairAmoebaGPU::udirect2b_cpu()
 
     pdi = pdamp[itype];
     pti = thole[itype];
-    ddi = dirdamp[itype];
 
     // evaluate all sites within the cutoff distance
 
@@ -856,15 +850,8 @@ void PairAmoebaGPU::udirect2b_cpu()
       jtype = amtype[j];
       jgroup = amgroup[j];
 
-      factor_wscale = special_polar_wscale[sbmask15(jextra)];
-      if (igroup == jgroup) {
-        factor_pscale = special_polar_piscale[sbmask15(jextra)];
-        factor_dscale = polar_dscale;
-        factor_uscale = polar_uscale;
-      } else {
-        factor_pscale = special_polar_pscale[sbmask15(jextra)];
-        factor_dscale = factor_uscale = 1.0;
-      }
+      if (igroup == jgroup) factor_uscale = polar_uscale;
+      else factor_uscale = 1.0;
 
       r = sqrt(r2);
       rr1 = 1.0 / r;
@@ -1251,10 +1238,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
     return;
   }
 
-  int eflag=1, vflag=1;
-  int nall = atom->nlocal + atom->nghost;
-  int inum;
-
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -1266,7 +1249,6 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
   } else {
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
-  inum = atom->nlocal;
 
   // select the correct cutoff (off2) for the term
 
@@ -1291,8 +1273,6 @@ void PairAmoebaGPU::polar_real()
 
   int eflag=1, vflag=1;
   double **f = atom->f;
-  int nall = atom->nlocal + atom->nghost;
-  int inum;
 
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
@@ -1305,7 +1285,6 @@ void PairAmoebaGPU::polar_real()
   } else {
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
-  inum = atom->nlocal;
 
   // select the correct cutoff and aewald for the term
 
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index bf3e113ea7..1f0f3e820a 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -255,7 +255,7 @@ void PairHippoGPU::repulsion()
   int inum, host_start;
 
   bool success = true;
-  int *ilist, *numneigh, **firstneigh;
+  int *ilist, *numneigh;
 
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
@@ -270,15 +270,15 @@ void PairHippoGPU::repulsion()
   }
   inum = atom->nlocal;
 
-  firstneigh = hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x,
-                                    atom->type, amtype, amgroup, rpole,
-                                    nullptr, nullptr, nullptr,
-                                    sublo, subhi, atom->tag,
-                                    atom->nspecial, atom->special,
-                                    atom->nspecial15, atom->special15,
-                                    eflag, vflag, eflag_atom, vflag_atom,
-                                    host_start, &ilist, &numneigh, cpu_time,
-                                    success, atom->q, domain->boxlo, domain->prd);
+  hippo_gpu_precompute(neighbor->ago, inum, nall, atom->x,
+                       atom->type, amtype, amgroup, rpole,
+                       nullptr, nullptr, nullptr,
+                       sublo, subhi, atom->tag,
+                       atom->nspecial, atom->special,
+                       atom->nspecial15, atom->special15,
+                       eflag, vflag, eflag_atom, vflag_atom,
+                       host_start, &ilist, &numneigh, cpu_time,
+                       success, atom->q, domain->boxlo, domain->prd);
 
   // select the correct cutoff for the term
 
@@ -321,13 +321,8 @@ void PairHippoGPU::dispersion_real()
     return;
   }
 
-  int eflag=1, vflag=1;
-  int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
-  int *ilist, *numneigh, **firstneigh;
-
   double sublo[3],subhi[3];
+
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
     sublo[1] = domain->sublo[1];
@@ -338,7 +333,6 @@ void PairHippoGPU::dispersion_real()
   } else {
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
-  inum = atom->nlocal;
 
   // select the correct cutoff for the term
 
@@ -366,7 +360,7 @@ void PairHippoGPU::multipole_real()
   int inum, host_start;
 
   bool success = true;
-  int *ilist, *numneigh, **firstneigh;
+  int *ilist, *numneigh;
 
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
@@ -425,7 +419,7 @@ void PairHippoGPU::multipole_real()
 void PairHippoGPU::induce()
 {
   bool done;
-  int i,j,m,ii,itype;
+  int i,j,m,itype;
   int iter,maxiter;
   double polmin;
   double eps,epsold;
@@ -435,8 +429,6 @@ void PairHippoGPU::induce()
   double sum,sump,term;
   double reduce[4],allreduce[4];
 
-  int debug = 1;
-
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
@@ -823,11 +815,9 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
     return;
   }
 
-  int eflag=1, vflag=1;
-  int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
+  int inum;
   double sublo[3],subhi[3];
+
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
     sublo[1] = domain->sublo[1];
@@ -887,19 +877,18 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
 
 void PairHippoGPU::udirect2b_cpu()
 {
-  int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup;
+  int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup;
   double xr,yr,zr,r,r2;
   double rr1,rr2,rr3,rr5;
   double bfac,exp2a;
   double ralpha,aefac;
   double aesq2,aesq2n;
-  double pdi,pti,ddi;
+  double pdi,pti;
   double pgamma;
   double damp,expdamp;
-  double scale3,scale5;
-  double scale7,scalek;
+  double scale3,scale5,scalek;
   double bn[4],bcn[3];
-  double factor_dscale,factor_pscale,factor_uscale,factor_wscale;
+  double factor_uscale;
 
   int inum,jnum;
   int *ilist,*jlist,*numneigh,**firstneigh;
@@ -940,7 +929,6 @@ void PairHippoGPU::udirect2b_cpu()
 
     pdi = pdamp[itype];
     pti = thole[itype];
-    ddi = dirdamp[itype];
 
     // evaluate all sites within the cutoff distance
 
@@ -957,15 +945,8 @@ void PairHippoGPU::udirect2b_cpu()
       jtype = amtype[j];
       jgroup = amgroup[j];
 
-      factor_wscale = special_polar_wscale[sbmask15(jextra)];
-      if (igroup == jgroup) {
-        factor_pscale = special_polar_piscale[sbmask15(jextra)];
-        factor_dscale = polar_dscale;
-        factor_uscale = polar_uscale;
-      } else {
-        factor_pscale = special_polar_pscale[sbmask15(jextra)];
-        factor_dscale = factor_uscale = 1.0;
-      }
+      if (igroup == jgroup) factor_uscale = polar_uscale;
+      else factor_uscale = 1.0;
 
       r = sqrt(r2);
       rr1 = 1.0 / r;
@@ -1033,7 +1014,6 @@ void PairHippoGPU::udirect2b_cpu()
 
 void PairHippoGPU::ufield0c(double **field, double **fieldp)
 {
-  int i,j;
   double term;
 
   double time0,time1,time2;
@@ -1309,10 +1289,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
     return;
   }
 
-  int eflag=1, vflag=1;
-  int nall = atom->nlocal + atom->nghost;
-  int inum;
-
   double sublo[3],subhi[3];
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
@@ -1324,7 +1300,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
   } else {
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
-  inum = atom->nlocal;
 
   // select the correct cutoff (off2) for the term
 
@@ -1350,10 +1325,8 @@ void PairHippoGPU::polar_real()
 
   int eflag=1, vflag=1;
   double **f = atom->f;
-  int nall = atom->nlocal + atom->nghost;
-  int inum;
-
   double sublo[3],subhi[3];
+
   if (domain->triclinic == 0) {
     sublo[0] = domain->sublo[0];
     sublo[1] = domain->sublo[1];
@@ -1364,7 +1337,6 @@ void PairHippoGPU::polar_real()
   } else {
     domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
   }
-  inum = atom->nlocal;
 
   // select the correct cutoff and aewald for the term
 

From 71931d1d44ed52438d1712ed93fa6a16cfcb94b0 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 17 Jan 2023 09:39:03 -0600
Subject: [PATCH 147/181] Cleaned up, and added missing zero timers for extra
 fields transfers

---
 lib/gpu/lal_atom.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 142d64ef1d..cfd4368948 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -108,7 +108,7 @@ class Atom {
   bool velocity() { return _vel; }
 
   /// Returns true if GPU is using extra fields
-  bool using_extra() { return _extra_fields; }
+  bool using_extra() { return (_extra_fields>0); }
 
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
@@ -131,6 +131,8 @@ class Atom {
       time_quat.add_to_total();
     if (_vel)
       time_vel.add_to_total();
+    if (_extra_fields>0)
+      time_extra.add_to_total();
   }
 
   /// Add copy times to timers
@@ -142,6 +144,8 @@ class Atom {
       time_quat.zero();
     if (_vel)
       time_vel.zero();
+    if (_extra_fields>0)
+      time_extra.zero();
   }
 
   /// Return the total time for host/device data transfer
@@ -161,6 +165,10 @@ class Atom {
       total+=time_vel.total_seconds();
       time_vel.zero_total();
     }
+    if (_extra_fields>0) {
+      total+=time_extra.total_seconds();
+      time_extra.zero_total();
+    }
 
     return total+_time_transfer/1000.0;
   }

From f86375c992bf47f659cf2944b49fda3a9689f464 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 17 Jan 2023 09:47:09 -0600
Subject: [PATCH 148/181] Attempted to ensure that extra gets allocated in the
 exactly same way as other added fields (charge, quat and vel)

---
 lib/gpu/lal_atom.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index f195bf5287..03f3b477c9 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -124,7 +124,7 @@ bool AtomT::alloc(const int nall) {
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=v.device.row_bytes();
   }
-  if (_extra_fields>0 && _host_view==false) {
+  if (_extra_fields>0 && !_host_view) {
     success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=extra.device.row_bytes();

From eddd3d6f254c553086863c68c1f638731cc699be Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 18 Jan 2023 20:04:45 -0600
Subject: [PATCH 149/181] Fixed a bug with extra being nullptr when _host_view
 is true: always allocate extra (Note that BaseAmoeba has its own
 cast_extra_data() that doesn't know if extra is allocated properly, it is the
 case when _host_view is false for dedicated GPUs for example)

---
 lib/gpu/lal_atom.cpp   |  2 +-
 lib/gpu/lal_atom.h     | 19 +++++++------------
 lib/gpu/lal_device.cpp |  2 +-
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 03f3b477c9..bf27334578 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -124,7 +124,7 @@ bool AtomT::alloc(const int nall) {
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=v.device.row_bytes();
   }
-  if (_extra_fields>0 && !_host_view) {
+  if (_extra_fields>0) {
     success=success && (extra.alloc(_max_atoms*_extra_fields,*dev,UCL_WRITE_ONLY,
                                    UCL_READ_ONLY)==UCL_SUCCESS);
     gpu_bytes+=extra.device.row_bytes();
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index cfd4368948..f4b23822f8 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -470,18 +470,13 @@ class Atom {
   inline void cast_extra_data(cpytyp *host_ptr) {
     if (_extra_avail==false) {
       double t=MPI_Wtime();
-      if (_host_view) {
-        extra.host.view((numtyp*)host_ptr,_nall*_extra_fields,*dev);
-        extra.device.view(extra.host);
-      } else if (sizeof(numtyp)==sizeof(double))
-        memcpy(extra.host.begin(),host_ptr,_nall*_extra_fields*sizeof(numtyp));
-      else
-        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
-        #pragma omp parallel for simd schedule(static)
-        #elif (LAL_USE_OMP_SIMD == 1)
-        #pragma omp simd
-        #endif
-        for (int i=0; i<_nall*_extra_fields; i++) extra[i]=host_ptr[i];
+      #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+      #pragma omp parallel for simd schedule(static)
+      #elif (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i=0; i<_nall*_extra_fields; i++)
+        extra[i]=host_ptr[i];
       _time_cast+=MPI_Wtime()-t;
     }
   }
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 1dbe1a0c40..e54d16266c 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -490,7 +490,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
       _data_in_estimate++;
     if (!atom.velocity() && vel)
       _data_in_estimate++;
-      if (atom.using_extra()==false && extra_fields>0)
+    if (atom.using_extra() && extra_fields>0)
       _data_in_estimate++;
     if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor>0 && maxspecial,vel,extra_fields))
       return -3;

From 3ae2805316f08c2de6e3612401632a4929cbdf4d Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 19 Jan 2023 07:06:29 -0500
Subject: [PATCH 150/181] add option variable to CMake build to select GPU
 library debug

---
 cmake/Modules/Packages/GPU.cmake | 27 +++++++++++++++++++++------
 doc/src/Build_extras.rst         |  7 ++++---
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 92fe608656..89e15e548b 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -28,6 +28,9 @@ elseif(GPU_PREC STREQUAL "SINGLE")
   set(GPU_PREC_SETTING "SINGLE_SINGLE")
 endif()
 
+option(GPU_DEBUG "Enable debugging code of the GPU package" OFF)
+mark_as_advanced(GPU_DEBUG)
+
 file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
 file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
 
@@ -153,7 +156,12 @@ if(GPU_API STREQUAL "CUDA")
   add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
   target_link_libraries(gpu PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
   target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
-  target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
+  target_compile_definitions(gpu PRIVATE -DUSE_CUDA -D_${GPU_PREC_SETTING} ${GPU_CUDA_MPS_FLAGS})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
+  endif()
   if(CUDPP_OPT)
     target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
     target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
@@ -227,9 +235,12 @@ elseif(GPU_API STREQUAL "OPENCL")
   add_library(gpu STATIC ${GPU_LIB_SOURCES})
   target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
   target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
-  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
-
+  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL -D_${GPU_PREC_SETTING})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
+  endif()
   target_link_libraries(lammps PRIVATE gpu)
 
   add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
@@ -379,8 +390,12 @@ elseif(GPU_API STREQUAL "HIP")
 
   add_library(gpu STATIC ${GPU_LIB_SOURCES})
   target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
-  target_compile_definitions(gpu PRIVATE -DUSE_HIP)
+  target_compile_definitions(gpu PRIVATE -DUSE_HIP -D_${GPU_PREC_SETTING})
+  if(GPU_DEBUG)
+    target_compile_definitions(gpu PRIVATE -DUCL_DEBUG -DGERYON_KERNEL_DUMP)
+  else()
+    target_compile_definitions(gpu PRIVATE -DMPI_GERYON -DUCL_NO_EXIT)
+  endif()
   target_link_libraries(gpu PRIVATE hip::host)
 
   if(HIP_USE_DEVICE_SORT)
diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index d7bbe65a18..3539b14b41 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -127,10 +127,11 @@ CMake build
    -D GPU_API=value             # value = opencl (default) or cuda or hip
    -D GPU_PREC=value            # precision setting
                                 # value = double or mixed (default) or single
-   -D HIP_PATH                  # path to HIP installation. Must be set if GPU_API=HIP
    -D GPU_ARCH=value            # primary GPU hardware choice for GPU_API=cuda
-                                # value = sm_XX, see below
-                                # default is sm_50
+                                # value = sm_XX (see below, default is sm_50)
+   -D GPU_DEBUG=value           # enable debug code in the GPU package library, mostly useful for developers
+                                # value = yes or no (default)
+   -D HIP_PATH=value            # value = path to HIP installation. Must be set if GPU_API=HIP
    -D HIP_ARCH=value            # primary GPU hardware choice for GPU_API=hip
                                 # value depends on selected HIP_PLATFORM
                                 # default is 'gfx906' for HIP_PLATFORM=amd and 'sm_50' for HIP_PLATFORM=nvcc

From 4244d2e6cdcd16f0837edc1ef60be405c3b993e9 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 19 Jan 2023 08:56:54 -0500
Subject: [PATCH 151/181] silence compiler warnings about unused parameters and
 variables

---
 lib/gpu/geryon/hip_macros.h     |  3 ++
 lib/gpu/geryon/nvd_macros.h     |  3 ++
 lib/gpu/geryon/ocl_device.h     |  5 +--
 lib/gpu/geryon/ocl_macros.h     |  3 ++
 lib/gpu/geryon/ocl_memory.h     |  6 +--
 lib/gpu/geryon/ocl_texture.h    | 10 ++---
 lib/gpu/geryon/ocl_timer.h      |  2 +-
 lib/gpu/geryon/ucl_copy.h       | 37 +++++++---------
 lib/gpu/geryon/ucl_d_vec.h      |  6 +--
 lib/gpu/geryon/ucl_h_vec.h      |  6 +--
 lib/gpu/geryon/ucl_s_obj_help.h | 22 ++++------
 lib/gpu/lal_amoeba.cpp          | 10 ++---
 lib/gpu/lal_amoeba_ext.cpp      |  4 +-
 lib/gpu/lal_atom.cpp            |  7 ++-
 lib/gpu/lal_atom.h              |  4 +-
 lib/gpu/lal_base_amoeba.cpp     | 63 +++++++++++++--------------
 lib/gpu/lal_base_amoeba.h       |  2 +-
 lib/gpu/lal_base_dpd.cpp        |  4 +-
 lib/gpu/lal_charmm_long.cpp     | 22 ++++------
 lib/gpu/lal_device.cpp          |  4 +-
 lib/gpu/lal_dpd_tstat_ext.cpp   |  8 ++--
 lib/gpu/lal_eam.cpp             |  6 +--
 lib/gpu/lal_hippo.cpp           | 77 ++++++++++++++++-----------------
 lib/gpu/lal_hippo_ext.cpp       |  2 +-
 lib/gpu/lal_neighbor.h          |  6 ++-
 lib/gpu/lal_sw.cpp              |  2 +-
 lib/gpu/lal_vashishta.cpp       |  4 +-
 27 files changed, 164 insertions(+), 164 deletions(-)

diff --git a/lib/gpu/geryon/hip_macros.h b/lib/gpu/geryon/hip_macros.h
index 96313ec87e..e16caf4944 100644
--- a/lib/gpu/geryon/hip_macros.h
+++ b/lib/gpu/geryon/hip_macros.h
@@ -26,6 +26,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif
 
 #ifndef UCL_NO_API_CHECK
diff --git a/lib/gpu/geryon/nvd_macros.h b/lib/gpu/geryon/nvd_macros.h
index ac2e6cc682..19c8ff4b6c 100644
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@@ -33,6 +33,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif
 
 #ifndef UCL_NO_API_CHECK
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 4163d40881..588c53c8fa 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -309,15 +309,14 @@ class UCL_Device {
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
   /// Return the maximum memory pitch in bytes
-  inline size_t max_pitch(const int i) { return 0; }
+  inline size_t max_pitch(const int) { return 0; }
 
   /// Returns false if accelerator cannot be shared by multiple processes
   /** If it cannot be determined, true is returned **/
   inline bool sharing_supported() { return sharing_supported(_device); }
   /// Returns false if accelerator cannot be shared by multiple processes
   /** If it cannot be determined, true is returned **/
-  inline bool sharing_supported(const int i)
-    { return true; }
+  inline bool sharing_supported(const int) { return true; }
 
   /// True if the device is a sub-device
   inline bool is_subdevice()
diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h
index 5e5a190ede..652d7795e9 100644
--- a/lib/gpu/geryon/ocl_macros.h
+++ b/lib/gpu/geryon/ocl_macros.h
@@ -33,6 +33,9 @@
 #ifdef UCL_DEBUG
 #define UCL_SYNC_DEBUG
 #define UCL_DESTRUCT_CHECK
+#define UCL_DEBUG_ARG(arg) arg
+#else
+#define UCL_DEBUG_ARG(arg)
 #endif
 
 #ifndef UCL_NO_API_CHECK
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index bfc260889a..5d8b9808bd 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -137,7 +137,7 @@ inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
 
 template <class mat_type>
 inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
-                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){
+                       const enum UCL_MEMOPT kind, const enum UCL_MEMOPT /*kind2*/){
   cl_mem_flags buffer_perm;
   cl_map_flags map_perm;
   if (kind==UCL_READ_ONLY) {
@@ -583,7 +583,7 @@ template <> struct _ucl_memcpy<1,0> {
   template <class p1, class p2>
   static inline void mc(p1 &dst, const p2 &src, const size_t n,
                         cl_command_queue &cq, const cl_bool block,
-                        const size_t dst_offset, const size_t src_offset) {
+                        const size_t /*dst_offset*/, const size_t src_offset) {
     if (src.cbegin()==dst.cbegin()) {
       #ifdef UCL_DBG_MEM_TRACE
       std::cerr << "UCL_COPY 1S\n";
@@ -641,7 +641,7 @@ template <> struct _ucl_memcpy<0,1> {
   template <class p1, class p2>
   static inline void mc(p1 &dst, const p2 &src, const size_t n,
                         cl_command_queue &cq, const cl_bool block,
-                        const size_t dst_offset, const size_t src_offset) {
+                        const size_t dst_offset, const size_t /*src_offset*/) {
     if (src.cbegin()==dst.cbegin()) {
       if (block) ucl_sync(cq);
       #ifdef UCL_DBG_MEM_TRACE
diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h
index 8ddde5b2a3..87db3794a6 100644
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@@ -35,19 +35,19 @@ class UCL_Texture {
   UCL_Texture() {}
   ~UCL_Texture() {}
   /// Construct with a specified texture reference
-  inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
+  inline UCL_Texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
   /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name) { }
+  inline void get_texture(UCL_Program & /*prog*/, const char * /*texture_name*/) { }
 
   /// Bind a float array where each fetch grabs a vector of length numel
   template<class mat_typ>
-  inline void bind_float(mat_typ &vec, const unsigned numel) { }
+    inline void bind_float(mat_typ & /*vec*/, const unsigned /*numel*/) { }
 
   /// Unbind the texture reference from the memory allocation
   inline void unbind() { }
 
   /// Make a texture reference available to kernel
-  inline void allow(UCL_Kernel &kernel) { }
+  inline void allow(UCL_Kernel & /*kernel*/) { }
 
  private:
   friend class UCL_Kernel;
@@ -62,7 +62,7 @@ class UCL_Const {
   inline UCL_Const(UCL_Program &prog, const char *global_name)
     { get_global(prog,global_name); }
   /// Set the global reference for this object
-  inline void get_global(UCL_Program &prog, const char *global_name) {
+  inline void get_global(UCL_Program &prog, const char * /*global_name*/) {
     if (_active) {
       CL_DESTRUCT_CALL(clReleaseContext(_context));
       CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 189871e631..8f55a91a28 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -71,7 +71,7 @@ class UCL_Timer {
   inline void init(UCL_Device &dev) { init(dev,dev.cq()); }
 
   /// Initialize command queue for timing
-  inline void init(UCL_Device &dev, command_queue &cq) {
+  inline void init(UCL_Device & /*dev*/, command_queue &cq) {
     clear();
     _cq=cq;
     clRetainCommandQueue(_cq);
diff --git a/lib/gpu/geryon/ucl_copy.h b/lib/gpu/geryon/ucl_copy.h
index c906a14f30..94b57f7a09 100644
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@@ -205,12 +205,11 @@ template <> struct _host_host_copy<1,1> {
 // Should never be here
 template <int host_t1, int host_t2> struct _host_host_copy {
   template <class mat1, class mat2>
-  static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
+  static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/) {
     assert(0==1);
   }
   template <class mat1, class mat2>
-  static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
-                         const size_t cols) {
+  static inline void hhc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/, const size_t /*cols*/) {
     assert(0==1);
   }
 };
@@ -470,24 +469,22 @@ template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
 // Neither on host or both on host
 template <> struct _ucl_cast_copy<1,1> {
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer, command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
+                          mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer,
-                        command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
     assert(0==1);
   }
 };
@@ -495,24 +492,22 @@ template <> struct _ucl_cast_copy<1,1> {
 // Neither on host or both on host
 template <> struct _ucl_cast_copy<0,0> {
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer, command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/,
+                          mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
-                        mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*numel*/, mat3 & /*cast_buffer*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t /*cols*/, mat3 & /*cast_buffer*/) {
     assert(0==1);
   }
   template <class mat1, class mat2, class mat3>
-  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
-                        const size_t cols, mat3 &cast_buffer,
-                        command_queue &cq) {
+    static inline void cc(mat1 & /*dst*/, const mat2 & /*src*/, const size_t /*rows*/,
+                          const size_t cols, mat3 & /*cast_buffer*/, command_queue & /*cq*/) {
     assert(0==1);
   }
 };
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index 9158e145b3..5e281fef07 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -125,7 +125,7 @@ class UCL_D_Vec : public UCL_BaseMat {
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs **/
   template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+  inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
     #ifdef UCL_DEBUG
     assert(rows==1);
     #endif
@@ -230,8 +230,8 @@ class UCL_D_Vec : public UCL_BaseMat {
     * - The view does not prevent the memory from being freed by the
     *   allocating container when using CUDA APIs **/
   template <class ucl_type>
-  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
-                          const size_t cols) {
+  inline void view_offset(const size_t offset,ucl_type &input,
+                          const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
     #ifdef UCL_DEBUG
     assert(rows==1);
     #endif
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index 2f49f9f633..9f734ac40c 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -126,7 +126,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     *   allocating container when using CUDA APIs
     * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+  inline void view(ucl_type &input, const size_t UCL_DEBUG_ARG(rows), const size_t cols) {
     #ifdef UCL_DEBUG
     assert(rows==1);
     #endif
@@ -188,7 +188,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     *   allocating container when using CUDA APIs
     * - Viewing a device pointer on the host is not supported **/
   template <class ptr_type>
-  inline void view(ptr_type *input, const size_t rows, const size_t cols,
+  inline void view(ptr_type *input, const size_t UCL_DEBUG_ARG(rows), const size_t cols,
                    UCL_Device &dev) {
     #ifdef UCL_DEBUG
     assert(rows==1);
@@ -233,7 +233,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     *   allocating container when using CUDA APIs
     * - Viewing a device container on the host is not supported **/
   template <class ucl_type>
-  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t UCL_DEBUG_ARG(rows),
                           const size_t cols) {
     #ifdef UCL_DEBUG
     assert(rows==1);
diff --git a/lib/gpu/geryon/ucl_s_obj_help.h b/lib/gpu/geryon/ucl_s_obj_help.h
index a10f3cdb3f..9bc2c40fe2 100644
--- a/lib/gpu/geryon/ucl_s_obj_help.h
+++ b/lib/gpu/geryon/ucl_s_obj_help.h
@@ -27,7 +27,7 @@ template <int st> struct _ucl_s_obj_help;
 // -- Can potentially use same memory if shared by accelerator
 template <> struct _ucl_s_obj_help<1> {
   template <class t1, class t2, class t3>
-  static inline int alloc(t1 &host, t2 &device, t3 &_buffer,
+    static inline int alloc(t1 &host, t2 &device, t3 & /*_buffer*/,
                           const int cols, UCL_Device &acc,
                           const enum UCL_MEMOPT kind1,
                           const enum UCL_MEMOPT kind2) {
@@ -131,41 +131,37 @@ template <> struct _ucl_s_obj_help<1> {
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, t3 &buffer, const bool async) {
+    static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, const bool async) {
     ucl_copy(dst,src,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, t3 &buffer, command_queue &cq) {
+    static inline void copy(t1 &dst, t2 &src, t3 & /*buffer*/, command_queue &cq) {
     ucl_copy(dst,src,cq);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
-                          const bool async) {
+    static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, const bool async) {
     ucl_copy(dst,src,cols,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int cols, t3 &buffer,
-                          command_queue &cq) {
+    static inline void copy(t1 &dst, t2 &src, const int cols, t3 & /*buffer*/, command_queue &cq) {
     ucl_copy(dst,src,cols,cq);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
-                          t3 &buffer, const bool async) {
+    static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, const bool async) {
     ucl_copy(dst,src,rows,cols,async);
   }
 
   template <class t1, class t2, class t3>
-  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols,
-                          t3 &buffer, command_queue &cq) {
+  static inline void copy(t1 &dst, t2 &src, const int rows, const int cols, t3 & /*buffer*/, command_queue &cq) {
     ucl_copy(dst,src,rows,cols,cq);
   }
 
   template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+    static inline int dev_resize(t1 &device, t2 &host, t3 & /*buff*/,const int cols) {
     if (device.kind()==UCL_VIEW) {
       device.view(host);
       return UCL_SUCCESS;
@@ -353,7 +349,7 @@ template <int st> struct _ucl_s_obj_help {
   }
 
   template <class t1, class t2, class t3>
-  static inline int dev_resize(t1 &device, t2 &host, t3 &buff,const int cols) {
+  static inline int dev_resize(t1 &device, t2 & /*host*/, t3 &buff,const int cols) {
     int err=buff.resize(cols);
     if (err!=UCL_SUCCESS)
       return err;
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 38aa2bde27..5e19997913 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -48,10 +48,10 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
                   const double *host_pdamp, const double *host_thole,
                   const double *host_dirdamp, const int *host_amtype2class,
                   const double *host_special_hal,
-                  const double *host_special_repel,
-                  const double *host_special_disp,
+                  const double * /*host_special_repel*/,
+                  const double * /*host_special_disp*/,
                   const double *host_special_mpole,
-                  const double *host_special_polar_wscale,
+                  const double * /*host_special_polar_wscale*/,
                   const double *host_special_polar_piscale,
                   const double *host_special_polar_pscale,
                   const double *host_csix, const double *host_adisp,
@@ -188,7 +188,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
 // Launch the real-space permanent field kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int AmoebaT::udirect2b(const int eflag, const int vflag) {
+int AmoebaT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
   int ainum=this->ans->inum();
   if (ainum == 0)
     return 0;
@@ -230,7 +230,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
 // Launch the real-space induced field kernel, returning field and fieldp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int AmoebaT::umutual2b(const int eflag, const int vflag) {
+int AmoebaT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
   int ainum=this->ans->inum();
   if (ainum == 0)
     return 0;
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 5e4d48a2da..fe3d4a26d8 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -119,8 +119,8 @@ void amoeba_gpu_clear() {
 
 int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
                             double **host_x, int *host_type, int *host_amtype,
-                            int *host_amgroup, double **host_rpole,
-                            double **host_uind, double **host_uinp, double *host_pval,
+                            int *host_amgroup, double **host_rpole, double ** /*host_uind*/,
+                            double ** /*host_uinp*/, double * /*host_pval*/,
                             double *sublo, double *subhi, tagint *tag,
                             int **nspecial, tagint **special,
                             int *nspecial15, tagint **special15,
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index bf27334578..853fdf216d 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -403,9 +403,14 @@ double AtomT::host_memory_usage() const {
   return _max_atoms*atom_bytes*sizeof(numtyp)+sizeof(Atom<numtyp,acctyp>);
 }
 
+#ifdef USE_CUDPP
+#define USE_CUDPP_ARG(arg) arg
+#else
+#define USE_CUDPP_ARG(arg)
+#endif
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
-void AtomT::sort_neighbor(const int num_atoms) {
+void AtomT::sort_neighbor(const int USE_CUDPP_ARG(num_atoms)) {
   #ifdef USE_CUDPP
   CUDPPResult result = cudppSort(sort_plan, (unsigned *)dev_cell_id.begin(),
                                  (int *)dev_particle_id.begin(),
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index f4b23822f8..4b29d76cb1 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -327,7 +327,7 @@ class Atom {
 
   /// Copy positions and types to device asynchronously
   /** Copies nall() elements **/
-  inline void add_x_data(double **host_ptr, int *host_type) {
+  inline void add_x_data(double ** /*host_ptr*/, int * /*host_type*/) {
     time_pos.start();
     if (_x_avail==false) {
       #ifdef GPU_CAST
@@ -441,7 +441,7 @@ class Atom {
 
   /// Copy velocities and tags to device asynchronously
   /** Copies nall() elements **/
-  inline void add_v_data(double **host_ptr, tagint *host_tag) {
+  inline void add_v_data(double ** /*host_ptr*/, tagint * /*host_tag*/) {
     time_vel.start();
     if (_v_avail==false) {
       #ifdef GPU_CAST
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index a1d4a00c2c..99e3a6a77e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -288,7 +288,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
                               const bool eflag_in, const bool vflag_in,
                               const bool eatom, const bool vatom, int &host_start,
                               int **&ilist, int **&jnum, const double cpu_time,
-                              bool &success, double *host_q, double *boxlo, double *prd) {
+                              bool &success, double *host_q, double * /*boxlo*/, double * /*prd*/) {
   acc_timers();
   if (eatom) _eflag=2;
   else if (eflag_in) _eflag=1;
@@ -368,20 +368,21 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 //   this is the first part in a time step done on the GPU for AMOEBA for now
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
-                                          const int nall, double **host_x,
-                                          int *host_type, int *host_amtype,
-                                          int *host_amgroup, double **host_rpole, double *host_pval,
-                                          double *sublo, double *subhi, tagint *tag,
-                                          int **nspecial, tagint **special,
-                                          int *nspecial15, tagint **special15,
-                                          const bool eflag_in, const bool vflag_in,
-                                          const bool eatom, const bool vatom,
-                                          int &host_start, int **ilist, int **jnum,
-                                          const double cpu_time, bool &success,
-                                          const double aewald, const double felec,
-                                          const double off2_mpole, double *host_q,
-                                          double *boxlo, double *prd, void **tep_ptr) {
+void BaseAmoebaT::compute_multipole_real(const int /*ago*/, const int inum_full,
+                                         const int /*nall*/, double ** /*host_x*/,
+                                         int * /*host_type*/, int * /*host_amtype*/,
+                                         int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                         double */*host_pval*/, double * /*sublo*/,
+                                         double * /*subhi*/, tagint * /*tag*/,
+                                         int ** /*nspecial*/, tagint ** /*special*/,
+                                         int * /*nspecial15*/, tagint ** /*special15*/,
+                                         const bool /*eflag_in*/, const bool /*vflag_in*/,
+                                         const bool /*eatom*/, const bool /*vatom*/,
+                                         int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                                         const double /*cpu_time*/, bool & /*success*/,
+                                         const double aewald, const double felec,
+                                         const double off2_mpole, double * /*host_q*/,
+                                         double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
   // ------------------- Resize _tep array ------------------------
 
   if (inum_full>_max_tep_size) {
@@ -393,7 +394,7 @@ void BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
   _off2_mpole = off2_mpole;
   _felec = felec;
   _aewald = aewald;
-  const int red_blocks=multipole_real(_eflag,_vflag);
+  multipole_real(_eflag,_vflag);
 
   // leave the answers (forces, energies and virial) on the device,
   //   only copy them back in the last kernel (polar_real)
@@ -424,7 +425,7 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
   // specify the correct cutoff and alpha values
   _off2_polar = off2_polar;
   _aewald = aewald;
-  const int red_blocks=udirect2b(_eflag,_vflag);
+  udirect2b(_eflag,_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
@@ -436,10 +437,10 @@ void BaseAmoebaT::compute_udirect2b(int *host_amtype, int *host_amgroup, double
 //    of the induced field
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
-                                     double **host_uind, double **host_uinp, double *host_pval,
-                                     const double aewald, const double off2_polar,
-                                     void** fieldp_ptr) {
+void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double ** /*host_rpole*/,
+                                    double **host_uind, double **host_uinp, double * /*host_pval*/,
+                                    const double aewald, const double off2_polar,
+                                    void** /*fieldp_ptr*/) {
   // only copy the necessary data arrays that are updated over the iterations
   // use nullptr for the other arrays that are already copied from host to device
   cast_extra_data(host_amtype, host_amgroup, nullptr, host_uind, host_uinp, nullptr);
@@ -449,7 +450,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
   _off2_polar = off2_polar;
   _aewald = aewald;
   // launch the kernel
-  const int red_blocks=umutual2b(_eflag,_vflag);
+  umutual2b(_eflag,_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
   // NOTE: move this step to update_fieldp() to delay device-host transfer
@@ -492,7 +493,7 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
     _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
     _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
   } else {
-    if (_thetai1.cols()<_max_thetai_size*bsorder) {
+    if ((int)_thetai1.cols()<_max_thetai_size*bsorder) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
       _thetai1.resize(_max_thetai_size*bsorder);
       _thetai2.resize(_max_thetai_size*bsorder);
@@ -573,7 +574,7 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
   int numel = _num_grid_points;
   if (_cgrid_brick.cols() == 0) {
     _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
-  } else if (numel > _cgrid_brick.cols()) {
+  } else if (numel > (int)_cgrid_brick.cols()) {
     _cgrid_brick.resize(numel);
   }
 }
@@ -611,7 +612,7 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
   #endif
 
   // launch the kernel with its execution configuration (see below)
-  const int red_blocks = fphi_uind();
+  fphi_uind();
 
   // copy data from device to host asynchronously
   _fdip_phi1.update_host(_max_thetai_size*10, true);
@@ -682,7 +683,7 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi
   _cgrid_brick.update_device(_num_grid_points, false);
 
   _felec = felec;
-  const int red_blocks = fphi_mpole();
+  fphi_mpole();
 
   _fdip_sum_phi.update_host(_max_thetai_size*20);
 
@@ -698,9 +699,6 @@ int BaseAmoebaT::fphi_mpole() {
   if (ainum == 0)
     return 0;
 
-  int _nall=atom->nall();
-  int nbor_pitch=nbor->nbor_pitch();
-
   // Compute the block size and grid size to keep all cores busy
 
   const int BX=block_size();
@@ -771,7 +769,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::setup_fft(const int numel, const int element_type)
+void BaseAmoebaT::setup_fft(const int /*numel*/, const int /*element_type*/)
 {
   // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
 }
@@ -781,7 +779,8 @@ void BaseAmoebaT::setup_fft(const int numel, const int element_type)
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode)
+void BaseAmoebaT::compute_fft1d(void * /*in*/, void * /*out*/,
+                                const int /*numel*/, const int /*mode*/)
 {
   // TODO: setting up FFT plan based on the backend (cuFFT or hipFFT)
   #if 0 // !defined(USE_OPENCL) && !defined(USE_HIP)
@@ -940,7 +939,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
 
   #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
   if (dev.has_subgroup_support()) {
-    size_t mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
+    int mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
     if (_threads_per_atom > mx_subgroup_sz)
       _threads_per_atom = mx_subgroup_sz;
     device->set_simd_size(mx_subgroup_sz);
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index a7f98fa5be..0eaaafeb1e 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -280,7 +280,7 @@ class BaseAmoeba {
   UCL_Kernel k_fphi_uind, k_fphi_mpole;
   UCL_Kernel k_special15, k_short_nbor;
   inline int block_size() { return _block_size; }
-  inline void set_kernel(const int eflag, const int vflag) {}
+  inline void set_kernel(const int /*eflag*/, const int /*vflag*/) {}
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
index c084c02ff0..e103699d40 100644
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@@ -196,7 +196,7 @@ void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
                        const double cpu_time, bool &success, tagint *tag,
                        double **host_v, const double dtinvsqrt,
                        const int seed, const int timestep,
-                       const int nlocal, double *boxlo, double *prd) {
+                       const int /*nlocal*/, double * /*boxlo*/, double * /*prd*/) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
@@ -261,7 +261,7 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
                         const double cpu_time, bool &success,
                         double **host_v, const double dtinvsqrt,
                         const int seed, const int timestep,
-                        double *boxlo, double *prd) {
+                        double * /*boxlo*/, double * /*prd*/) {
   acc_timers();
   int eflag, vflag;
   if (eatom) eflag=2;
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index 8008b1fbb3..0d01d70fb1 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -44,19 +44,15 @@ int CHARMMLongT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int CHARMMLongT::init(const int ntypes,
-                           double host_cut_bothsq, double **host_lj1,
-                           double **host_lj2, double **host_lj3,
-                           double **host_lj4, double **host_offset,
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double host_cut_ljsq, const double host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e,
-                           const double g_ewald, const double cut_lj_innersq,
-                           const double denom_lj, double **epsilon,
-                           double **sigma, const bool mix_arithmetic) {
+int CHARMMLongT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
+                      double **host_lj2, double **host_lj3, double **host_lj4,
+                      double ** /*host_offset*/, double *host_special_lj, const int nlocal,
+                      const int nall, const int max_nbors, const int maxspecial,
+                      const double cell_size, const double gpu_split, FILE *_screen,
+                      double host_cut_ljsq, const double host_cut_coulsq,
+                      double *host_special_coul, const double qqrd2e, const double g_ewald,
+                      const double cut_lj_innersq, const double denom_lj, double **epsilon,
+                      double **sigma, const bool mix_arithmetic) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
                             _screen,charmm_long,"k_charmm_long");
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index e54d16266c..dd3ce15827 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -52,7 +52,7 @@ DeviceT::~Device() {
 }
 
 template <class numtyp, class acctyp>
-int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu,
                          const int first_gpu_id, const int gpu_mode,
                          const double p_split, const int t_per_atom,
                          const double user_cell_size, char *ocl_args,
@@ -528,7 +528,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
 
 template <class numtyp, class acctyp>
 int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
-                       const int host_nlocal, const int nall,
+                       const int host_nlocal, const int /*nall*/,
                        const int maxspecial, const int gpu_host,
                        const int max_nbors, const double cutoff,
                        const bool pre_cut, const int threads_per_atom,
diff --git a/lib/gpu/lal_dpd_tstat_ext.cpp b/lib/gpu/lal_dpd_tstat_ext.cpp
index 2b63bf62e7..78a1bf2d9d 100644
--- a/lib/gpu/lal_dpd_tstat_ext.cpp
+++ b/lib/gpu/lal_dpd_tstat_ext.cpp
@@ -28,10 +28,10 @@ static DPD<PRECISION,ACC_PRECISION> DPDTMF;
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
 int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
-                 double **host_gamma, double **host_sigma, double **host_cut,
-                 double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen) {
+                       double **host_gamma, double **host_sigma, double **host_cut,
+                       double *special_lj, const int inum,
+                       const int nall, const int /*max_nbors*/,  const int maxspecial,
+                       const double cell_size, int &gpu_mode, FILE *screen) {
   DPDTMF.clear();
   gpu_mode=DPDTMF.device->gpu_mode();
   double gpu_split=DPDTMF.device->particle_split();
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 2c0d63f7bf..b7bc7b958a 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -310,7 +310,7 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                    const int nall, double **host_x, int *host_type,
                    int *ilist, int *numj, int **firstneigh,
                    const bool eflag_in, const bool vflag_in,
-                   const bool eatom, const bool vatom,
+                   const bool /*eatom*/, const bool /*vatom*/,
                    int &host_start, const double cpu_time,
                    bool &success, void **fp_ptr) {
   this->acc_timers();
@@ -386,8 +386,8 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, double *sublo,
                     double *subhi, tagint *tag, int **nspecial,
                     tagint **special, const bool eflag_in,
-                    const bool vflag_in, const bool eatom,
-                    const bool vatom, int &host_start, int **ilist, int **jnum,
+                    const bool vflag_in, const bool /*eatom*/,
+                    const bool /*vatom*/, int &host_start, int **ilist, int **jnum,
                     const double cpu_time, bool &success, int &inum,
                     void **fp_ptr) {
   this->acc_timers();
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index f8ab436ad0..24ffae8de2 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -176,19 +176,19 @@ double HippoT::host_memory_usage() const {
 // Compute the repulsion term, returning tep
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void HippoT::compute_repulsion(const int ago, const int inum_full,
-                               const int nall, double **host_x,
-                               int *host_type, int *host_amtype,
-                               int *host_amgroup, double **host_rpole,
-                               double *sublo, double *subhi, tagint *tag,
-                               int **nspecial, tagint **special,
-                               int *nspecial15, tagint **special15,
+void HippoT::compute_repulsion(const int /*ago*/, const int inum_full,
+                               const int /*nall*/, double ** /*host_x*/,
+                               int * /*host_type*/, int * /*host_amtype*/,
+                               int * /*host_amgroup*/, double ** /*host_rpole*/,
+                               double * /*sublo*/, double * /*subhi*/, tagint * /*tag*/,
+                               int ** /*nspecial*/, tagint ** /*special*/,
+                               int * /*nspecial15*/, tagint ** /*special15*/,
                                const bool eflag_in, const bool vflag_in,
                                const bool eatom, const bool vatom,
-                               int &host_start, int **ilist, int **jnum,
-                               const double cpu_time, bool &success,
-                               const double aewald, const double off2_repulse,
-                               double *host_q, double *boxlo, double *prd,
+                               int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                               const double /*cpu_time*/, bool & /*success*/,
+                               const double /*aewald*/, const double off2_repulse,
+                               double * /*host_q*/, double * /*boxlo*/, double * /*prd*/,
                                double cut2, double c0, double c1, double c2,
                                double c3, double c4, double c5, void **tep_ptr) {
   this->acc_timers();
@@ -223,7 +223,7 @@ void HippoT::compute_repulsion(const int ago, const int inum_full,
   _c3 = c3;
   _c4 = c4;
   _c5 = c5;
-  const int red_blocks=repulsion(this->_eflag,this->_vflag);
+  repulsion(this->_eflag,this->_vflag);
 
   // copy tep from device to host
   this->_tep.update_host(this->_max_tep_size*4,false);
@@ -287,7 +287,7 @@ void HippoT::compute_dispersion_real(int *host_amtype, int *host_amgroup,
 
   this->_off2_disp = off2_disp;
   this->_aewald = aewald;
-  const int red_blocks=dispersion_real(this->_eflag,this->_vflag);
+  dispersion_real(this->_eflag,this->_vflag);
 
   // only copy them back if this is the last kernel
   //   otherwise, commenting out these two lines to leave the answers
@@ -341,21 +341,21 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
 // Compute the multipole real-space term, returning tep
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void HippoT::compute_multipole_real(const int ago, const int inum_full,
-                                     const int nall, double **host_x,
-                                     int *host_type, int *host_amtype,
-                                     int *host_amgroup, double **host_rpole,
-                                     double* host_pval, double *sublo,
-                                     double *subhi, tagint *tag,
-                                     int **nspecial, tagint **special,
-                                     int *nspecial15, tagint **special15,
-                                     const bool eflag_in, const bool vflag_in,
-                                     const bool eatom, const bool vatom,
-                                     int &host_start, int **ilist, int **jnum,
-                                     const double cpu_time, bool &success,
+void HippoT::compute_multipole_real(const int /*ago*/, const int inum_full,
+                                    const int /*nall*/, double ** /*host_x*/,
+                                    int * /*host_type*/, int * /*host_amtype*/,
+                                    int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                    double* host_pval, double * /*sublo*/,
+                                    double * /*subhi*/, tagint * /*tag*/,
+                                    int ** /*nspecial*/, tagint ** /*special*/,
+                                    int * /*nspecial15*/, tagint ** /*special15*/,
+                                    const bool /*eflag_in*/, const bool /*vflag_in*/,
+                                    const bool /*eatom*/, const bool /*vatom*/,
+                                    int & /*host_start*/, int ** /*ilist*/, int ** /*jnum*/,
+                                    const double /*cpu_time*/, bool & /*success*/,
                                      const double aewald, const double felec,
-                                     const double off2_mpole, double *host_q,
-                                     double *boxlo, double *prd, void **tep_ptr) {
+                                    const double off2_mpole, double * /*host_q*/,
+                                    double * /*boxlo*/, double * /*prd*/, void **tep_ptr) {
 
   // cast necessary data arrays from host to device
 
@@ -373,7 +373,7 @@ void HippoT::compute_multipole_real(const int ago, const int inum_full,
   this->_off2_mpole = off2_mpole;
   this->_felec = felec;
   this->_aewald = aewald;
-  const int red_blocks=multipole_real(this->_eflag,this->_vflag);
+  multipole_real(this->_eflag,this->_vflag);
 
   // copy tep from device to host
   this->_tep.update_host(this->_max_tep_size*4,false);
@@ -424,7 +424,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
 //   returning field and fieldp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole,
+void HippoT::compute_udirect2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
                                 double **host_uind, double **host_uinp, double* host_pval,
                                 const double aewald, const double off2_polar,
                                 void** fieldp_ptr) {
@@ -438,7 +438,7 @@ void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **hos
 
   this->_off2_polar = off2_polar;
   this->_aewald = aewald;
-  const int red_blocks=udirect2b(this->_eflag,this->_vflag);
+  udirect2b(this->_eflag,this->_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
 
@@ -449,7 +449,7 @@ void HippoT::compute_udirect2b(int *host_amtype, int *host_amgroup, double **hos
 // Launch the real-space permanent field kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int HippoT::udirect2b(const int eflag, const int vflag) {
+int HippoT::udirect2b(const int /*eflag*/, const int /*vflag*/) {
   int ainum=this->ans->inum();
   if (ainum == 0)
     return 0;
@@ -493,10 +493,9 @@ int HippoT::udirect2b(const int eflag, const int vflag) {
 //   returning field and fieldp
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole,
-                                double **host_uind, double **host_uinp, double *host_pval,
-                                const double aewald, const double off2_polar,
-                                void** fieldp_ptr) {
+void HippoT::compute_umutual2b(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                               double **host_uind, double **host_uinp, double * /*host_pval*/,
+                               const double aewald, const double off2_polar, void ** /*fieldp_ptr*/) {
 
   // cast necessary data arrays from host to device
 
@@ -505,7 +504,7 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos
 
   this->_off2_polar = off2_polar;
   this->_aewald = aewald;
-  const int red_blocks=umutual2b(this->_eflag,this->_vflag);
+  umutual2b(this->_eflag,this->_vflag);
 
   // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
   // NOTE: move this step to update_fieldp() to delay device-host transfer
@@ -517,7 +516,7 @@ void HippoT::compute_umutual2b(int *host_amtype, int *host_amgroup, double **hos
 // Launch the real-space induced field kernel
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int HippoT::umutual2b(const int eflag, const int vflag) {
+int HippoT::umutual2b(const int /*eflag*/, const int /*vflag*/) {
   int ainum=this->ans->inum();
   if (ainum == 0)
     return 0;
@@ -557,8 +556,8 @@ int HippoT::umutual2b(const int eflag, const int vflag) {
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void HippoT::compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
-                                double **host_uind, double **host_uinp, double *host_pval,
+void HippoT::compute_polar_real(int * /*host_amtype*/, int * /*host_amgroup*/, double ** /*host_rpole*/,
+                                double **host_uind, double **host_uinp, double * /*host_pval*/,
                                 const bool eflag_in, const bool vflag_in,
                                 const bool eatom, const bool vatom,
                                 const double aewald, const double felec,
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index 77450bf7b1..b5ac42744a 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -123,7 +123,7 @@ void hippo_gpu_clear() {
 int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, int *host_amtype,
                            int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp, double *host_pval,
+                           double ** /*host_uind*/, double ** /*host_uinp*/, double * /*host_pval*/,
                            double *sublo, double *subhi, tagint *tag,
                            int **nspecial, tagint **special,
                            int *nspecial15, tagint **special15,
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 9061ce5150..24aaf6aeba 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -293,15 +293,17 @@ class Neighbor {
   #endif
 
   int _simd_size;
+  #ifdef LAL_USE_OLD_NEIGHBOR
   inline void set_nbor_block_size(const int mn) {
-    #ifdef LAL_USE_OLD_NEIGHBOR
     int desired=mn/(2*_simd_size);
     desired*=_simd_size;
     if (desired<_simd_size) desired=_simd_size;
     else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
     _block_nbor_build=desired;
-    #endif
   }
+  #else
+  inline void set_nbor_block_size(const int) {}
+  #endif
 };
 
 }
diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp
index eb42c710cc..9687a0352d 100644
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@@ -150,7 +150,7 @@ double SWT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int SWT::loop(const int eflag, const int vflag, const int evatom,
-              bool &success) {
+              bool & /*success*/) {
   const int nbor_pitch=this->nbor->nbor_pitch();
 
   // build the short neighbor list
diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp
index c343de3f55..fcc9d00ab0 100644
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@@ -56,7 +56,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
            const double* costheta, const double* bigb,
            const double* big2b, const double* bigc)
 {
-  int success;
+  int success=0;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,vashishta,"k_vashishta","k_vashishta_three_center",
                            "k_vashishta_three_end","k_vashishta_short_nbor");
@@ -211,7 +211,7 @@ double VashishtaT::host_memory_usage() const {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
-                     bool &success) {
+                     bool & /*success*/) {
   const int nbor_pitch=this->nbor->nbor_pitch();
 
   // build the short neighbor list

From 03ab42fd52fd104edcf8b973fb1ece54733cc9ac Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 19 Jan 2023 08:57:24 -0500
Subject: [PATCH 152/181] correct calling sequence for matching argument types

---
 lib/gpu/lal_base_amoeba.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 99e3a6a77e..841d968e56 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -685,7 +685,7 @@ void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi
   _felec = felec;
   fphi_mpole();
 
-  _fdip_sum_phi.update_host(_max_thetai_size*20);
+  _fdip_sum_phi.update_host(_max_thetai_size*20, false);
 
   *host_fphi = _fdip_sum_phi.host.begin();
 }

From 8eb722a32ad3d551a1c64e88fc768f66af771a35 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Thu, 19 Jan 2023 13:22:27 -0600
Subject: [PATCH 153/181] Enforced synchronous host-device transfers for
 cgrid_brick and fdip arrays

---
 lib/gpu/lal_base_amoeba.cpp | 26 ++++++--------------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 841d968e56..21d9975b28 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -591,10 +591,6 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
                                     void **host_fdip_phi2,
                                     void **host_fdip_sum_phi)
 {
-  // TODO: find out why this (dummy) host alloc helps the cgrid_brick update_device() work correcly
-  UCL_H_Vec<numtyp> hdummy;
-  hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
-
   int n = 0;
   for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
     for (int iy = _nylo_out; iy <= _nyhi_out; iy++)
@@ -605,7 +601,7 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
         _cgrid_brick[n] = v;
         n++;
       }
-  _cgrid_brick.update_device(_num_grid_points, true);
+  _cgrid_brick.update_device(_num_grid_points, false);
 
   #ifdef ASYNC_DEVICE_COPY
   ucl_device->sync();
@@ -614,10 +610,10 @@ void BaseAmoebaT::compute_fphi_uind(double ****host_grid_brick,
   // launch the kernel with its execution configuration (see below)
   fphi_uind();
 
-  // copy data from device to host asynchronously
-  _fdip_phi1.update_host(_max_thetai_size*10, true);
-  _fdip_phi2.update_host(_max_thetai_size*10, true);
-  _fdip_sum_phi.update_host(_max_thetai_size*20, true);
+  // copy data from device to host
+  _fdip_phi1.update_host(_max_thetai_size*10, false);
+  _fdip_phi2.update_host(_max_thetai_size*10, false);
+  _fdip_sum_phi.update_host(_max_thetai_size*20, false);
 
   // return the pointers to the host-side arrays
   *host_fdip_phi1 = _fdip_phi1.host.begin();
@@ -638,13 +634,7 @@ int BaseAmoebaT::fphi_uind() {
 
   const int BX=block_size();
   const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  /*
-  const int cus = this->device->gpu->cus();
-  while (GX < cus && GX > 1) {
-    BX /= 2;
-    GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  }
-  */
+
   time_pair.start();
   int ngridxy = _ngridx * _ngridy;
   k_fphi_uind.set_size(GX,BX);
@@ -666,10 +656,6 @@ int BaseAmoebaT::fphi_uind() {
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_mpole(double ***host_grid_brick, void **host_fphi, const double felec)
 {
-  // TODO: grid brick[k][j][i] is a scalar
-  UCL_H_Vec<numtyp> hdummy;
-  hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
-
   int n = 0;
   for (int iz = _nzlo_out; iz <= _nzhi_out; iz++)
     for (int iy = _nylo_out; iy <= _nyhi_out; iy++)

From bdf8dd4e5415ff0dcd8bcf7a704d64726ff860df Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 20 Jan 2023 00:32:31 -0500
Subject: [PATCH 154/181] serial version

---
 src/REAXFF/fix_reaxff_species.cpp | 53 +++++++++++++++++++++++++++++--
 src/REAXFF/fix_reaxff_species.h   |  1 +
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index 65a2e6d8ce..ceaf983893 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -38,6 +38,7 @@
 
 #include <cstring>
 #include <exception>
+#include <random>
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
@@ -145,6 +146,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
   ele = filepos = filedel = nullptr;
   eleflag = posflag = padflag = 0;
   delflag = specieslistflag = masslimitflag = 0;
+  delete_Nlimit = delete_Nsteps = 0;
 
   singlepos_opened = multipos_opened = del_opened = 0;
   multipos = 0;
@@ -221,7 +223,12 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
 
       } else
         error->all(FLERR, "Unknown fix reaxff/species delete option: {}", arg[iarg]);
-
+      // rate limit when deleting molecules
+    } else if (strcmp(arg[iarg], "delete_rate_limit") == 0) {
+      if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species delete_rate_limit", error);
+      delete_Nlimit = utils::numeric(FLERR, arg[iarg+1], false, lmp);
+      delete_Nsteps = utils::numeric(FLERR, arg[iarg+2], false, lmp);
+      iarg += 3;
       // position of molecules
     } else if (strcmp(arg[iarg], "position") == 0) {
       if (iarg + 3 > narg) utils::missing_cmd_args(FLERR, "fix reaxff/species position", error);
@@ -260,6 +267,14 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
   if (delflag && specieslistflag && masslimitflag)
     error->all(FLERR, "Incompatible combination fix reaxff/species command options");
 
+  if (delete_Nlimit > 0) {
+    memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount");
+
+    for (int i = 0; i < delete_Nsteps; i++)
+      delete_Tcount[i] = -1;
+    delete_Tcount[0] = 0;
+  }
+
   vector_nmole = 0;
   vector_nspec = 0;
 }
@@ -279,6 +294,7 @@ FixReaxFFSpecies::~FixReaxFFSpecies()
   memory->destroy(Mol2Spec);
   memory->destroy(MolType);
   memory->destroy(MolName);
+  memory->destroy(delete_Tcount);
 
   delete[] filepos;
   delete[] filedel;
@@ -375,6 +391,11 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/)
   // point to fix_ave_atom
   f_SPECBOND->end_of_step();
 
+  // push back delete_Tcount on every step
+  if (delete_Nlimit > 0)
+    for (int i = delete_Nsteps-1; i > 0; i--)
+      delete_Tcount[i] = delete_Tcount[i-1];
+
   if (ntimestep != nvalid) return;
 
   nlocal = atom->nlocal;
@@ -826,6 +847,15 @@ void FixReaxFFSpecies::WritePos(int Nmole, int Nspec)
 
 void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
 {
+  int ndeletions;
+  int headroom = -1;
+  if (delete_Nlimit > 0) {
+    if (delete_Tcount[delete_Nsteps-1] == -1) return;
+    ndeletions = delete_Tcount[0] - delete_Tcount[delete_Nsteps-1];
+    headroom = MAX(0, delete_Nlimit - ndeletions);
+    if (headroom == 0) return;
+  }
+
   int i, j, m, n, itype, cid;
   int ndel, ndelone, count, count_tmp;
   int *Nameall;
@@ -856,7 +886,20 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
   int *marklist;
   memory->create(marklist, nlocal, "reaxff/species:marklist");
 
-  for (m = 1; m <= Nmole; m++) {
+  std::random_device rnd;
+  std::minstd_rand park_rng(rnd());
+  int *molrange;
+  memory->create(molrange,Nmole,"reaxff/species:molrange");
+  for (m = 0; m < Nmole; m++)
+    molrange[m] = m + 1;
+  // shuffle index when using rate_limit, in case order is biased
+  if (delete_Nlimit > 0)
+    std::shuffle(&molrange[0],&molrange[Nmole], park_rng);
+
+  int this_delete_Tcount = 0;
+  for (int mm = 0; mm < Nmole; mm++) {
+    if (this_delete_Tcount == headroom) break;
+    m = molrange[mm];
     localmass = totalmass = count = nmarklist = 0;
     for (n = 0; n < ntypes; n++) Name[n] = 0;
 
@@ -896,6 +939,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
       // find corresponding moltype
 
       if (totalmass > massmin && totalmass < massmax) {
+        this_delete_Tcount++;
         for (j = 0; j < nmarklist; j++) {
           mark[marklist[j]] = 1;
           deletecount[Mol2Spec[m - 1]] += 1.0 / (double) count;
@@ -905,6 +949,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
       if (count > 0) {
         for (i = 0; i < ndelspec; i++) {
           if (del_species[i] == species_str) {
+            this_delete_Tcount++;
             for (j = 0; j < nmarklist; j++) {
               mark[marklist[j]] = 1;
               deletecount[i] += 1.0 / (double) count;
@@ -976,6 +1021,9 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
     }
   }
 
+  if (delete_Nlimit)
+    delete_Tcount[0] += this_delete_Tcount;
+
   if (ndel && (atom->map_style != Atom::MAP_NONE)) {
     atom->nghost = 0;
     atom->map_init();
@@ -988,6 +1036,7 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
   memory->destroy(marklist);
   memory->destroy(mark);
   memory->destroy(deletecount);
+  memory->destroy(molrange);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/REAXFF/fix_reaxff_species.h b/src/REAXFF/fix_reaxff_species.h
index 65eeae4c60..329e17145b 100644
--- a/src/REAXFF/fix_reaxff_species.h
+++ b/src/REAXFF/fix_reaxff_species.h
@@ -60,6 +60,7 @@ class FixReaxFFSpecies : public Fix {
   FILE *fp, *pos, *fdel;
   int eleflag, posflag, multipos, padflag, setupflag;
   int delflag, specieslistflag, masslimitflag;
+  int delete_Nlimit, delete_Nsteps, *delete_Tcount;
   double massmin, massmax;
   int singlepos_opened, multipos_opened, del_opened;
   char *ele, **eletype, *filepos, *filedel;

From 096e0a14f009fcfa05d777daabb66e2bd7b89d8f Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 20 Jan 2023 00:38:06 -0500
Subject: [PATCH 155/181] off-by-one fix

---
 src/REAXFF/fix_reaxff_species.cpp | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index ceaf983893..6ad7b50226 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -391,12 +391,13 @@ void FixReaxFFSpecies::Output_ReaxFF_Bonds(bigint ntimestep, FILE * /*fp*/)
   // point to fix_ave_atom
   f_SPECBOND->end_of_step();
 
-  // push back delete_Tcount on every step
-  if (delete_Nlimit > 0)
-    for (int i = delete_Nsteps-1; i > 0; i--)
-      delete_Tcount[i] = delete_Tcount[i-1];
-
-  if (ntimestep != nvalid) return;
+  if (ntimestep != nvalid) {
+    // push back delete_Tcount on every step
+    if (delete_Nlimit > 0)
+      for (int i = delete_Nsteps-1; i > 0; i--)
+        delete_Tcount[i] = delete_Tcount[i-1];
+    return;
+  }
 
   nlocal = atom->nlocal;
 
@@ -1021,8 +1022,13 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
     }
   }
 
-  if (delete_Nlimit)
+
+  // push back delete_Tcount on every step
+  if (delete_Nlimit > 0) {
+    for (i = delete_Nsteps-1; i > 0; i--)
+      delete_Tcount[i] = delete_Tcount[i-1];
     delete_Tcount[0] += this_delete_Tcount;
+  }
 
   if (ndel && (atom->map_style != Atom::MAP_NONE)) {
     atom->nghost = 0;

From bebf79ec92251f04d36b40dfce0e7f627d74f50a Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 20 Jan 2023 00:41:56 -0500
Subject: [PATCH 156/181] reaxff species delete_rate_limit keyword docs

---
 doc/src/fix_reaxff_species.rst | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/doc/src/fix_reaxff_species.rst b/doc/src/fix_reaxff_species.rst
index 11f0e7b7e7..dcb9cfa3bc 100644
--- a/doc/src/fix_reaxff_species.rst
+++ b/doc/src/fix_reaxff_species.rst
@@ -39,6 +39,9 @@ Syntax
            *masslimit* value = massmin massmax
              massmin = minimum molecular weight of species to delete
              massmax = maximum molecular weight of species to delete
+       *delete_rate_limit* value = Nlimit Nsteps
+             Nlimit = maximum number of deletions allowed to occur within interval
+             Nsteps = the interval (number of timesteps) over which to count deletions
 
 Examples
 """"""""
@@ -140,7 +143,13 @@ When using the *masslimit* keyword, each line of the *filedel* file
 contains the timestep on which deletions occurs, followed by how many
 of each species are deleted (with quantities preceding chemical
 formulae).  The *specieslist* and *masslimit* keywords cannot both be
-used in the same *reaxff/species* fix.
+used in the same *reaxff/species* fix.  The *delete_rate_limit*
+keyword can enforce an upper limit on the overall rate of molecule
+deletion.  The number of deletion occurrences is limited to Nlimit
+within an interval of Nsteps timesteps.  When using the
+*delete_rate_limit* keyword, no deletions are permitted to occur
+within the first Nsteps timesteps of the first run (after reading a
+either a data or restart file).
 
 ----------
 

From 617d70dd1ca6b466ef784dadcb513286e0171d89 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 20 Jan 2023 14:19:16 -0600
Subject: [PATCH 157/181] Replaced MPI_Wtime() with platform::walltime(), put
 the low-level timing breakdown inside #if DEBUG_AMOEBA

---
 src/AMOEBA/amoeba_convolution.cpp  | 16 ++++++++--------
 src/AMOEBA/amoeba_induce.cpp       | 26 ++++++++++++--------------
 src/AMOEBA/amoeba_multipole.cpp    |  6 +++---
 src/AMOEBA/amoeba_polar.cpp        |  6 +++---
 src/AMOEBA/pair_amoeba.cpp         | 26 ++++++++++++++------------
 src/GPU/amoeba_convolution_gpu.cpp |  8 ++++----
 src/GPU/pair_amoeba_gpu.cpp        | 23 +++++++++++------------
 src/GPU/pair_hippo_gpu.cpp         | 22 ++++++++++------------
 8 files changed, 65 insertions(+), 68 deletions(-)

diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp
index 028eb7717b..609df1184e 100644
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@@ -329,12 +329,12 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
   double time0,time1;
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   if (SCALE) {
     double scale = 1.0/nfft_global;
@@ -394,12 +394,12 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
   double time0,time1;
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   if (SCALE) {
     double scale = 1.0/nfft_global;
@@ -444,10 +444,10 @@ void *AmoebaConvolution::post_convolution_3d()
   double time0,time1;
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   time_fft += time1 - time0;
 
@@ -495,11 +495,11 @@ void *AmoebaConvolution::post_convolution_4d()
   double time0,time1;
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   time_fft += time1 - time0;
 
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 031173060c..7ff9fe7121 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -532,8 +532,6 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
   int i,j;
   double term;
 
-  double time0,time1,time2;
-
   // zero field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
@@ -546,18 +544,19 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
     }
   }
 
+  double time0, time1, time2;
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   // get the real space portion of the mutual field
 
   if (polar_rspace_flag) umutual2b(field,fieldp);
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   // get the reciprocal space part of the mutual field
 
   if (polar_kspace_flag) umutual1(field,fieldp);
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();
 
   // add the self-energy portion of the mutual field
 
@@ -781,8 +780,6 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
   int i,j;
   double term;
 
-  double time0,time1,time2;
-
   // zero out field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
@@ -797,11 +794,12 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
 
   // get the reciprocal space part of the permanent field
 
+  double time0, time1, time2;
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   if (polar_kspace_flag) udirect1(field);
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   for (i = 0; i < nlocal; i++) {
     for (j = 0; j < 3; j++) {
@@ -812,7 +810,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
   // get the real space portion of the permanent field
 
   if (polar_rspace_flag) udirect2b(field,fieldp);
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();
 
   // get the self-energy portion of the permanent field
 
@@ -873,11 +871,11 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
   // map 2 values to grid
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   grid_uind(fuind,fuinp,gridpre);
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
   time_grid_uind += (time1 - time0);
 
   // pre-convolution operations including forward FFT
@@ -918,11 +916,11 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
   // get potential
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
   time_fphi_uind += (time1 - time0);
 
   // store fractional reciprocal potentials for OPT method
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index 7269128080..f302194193 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -81,17 +81,17 @@ void PairAmoeba::multipole()
   felec = electric / am_dielectric;
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   // compute the real space part of the Ewald summation
 
   if (mpole_rspace_flag) multipole_real();
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   // compute the reciprocal space part of the Ewald summation
 
   if (mpole_kspace_flag) multipole_kspace();
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();
 
   // compute the Ewald self-energy term over all the atoms
 
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index e0e8ecc1d9..e2b85ed22c 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -79,15 +79,15 @@ void PairAmoeba::polar()
   // compute the real space part of the dipole interactions
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   if (polar_rspace_flag) polar_real();
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   // compute the reciprocal space part of dipole interactions
 
   if (polar_kspace_flag) polar_kspace();
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();
 
   // compute the Ewald self-energy torque and virial terms
 
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index 677bc48344..a1b288348a 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -47,6 +47,7 @@ enum{MUTUAL,OPT,TCG,DIRECT};
 enum{GEAR,ASPC,LSQR};
 
 #define DELTASTACK 16
+#define DEBUG_AMOEBA 0
 
 /* ---------------------------------------------------------------------- */
 
@@ -371,7 +372,7 @@ void PairAmoeba::compute(int eflag, int vflag)
   double time0,time1,time2,time3,time4,time5,time6,time7,time8;
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   // if reneighboring step:
   // augment neighbor list to include 1-5 neighbor flags
@@ -427,8 +428,7 @@ void PairAmoeba::compute(int eflag, int vflag)
   comm->forward_comm(this);
 
   if (amoeba) pbc_xred();
-
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   // ----------------------------------------
   // compute components of force field
@@ -437,22 +437,22 @@ void PairAmoeba::compute(int eflag, int vflag)
   // buffered 14-7 Vdwl, pairwise
 
   if (amoeba && hal_flag) hal();
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();
 
   // Pauli repulsion, pairwise
 
   if (!amoeba && repulse_flag) repulsion();
-  time3 = MPI_Wtime();
+  time3 = platform::walltime();
 
   // Ewald dispersion, pairwise and long range
 
   if (!amoeba && (disp_rspace_flag || disp_kspace_flag)) dispersion();
-  time4 = MPI_Wtime();
+  time4 = platform::walltime();
 
   // multipole, pairwise and long range
 
   if (mpole_rspace_flag || mpole_kspace_flag) multipole();
-  time5 = MPI_Wtime();
+  time5 = platform::walltime();
 
   // induced dipoles, interative CG relaxation
   // communicate induce() output values needed by ghost atoms
@@ -462,17 +462,17 @@ void PairAmoeba::compute(int eflag, int vflag)
     cfstyle = INDUCE;
     comm->forward_comm(this);
   }
-  time6 = MPI_Wtime();
+  time6 = platform::walltime();
 
   // dipoles, pairwise and long range
 
   if (polar_rspace_flag || polar_kspace_flag) polar();
-  time7 = MPI_Wtime();
+  time7 = platform::walltime();
 
   // charge transfer, pairwise
 
   if (!amoeba && qxfer_flag) charge_transfer();
-  time8 = MPI_Wtime();
+  time8 = platform::walltime();
 
   // store energy components for output by compute pair command
 
@@ -535,8 +535,8 @@ void PairAmoeba::finish()
   MPI_Allreduce(&time_qxfer,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_qxfer = ave/comm->nprocs;
 
+  #if DEBUG_AMOEBA
   // real-space/kspace breakdown
-
   MPI_Allreduce(&time_mpole_rspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_mpole_rspace = ave/comm->nprocs;
 
@@ -571,6 +571,7 @@ void PairAmoeba::finish()
   if (ic_kspace) time_mutual_fft = ic_kspace->time_fft;
   MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_mutual_fft = ave/comm->nprocs;
+  #endif // DEBUG_AMOEBA
 
   double time_total = (time_init + time_hal + time_repulse + time_disp +
                        time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
@@ -591,6 +592,7 @@ void PairAmoeba::finish()
       utils::logmesg(lmp,"  Qxfer   time: {:.6g} {:.6g}\n", time_qxfer, time_qxfer/time_total);
     utils::logmesg(lmp,"  Total   time: {:.6g}\n",time_total * 100.0);
 
+    #if DEBUG_AMOEBA
     double rspace_time = time_mpole_rspace + time_direct_rspace + time_mutual_rspace + time_polar_rspace;
     double kspace_time = time_mpole_kspace + time_direct_kspace + time_mutual_kspace + time_polar_kspace;
 
@@ -607,7 +609,7 @@ void PairAmoeba::finish()
     utils::logmesg(lmp,"       - FFT     : {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
     utils::logmesg(lmp,"       - Interp  : {:.6g} {:.3g}%\n", time_fphi_uind, time_fphi_uind/time_total);
     utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
-
+    #endif
   }
 }
 
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
index fd4aece6c8..908c9e409c 100644
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -102,7 +102,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
   double time0,time1;
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   // perform forward FFT
 
@@ -112,7 +112,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
   #endif
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   time_fft += time1 - time0;
 
@@ -146,11 +146,11 @@ void *AmoebaConvolutionGPU::post_convolution_4d()
   double time0,time1;
 
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   time_fft += time1 - time0;
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 5bc2b3a48c..34605725a5 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -919,11 +919,8 @@ void PairAmoebaGPU::udirect2b_cpu()
 
 void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
 {
-  //int i,j;
   double term;
 
-  double time0,time1,time2;
-
   // zero field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
@@ -934,16 +931,18 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
 
   // get the real space portion of the mutual field first
 
+  double time0, time1, time2;
+
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   if (polar_rspace_flag) umutual2b(field,fieldp);
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   // get the reciprocal space part of the mutual field
 
   if (polar_kspace_flag) umutual1(field,fieldp);
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();
 
   // add the self-energy portion of the mutual field
 
@@ -1049,20 +1048,19 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
     fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
   }
 
-  double time0, time1;
-
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
   double ****gridpre = (double ****) ic_kspace->zero();
 
   // map 2 values to grid
 
+  double time0, time1;
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   grid_uind(fuind,fuinp,gridpre);
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
   time_grid_uind += (time1 - time0);
 
   // pre-convolution operations including forward FFT
@@ -1102,11 +1100,12 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 
   // get potential
 
-  time0 = MPI_Wtime();
+  MPI_Barrier(world);
+  time0 = platform::walltime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
   time_fphi_uind += (time1 - time0);
 
   // store fractional reciprocal potentials for OPT method
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 1f0f3e820a..3049799433 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -1016,8 +1016,6 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
 {
   double term;
 
-  double time0,time1,time2;
-
   // zero field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
@@ -1028,16 +1026,17 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
 
   // get the real space portion of the mutual field first
 
+  double time0, time1, time2;
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   if (polar_rspace_flag) umutual2b(field,fieldp);
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
 
   // get the reciprocal space part of the mutual field
 
   if (polar_kspace_flag) umutual1(field,fieldp);
-  time2 = MPI_Wtime();
+  time2 = platform::walltime();
 
   // add the self-energy portion of the mutual field
 
@@ -1123,21 +1122,19 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
     fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
   }
 
-  double time0, time1;
-
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
   double ****gridpre = (double ****) ic_kspace->zero();
 
   // map 2 values to grid
 
-
+  double time0, time1;
   MPI_Barrier(world);
-  time0 = MPI_Wtime();
+  time0 = platform::walltime();
 
   grid_uind(fuind,fuinp,gridpre);
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
   time_grid_uind += (time1 - time0);
 
   // pre-convolution operations including forward FFT
@@ -1177,11 +1174,12 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
 
   // get potential
 
-  time0 = MPI_Wtime();
+  MPI_Barrier(world);
+  time0 = platform::walltime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
 
-  time1 = MPI_Wtime();
+  time1 = platform::walltime();
   time_fphi_uind += (time1 - time0);
 
   // store fractional reciprocal potentials for OPT method

From ff709f5897c38a8d5d685a9957865dc5ae0dc27a Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 20 Jan 2023 16:29:16 -0500
Subject: [PATCH 158/181] 'include' for std::shuffle

---
 src/REAXFF/fix_reaxff_species.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index 6ad7b50226..caffcc08af 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -36,6 +36,7 @@
 #include "pair_reaxff.h"
 #include "reaxff_defs.h"
 
+#include <algorithm>
 #include <cstring>
 #include <exception>
 #include <random>

From 846f00ce32d6e59cd7acd93b85a8d1eff4d2eea4 Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 20 Jan 2023 16:58:19 -0500
Subject: [PATCH 159/181] add citation

---
 src/REAXFF/fix_reaxff_species.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index caffcc08af..9bc70e7617 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -21,6 +21,7 @@
 
 #include "atom.h"
 #include "atom_vec.h"
+#include "citeme.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
@@ -44,6 +45,17 @@
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
+static const char cite_reaxff_species_delete[] =
+  "fix reaxff/species, 'delete' keyword: https://doi.org/10.1016/j.carbon.2022.11.002\n\n"
+  "@Article{Gissinger23,\n"
+  " author = {J. R. Gissinger, S. R. Zavada, J. G. Smith, J. Kemppainen, I. Gallegos, G. M. Odegard, E. J. Siochi, K. E. Wise},\n"
+  " title = {Predicting char yield of high-temperature resins},\n"
+  " journal = {Carbon},\n"
+  " year =    2023,\n"
+  " volume =  202,\n"
+  " pages =   {336-347}\n"
+  "}\n\n";
+
 /* ---------------------------------------------------------------------- */
 
 FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
@@ -52,6 +64,8 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
     x0(nullptr), BOCut(nullptr), fp(nullptr), pos(nullptr), fdel(nullptr), ele(nullptr),
     eletype(nullptr), filepos(nullptr), filedel(nullptr)
 {
+  if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete);
+
   if (narg < 7) utils::missing_cmd_args(FLERR, "fix reaxff/species", error);
 
   force_reneighbor = 1;

From 375fad6d2a132ddac5827b6f3b4b6b387e659970 Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 20 Jan 2023 17:13:56 -0500
Subject: [PATCH 160/181] parallel version

---
 src/REAXFF/fix_reaxff_species.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index 9bc70e7617..c3335a0397 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -906,11 +906,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
   std::minstd_rand park_rng(rnd());
   int *molrange;
   memory->create(molrange,Nmole,"reaxff/species:molrange");
-  for (m = 0; m < Nmole; m++)
-    molrange[m] = m + 1;
-  // shuffle index when using rate_limit, in case order is biased
-  if (delete_Nlimit > 0)
-    std::shuffle(&molrange[0],&molrange[Nmole], park_rng);
+  if (comm->me == 0) {
+    for (m = 0; m < Nmole; m++)
+      molrange[m] = m + 1;
+    // shuffle index when using rate_limit, in case order is biased
+    if (delete_Nlimit > 0)
+      std::shuffle(&molrange[0],&molrange[Nmole], park_rng);
+  }
+  MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world);
 
   int this_delete_Tcount = 0;
   for (int mm = 0; mm < Nmole; mm++) {

From f6ded5a7d742e600cab2c88878be137d65c68b94 Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Fri, 20 Jan 2023 17:36:46 -0500
Subject: [PATCH 161/181] reduce unnecessary communication

---
 src/REAXFF/fix_reaxff_species.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index c3335a0397..a9bab28003 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -906,14 +906,14 @@ void FixReaxFFSpecies::DeleteSpecies(int Nmole, int Nspec)
   std::minstd_rand park_rng(rnd());
   int *molrange;
   memory->create(molrange,Nmole,"reaxff/species:molrange");
-  if (comm->me == 0) {
-    for (m = 0; m < Nmole; m++)
-      molrange[m] = m + 1;
+  for (m = 0; m < Nmole; m++)
+    molrange[m] = m + 1;
+  if (delete_Nlimit > 0) {
     // shuffle index when using rate_limit, in case order is biased
-    if (delete_Nlimit > 0)
+    if (comm->me == 0)
       std::shuffle(&molrange[0],&molrange[Nmole], park_rng);
+    MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world);
   }
-  MPI_Bcast(&molrange[0], Nmole, MPI_INT, 0, world);
 
   int this_delete_Tcount = 0;
   for (int mm = 0; mm < Nmole; mm++) {

From 8537ccb840e406b2049291c2ddfccbb05ce90063 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sat, 21 Jan 2023 11:18:51 -0500
Subject: [PATCH 162/181] add CMake option to skip automatic download of large
 potential files

---
 cmake/CMakeLists.txt            |  2 ++
 cmake/Modules/LAMMPSUtils.cmake | 40 +++++++++++++++++----------------
 2 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 0223750ace..767bbbfe34 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -566,6 +566,8 @@ RegisterStyles(${LAMMPS_SOURCE_DIR})
 ########################################################
 # Fetch missing external files and archives for packages
 ########################################################
+option(DOWNLOAD_POTENTIALS "Automatically download large potential files" ON)
+mark_as_advanced(DOWNLOAD_POTENTIALS)
 foreach(PKG ${STANDARD_PACKAGES} ${EXTRA_PACKAGES} ${SUFFIX_PACKAGES})
   if(PKG_${PKG})
     FetchPotentials(${LAMMPS_SOURCE_DIR}/${PKG} ${LAMMPS_POTENTIALS_DIR})
diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake
index 9602379403..d42f91f10e 100644
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@@ -118,25 +118,27 @@ endfunction(GenerateBinaryHeader)
 
 # fetch missing potential files
 function(FetchPotentials pkgfolder potfolder)
-  if(EXISTS "${pkgfolder}/potentials.txt")
-    file(STRINGS "${pkgfolder}/potentials.txt" linelist REGEX "^[^#].")
-    foreach(line ${linelist})
-      string(FIND ${line} " " blank)
-      math(EXPR plusone "${blank}+1")
-      string(SUBSTRING ${line} 0 ${blank} pot)
-      string(SUBSTRING ${line} ${plusone} -1 sum)
-      if(EXISTS "${LAMMPS_POTENTIALS_DIR}/${pot}")
-        file(MD5 "${LAMMPS_POTENTIALS_DIR}/${pot}" oldsum)
-      endif()
-      if(NOT sum STREQUAL oldsum)
-        message(STATUS "Downloading external potential ${pot} from ${LAMMPS_POTENTIALS_URL}")
-        string(MD5 TMP_EXT "${CMAKE_BINARY_DIR}")
-        file(DOWNLOAD "${LAMMPS_POTENTIALS_URL}/${pot}.${sum}" "${CMAKE_BINARY_DIR}/${pot}.${TMP_EXT}"
-          EXPECTED_HASH MD5=${sum} SHOW_PROGRESS)
-        file(COPY "${CMAKE_BINARY_DIR}/${pot}.${TMP_EXT}" DESTINATION "${LAMMPS_POTENTIALS_DIR}")
-        file(RENAME "${LAMMPS_POTENTIALS_DIR}/${pot}.${TMP_EXT}" "${LAMMPS_POTENTIALS_DIR}/${pot}")
-      endif()
-    endforeach()
+  if(DOWNLOAD_POTENTIALS)
+    if(EXISTS "${pkgfolder}/potentials.txt")
+      file(STRINGS "${pkgfolder}/potentials.txt" linelist REGEX "^[^#].")
+      foreach(line ${linelist})
+        string(FIND ${line} " " blank)
+        math(EXPR plusone "${blank}+1")
+        string(SUBSTRING ${line} 0 ${blank} pot)
+        string(SUBSTRING ${line} ${plusone} -1 sum)
+        if(EXISTS "${LAMMPS_POTENTIALS_DIR}/${pot}")
+          file(MD5 "${LAMMPS_POTENTIALS_DIR}/${pot}" oldsum)
+        endif()
+        if(NOT sum STREQUAL oldsum)
+          message(STATUS "Downloading external potential ${pot} from ${LAMMPS_POTENTIALS_URL}")
+          string(RANDOM LENGTH 10 TMP_EXT)
+          file(DOWNLOAD "${LAMMPS_POTENTIALS_URL}/${pot}.${sum}" "${CMAKE_BINARY_DIR}/${pot}.${TMP_EXT}"
+            EXPECTED_HASH MD5=${sum} SHOW_PROGRESS)
+          file(COPY "${CMAKE_BINARY_DIR}/${pot}.${TMP_EXT}" DESTINATION "${LAMMPS_POTENTIALS_DIR}")
+          file(RENAME "${LAMMPS_POTENTIALS_DIR}/${pot}.${TMP_EXT}" "${LAMMPS_POTENTIALS_DIR}/${pot}")
+        endif()
+      endforeach()
+    endif()
   endif()
 endfunction(FetchPotentials)
 

From 658328dd9d57bc84d308affe0327217a42f9e947 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sun, 22 Jan 2023 17:24:15 -0600
Subject: [PATCH 163/181] Added a note in the amoeba doc page on the not-yet
 resolved issue with integrated GPUs, removed commented out and debugging
 stuffs in the AM/HP kernels

---
 doc/src/pair_amoeba.rst |   5 ++
 lib/gpu/lal_amoeba.cu   |  87 +---------------------------
 lib/gpu/lal_hippo.cu    | 123 +++++-----------------------------------
 3 files changed, 20 insertions(+), 195 deletions(-)

diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst
index 113ae560f7..ab82fa5593 100644
--- a/doc/src/pair_amoeba.rst
+++ b/doc/src/pair_amoeba.rst
@@ -200,6 +200,11 @@ These pair styles can only be used via the *pair* keyword of the
 
 .. include:: accel_styles.rst
 
+.. note::
+
+  There is a unresolved issue with the `amoeba/gpu` and `hippo/gpu`
+  pair styles with the OpenCL build when running on integrated GPUs.
+
 ----------
 
 Restrictions
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index b3bbabadc3..68d15cfb47 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -14,7 +14,7 @@
 // ***************************************************************************
 
 #if defined(NV_KERNEL) || defined(USE_HIP)
-//#include <stdio.h>
+
 #include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@@ -448,20 +448,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
 
   if (ii<inum) {
-    //int m;
-    //numtyp bfac;
-    //numtyp term1,term2,term3;
-    //numtyp term4,term5,term6;
-    //numtyp bn[6];
-
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -491,7 +483,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -596,11 +587,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[6];
-      /*
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      bn[0] = _erfc * rinv;
-      */
       bn[0] = ucl_erfc(ralpha) * rinv;
       
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
@@ -696,8 +682,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   // accumate force, energy and virial: use _acc if not the first kernel
   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv);
-  //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //   offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
 /* ----------------------------------------------------------------------
@@ -752,9 +736,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
     int itype  = pol3i.z;    // amtype[i];
     int igroup = pol3i.w;    // amgroup[i];
 
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
     numtyp pdi = coeff[itype].x;
     numtyp pti = coeff[itype].y;
     numtyp ddi = coeff[itype].z;
@@ -769,7 +750,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -777,8 +757,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
-
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
@@ -825,12 +803,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
-      numtyp bn[4],bcn[3];
-      /*
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      bn[0] = _erfc * rinv;
-      */
+      numtyp bn[4], bcn[3];
       bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
@@ -849,7 +822,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       if (damp != (numtyp)0.0) {
         numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
         if (pgamma != (numtyp)0.0) {
-          //damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
           numtyp tmp = r*ucl_recip(damp);
           damp = pgamma * ucl_sqrt(tmp*tmp*tmp);
           if (damp < (numtyp)50.0) {
@@ -860,7 +832,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
           }
         } else {
           pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
-          //damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
           numtyp tmp = r*ucl_recip(damp);
           damp = pgamma * (tmp*tmp*tmp);
           if (damp < (numtyp)50.0) {
@@ -930,7 +901,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  //local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
@@ -939,8 +909,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
   numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
   numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
 
-  //numtyp4 xi__;
-
   if (ii<inum) {
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -1011,11 +979,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[4];
-      /*
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      bn[0] = _erfc * rinv;
-      */
       bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
@@ -1054,9 +1017,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
       tdipdip[4] = bcn[1]*yr*zr;
       tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
-      //if (i==0 && j == 10)
-      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
-      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
 
       numtyp fid[3];
       fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
@@ -1132,8 +1092,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
   numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
 
-  //numtyp4 xi__;
-
   if (ii<inum) {
     int itype,igroup;
     numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
@@ -1177,9 +1135,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
     uiyp = pol5i.y;   // uinp[i][1];
     uizp = pol5i.z;   // uinp[i][2];
 
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
     numtyp pdi = coeff[itype].x;
     numtyp pti = coeff[itype].y;
 
@@ -1189,16 +1144,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
       numtyp yr = jx.y - ix.y;
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
-
-      //if (r2>off2) continue;
-
       numtyp r = ucl_sqrt(r2);
 
       const numtyp4 pol1j = polar1[j];
@@ -1249,7 +1200,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
       numtyp qkr = qkx*xr + qky*yr + qkz*zr;
       numtyp uir = uix*xr + uiy*yr + uiz*zr;
-      //numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
       numtyp ukr = ukx*xr + uky*yr + ukz*zr;
       numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
 
@@ -1280,15 +1230,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp drc3[3],drc5[3],drc7[3];
       numtyp urc3[3],urc5[3];
     
-
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[5];
-      /*
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      bn[0] = _erfc * rinv;
-      */
       bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
@@ -1318,7 +1262,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
         numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
-        //damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
         numtyp tmp = r*ucl_recip(damp);
         damp = pgamma * (tmp*tmp*tmp);
         if (damp < (numtyp)50.0) {
@@ -1614,9 +1557,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   // accumulate ufld and dufld to compute tep
   store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
 
-  // accumate force, energy and virial
-  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //     offset,eflag,vflag,ans,engv);
   store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
@@ -1746,17 +1686,6 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
 
         int i = (igridx - nxlo_out) - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
-          /*
-          tq_1 = grid[k][j][i][0];
-          tq_2 = grid[k][j][i][1];
-          t0_1 += tq_1*thetai1[m][ib][0];
-          t1_1 += tq_1*thetai1[m][ib][1];
-          t2_1 += tq_1*thetai1[m][ib][2];
-          t0_2 += tq_2*thetai1[m][ib][0];
-          t1_2 += tq_2*thetai1[m][ib][1];
-          t2_2 += tq_2*thetai1[m][ib][2];
-          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
-          */
           const int i1 = istart + ib;
           const numtyp4 tha1 = thetai1[i1];
           const int gidx = my + i; // k*ngridxy + j*ngridx + i;
@@ -1963,12 +1892,6 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
 
     int k = (igridz - nzlo_out) - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
-      /*
-      v0 = thetai3[m][kb][0];
-      v1 = thetai3[m][kb][1];
-      v2 = thetai3[m][kb][2];
-      v3 = thetai3[m][kb][3];
-      */
       int i3 = istart + kb;
       numtyp4 tha3 = thetai3[i3];
       numtyp v0 = tha3.x;
@@ -1988,12 +1911,6 @@ __kernel void k_amoeba_fphi_mpole(const __global numtyp4 *restrict thetai1,
 
       int j = (igridy - nylo_out) - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
-        /*
-        u0 = thetai2[m][jb][0];
-        u1 = thetai2[m][jb][1];
-        u2 = thetai2[m][jb][2];
-        u3 = thetai2[m][jb][3];
-        */
         int i2 = istart + jb;
         numtyp4 tha2 = thetai2[i2];
         numtyp u0 = tha2.x;
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index a5fca5cc80..0647a736a8 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -14,7 +14,7 @@
 // ***************************************************************************
 
 #if defined(NV_KERNEL) || defined(USE_HIP)
-#include <stdio.h>
+
 #include "lal_hippo_extra.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@@ -455,8 +455,6 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -467,7 +465,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
     }
 
     const numtyp4 pol1i = polar1[i];
-    //numtyp ci  = pol1i.x;    // rpole[i][0];
+    //numtyp ci  = pol1i.x;  // rpole[i][0];
     numtyp dix = pol1i.y;    // rpole[i][1];
     numtyp diy = pol1i.z;    // rpole[i][2];
     numtyp diz = pol1i.w;    // rpole[i][3];
@@ -490,7 +488,6 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -502,18 +499,18 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
 
       const numtyp4 pol1j = polar1[j];
       //numtyp ck  = pol1j.x;  // rpole[j][0];
-      numtyp dkx = pol1j.y;  // rpole[j][1];
-      numtyp dky = pol1j.z;  // rpole[j][2];
-      numtyp dkz = pol1j.w;  // rpole[j][3];
+      numtyp dkx = pol1j.y;    // rpole[j][1];
+      numtyp dky = pol1j.z;    // rpole[j][2];
+      numtyp dkz = pol1j.w;    // rpole[j][3];
       const numtyp4 pol2j = polar2[j];
-      numtyp qkxx = pol2j.x; // rpole[j][4];
-      numtyp qkxy = pol2j.y; // rpole[j][5];
-      numtyp qkxz = pol2j.z; // rpole[j][6];
-      numtyp qkyy = pol2j.w; // rpole[j][8];
+      numtyp qkxx = pol2j.x;   // rpole[j][4];
+      numtyp qkxy = pol2j.y;   // rpole[j][5];
+      numtyp qkxz = pol2j.z;   // rpole[j][6];
+      numtyp qkyy = pol2j.w;   // rpole[j][8];
       const numtyp4 pol3j = polar3[j];
-      numtyp qkyz = pol3j.x; // rpole[j][9];
-      numtyp qkzz = pol3j.y; // rpole[j][12];
-      int jtype = pol3j.z; // amtype[j];
+      numtyp qkyz = pol3j.x;   // rpole[j][9];
+      numtyp qkzz = pol3j.y;   // rpole[j][12];
+      int jtype = pol3j.z;     // amtype[j];
 
       numtyp sizk = coeff_rep[jtype].x; // sizpr[jtype];
       numtyp dmpk = coeff_rep[jtype].y; // dmppr[jtype];
@@ -776,7 +773,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = ix.x - jx.x;
@@ -784,8 +780,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
       numtyp zr = ix.z - jx.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
-
       int jtype =   polar3[j].z; // amtype[j];
       int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
       numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
@@ -886,9 +880,6 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
 
   } // ii<inum
 
-  // accumate force, energy and virial: use _acc if not the first kernel
-  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //   offset,eflag,vflag,ans,engv);
   store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
@@ -982,7 +973,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       int j = jextra & NEIGHMASK15;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      //int jtype=jx.w;
 
       // Compute r12
       numtyp xr = jx.x - ix.x;
@@ -990,8 +980,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
-
       numtyp r = ucl_sqrt(r2);
       const numtyp4 pol1j = polar1[j];
       numtyp dkx = pol1j.y;  // rpole[j][1];
@@ -1090,11 +1078,6 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[6];
-      /*
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      bn[0] = _erfc * rinv;
-      */
       bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
@@ -1213,9 +1196,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
   // accumulate tq
   store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
 
-  // accumate force, energy and virial: use _acc if not the first kernel
-  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //   offset,eflag,vflag,ans,engv);
+  // accumate force, energy and virial
   store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
@@ -1294,8 +1275,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
-
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_recip(r);
       numtyp r2inv = rinv*rinv;
@@ -1345,11 +1324,6 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[4];
-      /*
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      bn[0] = _erfc * rinv;
-      */
       bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
@@ -1439,9 +1413,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
   numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
   numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
-  //numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
-
-  //numtyp4 xi__;
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
@@ -1461,8 +1432,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
     int itype;
     itype  = polar3[i].z; // amtype[i];
-    //igroup = polar3[i].w; // amgroup[i];
-
     int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
     numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
 
@@ -1483,8 +1452,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
-
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
@@ -1494,7 +1461,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 
       const numtyp4 pol3j = polar3[j];
       int jtype = pol3j.z; // amtype[j];
-      //int jgroup =  pol3j.w; // amgroup[j];
       const numtyp4 pol4j = polar4[j];
       numtyp ukx = pol4j.x;  // uind[j][0];
       numtyp uky = pol4j.y;  // uind[j][1];
@@ -1516,11 +1482,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[4];
-      /*
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      bn[0] = _erfc * rinv;
-      */
       bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp aefac = aesq2n;
@@ -1546,9 +1507,6 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
       tdipdip[3] = -rr3ik + rr5ik*yr*yr;
       tdipdip[4] = rr5ik*yr*zr;
       tdipdip[5] = -rr3ik + rr5ik*zr*zr;
-      //if (i==0 && j == 10)
-      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
-      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
 
       numtyp fid[3];
       fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
@@ -1638,8 +1596,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    //numtyp qtmp; fetch(qtmp,i,q_tex);
-    //int itype=ix.w;
 
     // recalculate numj and nbor_end for use of the short nbor list
     if (dev_packed==dev_nbor) {
@@ -1672,9 +1628,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
     uiyp = pol5i.y;   // uinp[i][1];
     uizp = pol5i.z;   // uinp[i][2];
 
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
     numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
     numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
     numtyp vali = polar6[i].x;
@@ -1692,8 +1645,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
-
       numtyp r = ucl_sqrt(r2);
 
       const numtyp4 pol1j = polar1[j];
@@ -1759,11 +1710,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[5];
-      /*
-      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
-      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
-      bn[0] = _erfc * rinv;
-      */
       bn[0] = ucl_erfc(ralpha) * rinv;
 
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
@@ -1824,7 +1770,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
       dufld[5] += zr*tiz5 + zr*zr*tuir;
 
-
       // get the field gradient for direct polarization force
 
       numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
@@ -1962,7 +1907,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp frcz = (numtyp)-2.0 * depz;
 
       numtyp term1,term2,term3;
-      //numtyp term4,term5,term6,term7;
 
       // get the dEp/dR terms used for direct polarization force
       // poltyp == MUTUAL && hippo
@@ -2039,8 +1983,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
   store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
 
   // accumate force, energy and virial
-  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
-  //     offset,eflag,vflag,ans,engv);
   store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
@@ -2063,10 +2005,6 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
                           const int nxlo_out, const int ngridxy,
                           const int ngridx)
 {
-  //int tid, ii, offset, i, n_stride;
-  //atom_info(t_per_atom,ii,tid,offset);
-  
-
   int tid=THREAD_ID_X;
   int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
 
@@ -2125,12 +2063,6 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
 
     int k = (igridz - nzlo_out) - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
-      /*
-      v0 = thetai3[m][kb][0];
-      v1 = thetai3[m][kb][1];
-      v2 = thetai3[m][kb][2];
-      v3 = thetai3[m][kb][3];
-      */
       int i3 = istart + kb;
       numtyp4 tha3 = thetai3[i3];
       numtyp v0 = tha3.x;
@@ -2162,12 +2094,6 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
 
       int j = (igridy - nylo_out) - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
-        /*
-        u0 = thetai2[m][jb][0];
-        u1 = thetai2[m][jb][1];
-        u2 = thetai2[m][jb][2];
-        u3 = thetai2[m][jb][3];
-        */
         int i2 = istart + jb;
         numtyp4 tha2 = thetai2[i2];
         numtyp u0 = tha2.x;
@@ -2184,17 +2110,6 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
 
         int i = (igridx - nxlo_out) - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
-          /*
-          tq_1 = grid[k][j][i][0];
-          tq_2 = grid[k][j][i][1];
-          t0_1 += tq_1*thetai1[m][ib][0];
-          t1_1 += tq_1*thetai1[m][ib][1];
-          t2_1 += tq_1*thetai1[m][ib][2];
-          t0_2 += tq_2*thetai1[m][ib][0];
-          t1_2 += tq_2*thetai1[m][ib][1];
-          t2_2 += tq_2*thetai1[m][ib][2];
-          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
-          */
           int i1 = istart + ib;
           numtyp4 tha1 = thetai1[i1];
           numtyp w0 = tha1.x;
@@ -2403,12 +2318,6 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
 
     int k = (igridz - nzlo_out) - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
-      /*
-      v0 = thetai3[m][kb][0];
-      v1 = thetai3[m][kb][1];
-      v2 = thetai3[m][kb][2];
-      v3 = thetai3[m][kb][3];
-      */
       int i3 = istart + kb;
       numtyp4 tha3 = thetai3[i3];
       numtyp v0 = tha3.x;
@@ -2428,12 +2337,6 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
 
       int j = (igridy - nylo_out) - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
-        /*
-        u0 = thetai2[m][jb][0];
-        u1 = thetai2[m][jb][1];
-        u2 = thetai2[m][jb][2];
-        u3 = thetai2[m][jb][3];
-        */
         int i2 = istart + jb;
         numtyp4 tha2 = thetai2[i2];
         numtyp u0 = tha2.x;

From 8e79e2efa5a971372574ed28f9d441f5a9293aed Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 23 Jan 2023 00:18:42 -0600
Subject: [PATCH 164/181] More cleanup, fixed bugs with hippo fphi kernels for
 mixed precision

---
 lib/gpu/lal_amoeba.cu |  2 +-
 lib/gpu/lal_hippo.cu  | 79 +++++++++++++++++++++----------------------
 2 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 68d15cfb47..6317ba8d94 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1557,6 +1557,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   // accumulate ufld and dufld to compute tep
   store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
 
+  // accumate force, energy and virial
   store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
@@ -1834,7 +1835,6 @@ __kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
   }
 }
 
-
 /* ----------------------------------------------------------------------
    fphi_mpole = multipole potential from grid
    fphi_mpole extracts the permanent multipole potential from
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 0647a736a8..1611e8aece 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -1996,10 +1996,10 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai2,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
-                          const __global numtyp *restrict grid,
-                          __global numtyp *restrict fdip_phi1,
-                          __global numtyp *restrict fdip_phi2,
-                          __global numtyp *restrict fdip_sum_phi,
+                          const __global numtyp2 *restrict grid,
+                          __global acctyp *restrict fdip_phi1,
+                          __global acctyp *restrict fdip_phi2,
+                          __global acctyp *restrict fdip_sum_phi,
                           const int bsorder, const int inum,
                           const int nzlo_out, const int nylo_out,
                           const int nxlo_out, const int ngridxy,
@@ -2010,12 +2010,12 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
 
   if (ii<inum) {
 
-    int nlpts = (bsorder-1) / 2;
+    const int nlpts = (bsorder-1) / 2;
     
     int istart = fast_mul(ii,4);
-    int igridx = igrid[istart];
-    int igridy = igrid[istart+1];
-    int igridz = igrid[istart+2];
+    const int igridx = igrid[istart];
+    const int igridy = igrid[istart+1];
+    const int igridz = igrid[istart+2];
     
     // now istart is used to index thetai1, thetai2 and thetai3
     istart = fast_mul(ii,bsorder);
@@ -2063,12 +2063,13 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
 
     int k = (igridz - nzlo_out) - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
-      int i3 = istart + kb;
-      numtyp4 tha3 = thetai3[i3];
-      numtyp v0 = tha3.x;
-      numtyp v1 = tha3.y;
-      numtyp v2 = tha3.z;
-      numtyp v3 = tha3.w;
+      const int mz = fast_mul(k, ngridxy);
+      const int i3 = istart + kb;
+      const numtyp4 tha3 = thetai3[i3];
+      const numtyp v0 = tha3.x; // thetai3[m][kb][0];
+      const numtyp v1 = tha3.y; // thetai3[m][kb][1];
+      const numtyp v2 = tha3.z; // thetai3[m][kb][2];
+      const numtyp v3 = tha3.w; // thetai3[m][kb][3];
       numtyp tu00_1 = (numtyp)0.0;
       numtyp tu01_1 = (numtyp)0.0;
       numtyp tu10_1 = (numtyp)0.0;
@@ -2094,12 +2095,13 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
 
       int j = (igridy - nylo_out) - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
-        int i2 = istart + jb;
-        numtyp4 tha2 = thetai2[i2];
-        numtyp u0 = tha2.x;
-        numtyp u1 = tha2.y;
-        numtyp u2 = tha2.z;
-        numtyp u3 = tha2.w;
+        const int my = mz + fast_mul(j, ngridx);
+        const int i2 = istart + jb;
+        const numtyp4 tha2 = thetai2[i2];
+        const numtyp u0 = tha2.x; // thetai2[m][jb][0];
+        const numtyp u1 = tha2.y; // thetai2[m][jb][1];
+        const numtyp u2 = tha2.z; // thetai2[m][jb][2];
+        const numtyp u3 = tha2.w; // thetai2[m][jb][3];
         numtyp t0_1 = (numtyp)0.0;
         numtyp t1_1 = (numtyp)0.0;
         numtyp t2_1 = (numtyp)0.0;
@@ -2110,22 +2112,19 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
 
         int i = (igridx - nxlo_out) - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
-          int i1 = istart + ib;
-          numtyp4 tha1 = thetai1[i1];
-          numtyp w0 = tha1.x;
-          numtyp w1 = tha1.y;
-          numtyp w2 = tha1.z;
-          numtyp w3 = tha1.w;
-          int gidx = 2*(k*ngridxy + j*ngridx + i);
-          numtyp tq_1 = grid[gidx];
-          numtyp tq_2 = grid[gidx+1];
-          t0_1 += tq_1*w0;
-          t1_1 += tq_1*w1;
-          t2_1 += tq_1*w2;
-          t0_2 += tq_2*w0;
-          t1_2 += tq_2*w1;
-          t2_2 += tq_2*w2;
-          t3 += (tq_1+tq_2)*w3;
+          const int i1 = istart + ib;
+          const numtyp4 tha1 = thetai1[i1];
+          const int gidx = my + i; // k*ngridxy + j*ngridx + i;
+          const numtyp2 tq = grid[gidx];
+          const numtyp tq_1 = tq.x; //grid[gidx];
+          const numtyp tq_2 = tq.y; //grid[gidx+1];
+          t0_1 += tq_1*tha1.x;
+          t1_1 += tq_1*tha1.y;
+          t2_1 += tq_1*tha1.z;
+          t0_2 += tq_2*tha1.x;
+          t1_2 += tq_2*tha1.y;
+          t2_2 += tq_2*tha1.z;
+          t3 += (tq_1+tq_2)*tha1.w;
           i++;
         }
 
@@ -2199,7 +2198,7 @@ __kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
     }
 
     int idx;
-    numtyp fdip_buf[20];
+    acctyp fdip_buf[20];
 
     fdip_buf[0] = (numtyp)0.0;
     fdip_buf[1] = tuv100_1;
@@ -2271,8 +2270,8 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai2,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
-                          const __global numtyp *restrict grid,
-                          __global numtyp *restrict fphi,
+                          const __global numtyp2 *restrict grid,
+                          __global acctyp *restrict fphi,
                           const int bsorder, const int inum,
                           const int nzlo_out, const int nylo_out,
                           const int nxlo_out, const int ngridxy,
@@ -2353,7 +2352,7 @@ __kernel void k_hippo_fphi_mpole(const __global numtyp4 *restrict thetai1,
           int i1 = istart + ib;
           numtyp4 tha1 = thetai1[i1];
           int gidx = k*ngridxy + j*ngridx + i;
-          numtyp tq = grid[gidx];
+          numtyp tq = grid[gidx].x;
           t0 += tq*tha1.x;
           t1 += tq*tha1.y;
           t2 += tq*tha1.z;

From 917151f695a2ca3a6b1ee602fa32cb81953b541c Mon Sep 17 00:00:00 2001
From: Jacob Gissinger <jrgiss05@gmail.com>
Date: Mon, 23 Jan 2023 17:30:35 -0500
Subject: [PATCH 165/181] Update fix_reaxff_species.cpp

---
 src/REAXFF/fix_reaxff_species.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/REAXFF/fix_reaxff_species.cpp b/src/REAXFF/fix_reaxff_species.cpp
index a9bab28003..ce04be2cc8 100644
--- a/src/REAXFF/fix_reaxff_species.cpp
+++ b/src/REAXFF/fix_reaxff_species.cpp
@@ -64,8 +64,6 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
     x0(nullptr), BOCut(nullptr), fp(nullptr), pos(nullptr), fdel(nullptr), ele(nullptr),
     eletype(nullptr), filepos(nullptr), filedel(nullptr)
 {
-  if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete);
-
   if (narg < 7) utils::missing_cmd_args(FLERR, "fix reaxff/species", error);
 
   force_reneighbor = 1;
@@ -283,6 +281,7 @@ FixReaxFFSpecies::FixReaxFFSpecies(LAMMPS *lmp, int narg, char **arg) :
     error->all(FLERR, "Incompatible combination fix reaxff/species command options");
 
   if (delete_Nlimit > 0) {
+    if (lmp->citeme) lmp->citeme->add(cite_reaxff_species_delete);
     memory->create(delete_Tcount,delete_Nsteps,"reaxff/species:delete_Tcount");
 
     for (int i = 0; i < delete_Nsteps; i++)

From 5014e0434170d0bfbff9532acdc766d0bf8979eb Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 24 Jan 2023 08:40:08 -0600
Subject: [PATCH 166/181] Removed commented out code, ensured that ic_kspace is
 not nullptr when call precompute_kspace for hippo/gpu

---
 lib/gpu/lal_base_amoeba.cpp | 11 +++--------
 src/GPU/pair_hippo_gpu.cpp  | 12 ++++++------
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 21d9975b28..88dd10eab1 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -573,7 +573,8 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
 
   int numel = _num_grid_points;
   if (_cgrid_brick.cols() == 0) {
-    _cgrid_brick.alloc(numel, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
+    int nsize=(int)(((double)numel)*1.1);
+    _cgrid_brick.alloc(nsize, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_ONLY);
   } else if (numel > (int)_cgrid_brick.cols()) {
     _cgrid_brick.resize(numel);
   }
@@ -689,13 +690,7 @@ int BaseAmoebaT::fphi_mpole() {
 
   const int BX=block_size();
   const int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  /*
-  const int cus = device->gpu->cus();
-  while (GX < cus && GX > 1) {
-    BX /= 2;
-    GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
-  }
-  */
+
   time_pair.start();
   int ngridxy = _ngridx * _ngridy;
   k_fphi_mpole.set_size(GX,BX);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 3049799433..8611c1b56a 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -475,12 +475,12 @@ void PairHippoGPU::induce()
 
   // allocate memory and make early host-device transfers
   // must be done before the first ufield0c
-
-  hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
-                              thetai3, igrid,
-                              ic_kspace->nzlo_out, ic_kspace->nzhi_out,
-                              ic_kspace->nylo_out, ic_kspace->nyhi_out,
-                              ic_kspace->nxlo_out, ic_kspace->nxhi_out);
+  if (ic_kspace)
+    hippo_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2,
+                                thetai3, igrid,
+                                ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                                ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                                ic_kspace->nxlo_out, ic_kspace->nxhi_out);
 
   // get induced dipoles via the OPT extrapolation method
   // NOTE: any way to rewrite these loops to avoid allocating

From aaa918cbe74eabab131dbb7971d43265b96bf6ee Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 24 Jan 2023 17:05:48 -0600
Subject: [PATCH 167/181] Fixed bugs with access mode on the host side of
 thetai[1-3]

---
 lib/gpu/lal_base_amoeba.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 88dd10eab1..e80fa01c2b 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -484,9 +484,9 @@ void BaseAmoebaT::precompute_kspace(const int inum_full, const int bsorder,
 
   if (_max_thetai_size == 0) {
     _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
     _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_WRITE_ONLY,UCL_READ_ONLY);
 
     _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);

From 40c8fcb03aab95e85908285aafe3bbabfdfa74e2 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Jan 2023 21:05:36 -0500
Subject: [PATCH 168/181] disallow using single precision FFTs with AMOEBA
 package

---
 cmake/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index a6956f5f5d..8184f9784d 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -527,6 +527,11 @@ foreach(PKG_WITH_INCL KSPACE PYTHON ML-IAP VORONOI COLVARS ML-HDNNP MDI MOLFILE
   endif()
 endforeach()
 
+# AMOEBA is not compatible with single precision FFTs
+if(PKG_AMOEBA AND FFT_SINGLE)
+  message(FATAL_ERROR "Package AMOEBA is not compatible with single precision FFTs")
+endif()
+
 # optionally enable building script wrappers using swig
 option(WITH_SWIG "Build scripting language wrappers with SWIG" OFF)
 if(WITH_SWIG)

From dec3afe5956c37d2e514bdd1c6345bbecc7b299e Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Jan 2023 21:15:37 -0500
Subject: [PATCH 169/181] make synchronization for timers optional. only enable
 with "timer sync"

---
 src/AMOEBA/amoeba_convolution.cpp | 9 +++++----
 src/AMOEBA/amoeba_induce.cpp      | 9 +++++----
 src/AMOEBA/amoeba_multipole.cpp   | 3 ++-
 src/AMOEBA/amoeba_polar.cpp       | 3 ++-
 src/AMOEBA/pair_amoeba.cpp        | 3 ++-
 5 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp
index 609df1184e..e58bb33b41 100644
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@@ -22,6 +22,7 @@
 #include "memory.h"
 #include "neighbor.h"
 #include "remap_wrap.h"
+#include "timer.h"
 
 #include <cmath>
 #include <cstring>
@@ -328,7 +329,7 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
 
   double time0,time1;
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   // perform forward FFT
@@ -393,7 +394,7 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
 
   double time0,time1;
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   // perform forward FFT
@@ -443,7 +444,7 @@ void *AmoebaConvolution::post_convolution_3d()
 
   double time0,time1;
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
@@ -494,7 +495,7 @@ void *AmoebaConvolution::post_convolution_4d()
 
   double time0,time1;
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 7ff9fe7121..6ac8148c59 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -24,6 +24,7 @@
 #include "math_special.h"
 #include "my_page.h"
 #include "neigh_list.h"
+#include "timer.h"
 
 #include <cmath>
 
@@ -545,7 +546,7 @@ void PairAmoeba::ufield0c(double **field, double **fieldp)
   }
 
   double time0, time1, time2;
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   // get the real space portion of the mutual field
@@ -795,7 +796,7 @@ void PairAmoeba::dfield0c(double **field, double **fieldp)
   // get the reciprocal space part of the permanent field
 
   double time0, time1, time2;
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   if (polar_kspace_flag) udirect1(field);
@@ -870,7 +871,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
 
   // map 2 values to grid
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   grid_uind(fuind,fuinp,gridpre);
@@ -915,7 +916,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
 
   // get potential
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index f302194193..848e1a13cb 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -21,6 +21,7 @@
 #include "math_const.h"
 #include "math_special.h"
 #include "neigh_list.h"
+#include "timer.h"
 
 #include <cmath>
 
@@ -80,7 +81,7 @@ void PairAmoeba::multipole()
 
   felec = electric / am_dielectric;
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   // compute the real space part of the Ewald summation
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index e2b85ed22c..e817e706dc 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -21,6 +21,7 @@
 #include "math_const.h"
 #include "math_special.h"
 #include "neigh_list.h"
+#include "timer.h"
 
 #include <cmath>
 #include <cstring>
@@ -78,7 +79,7 @@ void PairAmoeba::polar()
 
   // compute the real space part of the dipole interactions
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   if (polar_rspace_flag) polar_real();
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index a1b288348a..0812fe43f0 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -29,6 +29,7 @@
 #include "my_page.h"
 #include "neigh_list.h"
 #include "neighbor.h"
+#include "timer.h"
 #include "update.h"
 
 #include <cmath>
@@ -371,7 +372,7 @@ void PairAmoeba::compute(int eflag, int vflag)
 
   double time0,time1,time2,time3,time4,time5,time6,time7,time8;
 
-  MPI_Barrier(world);
+  if (timer->has_sync()) MPI_Barrier(world);
   time0 = platform::walltime();
 
   // if reneighboring step:

From b17689af6be379c67f9332ab88790528307c4f3b Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Jan 2023 21:28:08 -0500
Subject: [PATCH 170/181] doc fixes

---
 doc/src/fix_rigid.rst   | 4 ++--
 doc/src/pair_amoeba.rst | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/src/fix_rigid.rst b/doc/src/fix_rigid.rst
index 9a958e50d1..3a2477f90a 100644
--- a/doc/src/fix_rigid.rst
+++ b/doc/src/fix_rigid.rst
@@ -732,8 +732,8 @@ choices:
 
 * Use one of the 4 NPT or NPH styles for the rigid bodies.  Use the
   *dilate* all option so that it will dilate the positions of the
-  *non-rigid particles as well.  Use :doc:`fix nvt <fix_nh>` (or any
-  *other thermostat) for the non-rigid particles.
+  non-rigid particles as well.  Use :doc:`fix nvt <fix_nh>` (or any
+  other thermostat) for the non-rigid particles.
 * Use :doc:`fix npt <fix_nh>` for the group of non-rigid particles.  Use
   the *dilate* all option so that it will dilate the center-of-mass
   positions of the rigid bodies as well.  Use one of the 4 NVE or 2 NVT
diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst
index ab82fa5593..79b3daf22f 100644
--- a/doc/src/pair_amoeba.rst
+++ b/doc/src/pair_amoeba.rst
@@ -202,8 +202,9 @@ These pair styles can only be used via the *pair* keyword of the
 
 .. note::
 
-  There is a unresolved issue with the `amoeba/gpu` and `hippo/gpu`
-  pair styles with the OpenCL build when running on integrated GPUs.
+  Using the GPU accelerated pair styles 'amoeba/gpu' or 'hippo/gpu'
+  when compiling the GPU package for OpenCL has a few known issues
+  when running on integrated GPUs and the calculation may crash.
 
 ----------
 

From 878681999321df0e1cb2a0cfaff4678b0fd81e0f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Jan 2023 22:31:49 -0500
Subject: [PATCH 171/181] use FFT_SCALAR more consistently to perhaps support
 single precision FFT some time

also, use "override" instead of virtual and add a forgotten virtual
---
 src/AMOEBA/amoeba_convolution.cpp |  4 ++--
 src/AMOEBA/amoeba_convolution.h   | 10 ++++-----
 src/AMOEBA/amoeba_dispersion.cpp  |  4 ++--
 src/AMOEBA/amoeba_induce.cpp      | 12 +++++------
 src/AMOEBA/amoeba_kspace.cpp      | 10 ++++-----
 src/AMOEBA/amoeba_multipole.cpp   | 10 +++------
 src/AMOEBA/amoeba_polar.cpp       | 28 ++++++++++++------------
 src/AMOEBA/pair_amoeba.h          | 12 +++++------
 src/GPU/pair_amoeba_gpu.cpp       | 36 +++++++++++++++----------------
 src/GPU/pair_amoeba_gpu.h         | 25 +++++++++++----------
 src/GPU/pair_hippo_gpu.cpp        | 10 ++++-----
 src/GPU/pair_hippo_gpu.h          | 26 +++++++++++-----------
 12 files changed, 91 insertions(+), 96 deletions(-)

diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp
index e58bb33b41..ae3dbf16c4 100644
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@@ -338,7 +338,7 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
   time1 = platform::walltime();
 
   if (SCALE) {
-    double scale = 1.0/nfft_global;
+    FFT_SCALAR scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
   }
 
@@ -403,7 +403,7 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
   time1 = platform::walltime();
 
   if (SCALE) {
-    double scale = 1.0/nfft_global;
+    FFT_SCALAR scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
   }
 
diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h
index 44dc5b1687..bed65149ec 100644
--- a/src/AMOEBA/amoeba_convolution.h
+++ b/src/AMOEBA/amoeba_convolution.h
@@ -38,7 +38,7 @@ class AmoebaConvolution : protected Pointers {
   int nxlo_out, nxhi_out, nylo_out, nyhi_out, nzlo_out, nzhi_out;
   int nxlo_fft, nxhi_fft, nylo_fft, nyhi_fft, nzlo_fft, nzhi_fft;
   bigint nfft_global;          // nx * ny * nz
-  double *grid_brick_start;    // lower left corner of (c)grid_brick data
+  FFT_SCALAR *grid_brick_start;    // lower left corner of (c)grid_brick data
 
   AmoebaConvolution(class LAMMPS *, class Pair *, int, int, int, int, int);
   ~AmoebaConvolution();
@@ -61,14 +61,14 @@ class AmoebaConvolution : protected Pointers {
   class Grid3d *gc;
   class Remap *remap;
 
-  double ***grid_brick;      // 3d real brick grid with ghosts
-  double ****cgrid_brick;    // 4d complex brick grid with ghosts
+  FFT_SCALAR ***grid_brick;      // 3d real brick grid with ghosts
+  FFT_SCALAR ****cgrid_brick;    // 4d complex brick grid with ghosts
 
   FFT_SCALAR *grid_fft;    // 3d FFT grid as 1d vector
   FFT_SCALAR *cfft;        // 3d complex FFT grid as 1d vector
 
-  double *gc_buf1, *gc_buf2;    // buffers for GridComm
-  double *remap_buf;            // buffer for Remap
+  FFT_SCALAR *gc_buf1, *gc_buf2;    // buffers for GridComm
+  FFT_SCALAR *remap_buf;            // buffer for Remap
 
   void allocate_grid();
   void deallocate_grid();
diff --git a/src/AMOEBA/amoeba_dispersion.cpp b/src/AMOEBA/amoeba_dispersion.cpp
index f3af921d85..cc283f22d2 100644
--- a/src/AMOEBA/amoeba_dispersion.cpp
+++ b/src/AMOEBA/amoeba_dispersion.cpp
@@ -285,7 +285,7 @@ void PairAmoeba::dispersion_kspace()
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by zero()
 
-  double ***gridpre = (double ***) d_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) d_kspace->zero();
 
   // map atoms to grid
 
@@ -294,7 +294,7 @@ void PairAmoeba::dispersion_kspace()
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = d_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = d_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
diff --git a/src/AMOEBA/amoeba_induce.cpp b/src/AMOEBA/amoeba_induce.cpp
index 6ac8148c59..ecc20a198c 100644
--- a/src/AMOEBA/amoeba_induce.cpp
+++ b/src/AMOEBA/amoeba_induce.cpp
@@ -867,7 +867,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
 
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpre = (double ****) ic_kspace->zero();
+  FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
 
   // map 2 values to grid
 
@@ -882,7 +882,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = ic_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -912,7 +912,7 @@ void PairAmoeba::umutual1(double **field, double **fieldp)
   // post-convolution operations including backward FFT
   // gridppost = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpost = (double ****) ic_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
 
   // get potential
 
@@ -1090,7 +1090,7 @@ void PairAmoeba::udirect1(double **field)
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by setup()
 
-  double ***gridpre = (double ***) i_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) i_kspace->zero();
 
   // map multipole moments to grid
 
@@ -1099,7 +1099,7 @@ void PairAmoeba::udirect1(double **field)
   // pre-convolution operations including forward FFT
   // gridfft = my 1d portion of complex 3d grid in FFT decomp
 
-  double *gridfft = i_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = i_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1144,7 +1144,7 @@ void PairAmoeba::udirect1(double **field)
   // post-convolution operations including backward FFT
   // gridppost = my portion of 3d grid in brick decomp w/ ghost values
 
-  double ***gridpost = (double ***) i_kspace->post_convolution();
+  FFT_SCALAR ***gridpost = (FFT_SCALAR ***) i_kspace->post_convolution();
 
   // get potential
 
diff --git a/src/AMOEBA/amoeba_kspace.cpp b/src/AMOEBA/amoeba_kspace.cpp
index c47e734c5e..6d2fb64dd6 100644
--- a/src/AMOEBA/amoeba_kspace.cpp
+++ b/src/AMOEBA/amoeba_kspace.cpp
@@ -523,7 +523,7 @@ void PairAmoeba::frac_to_cart()
    grid_mpole maps fractional atomic multipoles to PME grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::grid_mpole(double **fmp, double ***grid)
+void PairAmoeba::grid_mpole(double **fmp, FFT_SCALAR ***grid)
 {
   int i,j,k,m,ib,jb,kb;
   double v0,u0,t0;
@@ -596,7 +596,7 @@ void PairAmoeba::grid_mpole(double **fmp, double ***grid)
    the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
+void PairAmoeba::fphi_mpole(FFT_SCALAR ***grid, double **fphi)
 {
   int i,j,k,m,ib,jb,kb;
   double v0,v1,v2,v3;
@@ -740,7 +740,7 @@ void PairAmoeba::fphi_mpole(double ***grid, double **fphi)
    grid_uind maps fractional induced dipoles to the PME grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
+void PairAmoeba::grid_uind(double **fuind, double **fuinp, FFT_SCALAR ****grid)
 {
   int i,j,k,m,ib,jb,kb;
   double v0,u0,t0;
@@ -791,7 +791,7 @@ void PairAmoeba::grid_uind(double **fuind, double **fuinp, double ****grid)
    fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
+void PairAmoeba::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
                            double **fdip_phi2, double **fdip_sum_phi)
 {
   int i,j,k,m,ib,jb,kb;
@@ -1040,7 +1040,7 @@ void PairAmoeba::fphi_uind(double ****grid, double **fdip_phi1,
    grid_disp maps dispersion coefficients to PME grid
 ------------------------------------------------------------------------- */
 
-void PairAmoeba::grid_disp(double ***grid)
+void PairAmoeba::grid_disp(FFT_SCALAR ***grid)
 {
   int i,j,k,m,ib,jb,kb,itype,iclass;
   double v0,u0,t0;
diff --git a/src/AMOEBA/amoeba_multipole.cpp b/src/AMOEBA/amoeba_multipole.cpp
index 848e1a13cb..a1503a91f3 100644
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@@ -452,10 +452,6 @@ void PairAmoeba::multipole_real()
         rr9 = bn[4] - scalek*rr9;
         rr11 = bn[5] - scalek*rr11;
         e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
-        if (i == 0  && j < 10) {
-          //printf("j = %d: scalek = %f; rr11 = %f; terms: %f %f %f %f %f\n", j, scalek, rr11, term1, term2, term3, term4, term5);
-          //printf("j = %d: felec = %f; rr1 = %f; bn0 = %f\n", j, felec, rr1, bn[0]);
-        }
 
         // find standard multipole intermediates for force and torque
 
@@ -662,7 +658,7 @@ void PairAmoeba::multipole_kspace()
 
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
 
-  double ***gridpre = (double ***) m_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) m_kspace->zero();
 
   // map atoms to grid
 
@@ -671,7 +667,7 @@ void PairAmoeba::multipole_kspace()
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-  double *gridfft = m_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = m_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -742,7 +738,7 @@ void PairAmoeba::multipole_kspace()
   // post-convolution operations including backward FFT
   // gridppost = my portion of 3d grid in brick decomp w/ ghost values
 
-  double ***gridpost = (double ***) m_kspace->post_convolution();
+  FFT_SCALAR ***gridpost = (FFT_SCALAR ***) m_kspace->post_convolution();
 
   // get potential
 
diff --git a/src/AMOEBA/amoeba_polar.cpp b/src/AMOEBA/amoeba_polar.cpp
index e817e706dc..3c51426beb 100644
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@@ -1340,7 +1340,7 @@ void PairAmoeba::polar_kspace()
 
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
 
-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -1349,7 +1349,7 @@ void PairAmoeba::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();
 
     // ---------------------
     // convolution operation
@@ -1399,7 +1399,7 @@ void PairAmoeba::polar_kspace()
     // post-convolution operations including backward FFT
     // gridppost = my portion of 3d grid in brick decomp w/ ghost values
 
-    double ***gridpost = (double ***) p_kspace->post_convolution();
+    FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();
 
     // get potential
 
@@ -1432,7 +1432,7 @@ void PairAmoeba::polar_kspace()
 
   // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpre2 = (double ****) pc_kspace->zero();
+  FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();
 
   // map 2 values to grid
 
@@ -1441,7 +1441,7 @@ void PairAmoeba::polar_kspace()
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = pc_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = pc_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1464,7 +1464,7 @@ void PairAmoeba::polar_kspace()
   // post-convolution operations including backward FFT
   // gridppost = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpost = (double ****) pc_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();
 
   // get potential
 
@@ -1870,7 +1870,7 @@ void PairAmoeba::polar_kspace()
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by zero()
 
-  double ***gridpre = (double ***) p_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
   // map atoms to grid
 
@@ -1900,7 +1900,7 @@ void PairAmoeba::polar_kspace()
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by zero()
 
-  gridpre = (double ***) p_kspace->zero();
+  gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
   // map atoms to grid
 
@@ -1909,7 +1909,7 @@ void PairAmoeba::polar_kspace()
   // pre-convolution operations including forward FFT
   // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors
 
-  double *gridfft2 = p_kspace->pre_convolution();
+  FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1966,7 +1966,7 @@ void PairAmoeba::polar_kspace()
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
     // zeroed by zero()
 
-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -1975,12 +1975,12 @@ void PairAmoeba::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();
 
     // gridfft1 = copy of first FFT
 
     int nfft_owned = p_kspace->nfft_owned;
-    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
+    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
 
     // assign ??? to the PME grid
 
@@ -1995,7 +1995,7 @@ void PairAmoeba::polar_kspace()
 
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
 
-    gridpre = (double ***) p_kspace->zero();
+    gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -2004,7 +2004,7 @@ void PairAmoeba::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft2 = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
 
     // ---------------------
     // convolution operation
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index f14be4bd11..cdeee6c95f 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -381,7 +381,7 @@ class PairAmoeba : public Pair {
 
   virtual void induce();
   void ulspred();
-  void ufield0c(double **, double **);
+  virtual void ufield0c(double **, double **);
   void uscale0b(int, double **, double **, double **, double **);
   void dfield0c(double **, double **);
   virtual void umutual1(double **, double **);
@@ -407,11 +407,11 @@ class PairAmoeba : public Pair {
   void fphi_to_cphi(double **, double **);
   void frac_to_cart();
 
-  void grid_mpole(double **, double ***);
-  void fphi_mpole(double ***, double **);
-  void grid_uind(double **, double **, double ****);
-  virtual void fphi_uind(double ****, double **, double **, double **);
-  void grid_disp(double ***);
+  void grid_mpole(double **, FFT_SCALAR ***);
+  void fphi_mpole(FFT_SCALAR ***, double **);
+  void grid_uind(double **, double **, FFT_SCALAR ****);
+  virtual void fphi_uind(FFT_SCALAR ****, double **, double **, double **);
+  void grid_disp(FFT_SCALAR ***);
 
   void kewald();
   void kewald_parallel(int, int, int, int, int &, int &, int &, int &, int &, int &, int &, int &,
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 34605725a5..4213946f38 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -1050,7 +1050,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpre = (double ****) ic_kspace->zero();
+  FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
 
   // map 2 values to grid
 
@@ -1066,7 +1066,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = ic_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1096,7 +1096,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
   // post-convolution operations including backward FFT
   // gridppost = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpost = (double ****) ic_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
 
   // get potential
 
@@ -1150,7 +1150,7 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
    fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
+void PairAmoebaGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
                               double **fdip_phi2, double **fdip_sum_phi)
 {
   if (!gpu_fphi_uind_ready) {
@@ -1422,7 +1422,7 @@ void PairAmoebaGPU::polar_kspace()
 
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
 
-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -1431,7 +1431,7 @@ void PairAmoebaGPU::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();
 
     // ---------------------
     // convolution operation
@@ -1481,7 +1481,7 @@ void PairAmoebaGPU::polar_kspace()
     // post-convolution operations including backward FFT
     // gridppost = my portion of 3d grid in brick decomp w/ ghost values
 
-    double ***gridpost = (double ***) p_kspace->post_convolution();
+    FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution();
 
     // get potential
 
@@ -1539,7 +1539,7 @@ void PairAmoebaGPU::polar_kspace()
 
   // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpre2 = (double ****) pc_kspace->zero();
+  FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero();
 
   // map 2 values to grid
 
@@ -1548,7 +1548,7 @@ void PairAmoebaGPU::polar_kspace()
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = pc_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = pc_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1571,7 +1571,7 @@ void PairAmoebaGPU::polar_kspace()
   // post-convolution operations including backward FFT
   // gridppost = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpost = (double ****) pc_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution();
 
   // get potential
 
@@ -1819,7 +1819,7 @@ void PairAmoebaGPU::polar_kspace()
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by zero()
 
-  double ***gridpre = (double ***) p_kspace->zero();
+  FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
   // map atoms to grid
 
@@ -1849,7 +1849,7 @@ void PairAmoebaGPU::polar_kspace()
   // gridpre = my portion of 3d grid in brick decomp w/ ghost values
   // zeroed by zero()
 
-  gridpre = (double ***) p_kspace->zero();
+  gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
   // map atoms to grid
 
@@ -1858,7 +1858,7 @@ void PairAmoebaGPU::polar_kspace()
   // pre-convolution operations including forward FFT
   // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors
 
-  double *gridfft2 = p_kspace->pre_convolution();
+  FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1915,7 +1915,7 @@ void PairAmoebaGPU::polar_kspace()
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
     // zeroed by zero()
 
-    double ***gridpre = (double ***) p_kspace->zero();
+    FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -1924,12 +1924,12 @@ void PairAmoebaGPU::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft = p_kspace->pre_convolution();
 
     // gridfft1 = copy of first FFT
 
     int nfft_owned = p_kspace->nfft_owned;
-    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(double));
+    memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR));
 
     // assign ??? to the PME grid
 
@@ -1944,7 +1944,7 @@ void PairAmoebaGPU::polar_kspace()
 
     // gridpre = my portion of 3d grid in brick decomp w/ ghost values
 
-    gridpre = (double ***) p_kspace->zero();
+    gridpre = (FFT_SCALAR ***) p_kspace->zero();
 
     // map atoms to grid
 
@@ -1953,7 +1953,7 @@ void PairAmoebaGPU::polar_kspace()
     // pre-convolution operations including forward FFT
     // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector
 
-    double *gridfft2 = p_kspace->pre_convolution();
+    FFT_SCALAR *gridfft2 = p_kspace->pre_convolution();
 
     // ---------------------
     // convolution operation
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index b7230594c5..c9b9b73a58 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -27,23 +27,22 @@ namespace LAMMPS_NS {
 class PairAmoebaGPU : public PairAmoeba {
  public:
   PairAmoebaGPU(LAMMPS *lmp);
-  ~PairAmoebaGPU();
-  void init_style();
-  double memory_usage();
+  ~PairAmoebaGPU() override;
+  void init_style() override;
+  double memory_usage() override;
 
   enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
-  virtual void induce();
+  void induce() override;
 
-  //virtual void dispersion_real();
-  virtual void multipole_real();
-  virtual void udirect2b(double **, double **);
-  virtual void umutual1(double **, double **);
-  virtual void fphi_uind(double ****, double **, double **, double **);
-  virtual void umutual2b(double **, double **);
-  virtual void ufield0c(double **, double **);
-  virtual void polar_real();
-  virtual void polar_kspace();
+  void multipole_real() override;
+  void udirect2b(double **, double **) override;
+  void umutual1(double **, double **) override;
+  void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
+  void umutual2b(double **, double **) override;
+  void ufield0c(double **, double **) override;
+  void polar_real() override;
+  void polar_kspace() override;
 
  private:
   int gpu_mode;
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 8611c1b56a..83c72d5252 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -1124,7 +1124,7 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
 
   // gridpre = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpre = (double ****) ic_kspace->zero();
+  FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero();
 
   // map 2 values to grid
 
@@ -1140,7 +1140,7 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
   // pre-convolution operations including forward FFT
   // gridfft = my portion of complex 3d grid in FFT decomposition
 
-  double *gridfft = ic_kspace->pre_convolution();
+  FFT_SCALAR *gridfft = ic_kspace->pre_convolution();
 
   // ---------------------
   // convolution operation
@@ -1170,7 +1170,7 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
   // post-convolution operations including backward FFT
   // gridppost = my portion of 4d grid in brick decomp w/ ghost values
 
-  double ****gridpost = (double ****) ic_kspace->post_convolution();
+  FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution();
 
   // get potential
 
@@ -1231,8 +1231,8 @@ void PairHippoGPU::umutual1(double **field, double **fieldp)
    fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1,
-                              double **fdip_phi2, double **fdip_sum_phi)
+void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
+                             double **fdip_phi2, double **fdip_sum_phi)
 {
   if (!gpu_fphi_uind_ready) {
     PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index 44bebd29f3..7955c97470 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -27,23 +27,23 @@ namespace LAMMPS_NS {
 class PairHippoGPU : public PairAmoeba {
  public:
   PairHippoGPU(LAMMPS *lmp);
-  ~PairHippoGPU();
-  void init_style();
-  double memory_usage();
+  ~PairHippoGPU() override;
+  void init_style() override;
+  double memory_usage() override;
 
   enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
 
-  virtual void induce();
+  void induce() override;
 
-  virtual void repulsion();
-  virtual void dispersion_real();
-  virtual void multipole_real();
-  virtual void udirect2b(double **, double **);
-  virtual void umutual1(double **, double **);
-  virtual void fphi_uind(double ****, double **, double **, double **);
-  virtual void umutual2b(double **, double **);
-  virtual void ufield0c(double **, double **);
-  virtual void polar_real();
+  void repulsion() override;
+  void dispersion_real() override;
+  void multipole_real() override;
+  void udirect2b(double **, double **) override;
+  void umutual1(double **, double **) override;
+  void fphi_uind(FFT_SCALAR ****, double **, double **, double **) override;
+  void umutual2b(double **, double **) override;
+  void ufield0c(double **, double **) override;
+  void polar_real() override;
 
  private:
   int gpu_mode;

From 6c63d7dcb92553dd9f5e284c6db8ef3a6c2b5765 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Jan 2023 22:54:47 -0500
Subject: [PATCH 172/181] single precision FFTs are now supported on the CPU

---
 cmake/CMakeLists.txt             | 5 -----
 cmake/Modules/Packages/GPU.cmake | 4 ++++
 doc/src/pair_amoeba.rst          | 3 +++
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8184f9784d..a6956f5f5d 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -527,11 +527,6 @@ foreach(PKG_WITH_INCL KSPACE PYTHON ML-IAP VORONOI COLVARS ML-HDNNP MDI MOLFILE
   endif()
 endforeach()
 
-# AMOEBA is not compatible with single precision FFTs
-if(PKG_AMOEBA AND FFT_SINGLE)
-  message(FATAL_ERROR "Package AMOEBA is not compatible with single precision FFTs")
-endif()
-
 # optionally enable building script wrappers using swig
 option(WITH_SWIG "Build scripting language wrappers with SWIG" OFF)
 if(WITH_SWIG)
diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 89e15e548b..2c766a2540 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -31,6 +31,10 @@ endif()
 option(GPU_DEBUG "Enable debugging code of the GPU package" OFF)
 mark_as_advanced(GPU_DEBUG)
 
+if(PKG_AMOEBA AND FFT_SINGLE)
+  message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT")
+endif()
+
 file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
 file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
 
diff --git a/doc/src/pair_amoeba.rst b/doc/src/pair_amoeba.rst
index 79b3daf22f..6ef92a6938 100644
--- a/doc/src/pair_amoeba.rst
+++ b/doc/src/pair_amoeba.rst
@@ -206,6 +206,9 @@ These pair styles can only be used via the *pair* keyword of the
   when compiling the GPU package for OpenCL has a few known issues
   when running on integrated GPUs and the calculation may crash.
 
+  The GPU accelerated pair styles are also not (yet) compatible
+  with single precision FFTs.
+
 ----------
 
 Restrictions

From c744be70602631afe1d66aa7876823504d207a2b Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Jan 2023 12:51:52 -0500
Subject: [PATCH 173/181] forcibly disable COMPRESS package is zlib is not
 found

---
 cmake/Modules/Packages/COMPRESS.cmake | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cmake/Modules/Packages/COMPRESS.cmake b/cmake/Modules/Packages/COMPRESS.cmake
index bdcf1aa3f8..4e1ab846a7 100644
--- a/cmake/Modules/Packages/COMPRESS.cmake
+++ b/cmake/Modules/Packages/COMPRESS.cmake
@@ -1,4 +1,9 @@
-find_package(ZLIB REQUIRED)
+find_package(ZLIB)
+if(NOT ZLIB_FOUND)
+  message(WARNING "No Zlib development support found. Disabling COMPRESS package...")
+  set(PKG_COMPRESS OFF CACHE BOOL "" FORCE)
+  return()
+endif()
 target_link_libraries(lammps PRIVATE ZLIB::ZLIB)
 
 find_package(PkgConfig QUIET)

From 4c996eed3beb9f970231c9fcbb99e096f93bfd44 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 24 Jan 2023 23:22:55 -0500
Subject: [PATCH 174/181] auto-enabling prerequisite packages with CMake

---
 cmake/Modules/LAMMPSUtils.cmake | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/LAMMPSUtils.cmake b/cmake/Modules/LAMMPSUtils.cmake
index d42f91f10e..9b42dafc44 100644
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@@ -99,8 +99,15 @@ function(check_for_autogen_files source_dir)
 endfunction()
 
 macro(pkg_depends PKG1 PKG2)
-  if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
-    message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with the ${PKG2} package")
+  if(DEFINED BUILD_${PKG2})
+    if(PKG_${PKG1} AND NOT BUILD_${PKG2})
+      message(FATAL_ERROR "The ${PKG1} package needs LAMMPS to be built with -D BUILD_${PKG2}=ON")
+    endif()
+  elseif(DEFINED PKG_${PKG2})
+    if(PKG_${PKG1} AND NOT PKG_${PKG2})
+      message(WARNING "The ${PKG1} package depends on the ${PKG2} package. Enabling it.")
+      set(PKG_${PKG2} ON CACHE BOOL "" FORCE)
+    endif()
   endif()
 endmacro()
 

From b206b4d1f63d724fb6b5151e8d70cc938dfe81fb Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 24 Jan 2023 23:55:30 -0600
Subject: [PATCH 175/181] Fixed bugs with hippo/gpu for single- and mixed-
 precisions

---
 src/GPU/pair_hippo_gpu.cpp | 121 ++++++++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 35 deletions(-)

diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 83c72d5252..0538096cc8 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -849,24 +849,44 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
   //   field and fieldp may already have some nonzero values from kspace (udirect1)
 
   int nlocal = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
+  if (tq_single) {
+    auto field_ptr = (float *)fieldp_pinned;
 
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
 
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
-  }
+    auto fieldp_ptr = (float *)fieldp_pinned;
+    fieldp_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += fieldp_ptr[idx];
+      fieldp[i][1] += fieldp_ptr[idx+1];
+      fieldp[i][2] += fieldp_ptr[idx+2];
+    }
 
+  } else {
+
+    auto field_ptr = (double *)fieldp_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    auto fieldp_ptr = (double *)fieldp_pinned;
+    fieldp_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += fieldp_ptr[idx];
+      fieldp[i][1] += fieldp_ptr[idx+1];
+      fieldp[i][2] += fieldp_ptr[idx+2];
+    }
+  }  
 }
 
 /* ----------------------------------------------------------------------
@@ -1246,30 +1266,61 @@ void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
                       &fdip_sum_phi_pinned);
 
   int nlocal = atom->nlocal;
-  double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
-  for (int i = 0; i < nlocal; i++) {
-    int n = i;
-    for (int m = 0; m < 10; m++) {
-      fdip_phi1[i][m] = _fdip_phi1_ptr[n];
-      n += nlocal;
+  if (tq_single) {
+    auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+        n += nlocal;
+      }
     }
-  }
 
-  double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
-  for (int i = 0; i < nlocal; i++) {
-    int n = i;
-    for (int m = 0; m < 10; m++) {
-      fdip_phi2[i][m] = _fdip_phi2_ptr[n];
-      n += nlocal;
+    auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+        n += nlocal;
+      }
     }
-  }
 
-  double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
-  for (int i = 0; i < nlocal; i++) {
-    int n = i;
-    for (int m = 0; m < 20; m++) {
-      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
-      n += nlocal;
+    auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 20; m++) {
+        fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+        n += nlocal;
+      }
+    }
+
+  } else {
+
+    auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 10; m++) {
+        fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+        n += nlocal;
+      }
+    }
+
+    auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
+    for (int i = 0; i < nlocal; i++) {
+      int n = i;
+      for (int m = 0; m < 20; m++) {
+        fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+        n += nlocal;
+      }
     }
   }
 }

From adf43d7feefb5a65b6c3d0ddef66190e28c42cc8 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 25 Jan 2023 00:02:25 -0600
Subject: [PATCH 176/181] Fixed the issues with some OpenCL implementation to
 avoid errors casting changing the pointer address spaces

---
 lib/gpu/lal_amoeba.cu       |  51 ++++++++--------
 lib/gpu/lal_atom.cpp        |   2 +-
 lib/gpu/lal_atom.h          |   2 +-
 lib/gpu/lal_base_amoeba.cpp |  49 +++++++++-------
 lib/gpu/lal_hippo.cu        | 114 +++++++++++++++++-------------------
 5 files changed, 106 insertions(+), 112 deletions(-)

diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 6317ba8d94..f572d3ebd0 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -410,7 +410,7 @@ _texture( q_tex,int2);
 ------------------------------------------------------------------------- */
 
 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict extra,
                                  const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_amoeba,
                                  const __global int *dev_nbor,
@@ -442,10 +442,10 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
   acctyp4 tq;
   tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
-
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
@@ -490,8 +490,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
-
       numtyp r = ucl_sqrt(r2);
       const numtyp4 pol1j = polar1[j];
       numtyp ck  = pol1j.x;  // rpole[j][0];
@@ -583,12 +581,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
 
       // calculate the real space Ewald error function terms
-      
+
       numtyp ralpha = aewald * r;
       numtyp exp2a = ucl_exp(-ralpha*ralpha);
       numtyp bn[6];
       bn[0] = ucl_erfc(ralpha) * rinv;
-      
+
       numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
@@ -691,7 +689,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict extra,
                                  const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_amoeba,
                                  const __global int *dev_nbor,
@@ -707,14 +705,14 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  //local_allocate_store_charge();
+  local_allocate_store_ufld();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
 
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
@@ -885,7 +883,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict extra,
                                  const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_amoeba,
                                  const __global int *dev_nbor,
@@ -901,13 +899,14 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
+  local_allocate_store_ufld();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
 
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
-  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar4 = &extra[3*nall];
+  const __global numtyp4* polar5 = &extra[4*nall];
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
@@ -949,8 +948,6 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      //if (r2>off2) continue;
-
       numtyp r = ucl_sqrt(r2);
       numtyp rinv = ucl_rsqrt(r2);
       numtyp r2inv = rinv*rinv;
@@ -1049,7 +1046,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
-                             const __global numtyp *restrict extra,
+                             const __global numtyp4 *restrict extra,
                              const __global numtyp4 *restrict coeff,
                              const __global numtyp4 *restrict sp_amoeba,
                              const __global int *dev_nbor,
@@ -1068,7 +1065,6 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_ufld();
   local_allocate_store_charge();
 
   acctyp4 f;
@@ -1086,11 +1082,12 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
 
   numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
-  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+  
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar4 = &extra[3*nall];
+  const __global numtyp4* polar5 = &extra[4*nall];
 
   if (ii<inum) {
     int itype,igroup;
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 853fdf216d..72cb59a912 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -49,7 +49,7 @@ int AtomT::bytes_per_atom() const {
   if (_vel)
     bytes+=4*sizeof(numtyp);
   if (_extra_fields>0)
-    bytes+=_extra_fields*sizeof(numtyp);
+    bytes+=_extra_fields*sizeof(numtyp4);
   return bytes;
 }
 
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index 4b29d76cb1..771c2a3571 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -516,7 +516,7 @@ class Atom {
   /// Velocities
   UCL_Vector<numtyp,numtyp> v;
   /// Extras
-  UCL_Vector<numtyp,numtyp> extra;
+  UCL_Vector<numtyp4,numtyp4> extra;
 
   #ifdef GPU_CAST
   UCL_Vector<numtyp,numtyp> x_cast;
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index e80fa01c2b..09d7386461 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -90,7 +90,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   bool vel = false;
   _extra_fields = 24; // round up to accomodate quadruples of numtyp values
                       // rpole 13; uind 3; uinp 3; amtype, amgroup; pval
-  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields);
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4);
   if (success!=0)
     return success;
 
@@ -820,35 +820,35 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
   atom->extra_data_unavail();
 
   int _nall=atom->nall();
-  numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));
+  numtyp4 *pextra=reinterpret_cast<numtyp4*>(&(atom->extra[0]));
 
   int n = 0;
-  int nstride = 4;
+  int nstride = 1; //4;
   if (rpole) {
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
-      pextra[idx]   = rpole[i][0];
-      pextra[idx+1] = rpole[i][1];
-      pextra[idx+2] = rpole[i][2];
-      pextra[idx+3] = rpole[i][3];
+      pextra[idx].x = rpole[i][0];
+      pextra[idx].y = rpole[i][1];
+      pextra[idx].z = rpole[i][2];
+      pextra[idx].w = rpole[i][3];
     }
 
     n += nstride*_nall;
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
-      pextra[idx]   = rpole[i][4];
-      pextra[idx+1] = rpole[i][5];
-      pextra[idx+2] = rpole[i][6];
-      pextra[idx+3] = rpole[i][8];
+      pextra[idx].x = rpole[i][4];
+      pextra[idx].y = rpole[i][5];
+      pextra[idx].z = rpole[i][6];
+      pextra[idx].w = rpole[i][8];
     }
 
     n += nstride*_nall;
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
-      pextra[idx]   = rpole[i][9];
-      pextra[idx+1] = rpole[i][12];
-      pextra[idx+2] = (numtyp)amtype[i];
-      pextra[idx+3] = (numtyp)amgroup[i];
+      pextra[idx].x = rpole[i][9];
+      pextra[idx].y = rpole[i][12];
+      pextra[idx].z = (numtyp)amtype[i];
+      pextra[idx].w = (numtyp)amgroup[i];
     }
   } else {
     n += 2*nstride*_nall;
@@ -858,9 +858,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
   if (uind) {
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
-      pextra[idx]   = uind[i][0];
-      pextra[idx+1] = uind[i][1];
-      pextra[idx+2] = uind[i][2];
+      pextra[idx].x = uind[i][0];
+      pextra[idx].y = uind[i][1];
+      pextra[idx].z = uind[i][2];
+      pextra[idx].w = 0;
     }
   }
 
@@ -868,9 +869,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
   if (uinp) {
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
-      pextra[idx]   = uinp[i][0];
-      pextra[idx+1] = uinp[i][1];
-      pextra[idx+2] = uinp[i][2];
+      pextra[idx].x = uinp[i][0];
+      pextra[idx].y = uinp[i][1];
+      pextra[idx].z = uinp[i][2];
+      pextra[idx].w = 0;
     }
   }
 
@@ -878,7 +880,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
   if (pval) {
     for (int i = 0; i < _nall; i++) {
       int idx = n+i*nstride;
-      pextra[idx]   = pval[i];
+      pextra[idx].x = pval[i];
+      pextra[idx].y = 0;
+      pextra[idx].z = 0;
+      pextra[idx].w = 0;
     }
   }
 }
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index 1611e8aece..99e20db223 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -410,7 +410,7 @@ _texture( q_tex,int2);
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
-                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict extra,
                                 const __global numtyp4 *restrict coeff_rep,
                                 const __global numtyp4 *restrict sp_nonpolar,
                                 const __global int *dev_nbor,
@@ -444,9 +444,9 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
   acctyp4 tq;
   tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
 
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
@@ -495,8 +495,6 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      if (r2>off2) continue;
-
       const numtyp4 pol1j = polar1[j];
       //numtyp ck  = pol1j.x;  // rpole[j][0];
       numtyp dkx = pol1j.y;    // rpole[j][1];
@@ -712,7 +710,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict extra,
                                  const __global numtyp4 *restrict coeff_amtype,
                                  const __global numtyp4 *restrict coeff_amclass,
                                  const __global numtyp4 *restrict sp_nonpolar,
@@ -741,7 +739,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
     for (int l=0; l<6; l++) virial[l]=(acctyp)0;
   }
 
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  const __global numtyp4* polar3 = &extra[2*nall];
 
   if (ii<inum) {
     int itype,iclass;
@@ -890,7 +888,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
-                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict extra,
                                 const __global numtyp4 *restrict coeff_amtype,
                                 const __global numtyp4 *restrict coeff_amclass,
                                 const __global numtyp4 *restrict sp_polar,
@@ -924,15 +922,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
   acctyp4 tq;
   tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
 
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar6 = &extra[5*nall];
 
   if (ii<inum) {
-    int m;
-    int itype,iclass;
-
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
@@ -960,9 +955,8 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
     const numtyp4 pol3i = polar3[i];
     numtyp qiyz = pol3i.x;   // rpole[i][9];
     numtyp qizz = pol3i.y;   // rpole[i][12];
-    itype  = pol3i.z;        // amtype[i];
-
-    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    int itype  = pol3i.z;        // amtype[i];
+    int iclass = coeff_amtype[itype].w;  // amtype2class[itype];
     numtyp corei = coeff_amclass[iclass].z;  // pcore[iclass];
     numtyp alphai = coeff_amclass[iclass].w; // palpha[iclass];
     numtyp vali = polar6[i].x;
@@ -1084,6 +1078,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
       numtyp alsq2n = (numtyp)0.0;
       if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
 
+      int m;
       for (m = 1; m < 6; m++) {
         numtyp bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
@@ -1208,32 +1203,32 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
-                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict extra,
                                 const __global numtyp4 *restrict coeff_amtype,
                                 const __global numtyp4 *restrict coeff_amclass,
                                 const __global numtyp4 *restrict sp_polar,
                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict fieldp,
-                                 const int inum,  const int nall,
-                                 const int nbor_pitch, const int t_per_atom,
-                                 const numtyp aewald, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict fieldp,
+                                const int inum,  const int nall,
+                                const int nbor_pitch, const int t_per_atom,
+                                const numtyp aewald, const numtyp off2,
+                                const numtyp polar_dscale, const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  //local_allocate_store_charge();
+  local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
 
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar6 = &extra[5*nall];
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
@@ -1388,7 +1383,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
-                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict extra,
                                 const __global numtyp4 *restrict coeff_amtype,
                                 const __global numtyp4 *restrict coeff_amclass,
                                 const __global numtyp4 *restrict sp_polar,
@@ -1405,14 +1400,14 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  //local_allocate_store_charge();
+  local_allocate_store_charge();
 
   acctyp _fieldp[6];
   for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
 
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
-  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar4 = &extra[3*nall];
+  const __global numtyp4* polar5 = &extra[4*nall];
 
   if (ii<inum) {
     int numj, nbor, nbor_end;
@@ -1539,27 +1534,26 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
-                             const __global numtyp *restrict extra,
-                             const __global numtyp4 *restrict coeff_amtype,
-                             const __global numtyp4 *restrict coeff_amclass,
-                             const __global numtyp4 *restrict sp_polar,
-                             const __global int *dev_nbor,
-                             const __global int *dev_packed,
-                             const __global int *dev_short_nbor,
-                             __global acctyp4 *restrict ans,
-                             __global acctyp *restrict engv,
-                             __global acctyp4 *restrict tep,
-                             const int eflag, const int vflag, const int inum,
-                             const int nall, const int nbor_pitch, const int t_per_atom,
-                             const numtyp aewald, const numtyp felec,
-                             const numtyp off2, const numtyp polar_dscale,
-                             const numtyp polar_uscale)
+                            const __global numtyp4 *restrict extra,
+                            const __global numtyp4 *restrict coeff_amtype,
+                            const __global numtyp4 *restrict coeff_amclass,
+                            const __global numtyp4 *restrict sp_polar,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            const __global int *dev_short_nbor,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            __global acctyp4 *restrict tep,
+                            const int eflag, const int vflag, const int inum,
+                            const int nall, const int nbor_pitch, const int t_per_atom,
+                            const numtyp aewald, const numtyp felec,
+                            const numtyp off2, const numtyp polar_dscale,
+                            const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
-  local_allocate_store_ufld();
   local_allocate_store_charge();
 
   acctyp4 f;
@@ -1577,14 +1571,13 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
   for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
 
   numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
-  numtyp4* polar1 = (numtyp4*)(&extra[0]);
-  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
-  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
-  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
-  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
-  numtyp4* polar6 = (numtyp4*)(&extra[20*nall]);
 
-  //numtyp4 xi__;
+  const __global numtyp4* polar1 = &extra[0];
+  const __global numtyp4* polar2 = &extra[nall];
+  const __global numtyp4* polar3 = &extra[2*nall];
+  const __global numtyp4* polar4 = &extra[3*nall];
+  const __global numtyp4* polar5 = &extra[4*nall];
+  const __global numtyp4* polar6 = &extra[5*nall];
 
   if (ii<inum) {
     int itype,igroup;
@@ -1644,7 +1637,6 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
       numtyp yr = jx.y - ix.y;
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
-
       numtyp r = ucl_sqrt(r2);
 
       const numtyp4 pol1j = polar1[j];

From c29012e85dc6e2523c5b157141fe1ad19d683603 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 25 Jan 2023 02:35:10 -0500
Subject: [PATCH 177/181] fix segfault from accessing float array as double.
 use introspection to detect

---
 src/GPU/pair_hippo_gpu.cpp | 54 +++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 0538096cc8..a12a7e1907 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -26,6 +26,7 @@
 #include "fix_store_peratom.h"
 #include "force.h"
 #include "gpu_extra.h"
+#include "info.h"
 #include "math_const.h"
 #include "memory.h"
 #include "my_page.h"
@@ -886,7 +887,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
       fieldp[i][1] += fieldp_ptr[idx+1];
       fieldp[i][2] += fieldp_ptr[idx+2];
     }
-  }  
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1077,24 +1078,45 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
   hippo_gpu_update_fieldp(&fieldp_pinned);
-
   int inum = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
 
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
+  if (Info::has_accelerator_feature("GPU", "precision", "single")) {
+    float *field_ptr = (float *)fieldp_pinned;
 
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    float* fieldp_ptr = (float *)fieldp_pinned;
+    fieldp_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += fieldp_ptr[idx];
+      fieldp[i][1] += fieldp_ptr[idx+1];
+      fieldp[i][2] += fieldp_ptr[idx+2];
+    }
+
+  } else {
+    double *field_ptr = (double *)fieldp_pinned;
+
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      field[i][0] += field_ptr[idx];
+      field[i][1] += field_ptr[idx+1];
+      field[i][2] += field_ptr[idx+2];
+    }
+
+    double* fieldp_ptr = (double *)fieldp_pinned;
+    fieldp_ptr += 4*inum;
+    for (int i = 0; i < nlocal; i++) {
+      int idx = 4*i;
+      fieldp[i][0] += fieldp_ptr[idx];
+      fieldp[i][1] += fieldp_ptr[idx+1];
+      fieldp[i][2] += fieldp_ptr[idx+2];
+    }
   }
 
   // accumulate timing information

From e068b14969a76478058c5ed24e0ab91ad903fd4d Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 25 Jan 2023 02:56:05 -0500
Subject: [PATCH 178/181] make consistent and simplify

---
 src/GPU/pair_amoeba_gpu.cpp | 46 +++++++++++++--------------
 src/GPU/pair_hippo_gpu.cpp  | 63 ++++++++++++++++---------------------
 2 files changed, 48 insertions(+), 61 deletions(-)

diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 4213946f38..941050cf04 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -203,7 +203,7 @@ void PairAmoebaGPU::init_style()
   if (gpu_mode == GPU_FORCE)
     error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
 
-  tq_single = tq_size != sizeof(double);
+  tq_single = (tq_size == sizeof(float));
 
   // replace with the gpu counterpart
 
@@ -285,10 +285,10 @@ void PairAmoebaGPU::multipole_real()
   // reference to the tep array from GPU lib
 
   if (tq_single) {
-    float *tq_ptr = (float *)tq_pinned;
+    auto *tq_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tq_ptr, f, virmpole); // fmpole
   } else {
-    double *tq_ptr = (double *)tq_pinned;
+    auto *tq_ptr = (double *)tq_pinned;
     compute_force_from_torque<double>(tq_ptr, f, virmpole); // fmpole
   }
 }
@@ -742,13 +742,12 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
       field[i][2] += field_ptr[idx+2];
     }
 
-    auto fieldp_ptr = (float *)fieldp_pinned;
-    fieldp_ptr += 4*inum;
+    field_ptr += 4*inum;
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
-      fieldp[i][0] += fieldp_ptr[idx];
-      fieldp[i][1] += fieldp_ptr[idx+1];
-      fieldp[i][2] += fieldp_ptr[idx+2];
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
     }
   } else {
     auto field_ptr = (double *)fieldp_pinned;
@@ -760,13 +759,12 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
       field[i][2] += field_ptr[idx+2];
     }
 
-    auto fieldp_ptr = (double *)fieldp_pinned;
-    fieldp_ptr += 4*inum;
+    field_ptr += 4*inum;
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
-      fieldp[i][0] += fieldp_ptr[idx];
-      fieldp[i][1] += fieldp_ptr[idx+1];
-      fieldp[i][2] += fieldp_ptr[idx+2];
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
     }
   }
 
@@ -975,13 +973,12 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
       field[i][2] += field_ptr[idx+2];
     }
 
-    auto fieldp_ptr = (float *)fieldp_pinned;
-    fieldp_ptr += 4*inum;
+    field_ptr += 4*inum;
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
-      fieldp[i][0] += fieldp_ptr[idx];
-      fieldp[i][1] += fieldp_ptr[idx+1];
-      fieldp[i][2] += fieldp_ptr[idx+2];
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
     }
   } else {
     auto field_ptr = (double *)fieldp_pinned;
@@ -993,13 +990,12 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
       field[i][2] += field_ptr[idx+2];
     }
 
-    auto fieldp_ptr = (double *)fieldp_pinned;
-    fieldp_ptr += 4*inum;
+    field_ptr += 4*inum;
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
-      fieldp[i][0] += fieldp_ptr[idx];
-      fieldp[i][1] += fieldp_ptr[idx+1];
-      fieldp[i][2] += fieldp_ptr[idx+2];
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
     }
   }
 
@@ -1301,10 +1297,10 @@ void PairAmoebaGPU::polar_real()
   // reference to the tep array from GPU lib
 
   if (tq_single) {
-    float *tep_ptr = (float *)tq_pinned;
+    auto *tep_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tep_ptr, f, virpolar); // fpolar
   } else {
-    double *tep_ptr = (double *)tq_pinned;
+    auto *tep_ptr = (double *)tq_pinned;
     compute_force_from_torque<double>(tep_ptr, f, virpolar); // fpolar
   }
 }
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index a12a7e1907..5956f1bc11 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -26,7 +26,6 @@
 #include "fix_store_peratom.h"
 #include "force.h"
 #include "gpu_extra.h"
-#include "info.h"
 #include "math_const.h"
 #include "memory.h"
 #include "my_page.h"
@@ -219,13 +218,9 @@ void PairHippoGPU::init_style()
                                screen, polar_dscale, polar_uscale, tq_size);
   GPU_EXTRA::check_flag(success,error,world);
 
-  if (gpu_mode == GPU_FORCE)
-    error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now");
+  if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now");
 
-  if (tq_size == sizeof(double))
-    tq_single = false;
-  else
-    tq_single = true;
+  tq_single = (tq_size == sizeof(float));
 
   // replace with the gpu counterpart
 
@@ -302,10 +297,10 @@ void PairHippoGPU::repulsion()
   // reference to the tep array from GPU lib
 
   if (tq_single) {
-    float *tq_ptr = (float *)tq_pinned;
+    auto *tq_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tq_ptr, f, virrepulse); // frepulse
   } else {
-    double *tq_ptr = (double *)tq_pinned;
+    auto *tq_ptr = (double *)tq_pinned;
     compute_force_from_torque<double>(tq_ptr, f, virrepulse); // frepulse
   }
 }
@@ -402,10 +397,10 @@ void PairHippoGPU::multipole_real()
   // reference to the tep array from GPU lib
 
   if (tq_single) {
-    float *tq_ptr = (float *)tq_pinned;
+    auto *tq_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tq_ptr, f, virmpole); // fmpole
   } else {
-    double *tq_ptr = (double *)tq_pinned;
+    auto *tq_ptr = (double *)tq_pinned;
     compute_force_from_torque<double>(tq_ptr, f, virmpole); // fmpole
   }
 }
@@ -860,13 +855,12 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
       field[i][2] += field_ptr[idx+2];
     }
 
-    auto fieldp_ptr = (float *)fieldp_pinned;
-    fieldp_ptr += 4*inum;
+    field_ptr += 4*inum;
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
-      fieldp[i][0] += fieldp_ptr[idx];
-      fieldp[i][1] += fieldp_ptr[idx+1];
-      fieldp[i][2] += fieldp_ptr[idx+2];
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
     }
 
   } else {
@@ -879,13 +873,12 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
       field[i][2] += field_ptr[idx+2];
     }
 
-    auto fieldp_ptr = (double *)fieldp_pinned;
-    fieldp_ptr += 4*inum;
+    field_ptr += 4*inum;
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
-      fieldp[i][0] += fieldp_ptr[idx];
-      fieldp[i][1] += fieldp_ptr[idx+1];
-      fieldp[i][2] += fieldp_ptr[idx+2];
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
     }
   }
 }
@@ -1080,8 +1073,8 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
   hippo_gpu_update_fieldp(&fieldp_pinned);
   int inum = atom->nlocal;
 
-  if (Info::has_accelerator_feature("GPU", "precision", "single")) {
-    float *field_ptr = (float *)fieldp_pinned;
+  if (tq_single) {
+    auto *field_ptr = (float *)fieldp_pinned;
 
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
@@ -1090,17 +1083,16 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
       field[i][2] += field_ptr[idx+2];
     }
 
-    float* fieldp_ptr = (float *)fieldp_pinned;
-    fieldp_ptr += 4*inum;
+    field_ptr += 4*inum;
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
-      fieldp[i][0] += fieldp_ptr[idx];
-      fieldp[i][1] += fieldp_ptr[idx+1];
-      fieldp[i][2] += fieldp_ptr[idx+2];
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
     }
 
   } else {
-    double *field_ptr = (double *)fieldp_pinned;
+    auto *field_ptr = (double *)fieldp_pinned;
 
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
@@ -1109,13 +1101,12 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
       field[i][2] += field_ptr[idx+2];
     }
 
-    double* fieldp_ptr = (double *)fieldp_pinned;
-    fieldp_ptr += 4*inum;
+    field_ptr += 4*inum;
     for (int i = 0; i < nlocal; i++) {
       int idx = 4*i;
-      fieldp[i][0] += fieldp_ptr[idx];
-      fieldp[i][1] += fieldp_ptr[idx+1];
-      fieldp[i][2] += fieldp_ptr[idx+2];
+      fieldp[i][0] += field_ptr[idx];
+      fieldp[i][1] += field_ptr[idx+1];
+      fieldp[i][2] += field_ptr[idx+2];
     }
   }
 
@@ -1426,10 +1417,10 @@ void PairHippoGPU::polar_real()
   // reference to the tep array from GPU lib
 
   if (tq_single) {
-    float *tep_ptr = (float *)tq_pinned;
+    auto *tep_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tep_ptr, f, virpolar); // fpolar
   } else {
-    double *tep_ptr = (double *)tq_pinned;
+    auto *tep_ptr = (double *)tq_pinned;
     compute_force_from_torque<double>(tep_ptr, f, virpolar); // fpolar
   }
 }

From 722e583b591633736e2beaa3b4a29809d190efc8 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 25 Jan 2023 05:22:49 -0500
Subject: [PATCH 179/181] use available introspection API to get accumulator
 data type. update name of flag.

---
 lib/gpu/lal_amoeba_ext.cpp  |  5 +----
 lib/gpu/lal_hippo_ext.cpp   |  5 +----
 src/GPU/pair_amoeba_gpu.cpp | 20 ++++++++++----------
 src/GPU/pair_amoeba_gpu.h   |  2 +-
 src/GPU/pair_hippo_gpu.cpp  | 23 ++++++++++++-----------
 src/GPU/pair_hippo_gpu.h    |  2 +-
 6 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index fe3d4a26d8..995dfbe95f 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -41,8 +41,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double polar_dscale, const double polar_uscale,
-                    int& tep_size) {
+                    const double polar_dscale, const double polar_uscale) {
   AMOEBAMF.clear();
   gpu_mode=AMOEBAMF.device->gpu_mode();
   double gpu_split=AMOEBAMF.device->particle_split();
@@ -52,8 +51,6 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
   int gpu_rank=AMOEBAMF.device->gpu_rank();
   int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
 
-  tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION);
-
   AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
 
   bool message=false;
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index b5ac42744a..0cb00387ca 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -42,8 +42,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double polar_dscale, const double polar_uscale,
-                    int& tep_size) {
+                   const double polar_dscale, const double polar_uscale) {
   HIPPOMF.clear();
   gpu_mode=HIPPOMF.device->gpu_mode();
   double gpu_split=HIPPOMF.device->particle_split();
@@ -53,8 +52,6 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
   int gpu_rank=HIPPOMF.device->gpu_rank();
   int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
 
-  tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION);
-
   HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
 
   bool message=false;
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 941050cf04..fd423486fd 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -26,6 +26,7 @@
 #include "fix_store_peratom.h"
 #include "force.h"
 #include "gpu_extra.h"
+#include "info.h"
 #include "math_const.h"
 #include "memory.h"
 #include "my_page.h"
@@ -66,7 +67,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double polar_dscale, const double polar_uscale, int& tq_size);
+                    const double polar_dscale, const double polar_uscale);
 void amoeba_gpu_clear();
 
 int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall,
@@ -188,7 +189,6 @@ void PairAmoebaGPU::init_style()
     maxspecial15=atom->maxspecial15;
   }
 
-  int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
                                 pdamp, thole, dirdamp, amtype2class, special_hal,
@@ -197,13 +197,13 @@ void PairAmoebaGPU::init_style()
                                 special_polar_pscale, csix, adisp, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                 maxspecial15, cell_size, gpu_mode, screen,
-                                polar_dscale, polar_uscale, tq_size);
+                                polar_dscale, polar_uscale);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE)
     error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
 
-  tq_single = (tq_size == sizeof(float));
+  acc_float = Info::has_accelerator_feature("GPU", "precision", "single");
 
   // replace with the gpu counterpart
 
@@ -284,7 +284,7 @@ void PairAmoebaGPU::multipole_real()
 
   // reference to the tep array from GPU lib
 
-  if (tq_single) {
+  if (acc_float) {
     auto *tq_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tq_ptr, f, virmpole); // fmpole
   } else {
@@ -732,7 +732,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   //   field and fieldp may already have some nonzero values from kspace (udirect1)
 
   int nlocal = atom->nlocal;
-  if (tq_single) {
+  if (acc_float) {
     auto field_ptr = (float *)fieldp_pinned;
 
     for (int i = 0; i < nlocal; i++) {
@@ -963,7 +963,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
   amoeba_gpu_update_fieldp(&fieldp_pinned);
 
   int inum = atom->nlocal;
-  if (tq_single) {
+  if (acc_float) {
     auto field_ptr = (float *)fieldp_pinned;
 
     for (int i = 0; i < nlocal; i++) {
@@ -1161,7 +1161,7 @@ void PairAmoebaGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
                        &fdip_sum_phi_pinned);
 
   int nlocal = atom->nlocal;
-  if (tq_single) {
+  if (acc_float) {
     auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned;
     for (int i = 0; i < nlocal; i++) {
       int n = i;
@@ -1296,7 +1296,7 @@ void PairAmoebaGPU::polar_real()
 
   // reference to the tep array from GPU lib
 
-  if (tq_single) {
+  if (acc_float) {
     auto *tep_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tep_ptr, f, virpolar); // fpolar
   } else {
@@ -1492,7 +1492,7 @@ void PairAmoebaGPU::polar_kspace()
     } else {
       void* fphi_pinned = nullptr;
       amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec);
-      if (tq_single) {
+      if (acc_float) {
         auto _fphi_ptr = (float *)fphi_pinned;
         for (int i = 0; i < nlocal; i++) {
           int idx = i;
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index c9b9b73a58..be53f7ef50 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -49,7 +49,7 @@ class PairAmoebaGPU : public PairAmoeba {
   double cpu_time;
   void *tq_pinned;
   void *fieldp_pinned;
-  bool tq_single;
+  bool acc_float;
 
   bool gpu_hal_ready;
   bool gpu_repulsion_ready;
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 5956f1bc11..9d286d5db7 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -26,6 +26,7 @@
 #include "fix_store_peratom.h"
 #include "force.h"
 #include "gpu_extra.h"
+#include "info.h"
 #include "math_const.h"
 #include "memory.h"
 #include "my_page.h"
@@ -67,7 +68,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double polar_dscale, const double polar_uscale, int& tq_size);
+                    const double polar_dscale, const double polar_uscale);
 void hippo_gpu_clear();
 
 int** hippo_gpu_precompute(const int ago, const int inum_full, const int nall,
@@ -205,7 +206,6 @@ void PairHippoGPU::init_style()
     maxspecial15=atom->maxspecial15;
   }
 
-  int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
                                pdamp, thole, dirdamp, amtype2class,
@@ -215,12 +215,13 @@ void PairHippoGPU::init_style()
                                csix, adisp, pcore, palpha,
                                atom->nlocal, atom->nlocal+atom->nghost, mnf,
                                maxspecial, maxspecial15, cell_size, gpu_mode,
-                               screen, polar_dscale, polar_uscale, tq_size);
+                               screen, polar_dscale, polar_uscale);
   GPU_EXTRA::check_flag(success,error,world);
 
-  if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now");
+  if (gpu_mode == GPU_FORCE)
+    error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now");
 
-  tq_single = (tq_size == sizeof(float));
+  acc_float = Info::has_accelerator_feature("GPU", "precision", "single");
 
   // replace with the gpu counterpart
 
@@ -296,7 +297,7 @@ void PairHippoGPU::repulsion()
 
   // reference to the tep array from GPU lib
 
-  if (tq_single) {
+  if (acc_float) {
     auto *tq_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tq_ptr, f, virrepulse); // frepulse
   } else {
@@ -396,7 +397,7 @@ void PairHippoGPU::multipole_real()
 
   // reference to the tep array from GPU lib
 
-  if (tq_single) {
+  if (acc_float) {
     auto *tq_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tq_ptr, f, virmpole); // fmpole
   } else {
@@ -845,7 +846,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
   //   field and fieldp may already have some nonzero values from kspace (udirect1)
 
   int nlocal = atom->nlocal;
-  if (tq_single) {
+  if (acc_float) {
     auto field_ptr = (float *)fieldp_pinned;
 
     for (int i = 0; i < nlocal; i++) {
@@ -1073,7 +1074,7 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
   hippo_gpu_update_fieldp(&fieldp_pinned);
   int inum = atom->nlocal;
 
-  if (tq_single) {
+  if (acc_float) {
     auto *field_ptr = (float *)fieldp_pinned;
 
     for (int i = 0; i < nlocal; i++) {
@@ -1279,7 +1280,7 @@ void PairHippoGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1,
                       &fdip_sum_phi_pinned);
 
   int nlocal = atom->nlocal;
-  if (tq_single) {
+  if (acc_float) {
     auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned;
     for (int i = 0; i < nlocal; i++) {
       int n = i;
@@ -1416,7 +1417,7 @@ void PairHippoGPU::polar_real()
 
   // reference to the tep array from GPU lib
 
-  if (tq_single) {
+  if (acc_float) {
     auto *tep_ptr = (float *)tq_pinned;
     compute_force_from_torque<float>(tep_ptr, f, virpolar); // fpolar
   } else {
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index 7955c97470..d160446d77 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -50,7 +50,7 @@ class PairHippoGPU : public PairAmoeba {
   double cpu_time;
   void *tq_pinned;
   void *fieldp_pinned;
-  bool tq_single;
+  bool acc_float;
 
   bool gpu_hal_ready;
   bool gpu_repulsion_ready;

From 6fefd8821a96e0e24c6c05f7a82de37ea06b5222 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 25 Jan 2023 10:42:55 -0600
Subject: [PATCH 180/181] Attempted to allow GPU acceleration on MacOS with
 neighbor builds on the device by enforcing the old neighbor list code path
 (will revisit)

---
 lib/gpu/lal_neighbor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 5b569f804a..482b93d9e5 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -33,7 +33,7 @@
 #endif
 #endif
 
-#if defined(USE_HIP)
+#if defined(USE_HIP) || defined(__APPLE__)
 #define LAL_USE_OLD_NEIGHBOR
 #endif
 

From 7e5e5c1b6f1704ac0834927cf20d938542ef7bdb Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Wed, 25 Jan 2023 13:30:29 -0600
Subject: [PATCH 181/181] Only added amoeba_convolution_gpu.* to the list of
 GPU source files when PKG_AMOEBA is on

---
 cmake/Modules/Packages/GPU.cmake | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 2c766a2540..24d9538206 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -3,9 +3,7 @@ set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
                 ${GPU_SOURCES_DIR}/fix_gpu.h
                 ${GPU_SOURCES_DIR}/fix_gpu.cpp
                 ${GPU_SOURCES_DIR}/fix_nh_gpu.h
-                ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp
-                ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
-                ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
+                ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp)
 target_compile_definitions(lammps PRIVATE -DLMP_GPU)
 
 set(GPU_API "opencl" CACHE STRING "API used by GPU package")
@@ -35,6 +33,12 @@ if(PKG_AMOEBA AND FFT_SINGLE)
   message(FATAL_ERROR "GPU acceleration of AMOEBA is not (yet) compatible with single precision FFT")
 endif()
 
+if (PKG_AMOEBA)
+  list(APPEND GPU_SOURCES
+              ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.h
+              ${GPU_SOURCES_DIR}/amoeba_convolution_gpu.cpp)
+endif()
+
 file(GLOB GPU_LIB_SOURCES ${CONFIGURE_DEPENDS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
 file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)