diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index a9e02ee7b4..8d9af4706e 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -62,7 +62,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                             cell_size,gpu_split,_screen,amoeba,
-                            "k_amoeba_dispersion", "k_amoeba_multipole",
+                            "k_amoeba_multipole",
                             "k_amoeba_udirect2b", "k_amoeba_umutual2b",
                             "k_amoeba_polar", "k_amoeba_short_nbor");
   if (success!=0)
@@ -149,47 +149,6 @@ double AmoebaT::host_memory_usage() const {
   return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
 }
 
-// ---------------------------------------------------------------------------
-// Calculate the dispersion real-space term, returning tep
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int AmoebaT::dispersion_real(const int eflag, const int vflag) {
-  int ainum=this->ans->inum();
-  if (ainum == 0)
-    return 0;
-
-  int _nall=this->atom->nall();
-  int nbor_pitch=this->nbor->nbor_pitch();
-
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-  this->time_pair.start();
-
-  // Build the short neighbor list for the cutoff off2_mpole,
-  //   at this point mpole is the first kernel in a time step
-  
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &this->_off2_disp, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
-  printf("launching dispersion\n");
-  this->k_dispersion.set_size(GX,BX);
-  this->k_dispersion.run(&this->atom->x, &this->atom->extra,
-                         &coeff_amtype, &coeff_amclass, &sp_nonpolar,
-                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                         &this->dev_short_nbor,
-                         &this->ans->force, &this->ans->engv,
-                         &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                         &this->_threads_per_atom,  &this->_aewald,
-                         &this->_off2_disp);
-  this->time_pair.stop();
-
-  return GX;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate the multipole real-space term, returning tep
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index df556a1018..04eb6e4aa9 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -93,7 +93,6 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  int dispersion_real(const int eflag, const int vflag);
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 309830e1ce..565f16b627 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -116,7 +116,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
 void amoeba_gpu_clear() {
   AMOEBAMF.clear();
 }
-
+/*
 int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -133,7 +133,7 @@ int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
                           eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                           cpu_time, success, aewald, off2, host_q, boxlo, prd);
 }
-
+*/
 int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            int *host_amtype, int *host_amgroup, double **host_rpole,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index f252131ea7..b8e927d6ce 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -33,7 +33,6 @@ template <class numtyp, class acctyp>
 BaseAmoebaT::~BaseAmoeba() {
   delete ans;
   delete nbor;
-  k_dispersion.clear();
   k_multipole.clear();
   k_udirect2b.clear();
   k_umutual2b.clear();
@@ -55,7 +54,6 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const int maxspecial15,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
-                             const char *k_name_dispersion,
                              const char *k_name_multipole,
                              const char *k_name_udirect2b,
                              const char *k_name_umutual2b,
@@ -92,7 +90,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
 
   _block_size=device->pair_block_size();
   _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_dispersion,k_name_multipole,
+  compile_kernels(*ucl_device,pair_program,k_name_multipole,
                   k_name_udirect2b, k_name_umutual2b,k_name_polar,k_name_short_nbor);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
@@ -428,73 +426,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   return nbor->host_jlist.begin()-host_start;
 }
 
-// ---------------------------------------------------------------------------
-// Reneighbor on GPU if necessary, and then compute dispersion real-space
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int** BaseAmoebaT::compute_dispersion_real(const int ago, const int inum_full,
-                                           const int nall, double **host_x,
-                                           int *host_type, int *host_amtype,
-                                           int *host_amgroup, double **host_rpole,
-                                           double *sublo, double *subhi, tagint *tag,
-                                           int **nspecial, tagint **special,
-                                           int *nspecial15, tagint **special15,
-                                           const bool eflag_in, const bool vflag_in,
-                                           const bool eatom, const bool vatom,
-                                           int &host_start, int **ilist, int **jnum,
-                                           const double cpu_time, bool &success,
-                                           const double aewald, const double off2_disp,
-                                           double *host_q, double *boxlo, double *prd) {
-  acc_timers();
-  int eflag, vflag;
-  if (eatom) eflag=2;
-  else if (eflag_in) eflag=1;
-  else eflag=0;
-  if (vatom) vflag=2;
-  else if (vflag_in) vflag=1;
-  else vflag=0;
-
-  #ifdef LAL_NO_BLOCK_REDUCE
-  if (eflag) eflag=2;
-  if (vflag) vflag=2;
-  #endif
-
-  set_kernel(eflag,vflag);
-
-  // reallocate per-atom arrays, transfer data from the host
-  //   and build the neighbor lists if needed
-  // NOTE: 
-  //   For now we invoke precompute() again here,
-  //     to be able to turn on/off the udirect2b kernel (which comes before this)
-  //   Once all the kernels are ready, precompute() is needed only once
-  //     in the first kernel in a time step.
-  //   We only need to cast uind and uinp from host to device here
-  //     if the neighbor lists are rebuilt and other per-atom arrays
-  //     (x, type, amtype, amgroup, rpole) are ready on the device.
-
-  int** firstneigh = nullptr;
-  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
-                          host_amtype, host_amgroup, host_rpole,
-                          nullptr, nullptr, sublo, subhi, tag,
-                          nspecial, special, nspecial15, special15,
-                          eflag_in, vflag_in, eatom, vatom,
-                          host_start, ilist, jnum, cpu_time,
-                          success, host_q, boxlo, prd);
-
-  _off2_disp = off2_disp;
-  _aewald = aewald;
-  const int red_blocks=dispersion_real(eflag,vflag);
-
-  // leave the answers (forces, energies and virial) on the device,
-  //   only copy them back in the last kernel (polar_real)
-  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
-  //device->add_ans_object(ans);
-
-  hd_balancer.stop_timer();
-
-  return firstneigh; // nbor->host_jlist.begin()-host_start;
-}
-
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
@@ -885,7 +816,6 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname_dispersion,
                                   const char *kname_multipole,
                                   const char *kname_udirect2b,
                                   const char *kname_umutual2b,
@@ -899,7 +829,6 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
   pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
 
-  k_dispersion.set_function(*pair_program,kname_dispersion);
   k_multipole.set_function(*pair_program,kname_multipole);
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index fcff3186c7..40da00f176 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -54,7 +54,7 @@ class BaseAmoeba {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_dispersion, const char *kname_multipole,
+                  const char *kname_multipole,
                   const char *kname_udirect2b, const char *kname_umutual2b,
                   const char *kname_polar, const char *kname_short_nbor);
 
@@ -142,18 +142,6 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
-  /// Compute dispersion real-space with device neighboring
-  int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, int *host_amtype,
-                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
-                tagint *tag, int **nspecial, tagint **special,
-                int *nspecial15, tagint **special15,
-                const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double aewald, const double off2_disp, double *charge,
-                double *boxlo, double *prd);
-
   /// Compute multipole real-space with device neighboring
   int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -269,7 +257,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_dispersion, k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
   UCL_Kernel k_special15, k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
@@ -291,11 +279,10 @@ class BaseAmoeba {
   numtyp _off2_hal,_off2_repulse,_off2_disp,_off2_mpole,_off2_polar;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
-     const char *kname_dispersion, const char *kname_multipole,
+     const char *kname_multipole,
      const char *kname_udirect2b, const char *kname_umutual2b,
      const char *kname_polar, const char *kname_short_nbor);
 
-  virtual int dispersion_real(const int eflag, const int vflag) = 0;
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
new file mode 100644
index 0000000000..7fa358e35a
--- /dev/null
+++ b/lib/gpu/lal_hippo.cpp
@@ -0,0 +1,430 @@
+/***************************************************************************
+                                 hippo.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "hippo_cl.h"
+#elif defined(USE_CUDART)
+const char *hippo=0;
+#else
+#include "hippo_cubin.h"
+#endif
+
+#include "lal_hippo.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define HippoT Hippo<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+HippoT::Hippo() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+HippoT::~Hippo() {
+  clear();
+  k_dispersion.clear();
+}
+
+template <class numtyp, class acctyp>
+int HippoT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                  const double *host_pdamp, const double *host_thole,
+                  const double *host_dirdamp, const int *host_amtype2class,
+                  const double *host_special_hal,
+                  const double *host_special_repel,
+                  const double *host_special_disp,
+                  const double *host_special_mpole,
+                  const double *host_special_polar_wscale,
+                  const double *host_special_polar_piscale,
+                  const double *host_special_polar_pscale,
+                  const double *host_csix, const double *host_adisp,
+                  const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15,
+                  const double cell_size, const double gpu_split, FILE *_screen,
+                  const double polar_dscale, const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,hippo,
+                            "k_hippo_multipole",
+                            "k_hippo_udirect2b", "k_hippo_umutual2b",
+                            "k_hippo_polar", "k_hippo_short_nbor");
+  if (success!=0)
+    return success;
+
+  k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = host_dirdamp[i];
+    host_write[i].w = host_amtype2class[i];
+  }
+
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = (numtyp)0;
+    host_write2[i].w = (numtyp)0;
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_polar_wscale[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=host_special_mpole[i];
+  }
+  ucl_copy(sp_polar,dview,5,false);
+
+  sp_nonpolar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_hal[i];
+    dview[i].y=host_special_repel[i];
+    dview[i].z=host_special_disp[i];
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_nonpolar,dview,5,false);
+
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+    + sp_polar.row_bytes() + sp_nonpolar.row_bytes() + this->_tep.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void HippoT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  coeff_amtype.clear();
+  coeff_amclass.clear();
+  sp_polar.clear();
+  sp_nonpolar.clear();
+  
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double HippoT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Hippo<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute dispersion real-space
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+int** HippoT::compute_dispersion_real(const int ago, const int inum_full,
+                                           const int nall, double **host_x,
+                                           int *host_type, int *host_amtype,
+                                           int *host_amgroup, double **host_rpole,
+                                           double *sublo, double *subhi, tagint *tag,
+                                           int **nspecial, tagint **special,
+                                           int *nspecial15, tagint **special15,
+                                           const bool eflag_in, const bool vflag_in,
+                                           const bool eatom, const bool vatom,
+                                           int &host_start, int **ilist, int **jnum,
+                                           const double cpu_time, bool &success,
+                                           const double aewald, const double off2_disp,
+                                           double *host_q, double *boxlo, double *prd) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          nullptr, nullptr, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  this->_off2_disp = off2_disp;
+  this->_aewald = aewald;
+  const int red_blocks=dispersion_real(eflag,vflag);
+
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
+  this->hd_balancer.stop_timer();
+
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the dispersion real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::dispersion_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step
+  
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_disp, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  k_dispersion.set_size(GX,BX);
+  k_dispersion.run(&this->atom->x, &this->atom->extra,
+                         &coeff_amtype, &coeff_amclass, &sp_nonpolar,
+                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                         &this->dev_short_nbor,
+                         &this->ans->force, &this->ans->engv,
+                         &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                         &this->_threads_per_atom,  &this->_aewald,
+                         &this->_off2_disp);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the multipole real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::multipole_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list for the cutoff off2_mpole,
+  //   at this point mpole is the first kernel in a time step
+  
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                         &this->_nbor_data->begin(),
+                         &this->dev_short_nbor, &this->_off2_mpole, &ainum,
+                         &nbor_pitch, &this->_threads_per_atom);
+
+  this->k_multipole.set_size(GX,BX);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the real-space permanent field, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::udirect2b(const int eflag, const int vflag) {
+  int ainum=this->ans->inum(); 
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(),
+                           &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                           &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+  
+  this->k_udirect2b.set_size(GX,BX);
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
+                        &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the real-space induced field, returning field and fieldp
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::umutual2b(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                           &this->_nbor_data->begin(), &this->dev_short_nbor,
+                           &this->_off2_polar, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_umutual2b.set_size(GX,BX);
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
+                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+
+  this->time_pair.stop();
+  return GX;
+}
+
+// ---------------------------------------------------------------------------
+// Calculate the polar real-space term, returning tep
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int HippoT::polar_real(const int eflag, const int vflag) {
+  int ainum=this->ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  // Build the short neighbor list if not done yet
+  if (!this->short_nbor_polar_avail) {
+    this->k_short_nbor.set_size(GX,BX);
+    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
+                          &nbor_pitch, &this->_threads_per_atom);
+    this->short_nbor_polar_avail = true;
+  }
+
+  this->k_polar.set_size(GX,BX);
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->dev_short_nbor,
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+
+  // Signal that short nbor list is not avail for the next time step
+  //   do it here because polar_real() is the last kernel in a time step at this point
+
+  this->short_nbor_polar_avail = false;
+
+  return GX;
+}
+
+template class Hippo<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
new file mode 100644
index 0000000000..a21afe6cd8
--- /dev/null
+++ b/lib/gpu/lal_hippo.cu
@@ -0,0 +1,1892 @@
+// **************************************************************************
+//                                   hippo.cu
+//                             -------------------
+//                          Trung Dac Nguyen (Northwestern)
+//
+//  Device code for acceleration of the hippo pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : trung.nguyen@northwestern.edu
+// ***************************************************************************
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include <stdio.h>
+#include "lal_aux_fun1.h"
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#include "inttypes.h"
+#define tagint int64_t
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+#ifndef _DOUBLE_DOUBLE
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+#else
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#define tagint long
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+
+#endif // defined(NV_KERNEL) || defined(USE_HIP)
+
+
+#if (SHUFFLE_AVAIL == 0)
+
+#define local_allocate_store_ufld()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+
+#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                                tep)                                        \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=tq.x;                                                   \
+    red_acc[1][tid]=tq.y;                                                   \
+    red_acc[2][tid]=tq.z;                                                   \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    tq.x=red_acc[0][tid];                                                   \
+    tq.y=red_acc[1][tid];                                                   \
+    tq.z=red_acc[2][tid];                                                   \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                              \
+  }
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=ufld[0];                                                \
+    red_acc[1][tid]=ufld[1];                                                \
+    red_acc[2][tid]=ufld[2];                                                \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    ufld[0]=red_acc[0][tid];                                                \
+    ufld[1]=red_acc[1][tid];                                                \
+    ufld[2]=red_acc[2][tid];                                                \
+    red_acc[0][tid]=dufld[0];                                               \
+    red_acc[1][tid]=dufld[1];                                               \
+    red_acc[2][tid]=dufld[2];                                               \
+    red_acc[3][tid]=dufld[3];                                               \
+    red_acc[4][tid]=dufld[4];                                               \
+    red_acc[5][tid]=dufld[5];                                               \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    dufld[0]=red_acc[0][tid];                                               \
+    dufld[1]=red_acc[1][tid];                                               \
+    dufld[2]=red_acc[2][tid];                                               \
+    dufld[3]=red_acc[3][tid];                                               \
+    dufld[4]=red_acc[4][tid];                                               \
+    dufld[5]=red_acc[5][tid];                                               \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#define store_answers_fieldp(_fieldp, ii, inum,tid, t_per_atom, offset, i,  \
+                              fieldp)                                       \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=_fieldp[0];                                             \
+    red_acc[1][tid]=_fieldp[1];                                             \
+    red_acc[2][tid]=_fieldp[2];                                             \
+    red_acc[3][tid]=_fieldp[3];                                             \
+    red_acc[4][tid]=_fieldp[4];                                             \
+    red_acc[5][tid]=_fieldp[5];                                             \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    _fieldp[0]=red_acc[0][tid];                                             \
+    _fieldp[1]=red_acc[1][tid];                                             \
+    _fieldp[2]=red_acc[2][tid];                                             \
+    _fieldp[3]=red_acc[3][tid];                                             \
+    _fieldp[4]=red_acc[4][tid];                                             \
+    _fieldp[5]=red_acc[5][tid];                                             \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 f, fp;                                                          \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
+    fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
+    fieldp[ii+inum] = fp;                                                   \
+  }
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+          engv[ei]+=e_coul*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else // SHUFFLE_AVAIL == 1
+
+#define local_allocate_store_ufld()
+
+#define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
+                          tep)                                              \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      tq.x += shfl_down(tq.x, s, t_per_atom);                               \
+      tq.y += shfl_down(tq.y, s, t_per_atom);                               \
+      tq.z += shfl_down(tq.z, s, t_per_atom);                               \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    tep[i]=tq;                                                              \
+  }
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      ufld[0] += shfl_down(ufld[0], s, t_per_atom);                         \
+      ufld[1] += shfl_down(ufld[1], s, t_per_atom);                         \
+      ufld[2] += shfl_down(ufld[2], s, t_per_atom);                         \
+      dufld[0] += shfl_down(dufld[0], s, t_per_atom);                       \
+      dufld[1] += shfl_down(dufld[1], s, t_per_atom);                       \
+      dufld[2] += shfl_down(dufld[2], s, t_per_atom);                       \
+      dufld[3] += shfl_down(dufld[3], s, t_per_atom);                       \
+      dufld[4] += shfl_down(dufld[4], s, t_per_atom);                       \
+      dufld[5] += shfl_down(dufld[5], s, t_per_atom);                       \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#define store_answers_fieldp(_fieldp, ii, inum, tid, t_per_atom, offset, i, \
+                             fieldp)                                        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      _fieldp[0] += shfl_down(_fieldp[0], s, t_per_atom);                   \
+      _fieldp[1] += shfl_down(_fieldp[1], s, t_per_atom);                   \
+      _fieldp[2] += shfl_down(_fieldp[2], s, t_per_atom);                   \
+      _fieldp[3] += shfl_down(_fieldp[3], s, t_per_atom);                   \
+      _fieldp[4] += shfl_down(_fieldp[4], s, t_per_atom);                   \
+      _fieldp[5] += shfl_down(_fieldp[5], s, t_per_atom);                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 f, fp;                                                          \
+    f.x = _fieldp[0];                                                       \
+    f.y = _fieldp[1];                                                       \
+    f.z = _fieldp[2];                                                       \
+    fieldp[ii] = f;                                                         \
+    fp.x = _fieldp[3];                                                      \
+    fp.y = _fieldp[4];                                                      \
+    fp.z = _fieldp[5];                                                      \
+    fieldp[ii+inum] = fp;                                                   \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+            engv[ei]+=e_coul*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+        engv[ei]+=e_coul*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// EVFLAG == 0
+#else
+
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }
+
+#endif // EVFLAG
+#endif // SHUFFLE_AVAIL
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MY_PIS (acctyp)1.77245385090551602729
+
+/* ----------------------------------------------------------------------
+   dispersion = real-space portion of Ewald dispersion
+   adapted from Tinker edreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff_amtype,
+                                 const __global numtyp4 *restrict coeff_amclass,
+                                 const __global numtyp4 *restrict sp_nonpolar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp off2)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int itype,iclass;
+    numtyp ci,ai;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    itype  = polar3[i].z;            // amtype[i];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+    ci = coeff_amclass[iclass].x;    // csix[iclass];
+    ai = coeff_amclass[iclass].y;    // adisp[iclass];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = ix.x - jx.x;
+      numtyp yr = ix.y - jx.y;
+      numtyp zr = ix.z - jx.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      int jtype =   polar3[j].z; // amtype[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
+      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
+      numtyp ak = coeff_amclass[jclass].y;    // adisp[jclass];
+
+      numtyp r6 = r2*r2*r2;
+      numtyp ralpha2 = r2 * aewald*aewald;
+      numtyp term = (numtyp)1.0 + ralpha2 + (numtyp)0.5*ralpha2*ralpha2;
+      numtyp expterm = ucl_exp(-ralpha2);
+      numtyp expa = expterm * term;
+
+      // find the damping factor for the dispersion interaction
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp r7 = r6 * r;
+      numtyp di = ai * r;
+      numtyp di2 = di * di;
+      numtyp di3 = di * di2;
+      numtyp dk = ak * r;
+      numtyp expi = ucl_exp(-di);
+      numtyp expk = ucl_exp(-dk);
+     
+      numtyp ai2,ak2;
+      numtyp di4,di5;
+      numtyp dk2,dk3;
+      numtyp ti,ti2;
+      numtyp tk,tk2;
+      numtyp damp3,damp5;
+      numtyp ddamp;
+      numtyp factor_disp = (numtyp)1.0; // factor_disp = special_disp[sbmask15(j)];
+
+      if (ai != ak) {
+        ai2 = ai * ai;
+        ak2 = ak * ak;
+        dk2 = dk * dk;
+        dk3 = dk * dk2;
+        ti = ak2 / (ak2-ai2);
+        ti2 = ti * ti;
+        tk = ai2 / (ai2-ak2);
+        tk2 = tk * tk;
+        damp3 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2) * expk
+          - (numtyp)2.0*ti2*tk * ((numtyp)1.0 + di)* expi
+          - (numtyp)2.0*tk2*ti * ((numtyp)1.0 + dk) *expk;
+        damp5 = (numtyp)1.0 - ti2*((numtyp)1.0 + di + (numtyp)0.5*di2 + di3/(numtyp)6.0) * expi
+          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
+          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
+          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
+        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + 
+          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);
+
+      } else {
+        di4 = di2 * di2;
+        di5 = di2 * di3;
+        damp3 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + (numtyp)7.0*di3/(numtyp)48.0+di4/(numtyp)48.0)*expi;
+        damp5 = (numtyp)1.0 - ((numtyp)1.0+di+(numtyp)0.5*di2 + di3/(numtyp)6.0+di4/(numtyp)24.0+di5/(numtyp)144.0)*expi;
+        ddamp = ai * expi * (di5-(numtyp)3.0*di3-(numtyp)3.0*di2) / (numtyp)96.0;
+      }
+
+      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
+      
+      // apply damping and scaling factors for this interaction
+
+      numtyp scale = factor_disp * damp*damp;
+      scale = scale - (numtyp )1.0;
+      numtyp e = -ci * ck * (expa+scale) / r6;
+      numtyp rterm = -ucl_powr(ralpha2,(numtyp)3.0) * expterm / r;
+      numtyp de = (numtyp)-6.0*e/r2 - ci*ck*rterm/r7 - (numtyp)2.0*ci*ck*factor_disp*damp*ddamp/r7;
+
+      energy+= e;
+
+      // increment the damped dispersion derivative components
+
+      numtyp dedx = de * xr;
+      numtyp dedy = de * yr;
+      numtyp dedz = de * zr;
+      f.x += dedx;
+      f.y += dedy;
+      f.z += dedz;
+      
+      // increment the internal virial tensor components
+
+      numtyp vxx = xr * dedx;
+      numtyp vyx = yr * dedx;
+      numtyp vzx = zr * dedx;
+      numtyp vyy = yr * dedy;
+      numtyp vzy = zr * dedy;
+      numtyp vzz = zr * dedz;
+
+      virial[0] += vxx;
+      virial[1] += vyy;
+      virial[2] += vzz;
+      virial[3] += vyx;
+      virial[4] += vzx;
+      virial[5] += vzy;
+    } // nbor
+    
+  } // ii<inum
+
+  // accumate force, energy and virial
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
+/* ----------------------------------------------------------------------
+   multipole_real = real-space portion of multipole
+   adapted from Tinker emreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 __global acctyp4 *restrict tep,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp felec, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int m;
+    numtyp bfac;
+    numtyp term1,term2,term3;
+    numtyp term4,term5,term6;
+    numtyp bn[6];
+    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      
+      numtyp dik = dix*dkx + diy*dky + diz*dkz;
+      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
+      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
+      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
+
+      // additional intermediates involving moments and distance
+
+      numtyp dirx = diy*zr - diz*yr;
+      numtyp diry = diz*xr - dix*zr;
+      numtyp dirz = dix*yr - diy*xr;
+      numtyp dkrx = dky*zr - dkz*yr;
+      numtyp dkry = dkz*xr - dkx*zr;
+      numtyp dkrz = dkx*yr - dky*xr;
+      numtyp dikx = diy*dkz - diz*dky;
+      numtyp diky = diz*dkx - dix*dkz;
+      numtyp dikz = dix*dky - diy*dkx;
+      numtyp qirx = qiz*yr - qiy*zr;
+      numtyp qiry = qix*zr - qiz*xr;
+      numtyp qirz = qiy*xr - qix*yr;
+      numtyp qkrx = qkz*yr - qky*zr;
+      numtyp qkry = qkx*zr - qkz*xr;
+      numtyp qkrz = qky*xr - qkx*yr;
+      numtyp qikx = qky*qiz - qkz*qiy;
+      numtyp qiky = qkz*qix - qkx*qiz;
+      numtyp qikz = qkx*qiy - qky*qix;
+      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
+      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
+      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
+      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
+      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
+      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
+      numtyp qikrx = qizk*yr - qiyk*zr;
+      numtyp qikry = qixk*zr - qizk*xr;
+      numtyp qikrz = qiyk*xr - qixk*yr;
+      numtyp qkirx = qkzi*yr - qkyi*zr;
+      numtyp qkiry = qkxi*zr - qkzi*xr;
+      numtyp qkirz = qkyi*xr - qkxi*yr;
+      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
+      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
+      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
+      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
+      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
+      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
+      numtyp diqkrx = diqkz*yr - diqky*zr;
+      numtyp diqkry = diqkx*zr - diqkz*xr;
+      numtyp diqkrz = diqky*xr - diqkx*yr;
+      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
+      numtyp dkqiry = dkqix*zr - dkqiz*xr;
+      numtyp dkqirz = dkqiy*xr - dkqix*yr;
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m < 6; m++) {
+        bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
+      }
+      for (m = 0; m < 6; m++) bn[m] *= felec;
+
+      term1 = ci*ck;
+      term2 = ck*dir - ci*dkr + dik;
+      term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
+      term5 = qir*qkr;
+      numtyp scalek = (numtyp)1.0 - factor_mpole;
+      rr1 = bn[0] - scalek*rr1;
+      rr3 = bn[1] - scalek*rr3;
+      rr5 = bn[2] - scalek*rr5;
+      rr7 = bn[3] - scalek*rr7;
+      rr9 = bn[4] - scalek*rr9;
+      rr11 = bn[5] - scalek*rr11;
+      numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
+
+      // find standard multipole intermediates for force and torque
+
+      numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11;
+      term1 = -ck*rr3 + dkr*rr5 - qkr*rr7;
+      term2 = ci*rr3 + dir*rr5 + qir*rr7;
+      term3 = (numtyp)2.0 * rr5;
+      term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
+      term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9);
+      term6 = (numtyp)4.0 * rr7;
+
+      energy += e;
+
+      // compute the force components for this interaction
+
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+        term4*qix + term5*qkx + term6*(qixk+qkxi);
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+        term4*qiy + term5*qky + term6*(qiyk+qkyi);
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+        term4*qiz + term5*qkz + term6*(qizk+qkzi);
+
+      // compute the torque components for this interaction
+
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+        term4*qirx - term6*(qikrx+qikx);
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+        term4*qiry - term6*(qikry+qiky);
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+        term4*qirz - term6*(qikrz+qikz);
+
+      // increment force-based gradient and torque on first site
+
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = -xr * frcx;
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = -yr * frcy;
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = -zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate tq
+  store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+  
+  // accumate force, energy and virial: use _acc if not the first kernel
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+  //store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //   offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+}
+
+/* ----------------------------------------------------------------------
+  udirect2b = Ewald real direct field via list
+  udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+    numtyp ddi = coeff[itype].z;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+      
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+      numtyp factor_dscale, factor_pscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find the field components for Thole polarization damping
+
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp scale7 = (numtyp)1.0;
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
+        if (pgamma != (numtyp)0.0) {
+          damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp) ;
+            scale3 = (numtyp)1.0 - expdamp ;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.5*damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
+          }
+        } else {
+          pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+          damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+          if (damp < (numtyp)50.0) {
+            numtyp expdamp = ucl_exp(-damp);
+            scale3 = (numtyp)1.0 - expdamp;
+            scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+            scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp + (numtyp)0.6*damp*damp);
+          }
+        }
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_dscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+        
+      scalek = factor_pscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+      bcn[2] = bn[3] - ((numtyp)1.0-scalek*scale7)*rr7;
+      fip[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
+      fip[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
+      fip[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
+
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+  umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict fieldp,
+                                 const int inum,  const int nall,
+                                 const int nbor_pitch, const int t_per_atom,
+                                 const numtyp aewald, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp _fieldp[6];
+  for (int l=0; l<6; l++) _fieldp[l]=(acctyp)0;
+
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    int itype,igroup;
+    numtyp bn[4],bcn[3];
+    numtyp fid[3],fip[3];
+    
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+
+    numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
+    numtyp aesq2n = (numtyp)0.0;
+    if (aewald > (numtyp)0.0) aesq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
+
+      numtyp factor_uscale;
+      if (igroup == jgroup) factor_uscale = polar_uscale;
+      else factor_uscale = (numtyp)1.0;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+
+      numtyp aefac = aesq2n;
+      for (int m = 1; m <= 3; m++) {
+        numtyp bfac = (numtyp) (m+m-1);
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * r2inv;
+      }
+
+      // find terms needed later to compute mutual polarization
+      // if (poltyp != DIRECT) 
+      numtyp scale3 = (numtyp)1.0;
+      numtyp scale5 = (numtyp)1.0;
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          scale3 = (numtyp)1.0 - expdamp;
+          scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
+        }
+        
+      } else { // damp == 0: ???
+      }
+
+      numtyp scalek = factor_uscale;
+      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
+      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
+
+      numtyp tdipdip[6]; // the following tdipdip is incorrect!! needs work to store tdipdip
+      tdipdip[0] = -bcn[0] + bcn[1]*xr*xr;
+      tdipdip[1] = bcn[1]*xr*yr;
+      tdipdip[2] = bcn[1]*xr*zr;
+      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
+      tdipdip[4] = bcn[1]*yr*zr;
+      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
+      //if (i==0 && j == 10) 
+      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
+      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
+      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
+      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
+      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
+      
+      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
+      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
+      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
+      
+      _fieldp[0] += fid[0];
+      _fieldp[1] += fid[1];
+      _fieldp[2] += fid[2];
+      _fieldp[3] += fip[0];
+      _fieldp[4] += fip[1];
+      _fieldp[5] += fip[2];
+    }  // nbor
+
+  } // ii<inum
+
+  // accumulate field and fieldp
+  
+  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
+}
+
+/* ----------------------------------------------------------------------
+   polar_real = real-space portion of induced dipole polarization
+   adapted from Tinker epreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
+                             const __global numtyp *restrict extra,
+                             const __global numtyp4 *restrict coeff,
+                             const __global numtyp4 *restrict sp_polar,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             const __global int *dev_short_nbor,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             __global acctyp4 *restrict tep,
+                             const int eflag, const int vflag, const int inum,
+                             const int nall, const int nbor_pitch, const int t_per_atom,
+                             const numtyp aewald, const numtyp felec,
+                             const numtyp off2, const numtyp polar_dscale,
+                             const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_ufld();
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp ufld[3];
+  ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
+  acctyp dufld[6];
+  for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int k,m,itype,igroup;
+    numtyp bfac;
+    numtyp psc3,psc5,psc7;
+    numtyp dsc3,dsc5,dsc7;
+    numtyp usc3,usc5;
+    numtyp psr3,psr5,psr7;
+    numtyp dsr3,dsr5,dsr7;
+    numtyp usr5;
+    numtyp term1,term2,term3;
+    numtyp term4,term5;
+    numtyp term6,term7;
+    numtyp rc3[3],rc5[3],rc7[3];
+    numtyp prc3[3],prc5[3],prc7[3];
+    numtyp drc3[3],drc5[3],drc7[3];
+    numtyp urc3[3],urc5[3];
+    numtyp bn[5];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    uix = polar4[i].x;    // uind[i][0];
+    uiy = polar4[i].y;    // uind[i][1];
+    uiz = polar4[i].z;    // uind[i][2];
+    uixp = polar5[i].x;   // uinp[i][0];
+    uiyp = polar5[i].y;   // uinp[i][1];
+    uizp = polar5[i].z;   // uinp[i][2];
+
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
+
+      numtyp factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      numtyp uir = uix*xr + uiy*yr + uiz*zr;
+      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
+      numtyp ukr = ukx*xr + uky*yr + ukz*zr;
+      numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m <= 4; m++) {
+        bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
+      }
+      for (m = 0; m < 5; m++) bn[m] *= felec;
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp sc3 = (numtyp)1.0;
+      numtyp sc5 = (numtyp)1.0;
+      numtyp sc7 = (numtyp)1.0;
+      for (k = 0; k < 3; k++) {
+        rc3[k] = (numtyp)0.0;
+        rc5[k] = (numtyp)0.0;
+        rc7[k] = (numtyp)0.0;
+      }
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
+        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          sc3 = (numtyp)1.0 - expdamp;
+          sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
+          sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
+          numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
+          numtyp temp5 = damp;
+          numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
+          rc3[0] = xr * temp3;
+          rc3[1] = yr * temp3;
+          rc3[2] = zr * temp3;
+          rc5[0] = rc3[0] * temp5;
+          rc5[1] = rc3[1] * temp5;
+          rc5[2] = rc3[2] * temp5;
+          rc7[0] = rc5[0] * temp7;
+          rc7[1] = rc5[1] * temp7;
+          rc7[2] = rc5[2] * temp7;
+        }
+
+        psc3 = (numtyp)1.0 - sc3*factor_pscale;
+        psc5 = (numtyp)1.0 - sc5*factor_pscale;
+        psc7 = (numtyp)1.0 - sc7*factor_pscale;
+        dsc3 = (numtyp)1.0 - sc3*factor_dscale;
+        dsc5 = (numtyp)1.0 - sc5*factor_dscale;
+        dsc7 = (numtyp)1.0 - sc7*factor_dscale;
+        usc3 = (numtyp)1.0 - sc3*factor_uscale;
+        usc5 = (numtyp)1.0 - sc5*factor_uscale;
+        psr3 = bn[1] - psc3*rr3;
+        psr5 = bn[2] - psc5*rr5;
+        psr7 = bn[3] - psc7*rr7;
+        dsr3 = bn[1] - dsc3*rr3;
+        dsr5 = bn[2] - dsc5*rr5;
+        dsr7 = bn[3] - dsc7*rr7;
+        usr5 = bn[2] - usc5*rr5;
+        for (k = 0; k < 3; k++) {
+          prc3[k] = rc3[k] * factor_pscale;
+          prc5[k] = rc5[k] * factor_pscale;
+          prc7[k] = rc7[k] * factor_pscale;
+          drc3[k] = rc3[k] * factor_dscale;
+          drc5[k] = rc5[k] * factor_dscale;
+          drc7[k] = rc7[k] * factor_dscale;
+          urc3[k] = rc3[k] * factor_uscale;
+          urc5[k] = rc5[k] * factor_uscale;
+        }
+      } else { // damp == 0: ???
+      }
+
+      // get the induced dipole field used for dipole torques
+
+      numtyp tix3 = psr3*ukx + dsr3*ukxp;
+      numtyp tiy3 = psr3*uky + dsr3*ukyp;
+      numtyp tiz3 = psr3*ukz + dsr3*ukzp;
+      numtyp tuir = -psr5*ukr - dsr5*ukrp;
+      
+      ufld[0] += tix3 + xr*tuir;
+      ufld[1] += tiy3 + yr*tuir;
+      ufld[2] += tiz3 + zr*tuir;
+
+      // get induced dipole field gradient used for quadrupole torques
+
+      numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
+      numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
+      numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
+      tuir = -psr7*ukr - dsr7*ukrp;
+      
+      dufld[0] += xr*tix5 + xr*xr*tuir;
+      dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
+      dufld[2] += yr*tiy5 + yr*yr*tuir;
+      dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
+      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
+      dufld[5] += zr*tiz5 + zr*zr*tuir;
+      
+      // get the dEd/dR terms used for direct polarization force
+
+      term1 = bn[2] - dsc3*rr5;
+      term2 = bn[3] - dsc5*rr7;
+      term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
+      term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
+      term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
+      numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
+      numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
+      term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
+      term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
+      numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
+      term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
+      term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
+      term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
+      term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
+      numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*xr*yr - rr3*yr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
+      term7 = rr5*drc5[0] - term2*xr;
+      numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;
+
+      term3 = term1*xr*zr - rr3*zr*drc3[0];
+      term5 = term2*xr*zr - rr5*zr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
+      numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*yr*zr - rr3*zr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
+      term7 = rr5*drc5[1] - term2*yr;
+      numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
+      numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
+      numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;
+
+      numtyp frcx = depx;
+      numtyp frcy = depy;
+      numtyp frcz = depz;
+
+      // get the dEp/dR terms used for direct polarization force
+      
+      // tixx and tkxx
+      term1 = bn[2] - psc3*rr5;
+      term2 = bn[3] - psc5*rr7;
+      term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr - psr5*xr;
+      term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
+      term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
+      tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
+      tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      // tiyy and tkyy
+      term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr - psr5*yr;
+      term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
+      term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
+      tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tizz and tkzz
+      term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
+      term4 = rr3*prc3[2] - term1*zr - psr5*zr;
+      term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
+      term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
+      term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
+      tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tixy and tkxy
+      term3 = term1*xr*yr - rr3*yr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
+      term7 = rr5*prc5[0] - term2*xr;
+      tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tixz and tkxz
+      term3 = term1*xr*zr - rr3*zr*prc3[0];
+      term5 = term2*xr*zr - rr5*zr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
+      tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tiyz and tkyz
+      term3 = term1*yr*zr - rr3*zr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
+      term7 = rr5*prc5[1] - term2*yr;
+      tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
+      depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
+      depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      // get the dtau/dr terms used for mutual polarization force
+      // poltyp == MUTUAL  && amoeba
+          
+      term1 = bn[2] - usc3*rr5;
+      term2 = bn[3] - usc5*rr7;
+      term3 = usr5 + term1;
+      term4 = rr3 * factor_uscale;
+      term5 = -xr*term3 + rc3[0]*term4;
+      term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
+      tixx = uix*term5 + uir*term6;
+      tkxx = ukx*term5 + ukr*term6;
+
+      term5 = -yr*term3 + rc3[1]*term4;
+      term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
+      tiyy = uiy*term5 + uir*term6;
+      tkyy = uky*term5 + ukr*term6;
+
+      term5 = -zr*term3 + rc3[2]*term4;
+      term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
+      tizz = uiz*term5 + uir*term6;
+      tkzz = ukz*term5 + ukr*term6;
+
+      term4 = -usr5 * yr;
+      term5 = -xr*term1 + rr3*urc3[0];
+      term6 = xr*yr*term2 - rr5*yr*urc5[0];
+      tixy = uix*term4 + uiy*term5 + uir*term6;
+      tkxy = ukx*term4 + uky*term5 + ukr*term6;
+
+      term4 = -usr5 * zr;
+      term6 = xr*zr*term2 - rr5*zr*urc5[0];
+      tixz = uix*term4 + uiz*term5 + uir*term6;
+      tkxz = ukx*term4 + ukz*term5 + ukr*term6;
+
+      term5 = -yr*term1 + rr3*urc3[1];
+      term6 = yr*zr*term2 - rr5*zr*urc5[1];
+      tiyz = uiy*term4 + uiz*term5 + uir*term6;
+      tkyz = uky*term4 + ukz*term5 + ukr*term6;
+
+      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
+        + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
+      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
+        + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
+      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
+        + tkxz*uixp + tkyz*uiyp + tkzz*uizp;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = xr * frcx;
+        numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = yr * frcy;
+        numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate ufld and dufld to compute tep
+  store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
+
+  // accumate force, energy and virial
+  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //     offset,eflag,vflag,ans,engv);
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
+}
+
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+  
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
+__kernel void k_hippo_short_nbor(const __global numtyp4 *restrict x_,
+                                  const __global int * dev_nbor,
+                                  const __global int * dev_packed,
+                                  __global int * dev_short_nbor,
+                                  const numtyp off2,
+                                  const int inum, const int nbor_pitch,
+                                  const int t_per_atom) {
+  __local int n_stride;
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int ncount = 0;
+    int m = nbor;
+    dev_short_nbor[m] = 0;
+    int nbor_short = nbor+n_stride;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int j=dev_packed[nbor];
+      int nj = j;
+      j &= NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<off2) {
+        dev_short_nbor[nbor_short] = nj;
+        nbor_short += n_stride;
+        ncount++;
+      }
+    } // for nbor
+
+    // store the number of neighbors for each thread
+    dev_short_nbor[m] = ncount;
+
+  } // if ii
+}
diff --git a/lib/gpu/lal_hippo.h b/lib/gpu/lal_hippo.h
new file mode 100644
index 0000000000..9fcb11c164
--- /dev/null
+++ b/lib/gpu/lal_hippo.h
@@ -0,0 +1,120 @@
+/***************************************************************************
+                                  hippo.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the hippo pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_HIPPO_H
+#define LAL_HIPPO_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Hippo : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Hippo();
+  ~Hippo();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole,
+           const double *host_dirdamp, const int *host_amtype2class,
+           const double *host_special_mpole,
+           const double *host_special_hal,
+           const double *host_special_repel,
+           const double *host_special_disp,
+           const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const double *host_csix, const double *host_adisp,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double polar_dscale, const double polar_uscale);
+
+  /// Compute dispersion real-space with device neighboring
+  int** compute_dispersion_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double off2_disp, double *charge,
+                double *boxlo, double *prd);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
+  /// Special polar values [0-4]: 
+  ///   sp_polar.x = special_polar_wscale
+  ///   sp_polar.y special_polar_pscale,
+  ///   sp_polar.z = special_polar_piscale
+  ///   sp_polar.w = special_mpole
+  UCL_D_Vec<numtyp4> sp_polar;
+  /// Special nonpolar values [0-4]: 
+  ///   sp_nonpolar.x = special_hal
+  ///   sp_nonpolar.y special_repel
+  ///   sp_nonpolar.z = special_disp
+  UCL_D_Vec<numtyp4> sp_nonpolar;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+  UCL_Kernel k_dispersion;
+
+ protected:
+  bool _allocated;
+  int dispersion_real(const int eflag, const int vflag);
+  int multipole_real(const int eflag, const int vflag);
+  int udirect2b(const int eflag, const int vflag);
+  int umutual2b(const int eflag, const int vflag);
+  int polar_real(const int eflag, const int vflag);
+  
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
new file mode 100644
index 0000000000..b9e31e7b20
--- /dev/null
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -0,0 +1,210 @@
+/***************************************************************************
+                                 hippo_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to hippo acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_hippo.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Hippo<PRECISION,ACC_PRECISION> HIPPOMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int *host_amtype2class,
+                    const double *host_special_hal,
+                    const double *host_special_repel,
+                    const double *host_special_disp,
+                    const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale,
+                    int& tep_size) {
+  HIPPOMF.clear();
+  gpu_mode=HIPPOMF.device->gpu_mode();
+  double gpu_split=HIPPOMF.device->particle_split();
+  int first_gpu=HIPPOMF.device->first_device();
+  int last_gpu=HIPPOMF.device->last_device();
+  int world_me=HIPPOMF.device->world_me();
+  int gpu_rank=HIPPOMF.device->gpu_rank();
+  int procs_per_gpu=HIPPOMF.device->procs_per_gpu();
+
+  tep_size=sizeof(ACC_PRECISION); // tep_size=sizeof(PRECISION);
+
+  HIPPOMF.device->init_message(screen,"HIPPO",first_gpu,last_gpu);
+
+  bool message=false;
+  if (HIPPOMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                          host_pdamp, host_thole, host_dirdamp,
+                          host_amtype2class, host_special_hal,
+                          host_special_repel, host_special_disp,
+                          host_special_mpole, host_special_polar_wscale,
+                          host_special_polar_piscale, host_special_polar_pscale,
+                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split,
+                          screen, polar_dscale, polar_uscale);
+
+  HIPPOMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=HIPPOMF.init(ntypes, max_amtype, max_amclass,
+                            host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_hal,
+                            host_special_repel, host_special_disp,
+                            host_special_mpole, host_special_polar_wscale,
+                            host_special_polar_piscale, host_special_polar_pscale,
+                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split,
+                            screen, polar_dscale, polar_uscale);
+
+    HIPPOMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    HIPPOMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void hippo_gpu_clear() {
+  HIPPOMF.clear();
+}
+
+int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd) {                             
+  return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd);
+}
+
+int** hippo_gpu_compute_multipole_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  return HIPPOMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
+                          tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success,  const double aewald, const double off2, double *host_q,
+                           double *boxlo, double *prd, void **fieldp_ptr) {
+  return HIPPOMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
+}
+
+int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2, double *host_q,
+                           double *boxlo, double *prd, void **fieldp_ptr) {
+  return HIPPOMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
+}
+
+int** hippo_gpu_compute_polar_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup,
+                           double **host_rpole, double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double felec, const double off2,
+                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+  return HIPPOMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
+}
+
+double hippo_gpu_bytes() {
+  return HIPPOMF.host_memory_usage();
+}
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index 4894ac6203..91bc679447 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -65,17 +65,6 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclas
                     const double polar_dscale, const double polar_uscale, int& tq_size);
 void amoeba_gpu_clear();
 
-int** amoeba_gpu_compute_dispersion_real(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, int *nspecial15, tagint** special15,
-                           const bool eflag, const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd);
-
 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
               double **host_rpole, double *sublo, double *subhi, tagint *tag,
@@ -128,9 +117,9 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   fieldp_pinned = nullptr;
   tq_pinned = nullptr;
 
-  gpu_hal_ready = false;
-  gpu_repulsion_ready = false;         // true for HIPPO
-  gpu_dispersion_real_ready = false;   // true for HIPPO
+  gpu_hal_ready = false;               // true for AMOEBA when ready
+  gpu_repulsion_ready = false;         // always false for AMOEBA
+  gpu_dispersion_real_ready = false;   // always false for AMOEBA
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
   gpu_umutual2b_ready = true;
@@ -205,54 +194,6 @@ void PairAmoebaGPU::init_style()
 
 /* ---------------------------------------------------------------------- */
 
-void PairAmoebaGPU::dispersion_real()
-{
-  if (!gpu_dispersion_real_ready) {
-    PairAmoeba::dispersion_real();
-    return;
-  }
-
-  int eflag=1, vflag=1;
-  int nall = atom->nlocal + atom->nghost;
-  int inum, host_start;
-
-  bool success = true;
-  int *ilist, *numneigh, **firstneigh;
-  
-  double sublo[3],subhi[3];
-  if (domain->triclinic == 0) {
-    sublo[0] = domain->sublo[0];
-    sublo[1] = domain->sublo[1];
-    sublo[2] = domain->sublo[2];
-    subhi[0] = domain->subhi[0];
-    subhi[1] = domain->subhi[1];
-    subhi[2] = domain->subhi[2];
-  } else {
-    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
-  }
-  inum = atom->nlocal;
-
-  // select the correct cutoff for the term
-
-  if (use_dewald) choose(DISP_LONG);
-  else choose(DISP);
-
-  firstneigh = amoeba_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x,
-                                                 atom->type, amtype, amgroup, rpole,
-                                                 sublo, subhi, atom->tag,
-                                                 atom->nspecial, atom->special,
-                                                 atom->nspecial15, atom->special15,
-                                                 eflag, vflag, eflag_atom, vflag_atom,
-                                                 host_start, &ilist, &numneigh, cpu_time,
-                                                 success, aewald, off2, atom->q,
-                                                 domain->boxlo, domain->prd);
-  
-  if (!success)
-    error->one(FLERR,"Insufficient memory on accelerator");
-}
-
-/* ---------------------------------------------------------------------- */
-
 void PairAmoebaGPU::multipole_real()
 {
   if (!gpu_multipole_real_ready) {
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index de17703dc7..e0210faa68 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -35,7 +35,7 @@ class PairAmoebaGPU : public PairAmoeba {
 
   virtual void induce();
 
-  virtual void dispersion_real();
+  //virtual void dispersion_real();
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
   virtual void umutual2b(double **, double **);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
new file mode 100644
index 0000000000..ce0051962b
--- /dev/null
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -0,0 +1,1175 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (Northwestern)
+------------------------------------------------------------------------- */
+
+#include "pair_hippo_gpu.h"
+
+#include "amoeba_convolution.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "fix_store.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "math_const.h"
+#include "memory.h"
+#include "my_page.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "suffix.h"
+#include <cmath>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+enum{INDUCE,RSD,SETUP_hippo,SETUP_HIPPO,KMPOLE,AMGROUP};   // forward comm
+enum{FIELD,ZRSD,TORQUE,UFLD};                               // reverse comm
+enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
+enum{MUTUAL,OPT,TCG,DIRECT};
+enum{GEAR,ASPC,LSQR};
+enum{BUILD,APPLY};
+enum{GORDON1,GORDON2};
+
+#define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
+
+// External functions from cuda library for atom decomposition
+
+int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_dirdamp, const int* host_amtype2class,
+                    const double *host_special_hal, const double *host_special_repel,
+                    const double *host_special_disp, const double *host_special_mpole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double polar_dscale, const double polar_uscale, int& tq_size);
+void hippo_gpu_clear();
+
+int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup, double **host_rpole,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, const double aewald, const double off2,
+                           double *host_q, double *boxlo, double *prd);
+
+int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double *sublo, double *subhi, tagint *tag,
+              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
+
+int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp, 
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double off2, double *host_q,
+              double *boxlo, double *prd, void **fieldp_ptr);
+
+int ** hippo_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp, 
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double off2, double *host_q,
+              double *boxlo, double *prd, void **fieldp_ptr);
+
+int ** hippo_gpu_compute_polar_real(const int ago, const int inum, const int nall,
+              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+              double **host_rpole, double **host_uind, double **host_uinp,
+              double *sublo, double *subhi, tagint *tag, int **nspecial,
+              tagint **special, int* nspecial15, tagint** special15,
+              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
+              int &host_start, int **ilist, int **jnum, const double cpu_time,
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
+
+double hippo_gpu_bytes();
+
+/* ---------------------------------------------------------------------- */
+
+PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
+{
+  respa_enable = 0;
+  reinitflag = 0;
+  cpu_time = 0.0;
+  suffix_flag |= Suffix::GPU;
+  fieldp_pinned = nullptr;
+  tq_pinned = nullptr;
+
+  gpu_hal_ready = false;               // always false for HIPPO
+  gpu_repulsion_ready = false;         // true for HIPPO when ready
+  gpu_dispersion_real_ready = false;   // true for HIPPO when ready
+  gpu_multipole_real_ready = true;
+  gpu_udirect2b_ready = true;
+  gpu_umutual2b_ready = true;
+  gpu_polar_real_ready = true;
+
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairHippoGPU::~PairHippoGPU()
+{
+  hippo_gpu_clear();
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::init_style()
+{
+  PairAmoeba::init_style();
+
+  // Repeat cutsq calculation because done after call to init_style
+
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  int maxspecial15=0;
+  if (atom->molecular != Atom::ATOMIC) {
+    maxspecial=atom->maxspecial;
+    maxspecial15=atom->maxspecial15;
+  }
+    
+  int tq_size;
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
+                                pdamp, thole, dirdamp, amtype2class, special_hal,
+                                special_repel, special_disp, special_mpole,
+                                special_polar_wscale, special_polar_piscale,
+                                special_polar_pscale, csix, adisp, atom->nlocal,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
+                                maxspecial15, cell_size, gpu_mode, screen,
+                                polar_dscale, polar_uscale, tq_size);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode == GPU_FORCE)
+    error->all(FLERR,"Pair style hippo/gpu does not support neigh no for now");
+
+  if (tq_size == sizeof(double))
+    tq_single = false;
+  else
+    tq_single = true;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHippoGPU::dispersion_real()
+{
+  if (!gpu_dispersion_real_ready) {
+    PairAmoeba::dispersion_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  if (use_dewald) choose(DISP_LONG);
+  else choose(DISP);
+
+  firstneigh = hippo_gpu_compute_dispersion_real(neighbor->ago, inum, nall, atom->x,
+                                                 atom->type, amtype, amgroup, rpole,
+                                                 sublo, subhi, atom->tag,
+                                                 atom->nspecial, atom->special,
+                                                 atom->nspecial15, atom->special15,
+                                                 eflag, vflag, eflag_atom, vflag_atom,
+                                                 host_start, &ilist, &numneigh, cpu_time,
+                                                 success, aewald, off2, atom->q,
+                                                 domain->boxlo, domain->prd);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHippoGPU::multipole_real()
+{
+  if (!gpu_multipole_real_ready) {
+    PairAmoeba::multipole_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff for the term
+
+  if (use_ewald) choose(MPOLE_LONG);
+  else choose(MPOLE);
+
+  // set the energy unit conversion factor for multipolar real-space calculation
+
+  double felec = electric / am_dielectric;
+
+  firstneigh = hippo_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
+                                                 atom->type, amtype, amgroup, rpole,
+                                                 sublo, subhi, atom->tag,
+                                                 atom->nspecial, atom->special,
+                                                 atom->nspecial15, atom->special15,
+                                                 eflag, vflag, eflag_atom, vflag_atom,
+                                                 host_start, &ilist, &numneigh, cpu_time,
+                                                 success, aewald, felec, off2, atom->q,
+                                                 domain->boxlo, domain->prd, &tq_pinned);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tq_single) {
+    float *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, fmpole, virmpole);
+  } else {
+    double *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, fmpole, virmpole);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   induce = induced dipole moments via pre-conditioned CG solver
+   adapted from Tinker induce0a() routine
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::induce()
+{
+  bool done;
+  int i,j,m,ii,itype;
+  int iter,maxiter;
+  double polmin;
+  double eps,epsold;
+  double epsd,epsp;
+  double udsum,upsum;
+  double a,ap,b,bp;
+  double sum,sump,term;
+  double reduce[4],allreduce[4];
+
+  double *poli;
+  double **conj,**conjp;
+  double **vec,**vecp;
+  double **udir,**usum,**usump;
+
+  int debug = 1;
+
+  // set cutoffs, taper coeffs, and PME params
+  // create qfac here, free at end of polar()
+  
+  if (use_ewald) {
+    choose(POLAR_LONG);
+    int nmine = p_kspace->nfft_owned;
+    memory->create(qfac,nmine,"ameoba/induce:qfac");
+  } else choose(POLAR);
+
+  // owned atoms
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int nlocal = atom->nlocal;
+
+  // zero out the induced dipoles at each site
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      uind[i][j] = 0.0;
+      uinp[i][j] = 0.0;
+    }
+  }
+
+  // allocation of arrays
+  // NOTE: not all are used by all methods
+  // NOTE: could be re-allocated dynamically
+
+  memory->create(poli,nlocal,"ameoba/induce:poli");
+  memory->create(conj,nlocal,3,"ameoba/induce:conj");
+  memory->create(conjp,nlocal,3,"ameoba/induce:conjp");
+  memory->create(vec,nlocal,3,"ameoba/induce:vec");
+  memory->create(vecp,nlocal,3,"ameoba/induce:vecp");
+  memory->create(udir,nlocal,3,"ameoba/induce:udir");
+  memory->create(usum,nlocal,3,"ameoba/induce:usum");
+  memory->create(usump,nlocal,3,"ameoba/induce:usump");
+
+  // get the electrostatic field due to permanent multipoles
+  
+  dfield0c(field,fieldp);
+
+  // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
+
+  if (!gpu_udirect2b_ready) {
+    crstyle = FIELD;
+    comm->reverse_comm_pair(this);
+  }
+
+  // set induced dipoles to polarizability times direct field
+
+  for (i = 0; i < nlocal; i++) {
+    itype = amtype[i];
+    for (j = 0; j < 3; j++) {
+      udir[i][j] = polarity[itype] * field[i][j];
+      udirp[i][j] = polarity[itype] * fieldp[i][j];
+      if (pcgguess) {
+        uind[i][j] = udir[i][j];
+        uinp[i][j] = udirp[i][j];
+      }
+    }
+  }
+/*
+  printf("GPU: cutghost = %f\n", comm->cutghost[0]);
+  for (i = 0; i < 10; i++) {
+    printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
+      i, udir[i][0], udir[i][1], udir[i][2],
+      udirp[i][0], udirp[i][1], udirp[i][2]); 
+  }
+*/
+  // get induced dipoles via the OPT extrapolation method
+  // NOTE: any way to rewrite these loops to avoid allocating
+  //       uopt,uoptp with a optorder+1 dimension, just optorder ??
+  //       since no need to store optorder+1 values after these loops
+
+  if (poltyp == OPT) { 
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uopt[i][0][j] = udir[i][j];
+        uoptp[i][0][j] = udirp[i][j];
+      }
+    }
+
+    for (m = 1; m <= optorder; m++) {
+      optlevel = m - 1;     // used in umutual1() for fopt,foptp
+
+      cfstyle = INDUCE;
+      comm->forward_comm_pair(this);
+
+      ufield0c(field,fieldp);
+
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm_pair(this);
+      }
+
+      for (i = 0; i < nlocal; i++) {
+	      itype = amtype[i];
+        for (j = 0; j < 3; j++) {
+          uopt[i][m][j] = polarity[itype] * field[i][j];
+          uoptp[i][m][j] = polarity[itype] * fieldp[i][j];
+          uind[i][j] = uopt[i][m][j];
+          uinp[i][j] = uoptp[i][m][j];
+        }
+      }
+    }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        uind[i][j] = 0.0;
+        uinp[i][j] = 0.0;
+        usum[i][j] = 0.0;
+        usump[i][j] = 0.0;
+        for (m = 0; m <= optorder; m++) {
+          usum[i][j] += uopt[i][m][j];
+          usump[i][j] += uoptp[i][m][j];
+          uind[i][j] += copt[m]*usum[i][j];
+          uinp[i][j] += copt[m]*usump[i][j];
+        }
+      }
+    }
+  }
+
+  // set tolerances for computation of mutual induced dipoles
+
+  if (poltyp == MUTUAL) {
+    done = false;
+    maxiter = 100;
+    iter = 0;
+    polmin = 0.00000001;
+    eps = 100.0;
+
+    // estimate induced dipoles using a polynomial predictor
+
+    if (use_pred && nualt == maxualt) {
+      ulspred();
+
+      double ***udalt = fixudalt->tstore;
+      double ***upalt = fixupalt->tstore;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          udsum = 0.0;
+          upsum = 0.0;
+          for (m = 0; m < nualt; m++) {
+            udsum += bpred[m]*udalt[i][m][j];
+            upsum += bpredp[m]*upalt[i][m][j];
+          }
+          uind[i][j] = udsum;
+          uinp[i][j] = upsum;
+        }
+      }
+    }
+
+    // estimate induced dipoles via inertial extended Lagrangian
+    // not supported for now
+    // requires uaux,upaux to persist with each atom
+    // also requires a velocity vector(s) to persist
+    // also requires updating uaux,upaux in the Verlet integration
+
+    /*
+    if (use_ielscf) {
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uaux[i][j];
+          uinp[i][j] = upaux[i][j];
+        }
+      }
+    }
+    */
+
+    // get the electrostatic field due to induced dipoles
+
+    cfstyle = INDUCE;
+    comm->forward_comm_pair(this);
+
+    ufield0c(field,fieldp);
+
+    if (!gpu_umutual2b_ready) {
+      crstyle = FIELD;
+      comm->reverse_comm_pair(this);
+    }
+    
+    //error->all(FLERR,"STOP GPU");
+
+    // set initial conjugate gradient residual and conjugate vector
+
+    for (i = 0; i < nlocal; i++) {
+      itype = amtype[i];
+
+      poli[i] = MAX(polmin,polarity[itype]);
+      for (j = 0; j < 3; j++) {
+        if (pcgguess) {
+          rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j];
+          rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j];
+        } else {
+          rsd[i][j] = udir[i][j] / poli[i];
+          rsdp[i][j] = udirp[i][j] / poli[i];
+        }
+        zrsd[i][j] = rsd[i][j];
+        zrsdp[i][j] = rsdp[i][j];
+      }
+    }
+
+    if (pcgprec) {
+      cfstyle = RSD;
+      comm->forward_comm_pair(this);
+      uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+      crstyle = ZRSD;
+      comm->reverse_comm_pair(this);
+   }
+
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        conj[i][j] = zrsd[i][j];
+        conjp[i][j] = zrsdp[i][j];
+      }
+    }
+
+    // conjugate gradient iteration of the mutual induced dipoles
+
+    while (!done) {
+      iter++;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          vec[i][j] = uind[i][j];
+          vecp[i][j] = uinp[i][j];
+          uind[i][j] = conj[i][j];
+          uinp[i][j] = conjp[i][j];
+        }
+      }
+
+      cfstyle = INDUCE;
+      comm->forward_comm_pair(this);
+
+      ufield0c(field,fieldp);
+
+      if (!gpu_umutual2b_ready) {
+        crstyle = FIELD;
+        comm->reverse_comm_pair(this);
+      }
+
+      //error->all(FLERR,"STOP");
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = vec[i][j];
+          uinp[i][j] = vecp[i][j];
+          vec[i][j] = conj[i][j]/poli[i] - field[i][j];
+          vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j];
+        }
+      }
+
+      a = 0.0;
+      ap = 0.0;
+      sum = 0.0;
+      sump = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          a += conj[i][j]*vec[i][j];
+          ap += conjp[i][j]*vecp[i][j];
+          sum += rsd[i][j]*zrsd[i][j];
+          sump += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      reduce[0] = a;
+      reduce[1] = ap;
+      reduce[2] = sum;
+      reduce[3] = sump;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      a = allreduce[0];
+      ap = allreduce[1];
+      sum = allreduce[2];
+      sump = allreduce[3];
+
+      if (a != 0.0) a = sum / a;
+      if (ap != 0.0) ap = sump / ap;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          uind[i][j] = uind[i][j] + a*conj[i][j];
+          uinp[i][j] = uinp[i][j] + ap*conjp[i][j];
+          rsd[i][j] = rsd[i][j] - a*vec[i][j];
+          rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j];
+          zrsd[i][j] = rsd[i][j];
+          zrsdp[i][j] = rsdp[i][j];
+        }
+      }
+
+      if (pcgprec) {
+        cfstyle = RSD;
+        comm->forward_comm_pair(this);
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+        crstyle = ZRSD;
+        comm->reverse_comm_pair(this);
+      }
+
+      b = 0.0;
+      bp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          b += rsd[i][j]*zrsd[i][j];
+          bp += rsdp[i][j]*zrsdp[i][j];
+        }
+      }
+
+      // NOTE: comp of b,bp and allreduce only needed if pcgprec ?
+
+      reduce[0] = b;
+      reduce[1] = bp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      b = allreduce[0];
+      bp = allreduce[1];
+
+      if (sum != 0.0) b /= sum;
+      if (sump != 0.0) bp /= sump;
+
+      epsd = 0.0;
+      epsp = 0.0;
+
+      for (i = 0; i < nlocal; i++) {
+        for (j = 0; j < 3; j++) {
+          conj[i][j] = zrsd[i][j] + b*conj[i][j];
+          conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j];
+          epsd += rsd[i][j]*rsd[i][j];
+          epsp += rsdp[i][j]*rsdp[i][j];
+        }
+      }
+
+      reduce[0] = epsd;
+      reduce[1] = epsp;
+      MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world);
+      epsd = allreduce[0];
+      epsp = allreduce[1];
+
+      // check the convergence of the mutual induced dipoles
+
+      epsold = eps;
+      eps = MAX(epsd,epsp);
+      eps = DEBYE * sqrt(eps/atom->natoms);
+
+      if (eps < poleps) done = true;
+      if (eps > epsold) done = true;
+      if (iter >= politer) done = true;
+
+      //  apply a "peek" iteration to the mutual induced dipoles
+     
+      if (done) {
+        for (i = 0; i < nlocal; i++) {
+          term = pcgpeek * poli[i];
+          for (j = 0; j < 3; j++) {
+            uind[i][j] += term*rsd[i][j];
+            uinp[i][j] += term*rsdp[i][j];
+          }
+        }
+      }
+
+    }
+
+    // terminate the calculation if dipoles failed to converge
+    // NOTE: could make this an error
+    
+    if (iter >= maxiter || eps > epsold)
+      if (me == 0)
+	      error->warning(FLERR,"hippo induced dipoles did not converge");
+  }
+
+  // DEBUG output to dump file
+
+  if (uind_flag) 
+    dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp);
+
+  // deallocation of arrays
+
+  memory->destroy(poli);
+  memory->destroy(conj);
+  memory->destroy(conjp);
+  memory->destroy(vec);
+  memory->destroy(vecp);
+  memory->destroy(udir);
+  memory->destroy(usum);
+  memory->destroy(usump);
+
+  // update the lists of previous induced dipole values
+  // shift previous m values up to m+1, add new values at m = 0
+  // only when preconditioner is used
+
+  if (use_pred) {
+    double ***udalt = fixudalt->tstore;
+    double ***upalt = fixupalt->tstore;
+
+    nualt = MIN(nualt+1,maxualt);
+    for (i = 0; i < nlocal; i++) {
+      for (j = 0; j < 3; j++) {
+        for (m = nualt-1; m > 0; m--) {
+          udalt[i][m][j] = udalt[i][m-1][j];
+          upalt[i][m][j] = upalt[i][m-1][j];
+        }
+        udalt[i][0][j] = uind[i][j];
+        upalt[i][0][j] = uinp[i][j];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+   atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::udirect2b(double **field, double **fieldp)
+{
+  if (!gpu_udirect2b_ready) {
+    PairAmoeba::udirect2b(field, fieldp);
+    return;
+  }
+   
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  firstneigh = hippo_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success, aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+  // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
+  // udirect2b_cpu();
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (udirect1)
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
+  }
+  
+}
+
+/* ----------------------------------------------------------------------
+   udirect2b = Ewald real direct field via list
+   udirect2b computes the real space contribution of the permanent
+     atomic multipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::udirect2b_cpu()
+{
+  int i,j,k,m,n,ii,jj,kk,kkk,jextra,ndip,itype,jtype,igroup,jgroup;
+  double xr,yr,zr,r,r2;
+  double rr1,rr2,rr3,rr5;
+  double bfac,exp2a;
+  double ralpha,aefac;
+  double aesq2,aesq2n;
+  double pdi,pti,ddi;
+  double pgamma;
+  double damp,expdamp;
+  double scale3,scale5;
+  double scale7,scalek;
+  double bn[4],bcn[3];
+  double factor_dscale,factor_pscale,factor_uscale,factor_wscale;
+
+  int inum,jnum;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+
+  // neigh list
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // NOTE: doesn't this have a problem if aewald is tiny ??
+  
+  aesq2 = 2.0 * aewald * aewald;
+  aesq2n = 0.0;
+  if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
+
+  // rebuild dipole-dipole pair list and store pairwise dipole matrices
+  // done one atom at a time in real-space double loop over atoms & neighs
+
+  int *neighptr;
+  double *tdipdip;
+
+  // compute the real space portion of the Ewald summation
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = amtype[i];
+    igroup = amgroup[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    n = ndip = 0;
+    neighptr = ipage_dipole->vget();
+    tdipdip = dpage_dipdip->vget();
+
+    pdi = pdamp[itype];
+    pti = thole[itype];
+    ddi = dirdamp[itype];
+    
+    // evaluate all sites within the cutoff distance
+
+    for (jj = 0; jj < jnum; jj++) {
+      jextra = jlist[jj];
+      j = jextra & NEIGHMASK15;
+      
+      xr = x[j][0] - x[i][0];
+      yr = x[j][1] - x[i][1];
+      zr = x[j][2] - x[i][2];
+      r2 = xr*xr + yr* yr + zr*zr;
+      if (r2 > off2) continue;
+
+      jtype = amtype[j];
+      jgroup = amgroup[j];
+      
+      factor_wscale = special_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = special_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = special_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = 1.0;
+      }
+
+      r = sqrt(r2);
+      rr1 = 1.0 / r;
+      rr2 = rr1 * rr1;
+      rr3 = rr2 * rr1;
+      rr5 = 3.0 * rr2 * rr3;
+
+      // calculate the real space Ewald error function terms
+
+      ralpha = aewald * r;
+      bn[0] = erfc(ralpha) * rr1;
+      exp2a = exp(-ralpha*ralpha);
+      aefac = aesq2n;
+      for (m = 1; m <= 3; m++) {
+        bfac = m+m-1;
+        aefac = aesq2 * aefac;
+        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
+      }
+      
+      // find terms needed later to compute mutual polarization
+
+      if (poltyp != DIRECT) {
+        scale3 = 1.0;
+        scale5 = 1.0;
+        damp = pdi * pdamp[jtype];
+        if (damp != 0.0) {
+          pgamma = MIN(pti,thole[jtype]);
+          damp = pgamma * pow(r/damp,3.0);
+          if (damp < 50.0) {
+            expdamp = exp(-damp);
+            scale3 = 1.0 - expdamp;
+            scale5 = 1.0 - expdamp*(1.0+damp);
+          }
+        }
+        scalek = factor_uscale;
+        bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
+        bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
+        
+        neighptr[n++] = j;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
+        tdipdip[ndip++] = bcn[1]*xr*yr;
+        tdipdip[ndip++] = bcn[1]*xr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr;
+        tdipdip[ndip++] = bcn[1]*yr*zr;
+        tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr;
+      } else {
+        if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
+      }
+      
+    } // jj
+
+    firstneigh_dipole[i] = neighptr;
+    firstneigh_dipdip[i] = tdipdip;
+    numneigh_dipole[i] = n;
+    ipage_dipole->vgot(n);
+    dpage_dipdip->vgot(ndip);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   umutual2b = Ewald real mutual field via list
+   umutual2b computes the real space contribution of the induced
+   atomic dipole moments to the field via a neighbor list
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::umutual2b(double **field, double **fieldp)
+{
+  if (!gpu_umutual2b_ready) {
+    PairAmoeba::umutual2b(field, fieldp);
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff (off2) for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  firstneigh = hippo_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success,aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // accumulate the field and fieldp values from the GPU lib
+  //   field and fieldp may already have some nonzero values from kspace (umutual1)
+
+  int nlocal = atom->nlocal;
+  double *field_ptr = (double *)fieldp_pinned;
+
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    field[i][0] += field_ptr[idx];
+    field[i][1] += field_ptr[idx+1];
+    field[i][2] += field_ptr[idx+2]; 
+  }
+
+  double* fieldp_ptr = (double *)fieldp_pinned;
+  fieldp_ptr += 4*inum;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 4*i;
+    fieldp[i][0] += fieldp_ptr[idx];
+    fieldp[i][1] += fieldp_ptr[idx+1];
+    fieldp[i][2] += fieldp_ptr[idx+2];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHippoGPU::polar_real()
+{
+  if (!gpu_polar_real_ready) {
+    PairAmoeba::polar_real();
+    return;
+  }
+
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  
+  double sublo[3],subhi[3];
+  if (domain->triclinic == 0) {
+    sublo[0] = domain->sublo[0];
+    sublo[1] = domain->sublo[1];
+    sublo[2] = domain->sublo[2];
+    subhi[0] = domain->subhi[0];
+    subhi[1] = domain->subhi[1];
+    subhi[2] = domain->subhi[2];
+  } else {
+    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+  }
+  inum = atom->nlocal;
+
+  // select the correct cutoff and aewald for the term
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  // set the energy unit conversion factor for polar real-space calculation
+
+  double felec = 0.5 * electric / am_dielectric;
+
+  firstneigh = hippo_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
+                                             atom->type, amtype, amgroup,
+                                             rpole, uind, uinp, sublo, subhi,
+                                             atom->tag, atom->nspecial, atom->special,
+                                             atom->nspecial15, atom->special15,
+                                             eflag, vflag, eflag_atom, vflag_atom,
+                                             host_start, &ilist, &numneigh, cpu_time,
+                                             success, aewald, felec, off2, atom->q,
+                                             domain->boxlo, domain->prd, &tq_pinned);
+  
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tq_single) {
+    float *tep_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tep_ptr, fpolar, virpolar);
+  } else {
+    double *tep_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tep_ptr, fpolar, virpolar);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute atom forces from torques
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
+                                              double** force_comp,
+                                              double* virial_comp)
+{
+  int i,ix,iy,iz;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double fix[3],fiy[3],fiz[3],_tq[4];
+
+  double** x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    _tq[0] = tq_ptr[4*i];
+    _tq[1] = tq_ptr[4*i+1];
+    _tq[2] = tq_ptr[4*i+2];
+    torque2force(i,_tq,fix,fiy,fiz,force_comp);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+
+    virial_comp[0] += vxx;
+    virial_comp[1] += vyy;
+    virial_comp[2] += vzz;
+    virial_comp[3] += vxy;
+    virial_comp[4] += vxz;
+    virial_comp[5] += vyz;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairHippoGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + hippo_gpu_bytes();
+}
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
new file mode 100644
index 0000000000..9e961045eb
--- /dev/null
+++ b/src/GPU/pair_hippo_gpu.h
@@ -0,0 +1,80 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(hippo/gpu,PairHippoGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_HIPPO_GPU_H
+#define LMP_PAIR_HIPPO_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairHippoGPU : public PairAmoeba {
+ public:
+  PairHippoGPU(LAMMPS *lmp);
+  ~PairHippoGPU();
+  void init_style();
+  double memory_usage();
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  virtual void induce();
+
+  virtual void dispersion_real();
+  virtual void multipole_real();
+  virtual void udirect2b(double **, double **);
+  virtual void umutual2b(double **, double **);
+  virtual void polar_real();
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tq_pinned;
+  void *fieldp_pinned;
+  bool tq_single;
+
+  bool gpu_hal_ready;
+  bool gpu_repulsion_ready;
+  bool gpu_dispersion_real_ready;
+  bool gpu_multipole_real_ready;
+  bool gpu_udirect2b_ready;
+  bool gpu_umutual2b_ready;
+  bool gpu_polar_real_ready;
+
+  void udirect2b_cpu();
+
+  template<class numtyp>
+  void compute_force_from_torque(const numtyp*, double**, double*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Insufficient memory on accelerator
+
+There is insufficient memory on one of the devices specified for the gpu
+package
+
+E: Pair style hippo/gpu requires atom attribute q
+
+The atom style defined does not have this attribute.
+
+*/