Working hippo multipole real-space term, added helper functions in a separate file

2021-09-24 16:44:43 -05:00
parent ad8164dfc0
commit e77df80ce2
7 changed files with 797 additions and 34 deletions
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -143,7 +143,7 @@ class BaseAmoeba {
                double *charge, double *boxlo, double *prd);

  /// Compute multipole real-space with device neighboring
-  int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
                tagint *tag, int **nspecial, tagint **special,
@ -155,7 +155,7 @@ class BaseAmoeba {
                double *boxlo, double *prd, void **tep_ptr);

  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
-  int** compute_udirect2b(const int ago, const int inum_full, const int nall,
+  virtual int** compute_udirect2b(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole,
                double **host_uind, double **host_uinp,
@ -169,7 +169,7 @@ class BaseAmoeba {
                double *boxlo, double *prd, void **fieldp_ptr);

  /// Compute the real space part of the induced field (umutual2b) with device neighboring
-  int** compute_umutual2b(const int ago, const int inum_full, const int nall,
+  virtual int** compute_umutual2b(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole,
                double **host_uind, double **host_uinp,
@ -183,7 +183,7 @@ class BaseAmoeba {
                double *boxlo, double *prd, void **fieldp_ptr);

  /// Compute polar real-space with device neighboring
-  int** compute_polar_real(const int ago, const int inum_full, const int nall,
+  virtual int** compute_polar_real(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double **host_uind,
                double **host_uinp, double *sublo, double *subhi,
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -56,6 +56,7 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
                  const double *host_special_polar_piscale,
                  const double *host_special_polar_pscale,
                  const double *host_csix, const double *host_adisp,
+                  const double *host_pcore, const double *host_palpha,
                  const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const int maxspecial15,
                  const double cell_size, const double gpu_split, FILE *_screen,
@ -69,7 +70,9 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
  if (success!=0)
    return success;

+  // specific to HIPPO
  k_dispersion.set_function(*(this->pair_program),"k_hippo_dispersion");
+  _pval.alloc(this->_max_tep_size,*(this->ucl_device),UCL_READ_ONLY,UCL_READ_ONLY);

  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
@ -98,8 +101,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
  for (int i = 0; i < max_amclass; i++) {
    host_write2[i].x = host_csix[i];
    host_write2[i].y = host_adisp[i];
-    host_write2[i].z = (numtyp)0;
-    host_write2[i].w = (numtyp)0;
+    host_write2[i].z = host_pcore[i];
+    host_write2[i].w = host_palpha[i];
  }

  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
@ -262,6 +265,93 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {
  return GX;
 }

+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary, and then compute multipole real-space
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** HippoT::compute_multipole_real(const int ago, const int inum_full,
+                                          const int nall, double **host_x,
+                                          int *host_type, int *host_amtype,
+                                          int *host_amgroup, double **host_rpole,
+                                          double *sublo, double *subhi, tagint *tag,
+                                          int **nspecial, tagint **special,
+                                          int *nspecial15, tagint **special15,
+                                          const bool eflag_in, const bool vflag_in,
+                                          const bool eatom, const bool vatom,
+                                          int &host_start, int **ilist, int **jnum,
+                                          const double cpu_time, bool &success,
+                                          const double aewald, const double felec,
+                                          const double off2_mpole, double *host_q,
+                                          double *boxlo, double *prd, void **tep_ptr) {
+  this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
+
+  // reallocate per-atom arrays, transfer data from the host
+  //   and build the neighbor lists if needed
+  // NOTE: 
+  //   For now we invoke precompute() again here,
+  //     to be able to turn on/off the udirect2b kernel (which comes before this)
+  //   Once all the kernels are ready, precompute() is needed only once
+  //     in the first kernel in a time step.
+  //   We only need to cast uind and uinp from host to device here
+  //     if the neighbor lists are rebuilt and other per-atom arrays
+  //     (x, type, amtype, amgroup, rpole) are ready on the device.
+
+  int** firstneigh = nullptr;
+  firstneigh = this->precompute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole,
+                          nullptr, nullptr, sublo, subhi, tag,
+                          nspecial, special, nspecial15, special15,
+                          eflag_in, vflag_in, eatom, vatom,
+                          host_start, ilist, jnum, cpu_time,
+                          success, host_q, boxlo, prd);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (inum_full>this->_max_tep_size) {
+    this->_max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+    this->_tep.resize(this->_max_tep_size*4);
+  }
+  *tep_ptr=this->_tep.host.begin();
+
+  this->_off2_mpole = off2_mpole;
+  this->_felec = felec;
+  this->_aewald = aewald;
+  const int red_blocks=multipole_real(eflag,vflag);
+
+  // leave the answers (forces, energies and virial) on the device,
+  //   only copy them back in the last kernel (polar_real)
+  //ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  //device->add_ans_object(ans);
+
+  this->hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  this->_tep.update_host(this->_max_tep_size*4,false);
+/*
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
+    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
+  return firstneigh; // nbor->host_jlist.begin()-host_start;
+}
+
 // ---------------------------------------------------------------------------
 // Calculate the multipole real-space term, returning tep
 // ---------------------------------------------------------------------------
@ -290,13 +380,14 @@ int HippoT::multipole_real(const int eflag, const int vflag) {
                         &nbor_pitch, &this->_threads_per_atom);

  this->k_multipole.set_size(GX,BX);
-  this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
-                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                    &this->dev_short_nbor,
-                    &this->ans->force, &this->ans->engv, &this->_tep,
-                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
-                    &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
+  this->k_multipole.run(&this->atom->x, &this->atom->extra, &_pval,
+                        &coeff_amtype, &coeff_amclass, &sp_polar,
+                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                        &this->dev_short_nbor,
+                        &this->ans->force, &this->ans->engv, &this->_tep,
+                        &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                        &this->_threads_per_atom,  &this->_aewald, &this->_felec,
+                        &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
  this->time_pair.stop();

  return GX;
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@ -15,7 +15,8 @@

 #if defined(NV_KERNEL) || defined(USE_HIP)
 #include <stdio.h>
-#include "lal_aux_fun1.h"
+#include "lal_hippo_extra.h"
+//#include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
 #endif
@ -404,6 +405,318 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729

+/* ----------------------------------------------------------------------
+   repulsion = Pauli repulsion interactions
+   adapted from Tinker erepel1b() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_nonpolar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 __global acctyp4 *restrict tep,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp off2, const numtyp cut2,
+                                 const numtyp c0, const numtyp c1, const numtyp c2,
+                                 const numtyp c3, const numtyp c4, const numtyp c5)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    const numtyp4 pol1i = polar1[i];
+    numtyp ci  = pol1i.x;    // rpole[i][0];
+    numtyp dix = pol1i.y;    // rpole[i][1];
+    numtyp diy = pol1i.z;    // rpole[i][2];
+    numtyp diz = pol1i.w;    // rpole[i][3];
+    const numtyp4 pol2i = polar2[i];
+    numtyp qixx = pol2i.x;   // rpole[i][4];
+    numtyp qixy = pol2i.y;   // rpole[i][5];
+    numtyp qixz = pol2i.z;   // rpole[i][6];
+    numtyp qiyy = pol2i.w;   // rpole[i][8];
+    const numtyp4 pol3i = polar3[i];
+    numtyp qiyz = pol3i.x;   // rpole[i][9];
+    numtyp qizz = pol3i.y;   // rpole[i][12];
+    int itype = pol3i.z; // amtype[i];
+    numtyp sizi = coeff[itype].x; // sizpr[itype];
+    numtyp dmpi = coeff[itype].y; // dmppr[itype];
+    numtyp vali = coeff[itype].z; // elepr[itype];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = ix.x - jx.x;
+      numtyp yr = ix.y - jx.y;
+      numtyp zr = ix.z - jx.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      const numtyp4 pol1j = polar1[j];
+      numtyp ck  = pol1j.x;  // rpole[j][0];
+      numtyp dkx = pol1j.y;  // rpole[j][1];
+      numtyp dky = pol1j.z;  // rpole[j][2];
+      numtyp dkz = pol1j.w;  // rpole[j][3];
+      const numtyp4 pol2j = polar2[j];
+      numtyp qkxx = pol2j.x; // rpole[j][4];
+      numtyp qkxy = pol2j.y; // rpole[j][5];
+      numtyp qkxz = pol2j.z; // rpole[j][6];
+      numtyp qkyy = pol2j.w; // rpole[j][8];
+      const numtyp4 pol3j = polar3[j];
+      numtyp qkyz = pol3j.x; // rpole[j][9];
+      numtyp qkzz = pol3j.y; // rpole[j][12];
+      int jtype = pol3j.z; // amtype[j];
+      
+      numtyp sizk = coeff[jtype].x; // sizpr[jtype];
+      numtyp dmpk = coeff[jtype].y; // dmppr[jtype];
+      numtyp valk = coeff[jtype].z; // elepr[jtype];
+
+      const numtyp4 sp_nonpol = sp_nonpolar[sbmask15(jextra)];
+      numtyp factor_repel = sp_nonpol.y; // factor_repel = special_repel[sbmask15(j)];
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      
+      numtyp dik = dix*dkx + diy*dky + diz*dkz;
+      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
+      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
+      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
+
+      // additional intermediates involving moments and distance
+
+      numtyp dirx = diy*zr - diz*yr;
+      numtyp diry = diz*xr - dix*zr;
+      numtyp dirz = dix*yr - diy*xr;
+      numtyp dkrx = dky*zr - dkz*yr;
+      numtyp dkry = dkz*xr - dkx*zr;
+      numtyp dkrz = dkx*yr - dky*xr;
+      numtyp dikx = diy*dkz - diz*dky;
+      numtyp diky = diz*dkx - dix*dkz;
+      numtyp dikz = dix*dky - diy*dkx;
+      numtyp qirx = qiz*yr - qiy*zr;
+      numtyp qiry = qix*zr - qiz*xr;
+      numtyp qirz = qiy*xr - qix*yr;
+      numtyp qkrx = qkz*yr - qky*zr;
+      numtyp qkry = qkx*zr - qkz*xr;
+      numtyp qkrz = qky*xr - qkx*yr;
+      numtyp qikx = qky*qiz - qkz*qiy;
+      numtyp qiky = qkz*qix - qkx*qiz;
+      numtyp qikz = qkx*qiy - qky*qix;
+      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
+      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
+      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
+      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
+      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
+      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
+      numtyp qikrx = qizk*yr - qiyk*zr;
+      numtyp qikry = qixk*zr - qizk*xr;
+      numtyp qikrz = qiyk*xr - qixk*yr;
+      numtyp qkirx = qkzi*yr - qkyi*zr;
+      numtyp qkiry = qkxi*zr - qkzi*xr;
+      numtyp qkirz = qkyi*xr - qkxi*yr;
+      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
+      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
+      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
+      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
+      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
+      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
+      numtyp diqkrx = diqkz*yr - diqky*zr;
+      numtyp diqkry = diqkx*zr - diqkz*xr;
+      numtyp diqkrz = diqky*xr - diqkx*yr;
+      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
+      numtyp dkqiry = dkqix*zr - dkqiz*xr;
+      numtyp dkqirz = dkqiy*xr - dkqix*yr;
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp r = ucl_sqrt(r2);
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
+
+      // get damping coefficients for the Pauli repulsion energy
+      numtyp dmpik[11];
+      damprep(r,r2,rr1,rr3,rr5,rr7,rr9,rr11,11,dmpi,dmpk,dmpik);
+
+      // calculate intermediate terms needed for the energy
+
+      numtyp term1 = vali*valk;
+      numtyp term2 = valk*dir - vali*dkr + dik;
+      numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      numtyp term4 = dir*qkr - dkr*qir - 4.0*qik;
+      numtyp term5 = qir*qkr;
+      numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + 
+        term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8];
+
+      // compute the Pauli repulsion energy for this interaction
+
+      numtyp sizik = sizi * sizk * factor_repel;
+      numtyp e = sizik * eterm * rr1;
+
+      // calculate intermediate terms for force and torque
+
+      numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + 
+        term4*dmpik[8] + term5*dmpik[10];
+      term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6];
+      term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6];
+      term3 = (numtyp)2.0 * dmpik[4];
+      term4 = (numtyp)2.0 * (-valk*dmpik[4] + dkr*dmpik[6] - qkr*dmpik[8]);
+      term5 = (numtyp)2.0 * (-vali*dmpik[4] - dir*dmpik[6] - qir*dmpik[8]);
+      numtyp term6 = (numtyp)4.0 * dmpik[6];
+
+      // compute the force components for this interaction
+
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+        term4*qix + term5*qkx + term6*(qixk+qkxi);
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+        term4*qiy + term5*qky + term6*(qiyk+qkyi);
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+        term4*qiz + term5*qkz + term6*(qizk+qkzi);
+      frcx = frcx*rr1 + eterm*rr3*xr;
+      frcy = frcy*rr1 + eterm*rr3*yr;
+      frcz = frcz*rr1 + eterm*rr3*zr;
+
+      // compute the torque components for this interaction
+      
+      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+        term4*qirx - term6*(qikrx+qikx);
+      numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - 
+        term4*qiry - term6*(qikry+qiky);
+      numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+        term4*qirz - term6*(qikrz+qikz);
+      ttmix = sizik * ttmix * rr1;
+      ttmiy = sizik * ttmiy * rr1;
+      ttmiz = sizik * ttmiz * rr1;
+
+      // use energy switching if near the cutoff distance
+
+      if (r2 > cut2) {
+        numtyp r3 = r2 * r;
+        numtyp r4 = r2 * r2;
+        numtyp r5 = r2 * r3;
+        numtyp taper = c5*r5 + c4*r4 + c3*r3 + c2*r2 + c1*r + c0;
+        numtyp dtaper = (numtyp)5.0*c5*r4 + (numtyp).0*c4*r3 +
+          (numtyp)3.0*c3*r2 + (numtyp)2.0*c2*r + c1;
+        dtaper *= e * rr1;
+        e *= taper;
+        frcx = frcx*taper - dtaper*xr;
+        frcy = frcy*taper - dtaper*yr;
+        frcz = frcz*taper - dtaper*zr;
+        ttmix *= taper;
+        ttmiy *= taper;
+        ttmiz *= taper;
+      }
+
+      energy += e;
+
+      // increment force-based gradient and torque on atom I
+
+      f.x += frcx;
+      f.y += frcy;
+      f.z += frcz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;
+
+      // increment the internal virial tensor components
+      if (EVFLAG && vflag) {
+        numtyp vxx = -xr * frcx;
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = -yr * frcy;
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = -zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate tq
+  store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
+  // accumate force, energy and virial
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
 /* ----------------------------------------------------------------------
   dispersion = real-space portion of Ewald dispersion
   adapted from Tinker edreal1d() routine
@ -594,20 +907,22 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */

 __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
-                                 const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict coeff,
-                                 const __global numtyp4 *restrict sp_polar,
-                                 const __global int *dev_nbor,
-                                 const __global int *dev_packed,
-                                 const __global int *dev_short_nbor,
-                                 __global acctyp4 *restrict ans,
-                                 __global acctyp *restrict engv,
-                                 __global acctyp4 *restrict tep,
-                                 const int eflag, const int vflag, const int inum,
-                                 const int nall, const int nbor_pitch,
-                                 const int t_per_atom, const numtyp aewald,
-                                 const numtyp felec, const numtyp off2,
-                                 const numtyp polar_dscale, const numtyp polar_uscale)
+                                const __global numtyp *restrict extra,
+                                const __global numtyp *restrict pval,
+                                const __global numtyp4 *restrict coeff_amtype,
+                                const __global numtyp4 *restrict coeff_amclass,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict ans,
+                                __global acctyp *restrict engv,
+                                __global acctyp4 *restrict tep,
+                                const int eflag, const int vflag, const int inum,
+                                const int nall, const int nbor_pitch,
+                                const int t_per_atom, const numtyp aewald,
+                                const numtyp felec, const numtyp off2,
+                                const numtyp polar_dscale, const numtyp polar_uscale)
 {
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);
@ -633,6 +948,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,

  if (ii<inum) {
    int m;
+    int itype,iclass;
    numtyp bfac;
    numtyp term1,term2,term3;
    numtyp term4,term5,term6;
@ -668,6 +984,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
    const numtyp4 pol3i = polar3[i];
    numtyp qiyz = pol3i.x;   // rpole[i][9];
    numtyp qizz = pol3i.y;   // rpole[i][12];
+    itype  = pol3i.z;        // amtype[i];
+    iclass = coeff_amtype[itype].w;  // amtype2class[itype];
+
+    numtyp corei = coeff_amclass[itype].z;  // pcore[iclass];
+    numtyp alphai = coeff_amclass[itype].w; // palpha[iclass];
+    numtyp vali = pval[i];

    for ( ; nbor<nbor_end; nbor+=n_stride) {

@ -701,10 +1023,15 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
      numtyp qkzz = pol3j.y; // rpole[j][12];
      int jtype = pol3j.z; // amtype[j];
      int jgroup =  pol3j.w; // amgroup[j];
+      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];

      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];

+      numtyp corek = coeff_amclass[jtype].z;  // pcore[jclass];
+      numtyp alphak = coeff_amclass[jtype].w; // palpha[jclass];
+      numtyp valk = pval[j];
+
      // intermediates involving moments and separation distance

      numtyp dir = dix*xr + diy*yr + diz*zr;
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@ -48,6 +48,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
           const double *host_special_polar_piscale,
           const double *host_special_polar_pscale,
           const double *host_csix, const double *host_adisp,
+           const double *host_pcore, const double *host_palpha,
           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const int maxspecial15, const double cell_size,
           const double gpu_split, FILE *_screen,
@ -65,6 +66,18 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
                const double aewald, const double off2_disp, double *charge,
                double *boxlo, double *prd);

+  /// Compute multipole real-space with device neighboring
+  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                const double aewald, const double felec, const double off2_mpole, double *charge,
+                double *boxlo, double *prd, void **tep_ptr);
+
  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
  void clear();
@ -105,6 +118,8 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {

  UCL_Kernel k_dispersion;

+  UCL_Vector<acctyp,acctyp> _pval;
+
 protected:
  bool _allocated;
  int dispersion_real(const int eflag, const int vflag);
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@ -38,6 +38,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                    const double *host_special_polar_piscale,
                    const double *host_special_polar_pscale,
                    const double *host_csix, const double *host_adisp,
+                    const double *host_pcore, const double *host_palpha,
                    const int nlocal, const int nall, const int max_nbors,
                    const int maxspecial, const int maxspecial15,
                    const double cell_size, int &gpu_mode, FILE *screen,
@ -73,7 +74,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                          host_special_repel, host_special_disp,
                          host_special_mpole, host_special_polar_wscale,
                          host_special_polar_piscale, host_special_polar_pscale,
-                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          host_csix, host_adisp, host_pcore, host_palpha,
+                          nlocal, nall, max_nbors,
                          maxspecial, maxspecial15, cell_size, gpu_split,
                          screen, polar_dscale, polar_uscale);

@ -97,7 +99,8 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                            host_special_repel, host_special_disp,
                            host_special_mpole, host_special_polar_wscale,
                            host_special_polar_piscale, host_special_polar_pscale,
-                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            host_csix, host_adisp, host_pcore, host_palpha,
+                            nlocal, nall, max_nbors,
                            maxspecial, maxspecial15, cell_size, gpu_split,
                            screen, polar_dscale, polar_uscale);

--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@ -0,0 +1,326 @@
+/// **************************************************************************
+//                              hippo_extra.h
+//                             -------------------
+//                              Trung Dac Nguyen
+//
+//  Device code for hippo math routines
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/*
+
+#ifndef LAL_HIPPO_EXTRA_H
+#define LAL_HIPPO_EXTRA_H
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_aux_fun1.h"
+#else
+#endif
+
+#define MY_PI2 (numtyp)1.57079632679489661923
+#define MY_PI4 (numtyp)0.78539816339744830962
+
+/* ----------------------------------------------------------------------
+   damprep generates coefficients for the Pauli repulsion
+   damping function for powers of the interatomic distance
+
+   literature reference:
+
+   J. A. Rackers and J. W. Ponder, "Classical Pauli Repulsion: An
+   Anisotropic, Atomic Multipole Model", Journal of Chemical Physics,
+   150, 084104 (2019)
+------------------------------------------------------------------------- */
+
+ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
+                        const numtyp rr3, const numtyp rr5, const numtyp rr7,
+                        const numtyp rr9, const numtyp rr11, const int rorder,
+                        const numtyp dmpi, const numtyp dmpk, numtyp dmpik[11])
+{
+  numtyp r3,r4;
+  numtyp r5,r6,r7,r8;
+  numtyp s,ds,d2s;
+  numtyp d3s,d4s,d5s;
+  numtyp dmpi2,dmpk2;
+  numtyp dmpi22,dmpi23;
+  numtyp dmpi24,dmpi25;
+  numtyp dmpi26,dmpi27;
+  numtyp dmpk22,dmpk23;
+  numtyp dmpk24,dmpk25;
+  numtyp dmpk26;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp pre,term,tmp;
+
+  // compute tolerance value for damping exponents
+
+  eps = (numtyp)0.001;
+  diff = dmpi-dmpk; 
+  if (diff < (numtyp)0) diff = -diff;
+
+  // treat the case where alpha damping exponents are equal
+
+  if (diff < eps) {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    r6 = r5 * r;
+    r7 = r6 * r;
+    dmpi2 = (numtyp)0.5 * dmpi;
+    dampi = dmpi2 * r;
+    expi = ucl_exp(-dampi);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpi26 = dmpi25 * dmpi2;
+    pre = (numtyp)128.0;
+    s = (r + dmpi2*r2 + dmpi22*r3/(numtyp)3.0) * expi;
+
+    ds = (dmpi22*r3 + dmpi23*r4) * expi / 3.0;
+    d2s = dmpi24 * expi * r5 / 9.0;
+    d3s = dmpi25 * expi * r6 / 45.0;
+    d4s = (dmpi25*r6 + dmpi26*r7) * expi / 315.0;
+    if (rorder >= 11) {
+      r8 = r7 * r;
+      dmpi27 = dmpi2 * dmpi26;
+      d5s = (dmpi25*r6 + dmpi26*r7 + dmpi27*r8/3.0) * expi / 945.0;
+    }
+
+  // treat the case where alpha damping exponents are unequal
+
+  } else {
+    r3 = r2 * r;
+    r4 = r3 * r;
+    r5 = r4 * r;
+    dmpi2 = 0.5 * dmpi;
+    dmpk2 = 0.5 * dmpk;
+    dampi = dmpi2 * r;
+    dampk = dmpk2 * r;
+    expi = exp(-dampi);
+    expk = exp(-dampk);
+    dmpi22 = dmpi2 * dmpi2;
+    dmpi23 = dmpi22 * dmpi2;
+    dmpi24 = dmpi23 * dmpi2;
+    dmpi25 = dmpi24 * dmpi2;
+    dmpk22 = dmpk2 * dmpk2;
+    dmpk23 = dmpk22 * dmpk2;
+    dmpk24 = dmpk23 * dmpk2;
+    dmpk25 = dmpk24 * dmpk2;
+    term = dmpi22 - dmpk22;
+    pre = 8192.0 * dmpi23 * dmpk23 / pow(term,4.0);
+    tmp = 4.0 * dmpi2 * dmpk2 / term;
+    s = (dampi-tmp)*expk + (dampk+tmp)*expi;
+
+    ds = (dmpi2*dmpk2*r2 - 4.0*dmpi2*dmpk22*r/term - 
+          4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2 + 4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/3.0 - 
+           (4.0/3.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+           4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi2*dmpk2*r2/3.0 + dmpi22*dmpk2*r3/3.0 + 
+       (4.0/3.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
+       4.0*dmpi2*dmpk2/term) * expi;
+    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/5.0 + dmpi2*dmpk2*r2/5.0 - 
+           (4.0/15.0)*dmpi2*dmpk24*r3/term - (8.0/5.0)*dmpi2*dmpk23*r2/term - 
+           4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
+      (dmpi23*dmpk2*r4/15.0 + dmpi22*dmpk2*r3/5.0 + dmpi2*dmpk2*r2/5.0 + 
+       (4.0/15.0)*dmpi24*dmpk2*r3/term + (8.0/5.0)*dmpi23*dmpk2*r2/term + 
+       4.0*dmpi22*dmpk2*r/term + 4.0/term*dmpi2*dmpk2) * expi;
+    d4s = (dmpi2*dmpk24*r5/105.0 + (2.0/35.0)*dmpi2*dmpk23*r4 + 
+           dmpi2*dmpk22*r3/7.0 + dmpi2*dmpk2*r2/7.0 - 
+           (4.0/105.0)*dmpi2*dmpk25*r4/term - (8.0/21.0)*dmpi2*dmpk24*r3/term - 
+           (12.0/7.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+           4.0*dmpi2*dmpk2/term) * expk + 
+      (dmpi24*dmpk2*r5/105.0 + (2.0/35.0)*dmpi23*dmpk2*r4 + 
+       dmpi22*dmpk2*r3/7.0 + dmpi2*dmpk2*r2/7.0 + 
+       (4.0/105.0)*dmpi25*dmpk2*r4/term + (8.0/21.0)*dmpi24*dmpk2*r3/term + 
+       (12.0/7.0)*dmpi23*dmpk2*r2/term + 4.0*dmpi22*dmpk2*r/term + 
+       4.0*dmpi2*dmpk2/term) * expi;
+    
+    if (rorder >= 11) {
+      r6 = r5 * r;
+      dmpi26 = dmpi25 * dmpi2;
+      dmpk26 = dmpk25 * dmpk2;
+      d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + 
+             dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - 
+             (4.0/945.0)*dmpi2*dmpk26*r5/term - 
+             (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - 
+             (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
+             4.0*dmpi2*dmpk2/term) * expk + 
+        (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + 
+         dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + 
+         (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + 
+         (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + 
+         4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
+    }
+  }
+
+  // convert partial derivatives into full derivatives
+
+  s = s * rr1;
+  ds = ds * rr3;
+  d2s = d2s * rr5;
+  d3s = d3s * rr7;
+  d4s = d4s * rr9;
+  d5s = d5s * rr11;
+  dmpik[0] = 0.5 * pre * s * s;
+  dmpik[2] = pre * s * ds;
+  dmpik[4] = pre * (s*d2s + ds*ds);
+  dmpik[6] = pre * (s*d3s + 3.0*ds*d2s);
+  dmpik[8] = pre * (s*d4s + 4.0*ds*d3s + 3.0*d2s*d2s);
+  if (rorder >= 11) dmpik[10] = pre * (s*d5s + 5.0*ds*d4s + 10.0*d2s*d3s);
+}
+
+/* ----------------------------------------------------------------------
+   damppole generates coefficients for the charge penetration
+   damping function for powers of the interatomic distance
+
+   literature references:
+
+   L. V. Slipchenko and M. S. Gordon, "Electrostatic Energy in the
+   Effective Fragment Potential Method: Theory and Application to
+   the Benzene Dimer", Journal of Computational Chemistry, 28,
+   276-291 (2007)  [Gordon f1 and f2 models]
+
+   J. A. Rackers, Q. Wang, C. Liu, J.-P. Piquemal, P. Ren and
+   J. W. Ponder, "An Optimized Charge Penetration Model for Use with
+   the AMOEBA Force Field", Physical Chemistry Chemical Physics, 19,
+   276-291 (2017)
+------------------------------------------------------------------------- */
+
+ucl_inline void damppole(const numtyp r, const int rorder,
+                         const numtyp alphai, const numtyp alphak,
+                         numtyp dmpi[9], numtyp dmpk[9], numtyp dmpik[11])
+{
+  numtyp termi,termk;
+  numtyp termi2,termk2;
+  numtyp alphai2,alphak2;
+  numtyp eps,diff;
+  numtyp expi,expk;
+  numtyp dampi,dampk;
+  numtyp dampi2,dampi3;
+  numtyp dampi4,dampi5;
+  numtyp dampi6,dampi7;
+  numtyp dampi8;
+  numtyp dampk2,dampk3;
+  numtyp dampk4,dampk5;
+  numtyp dampk6;
+
+  // compute tolerance and exponential damping factors
+
+  eps = 0.001;
+  diff = fabs(alphai-alphak);
+  dampi = alphai * r;
+  dampk = alphak * r;
+  expi = exp(-dampi);
+  expk = exp(-dampk);
+
+  // core-valence charge penetration damping for Gordon f1
+
+  dampi2 = dampi * dampi;
+  dampi3 = dampi * dampi2;
+  dampi4 = dampi2 * dampi2;
+  dampi5 = dampi2 * dampi3;
+  dmpi[0] = 1.0 - (1.0 + 0.5*dampi)*expi;
+  dmpi[2] = 1.0 - (1.0 + dampi + 0.5*dampi2)*expi;
+  dmpi[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi;
+  dmpi[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + dampi4/30.0)*expi;
+  dmpi[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                   4.0*dampi4/105.0 + dampi5/210.0)*expi;
+  if (diff < eps) {
+    dmpk[0] = dmpi[0];
+    dmpk[2] = dmpi[2];
+    dmpk[4] = dmpi[4];
+    dmpk[6] = dmpi[6];
+    dmpk[8] = dmpi[8];
+  } else {
+    dampk2 = dampk * dampk;
+    dampk3 = dampk * dampk2;
+    dampk4 = dampk2 * dampk2;
+    dampk5 = dampk2 * dampk3;
+    dmpk[0] = 1.0 - (1.0 + 0.5*dampk)*expk;
+    dmpk[2] = 1.0 - (1.0 + dampk + 0.5*dampk2)*expk;
+    dmpk[4] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk;
+    dmpk[6] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk;
+    dmpk[8] = 1.0 - (1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 
+                     4.0*dampk4/105.0 + dampk5/210.0)*expk;
+  }
+
+  // valence-valence charge penetration damping for Gordon f1
+
+  if (diff < eps) {
+    dampi6 = dampi3 * dampi3;
+    dampi7 = dampi3 * dampi4;
+    dmpik[0] = 1.0 - (1.0 + 11.0*dampi/16.0 + 3.0*dampi2/16.0 + 
+                      dampi3/48.0)*expi;
+    dmpik[2] = 1.0 - (1.0 + dampi + 0.5*dampi2 + 
+                      7.0*dampi3/48.0 + dampi4/48.0)*expi;
+    dmpik[4] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                      dampi4/24.0 + dampi5/144.0)*expi;
+    dmpik[6] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                      dampi4/24.0 + dampi5/120.0 + dampi6/720.0)*expi;
+    dmpik[8] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                      dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + 
+                      dampi7/5040.0)*expi;
+    if (rorder >= 11) {
+      dampi8 = dampi4 * dampi4;
+      dmpik[10] = 1.0 - (1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                         dampi4/24.0 + dampi5/120.0 + dampi6/720.0 + 
+                         dampi7/5040.0 + dampi8/45360.0)*expi;
+    }
+
+  } else {
+    alphai2 = alphai * alphai;
+    alphak2 = alphak * alphak;
+    termi = alphak2 / (alphak2-alphai2);
+    termk = alphai2 / (alphai2-alphak2);
+    termi2 = termi * termi;
+    termk2 = termk * termk;
+    dmpik[0] = 1.0 - termi2*(1.0 + 2.0*termk + 0.5*dampi)*expi - 
+      termk2*(1.0 + 2.0*termi + 0.5*dampk)*expk;
+    dmpik[2] = 1.0 - termi2*(1.0+dampi+0.5*dampi2)*expi -
+      termk2*(1.0+dampk+0.5*dampk2)*expk -
+      2.0*termi2*termk*(1.0+dampi)*expi -
+      2.0*termk2*termi*(1.0+dampk)*expk;
+    dmpik[4] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0)*expi - 
+      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0)*expk - 
+      2.0*termi2*termk*(1.0 + dampi + dampi2/3.0)*expi - 
+      2.0*termk2*termi*(1.0 + dampk + dampk2/3.0)*expk;
+    dmpik[6] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + 
+                             dampi3/6.0 + dampi4/30.0)*expi - 
+      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + dampk4/30.0)*expk - 
+      2.0*termi2*termk*(1.0 + dampi + 2.0*dampi2/5.0 + dampi3/15.0)*expi - 
+      2.0*termk2*termi*(1.0 + dampk + 2.0*dampk2/5.0 + dampk3/15.0)*expk;
+    dmpik[8] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                             4.0*dampi4/105.0 + dampi5/210.0)*expi - 
+      termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 
+              4.0*dampk4/105.0 + dampk5/210.0)*expk - 
+      2.0*termi2*termk*(1.0 + dampi + 3.0*dampi2/7.0 + 
+                        2.0*dampi3/21.0 + dampi4/105.0)*expi - 
+      2.0*termk2*termi*(1.0 + dampk + 3.0*dampk2/7.0 + 
+                        2.0*dampk3/21.0 + dampk4/105.0)*expk;
+    
+    if (rorder >= 11) {
+      dampi6 = dampi3 * dampi3;
+      dampk6 = dampk3 * dampk3;
+      dmpik[10] = 1.0 - termi2*(1.0 + dampi + 0.5*dampi2 + dampi3/6.0 + 
+                                5.0*dampi4/126.0 + 2.0*dampi5/315.0 + 
+                                dampi6/1890.0)*expi - 
+        termk2*(1.0 + dampk + 0.5*dampk2 + dampk3/6.0 + 5.0*dampk4/126.0 + 
+                2.0*dampk5/315.0 + dampk6/1890.0)*expk - 
+        2.0*termi2*termk*(1.0 + dampi + 4.0*dampi2/9.0 + dampi3/9.0 + 
+                          dampi4/63.0 + dampi5/945.0)*expi - 
+        2.0*termk2*termi*(1.0 + dampk + 4.0*dampk2/9.0 + dampk3/9.0 + 
+                          dampk4/63.0 + dampk5/945.0)*expk;
+    }
+  }
+}
+
+
+
+#endif
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@ -59,6 +59,7 @@ int hippo_gpu_init(const int ntypes, const int max_amtype, const int max_amclass
                    const double *host_special_polar_piscale,
                    const double *host_special_polar_pscale,
                    const double *host_csix, const double *host_adisp,
+                    const double *host_pcore, const double *host_palpha,
                    const int nlocal, const int nall, const int max_nbors,
                    const int maxspecial, const int maxspecial15,
                    const double cell_size, int &gpu_mode, FILE *screen,
@ -191,10 +192,10 @@ void PairHippoGPU::init_style()
                                pdamp, thole, dirdamp, amtype2class, special_hal,
                                special_repel, special_disp, special_mpole,
                                special_polar_wscale, special_polar_piscale,
-                                special_polar_pscale, csix, adisp, atom->nlocal,
-                                atom->nlocal+atom->nghost, mnf, maxspecial,
-                                maxspecial15, cell_size, gpu_mode, screen,
-                                polar_dscale, polar_uscale, tq_size);
+                                special_polar_pscale, csix, adisp, pcore, palpha,
+                                atom->nlocal, atom->nlocal+atom->nghost, mnf,
+                                maxspecial, maxspecial15, cell_size, gpu_mode,
+                                screen, polar_dscale, polar_uscale, tq_size);
  GPU_EXTRA::check_flag(success,error,world);

  if (gpu_mode == GPU_FORCE)