From 1166845fcf025292ac37646ed37e4b62d3bcc85b Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Sat, 18 Sep 2021 10:22:22 -0500
Subject: [PATCH] Prepared data structure for the dispersion real-space term

---
 lib/gpu/lal_amoeba.cpp      |  38 ++++++---
 lib/gpu/lal_amoeba.cu       | 166 ++++++++++++++++++++++++++++++------
 lib/gpu/lal_amoeba.h        |  14 +--
 lib/gpu/lal_amoeba_ext.cpp  |  24 +++---
 src/GPU/pair_amoeba_gpu.cpp |  12 +--
 5 files changed, 197 insertions(+), 57 deletions(-)
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index d2f2b1bf79..28ed02b480 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -44,12 +44,14 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
-                  const double *host_thole, const double *host_dirdamp,
+int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
+                  const double *host_pdamp, const double *host_thole,
+                  const double *host_dirdamp, const int *host_amtype2class,
                   const double *host_special_mpole,
                   const double *host_special_polar_wscale,
                   const double *host_special_polar_piscale,
                   const double *host_special_polar_pscale,
+                  const double *host_csix, const double *host_adisp,
                   const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15,
                   const double cell_size, const double gpu_split, FILE *_screen,
@@ -80,11 +82,22 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
     host_write[i].x = host_pdamp[i];
     host_write[i].y = host_thole[i];
     host_write[i].z = host_dirdamp[i];
-    host_write[i].w = (numtyp)0;
+    host_write[i].w = host_amtype2class[i];
   }
 
-  damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
-  ucl_copy(damping,host_write,false);
+  coeff_amtype.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amtype,host_write,false);
+
+  UCL_H_Vec<numtyp4> host_write2(max_amclass, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amclass; i++) {
+    host_write2[i].x = host_csix[i];
+    host_write2[i].y = host_adisp[i];
+    host_write2[i].z = (numtyp)0;
+    host_write2[i].w = (numtyp)0;
+  }
+
+  coeff_amclass.alloc(max_amclass,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(coeff_amclass,host_write2,false);
 
   UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
   sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
@@ -100,9 +113,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
   _polar_uscale = polar_uscale;
 
   _allocated=true;
-  this->_max_bytes=damping.row_bytes()
-    + sp_polar.row_bytes()
-    + this->_tep.row_bytes();
+  this->_max_bytes=coeff_amtype.row_bytes() + coeff_amclass.row_bytes()
+    + sp_polar.row_bytes() + this->_tep.row_bytes();
   return 0;
 }
 
@@ -112,7 +124,7 @@ void AmoebaT::clear() {
     return;
   _allocated=false;
 
-  damping.clear();
+  coeff_amtype.clear();
   sp_polar.clear();
   
   this->clear_atomic();
@@ -151,7 +163,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
                          &nbor_pitch, &this->_threads_per_atom);
 
   this->k_multipole.set_size(GX,BX);
-  this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+  this->k_multipole.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
@@ -192,7 +204,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
   }
   
   this->k_udirect2b.set_size(GX,BX);
-  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor,
                         &this->_fieldp, &ainum, &_nall, &nbor_pitch,
@@ -232,7 +244,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   }
 
   this->k_umutual2b.set_size(GX,BX);
-  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                         &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                         &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
                         &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
@@ -271,7 +283,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
   }
 
   this->k_polar.set_size(GX,BX);
-  this->k_polar.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
+  this->k_polar.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                     &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                     &this->dev_short_nbor,
                     &this->ans->force, &this->ans->engv, &this->_tep,
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 41185f30e3..5a1151f610 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -147,7 +147,7 @@ _texture( q_tex,int2);
     fieldp[ii+inum] = fp;                                                   \
   }
 
-#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
@@ -210,8 +210,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }
 
-// SHUFFLE_AVAIL == 1
-#else
+#else // SHUFFLE_AVAIL == 1
 
 #define local_allocate_store_ufld()
 
@@ -280,7 +279,7 @@ _texture( q_tex,int2);
 
 #if (EVFLAG == 1)
 
-#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
@@ -376,7 +375,7 @@ _texture( q_tex,int2);
 // EVFLAG == 0
 #else
 
-#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
+#define store_answers_acc(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
@@ -394,6 +393,125 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 
+/* ----------------------------------------------------------------------
+   dispersion = real-space portion of Ewald dispersion
+   adapted from Tinker edreal1d() routine
+------------------------------------------------------------------------- */
+
+__kernel void k_amoeba_dispersion(const __global numtyp4 *restrict x_,
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict coeff,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp felec, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp4 tq;
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;
+
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+
+  if (ii<inum) {
+    int m,itype,igroup;
+    numtyp bfac;
+    numtyp term1,term2,term3;
+    numtyp term4,term5,term6;
+    numtyp bn[6];
+    numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+
+    int numj, nbor, nbor_end;
+    const __global int* nbor_mem=dev_packed;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    // recalculate numj and nbor_end for use of the short nbor list
+    if (dev_packed==dev_nbor) {
+      numj = dev_short_nbor[nbor];
+      nbor += n_stride;
+      nbor_end = nbor+fast_mul(numj,n_stride);
+      nbor_mem = dev_short_nbor;
+    }
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=nbor_mem[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      //if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+
+    } // nbor
+    
+  } // ii<inum
+
+  // accumate force, energy and virial
+  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  //   offset,eflag,vflag,ans,engv);
+}
+
 /* ----------------------------------------------------------------------
    multipole_real = real-space portion of multipole
    adapted from Tinker emreal1d() routine
@@ -401,7 +519,7 @@ _texture( q_tex,int2);
 
 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_polar,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
@@ -697,7 +815,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
 
 __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_polar,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
@@ -759,9 +877,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
     // debug:
     // xi__ = ix; xi__.w = itype;
 
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
-    numtyp ddi = damping[itype].z;
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
+    numtyp ddi = coeff[itype].z;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -848,9 +966,9 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
       numtyp scale3 = (numtyp)1.0;
       numtyp scale5 = (numtyp)1.0;
       numtyp scale7 = (numtyp)1.0;
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(ddi,damping[jtype].z); // dirdamp[jtype]
+        numtyp pgamma = MIN(ddi,coeff[jtype].z); // dirdamp[jtype]
         if (pgamma != (numtyp)0.0) {
           damp = pgamma * ucl_powr(r/damp,(numtyp)1.5);
           if (damp < (numtyp)50.0) {
@@ -860,7 +978,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
             scale7 = (numtyp)1.0 - expdamp*((numtyp)1.0+(numtyp)0.65*damp + (numtyp)0.15*damp*damp);
           }
         } else {
-          pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+          pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
           damp = pgamma * ucl_powr(r/damp,3.0);
           if (damp < (numtyp)50.0) {
             numtyp expdamp = ucl_exp(-damp);
@@ -911,7 +1029,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
 
 __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
                                  const __global numtyp *restrict extra,
-                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict coeff,
                                  const __global numtyp4 *restrict sp_polar,
                                  const __global int *dev_nbor,
                                  const __global int *dev_packed,
@@ -962,8 +1080,8 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
     
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
 
     numtyp aesq2 = (numtyp)2.0 * aewald*aewald;
     numtyp aesq2n = (numtyp)0.0;
@@ -1025,9 +1143,9 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
       // if (poltyp != DIRECT) 
       numtyp scale3 = (numtyp)1.0;
       numtyp scale5 = (numtyp)1.0;
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
         damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
         if (damp < (numtyp)50.0) {
           numtyp expdamp = ucl_exp(-damp);
@@ -1131,7 +1249,7 @@ __kernel void k_special15(__global int * dev_nbor,
 
 __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
                             const __global numtyp *restrict extra,
-                            const __global numtyp4 *restrict damping,
+                            const __global numtyp4 *restrict coeff,
                             const __global numtyp4 *restrict sp_polar,
                             const __global int *dev_nbor,
                             const __global int *dev_packed,
@@ -1233,8 +1351,8 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
     // debug:
     // xi__ = ix; xi__.w = itype;
 
-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
+    numtyp pdi = coeff[itype].x;
+    numtyp pti = coeff[itype].y;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
@@ -1344,9 +1462,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 
       // apply Thole polarization damping to scale factors
 
-      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
       if (damp != (numtyp)0.0) {
-        numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+        numtyp pgamma = MIN(pti,coeff[jtype].y); // thole[jtype]
         damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
         if (damp < (numtyp)50.0) {
           numtyp expdamp = ucl_exp(-damp);
@@ -1644,7 +1762,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
   // accumate force, energy and virial
   //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
 //     offset,eflag,vflag,ans,engv);
-  store_answers_p(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+  store_answers_acc(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 3d7150c189..4d45ec6e5a 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -37,12 +37,13 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const int max_amtype, const double *host_pdamp,
-           const double *host_thole, const double *host_dirdamp, 
-           const double *host_special_mpole,
+  int init(const int ntypes, const int max_amtype, const int max_amclass,
+           const double *host_pdamp, const double *host_thole, const double *host_dirdamp,
+           const int *host_amtype2class, const double *host_special_mpole,
            const double *host_special_polar_wscale,
            const double *host_special_polar_piscale,
            const double *host_special_polar_pscale,
+           const double *host_csix, const double *host_adisp,
            const int nlocal, const int nall, const int max_nbors,
            const int maxspecial, const int maxspecial15, const double cell_size,
            const double gpu_split, FILE *_screen,
@@ -60,8 +61,11 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// pdamp = damping.x; thole = damping.y
-  UCL_D_Vec<numtyp4> damping;
+  /// pdamp = coeff_amtype.x; thole = coeff_amtype.y;
+  /// dirdamp = coeff_amtype.z; amtype2class = coeff_amtype.w
+  UCL_D_Vec<numtyp4> coeff_amtype;
+  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
+  UCL_D_Vec<numtyp4> coeff_amclass;
   /// Special polar values [0-4]: 
   ///   sp_polar.x = special_polar_wscale
   ///   sp_polar.y special_polar_pscale,
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 8493e9331d..804bf10f32 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -27,13 +27,14 @@ static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int amoeba_gpu_init(const int ntypes, const int max_amtype,
+int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                     const double *host_pdamp, const double *host_thole,
-                    const double *host_dirdamp,
+                    const double *host_dirdamp, const int *host_amtype2class,
                     const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
@@ -63,11 +64,13 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
-                          host_special_mpole, host_special_polar_wscale,
+    init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass,
+                          host_pdamp, host_thole, host_dirdamp,
+                          host_amtype2class, host_special_mpole, host_special_polar_wscale,
                           host_special_polar_piscale, host_special_polar_pscale,
-                          nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                          cell_size, gpu_split, screen, polar_dscale, polar_uscale);
+                          host_csix, host_adisp, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split,
+                          screen, polar_dscale, polar_uscale);
 
   AMOEBAMF.device->world_barrier();
   if (message)
@@ -83,11 +86,12 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
-                            host_special_mpole, host_special_polar_wscale,
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, max_amclass, host_pdamp, host_thole, host_dirdamp,
+                            host_amtype2class, host_special_mpole, host_special_polar_wscale,
                             host_special_polar_piscale, host_special_polar_pscale,
-                            nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                            cell_size, gpu_split, screen, polar_dscale, polar_uscale);
+                            host_csix, host_adisp, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split,
+                            screen, polar_dscale, polar_uscale);
 
     AMOEBAMF.device->gpu_barrier();
     if (message)
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index f932f05e25..25f4718163 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -50,13 +50,14 @@ enum{GORDON1,GORDON2};
 
 // External functions from cuda library for atom decomposition
 
-int amoeba_gpu_init(const int ntypes, const int max_amtype,
+int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass,
                     const double *host_pdamp, const double *host_thole,
-                    const double *host_dirdamp,
+                    const double *host_dirdamp, const int* host_amtype2class,
                     const double *host_special_mpole,
                     const double *host_special_polar_wscale,
                     const double *host_special_polar_piscale,
                     const double *host_special_polar_pscale,
+                    const double *host_csix, const double *host_adisp,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
@@ -168,9 +169,10 @@ void PairAmoebaGPU::init_style()
     
   int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
-  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
-                                special_mpole, special_polar_wscale, special_polar_piscale,
-                                special_polar_pscale, atom->nlocal,
+  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
+                                pdamp, thole, dirdamp, amtype2class, special_mpole,
+                                special_polar_wscale, special_polar_piscale,
+                                special_polar_pscale, csix, adisp, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                 maxspecial15, cell_size, gpu_mode, screen,
                                 polar_dscale, polar_uscale, tq_size);