Working on the multipole real-space term, not ready yet

2021-09-17 01:19:33 -05:00
parent 6293da7661
commit 003bebd31e
9 changed files with 729 additions and 161 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -45,7 +45,8 @@ int AmoebaT::bytes_per_atom(const int max_nbors) const {
 template <class numtyp, class acctyp>
 int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
-                  const double *host_thole, const double *host_dirdamp, 
+                  const double *host_thole, const double *host_dirdamp,
                  const double *host_special_mpole,
                  const double *host_special_polar_wscale,
                  const double *host_special_polar_piscale,
                  const double *host_special_polar_pscale,
@ -57,8 +58,9 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                            cell_size,gpu_split,_screen,amoeba,
-                            "k_amoeba_polar", "k_amoeba_udirect2b",
+                            "k_amoeba_multipole", "k_amoeba_udirect2b",
-                            "k_amoeba_umutual2b", "k_amoeba_short_nbor");
+                            "k_amoeba_umutual2b", "k_amoeba_polar",
                            "k_amoeba_short_nbor");
  if (success!=0)
    return success;
@ -91,7 +93,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
    dview[i].x=host_special_polar_wscale[i];
    dview[i].y=host_special_polar_piscale[i];
    dview[i].z=host_special_polar_pscale[i];
-    dview[i].w=(numtyp)0;
+    dview[i].w=host_special_mpole[i];
  }
  ucl_copy(sp_polar,dview,5,false);
@ -123,6 +125,47 @@ double AmoebaT::host_memory_usage() const {
  return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
 }
 // ---------------------------------------------------------------------------
 // Calculate the polar real-space term, returning tep
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::multipole_real(const int eflag, const int vflag) {
  int ainum=this->ans->inum();
  if (ainum == 0)
    return 0;
  int _nall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
  this->time_pair.start();
  // Build the short neighbor list if not done yet
  if (!this->short_nbor_avail) {
    this->k_short_nbor.set_size(GX,BX);
    this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
                          &this->dev_short_nbor, &this->_off2_polar, &ainum,
                          &nbor_pitch, &this->_threads_per_atom);
    this->short_nbor_avail = true;
  }
  this->k_multipole.set_size(GX,BX);
  this->k_multipole.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                    &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv, &this->_tep,
                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
                    &this->_threads_per_atom,  &_aewald, &this->_felec,
                    &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
  this->time_pair.stop();
  return GX;
 }
 // ---------------------------------------------------------------------------
 // Calculate the real-space permanent field, returning field and fieldp
 // ---------------------------------------------------------------------------
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -44,6 +44,27 @@ _texture( q_tex,int2);
 #define local_allocate_store_ufld()                                         \
    __local acctyp red_acc[6][BLOCK_PAIR];
 #define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
                                tep)                                        \
  if (t_per_atom>1) {                                                       \
    red_acc[0][tid]=tq.x;                                                   \
    red_acc[1][tid]=tq.y;                                                   \
    red_acc[2][tid]=tq.z;                                                   \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      simdsync();                                                           \
      if (offset < s) {                                                     \
        for (int r=0; r<3; r++)                                             \
          red_acc[r][tid] += red_acc[r][tid+s];                             \
      }                                                                     \
    }                                                                       \
    tq.x=red_acc[0][tid];                                                   \
    tq.y=red_acc[1][tid];                                                   \
    tq.z=red_acc[2][tid];                                                   \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    tep[i]=tq;                                                               \
  }
 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
                          i, tep)                                           \
  if (t_per_atom>1) {                                                       \
@ -130,6 +151,19 @@ _texture( q_tex,int2);
 #define local_allocate_store_ufld()
 #define store_answers_amoeba_tq(tq, ii, inum,tid, t_per_atom, offset, i,    \
                          tep)                                              \
  if (t_per_atom>1) {                                                       \
    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
      tq.x += shfl_down(tq.x, s, t_per_atom);                               \
      tq.y += shfl_down(tq.y, s, t_per_atom);                               \
      tq.z += shfl_down(tq.z, s, t_per_atom);                               \
    }                                                                       \
  }                                                                         \
  if (offset==0 && ii<inum) {                                               \
    tep[i]=tq;                                                               \
  }
 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
                          i, tep)                                           \
  if (t_per_atom>1) {                                                       \
@ -185,6 +219,315 @@ _texture( q_tex,int2);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MY_PIS (acctyp)1.77245385090551602729
 /* ----------------------------------------------------------------------
   multipole_real = real-space portion of multipole
   adapted from Tinker emreal1d() routine
 ------------------------------------------------------------------------- */
 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
                            const __global numtyp *restrict extra,
                            const __global numtyp4 *restrict damping,
                            const __global numtyp4 *restrict sp_polar,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            const __global int *dev_short_nbor,
                            __global acctyp4 *restrict ans,
                            __global acctyp *restrict engv,
                            __global numtyp4 *restrict tep,
                            const int eflag, const int vflag, const int inum,
                            const int nall, const int nbor_pitch, const int t_per_atom,
                            const numtyp aewald, const numtyp felec,
                            const numtyp off2, const numtyp polar_dscale,
                            const numtyp polar_uscale)
 {
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);
  int n_stride;
  local_allocate_store_ufld();
  local_allocate_store_charge();
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
  acctyp energy, e_coul, virial[6];
  if (EVFLAG) {
    energy=(acctyp)0;
    e_coul=(acctyp)0;
    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
  }
  acctyp4 tq;
  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.w=(acctyp)0; 
  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
  numtyp4* polar1 = (numtyp4*)(&extra[0]);
  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
  //numtyp4 xi__;
  if (ii<inum) {
    int k,m,itype,igroup;
    numtyp bfac;
    numtyp term1,term2,term3;
    numtyp term4,term5;
    numtyp term6,term7;
    numtyp rc3[3],rc5[3],rc7[3];
    numtyp bn[6];
    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
    int numj, nbor, nbor_end;
    const __global int* nbor_mem=dev_packed;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
    //numtyp qtmp; fetch(qtmp,i,q_tex);
    //int itype=ix.w;
    // recalculate numj and nbor_end for use of the short nbor list
    if (dev_packed==dev_nbor) {
      numj = dev_short_nbor[nbor];
      nbor += n_stride;
      nbor_end = nbor+fast_mul(numj,n_stride);
      nbor_mem = dev_short_nbor;
    }
    ci  = polar1[i].x;    // rpole[i][0];
    dix = polar1[i].y;    // rpole[i][1];
    diy = polar1[i].z;    // rpole[i][2];
    diz = polar1[i].w;    // rpole[i][3];
    qixx = polar2[i].x;   // rpole[i][4];
    qixy = polar2[i].y;   // rpole[i][5];
    qixz = polar2[i].z;   // rpole[i][6];
    qiyy = polar2[i].w;   // rpole[i][8];
    qiyz   = polar3[i].x; // rpole[i][9];
    qizz   = polar3[i].y; // rpole[i][12];
    itype  = polar3[i].z; // amtype[i];
    igroup = polar3[i].w; // amgroup[i];
    // debug:
    // xi__ = ix; xi__.w = itype;
    numtyp pdi = damping[itype].x;
    numtyp pti = damping[itype].y;
    for ( ; nbor<nbor_end; nbor+=n_stride) {
      int jextra=nbor_mem[nbor];
      int j = jextra & NEIGHMASK15;
      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
      numtyp zr = jx.z - ix.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;
      //if (r2>off2) continue;
      numtyp r = ucl_sqrt(r2);
      numtyp ck = polar1[j].x;   // rpole[j][0];
      numtyp dkx = polar1[j].y;  // rpole[j][1];
      numtyp dky = polar1[j].z;  // rpole[j][2];
      numtyp dkz = polar1[j].w;  // rpole[j][3];
      numtyp qkxx = polar2[j].x; // rpole[j][4];
      numtyp qkxy = polar2[j].y; // rpole[j][5];
      numtyp qkxz = polar2[j].z; // rpole[j][6];
      numtyp qkyy = polar2[j].w; // rpole[j][8];
      numtyp qkyz = polar3[j].x; // rpole[j][9];
      numtyp qkzz = polar3[j].y; // rpole[j][12];
      int jtype =   polar3[j].z; // amtype[j];
      int jgroup =  polar3[j].w; // amgroup[j];
      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
      numtyp factor_mpole = sp_pol.w; // sp_mpole[sbmask15(jextra)];
      // intermediates involving moments and separation distance
      numtyp dir = dix*xr + diy*yr + diz*zr;
      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
      numtyp qir = qix*xr + qiy*yr + qiz*zr;
      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
      numtyp dik = dix*dkx + diy*dky + diz*dkz;
      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
      numtyp qiqk = (numtyp )2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
        qixx*qkxx + qiyy*qkyy + qizz*qkzz;
      // additional intermediates involving moments and distance
      numtyp dirx = diy*zr - diz*yr;
      numtyp diry = diz*xr - dix*zr;
      numtyp dirz = dix*yr - diy*xr;
      numtyp dkrx = dky*zr - dkz*yr;
      numtyp dkry = dkz*xr - dkx*zr;
      numtyp dkrz = dkx*yr - dky*xr;
      numtyp dikx = diy*dkz - diz*dky;
      numtyp diky = diz*dkx - dix*dkz;
      numtyp dikz = dix*dky - diy*dkx;
      numtyp qirx = qiz*yr - qiy*zr;
      numtyp qiry = qix*zr - qiz*xr;
      numtyp qirz = qiy*xr - qix*yr;
      numtyp qkrx = qkz*yr - qky*zr;
      numtyp qkry = qkx*zr - qkz*xr;
      numtyp qkrz = qky*xr - qkx*yr;
      numtyp qikx = qky*qiz - qkz*qiy;
      numtyp qiky = qkz*qix - qkx*qiz;
      numtyp qikz = qkx*qiy - qky*qix;
      numtyp qixk = qixx*qkx + qixy*qky + qixz*qkz;
      numtyp qiyk = qixy*qkx + qiyy*qky + qiyz*qkz;
      numtyp qizk = qixz*qkx + qiyz*qky + qizz*qkz;
      numtyp qkxi = qkxx*qix + qkxy*qiy + qkxz*qiz;
      numtyp qkyi = qkxy*qix + qkyy*qiy + qkyz*qiz;
      numtyp qkzi = qkxz*qix + qkyz*qiy + qkzz*qiz;
      numtyp qikrx = qizk*yr - qiyk*zr;
      numtyp qikry = qixk*zr - qizk*xr;
      numtyp qikrz = qiyk*xr - qixk*yr;
      numtyp qkirx = qkzi*yr - qkyi*zr;
      numtyp qkiry = qkxi*zr - qkzi*xr;
      numtyp qkirz = qkyi*xr - qkxi*yr;
      numtyp diqkx = dix*qkxx + diy*qkxy + diz*qkxz;
      numtyp diqky = dix*qkxy + diy*qkyy + diz*qkyz;
      numtyp diqkz = dix*qkxz + diy*qkyz + diz*qkzz;
      numtyp dkqix = dkx*qixx + dky*qixy + dkz*qixz;
      numtyp dkqiy = dkx*qixy + dky*qiyy + dkz*qiyz;
      numtyp dkqiz = dkx*qixz + dky*qiyz + dkz*qizz;
      numtyp diqkrx = diqkz*yr - diqky*zr;
      numtyp diqkry = diqkx*zr - diqkz*xr;
      numtyp diqkrz = diqky*xr - diqkx*yr;
      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
      numtyp dkqiry = dkqix*zr - dkqiz*xr;
      numtyp dkqirz = dkqiy*xr - dkqix*yr;
      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);
      // get reciprocal distance terms for this interaction
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
      numtyp rr1 = felec * rinv;
      numtyp rr3 = rr1 * r2inv;
      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
      numtyp rr11 = (numtyp)9.0 * rr9 * r2inv;
      // calculate the real space Ewald error function terms
      numtyp ralpha = aewald * r;
      numtyp exp2a = ucl_exp(-ralpha*ralpha);
      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
      //bn[0] = erfc(ralpha) / r;
      bn[0] = _erfc * rinv;
      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
      numtyp alsq2n = (numtyp)0.0;
      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
      for (m = 1; m < 6; m++) {
        bfac = (numtyp) (m+m-1);
        alsq2n = alsq2 * alsq2n;
        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
      }
      for (m = 0; m < 6; m++) bn[m] *= felec;
      term1 = ci*ck;
      term2 = ck*dir - ci*dkr + dik;
      term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
      term4 = dir*qkr - dkr*qir - 4.0*qik;
      term5 = qir*qkr;
      numtyp scalek = 1.0 - factor_mpole;
      rr1 = bn[0] - scalek*rr1;
      rr3 = bn[1] - scalek*rr3;
      rr5 = bn[2] - scalek*rr5;
      rr7 = bn[3] - scalek*rr7;
      rr9 = bn[4] - scalek*rr9;
      rr11 = bn[5] - scalek*rr11;
      numtyp e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
      // find standard multipole intermediates for force and torque
      numtyp de = term1*rr3 + term2*rr5 + term3*rr7 + term4*rr9 + term5*rr11;
      term1 = -ck*rr3 + dkr*rr5 - qkr*rr7;
      term2 = ci*rr3 + dir*rr5 + qir*rr7;
      term3 = (numtyp)2.0 * rr5;
      term4 = (numtyp)2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
      term5 = (numtyp)2.0 * (-ci*rr5-dir*rr7-qir*rr9);
      term6 = (numtyp)4.0 * rr7;
      energy += e;
      // compute the force components for this interaction
      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
        term4*qix + term5*qkx + term6*(qixk+qkxi);
      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
        term4*qiy + term5*qky + term6*(qiyk+qkyi);
      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
        term4*qiz + term5*qkz + term6*(qizk+qkzi);
      // compute the torque components for this interaction
      numtyp tix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
        term4*qirx - term6*(qikrx+qikx);
      numtyp tiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
        term4*qiry - term6*(qikry+qiky);
      numtyp tiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
        term4*qirz - term6*(qikrz+qikz);
      // increment force-based gradient and torque on first site
      f.x += frcx;
      f.y += frcy;
      f.z += frcz;
      tq.x += tix;
      tq.y += tiy;
      tq.z += tiz;
      if (EVFLAG && vflag) {
        numtyp vxx = -xr * frcx;
        numtyp vxy = (numtyp )-0.5 * (yr*frcx+xr*frcy);
        numtyp vxz = (numtyp )-0.5 * (zr*frcx+xr*frcz);
        numtyp vyy = -yr * frcy;
        numtyp vyz = (numtyp )-0.5 * (zr*frcy+yr*frcz);
        numtyp vzz = -zr * frcz;
        virial[0] += vxx;
        virial[1] += vyy;
        virial[2] += vzz;
        virial[3] += vxy;
        virial[4] += vxz;
        virial[5] += vyz;
      }
    } // nbor
  } // ii<inum
  // accumulate ufld and dufld to compute tep
  store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
  // accumate force, energy and virial
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
     offset,eflag,vflag,ans,engv);
 }
 /* ----------------------------------------------------------------------
  udirect2b = Ewald real direct field via list
  udirect2b computes the real space contribution of the permanent
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@ -39,6 +39,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
    * - -5 Double precision is not supported on card **/
  int init(const int ntypes, const int max_amtype, const double *host_pdamp,
           const double *host_thole, const double *host_dirdamp, 
           const double *host_special_mpole,
           const double *host_special_polar_wscale,
           const double *host_special_polar_piscale,
           const double *host_special_polar_pscale,
@ -79,6 +80,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
 protected:
  bool _allocated;
  int multipole_real(const int eflag, const int vflag);
  int udirect2b(const int eflag, const int vflag);
  int umutual2b(const int eflag, const int vflag);
  int polar_real(const int eflag, const int vflag);
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -30,6 +30,7 @@ static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
 int amoeba_gpu_init(const int ntypes, const int max_amtype,
                    const double *host_pdamp, const double *host_thole,
                    const double *host_dirdamp,
                    const double *host_special_mpole,
                    const double *host_special_polar_wscale,
                    const double *host_special_polar_piscale,
                    const double *host_special_polar_pscale,
@ -63,10 +64,10 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
  int init_ok=0;
  if (world_me==0)
    init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
-                          host_special_polar_wscale, host_special_polar_piscale,
+                          host_special_mpole, host_special_polar_wscale,
-                          host_special_polar_pscale, nlocal, nall, max_nbors,
+                          host_special_polar_piscale, host_special_polar_pscale,
-                          maxspecial, maxspecial15, cell_size, gpu_split, screen,
+                          nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                          aewald, polar_dscale, polar_uscale);
+                          cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
  AMOEBAMF.device->world_barrier();
  if (message)
@ -83,10 +84,10 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole, host_dirdamp,
-                            host_special_polar_wscale, host_special_polar_piscale,
+                            host_special_mpole, host_special_polar_wscale,
-                            host_special_polar_pscale, nlocal, nall, max_nbors,
+                            host_special_polar_piscale, host_special_polar_pscale,
-                            maxspecial, maxspecial15, cell_size, gpu_split, screen,
+                            nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                            aewald, polar_dscale, polar_uscale);
+                            cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
    AMOEBAMF.device->gpu_barrier();
    if (message)
@ -104,6 +105,23 @@ void amoeba_gpu_clear() {
  AMOEBAMF.clear();
 }
 int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *host_amtype, int *host_amgroup, double **host_rpole,
                           double *sublo, double *subhi, tagint *tag, int **nspecial,
                           tagint **special, int *nspecial15, tagint** special15,
                           const bool eflag, const bool vflag, const bool eatom,
                           const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success, const double felec, const double off2,
                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
  return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
                          tag, nspecial, special, nspecial15, special15,
                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
                          cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
 }
 int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *host_amtype, int *host_amgroup, double **host_rpole,
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -36,9 +36,10 @@ template <class numtyp, class acctyp>
 BaseAmoebaT::~BaseAmoeba() {
  delete ans;
  delete nbor;
-  k_polar.clear();
+  k_multipole.clear();
  k_udirect2b.clear();
  k_umutual2b.clear();
  k_polar.clear();
  k_special15.clear();
  k_short_nbor.clear();
  if (pair_program) delete pair_program;
@ -56,9 +57,10 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                             const int maxspecial15,
                             const double cell_size, const double gpu_split,
                             FILE *_screen, const void *pair_program,
-                             const char *k_name_polar,
+                             const char *k_name_multipole,
                             const char *k_name_udirect2b,
                             const char *k_name_umutual2b,
                             const char *k_name_polar,
                             const char *k_name_short_nbor) {
  screen=_screen;
@ -91,8 +93,8 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  _block_size=device->pair_block_size();
  _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name_polar,k_name_udirect2b,
+  compile_kernels(*ucl_device,pair_program,k_name_multipole,k_name_udirect2b,
-                  k_name_umutual2b,k_name_short_nbor);
+                  k_name_umutual2b,k_name_polar,k_name_short_nbor);
  if (_threads_per_atom>1 && gpu_nbor==0) {
    nbor->packing(true);
@ -425,6 +427,85 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
  return nbor->host_jlist.begin()-host_start;
 }
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, const int nall,
                           double **host_x, int *host_type, int *host_amtype,
                           int *host_amgroup, double **host_rpole,
                           double *sublo, double *subhi, tagint *tag,
                           int **nspecial, tagint **special,
                           int *nspecial15, tagint **special15,
                           const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success, const double felec, const double off2_mpole,
                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
  acc_timers();
  int eflag, vflag;
  if (eatom) eflag=2;
  else if (eflag_in) eflag=1;
  else eflag=0;
  if (vatom) vflag=2;
  else if (vflag_in) vflag=1;
  else vflag=0;
  #ifdef LAL_NO_BLOCK_REDUCE
  if (eflag) eflag=2;
  if (vflag) vflag=2;
  #endif
  set_kernel(eflag,vflag);
  // reallocate per-atom arrays, transfer data from the host
  //   and build the neighbor lists if needed
  // NOTE: 
  //   For now we invoke precompute() again here,
  //     to be able to turn on/off the udirect2b kernel (which comes before this)
  //   Once all the kernels are ready, precompute() is needed only once
  //     in the first kernel in a time step.
  //   We only need to cast uind and uinp from host to device here
  //     if the neighbor lists are rebuilt and other per-atom arrays
  //     (x, type, amtype, amgroup, rpole) are ready on the device.
  int** firstneigh = nullptr;
  firstneigh = precompute(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole,
                          nullptr, nullptr, sublo, subhi, tag,
                          nspecial, special, nspecial15, special15,
                          eflag_in, vflag_in, eatom, vatom,
                          host_start, ilist, jnum, cpu_time,
                          success, host_q, boxlo, prd);
  // ------------------- Resize _tep array ------------------------
  if (inum_full>_max_tep_size) {
    _max_tep_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
    _tep.resize(_max_tep_size*4);
  }
  *tep_ptr=_tep.host.begin();
  _off2_mpole = off2_mpole;
  _felec = felec;
  const int red_blocks=multipole_real(eflag,vflag);
  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
  // copy tep from device to host
  _tep.update_host(_max_tep_size*4,false);
 /*
  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
  for (int i = 0; i < 10; i++) {
    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
  }
 */  
  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute the direct real space part
 //    of the permanent field
@ -713,9 +794,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname_polar,
+                                  const char *kname_multipole,
                                  const char *kname_udirect2b,
                                  const char *kname_umutual2b,
                                  const char *kname_polar,
                                  const char *kname_short_nbor) {
  if (_compiled)
    return;
@ -725,9 +807,10 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
-  k_polar.set_function(*pair_program,kname_polar);
+  k_multipole.set_function(*pair_program,kname_multipole);
  k_udirect2b.set_function(*pair_program,kname_udirect2b);
  k_umutual2b.set_function(*pair_program,kname_umutual2b);
  k_polar.set_function(*pair_program,kname_polar);
  k_short_nbor.set_function(*pair_program,kname_short_nbor);
  k_special15.set_function(*pair_program,"k_special15");
  pos_tex.get_texture(*pair_program,"pos_tex");
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -54,8 +54,9 @@ class BaseAmoeba {
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const int maxspecial15, const double cell_size,
                  const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_polar, const char *kname_udirect2b,
+                  const char *kname_multipole, const char *kname_udirect2b,
-                  const char *kname_umutual2b, const char *kname_short_nbor);
+                  const char *kname_umutual2b, const char *kname_polar,
                  const char *kname_short_nbor);
  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead(const int add_kernels=0);
@ -141,6 +142,18 @@ class BaseAmoeba {
                int **&ilist, int **&numj, const double cpu_time, bool &success,
                double *charge, double *boxlo, double *prd);
  /// Compute multipole real-space with device neighboring
  int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
                int *host_amgroup, double **host_rpole, double *sublo, double *subhi,
                tagint *tag, int **nspecial, tagint **special,
                int *nspecial15, tagint **special15,
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
                const double felec, const double off2_mpole, double *charge,
                double *boxlo, double *prd, void **tep_ptr);
  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
  int** compute_udirect2b(const int ago, const int inum_full, const int nall,
                double **host_x, int *host_type, int *host_amtype,
@ -241,7 +254,7 @@ class BaseAmoeba {
  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
-  UCL_Kernel k_polar, k_udirect2b, k_umutual2b, k_special15;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_special15;
  UCL_Kernel k_short_nbor;
  inline int block_size() { return _block_size; }
  inline void set_kernel(const int eflag, const int vflag) {}
@ -262,9 +275,11 @@ class BaseAmoeba {
  numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
  void compile_kernels(UCL_Device &dev, const void *pair_string,
-     const char *kname_polar, const char *kname_udirect2b,
+     const char *kname_multipole, const char *kname_udirect2b,
-     const char *kname_umutual2b, const char *kname_short_nbor);
+     const char *kname_umutual2b, const char *kname_polar,
     const char *kname_short_nbor);
  virtual int multipole_real(const int eflag, const int vflag) = 0;
  virtual int udirect2b(const int eflag, const int vflag) = 0;
  virtual int umutual2b(const int eflag, const int vflag) = 0;
  virtual int polar_real(const int eflag, const int vflag) = 0;
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@ -352,7 +352,7 @@ class PairAmoeba : public Pair {
  void dispersion_kspace();
  void multipole();
-  void multipole_real();
+  virtual void multipole_real();
  void multipole_kspace();
  void polar();
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -53,6 +53,7 @@ enum{GORDON1,GORDON2};
 int amoeba_gpu_init(const int ntypes, const int max_amtype,
                    const double *host_pdamp, const double *host_thole,
                    const double *host_dirdamp,
                    const double *host_special_mpole,
                    const double *host_special_polar_wscale,
                    const double *host_special_polar_piscale,
                    const double *host_special_polar_pscale,
@ -63,6 +64,15 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                    const double polar_uscale, int& tep_size);
 void amoeba_gpu_clear();
 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
              double **host_rpole, double *sublo, double *subhi, tagint *tag,
              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
              int &host_start, int **ilist, int **jnum, const double cpu_time,
              bool &success, const double felec, const double off2, double *host_q,
              double *boxlo, double *prd, void **tep_ptr);
 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
              double **host_rpole, double **host_uind, double **host_uinp, 
@ -90,7 +100,7 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
              tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double off2, const double felec, double *host_q,
+              bool &success, const double felec, const double off2, double *host_q,
              double *boxlo, double *prd, void **tep_ptr);
 double amoeba_gpu_bytes();
@ -106,6 +116,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  fieldp_pinned = nullptr;
  tep_pinned = nullptr;
  gpu_multipole_real_ready = false;
  gpu_udirect2b_ready = true;
  gpu_umutual2b_ready = true;
  gpu_polar_real_ready = true;
@ -122,139 +133,6 @@ PairAmoebaGPU::~PairAmoebaGPU()
  amoeba_gpu_clear();
 }
 /* ---------------------------------------------------------------------- */
 void PairAmoebaGPU::polar_real()
 {
  if (!gpu_polar_real_ready) {
    PairAmoeba::polar_real();
    return;
  }
  int eflag=1, vflag=1;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  int *ilist, *numneigh, **firstneigh;
  double sublo[3],subhi[3];
  if (domain->triclinic == 0) {
    sublo[0] = domain->sublo[0];
    sublo[1] = domain->sublo[1];
    sublo[2] = domain->sublo[2];
    subhi[0] = domain->subhi[0];
    subhi[1] = domain->subhi[1];
    subhi[2] = domain->subhi[2];
  } else {
    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
  }
  inum = atom->nlocal;
  // select the correct cutoff for the term
  if (use_ewald) choose(POLAR_LONG);
  else choose(POLAR);
  // set the energy unit conversion factor for polar real-space calculation
  double felec = 0.5 * electric / am_dielectric;
  firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
                                        atom->type, amtype, amgroup,
                                        rpole, uind, uinp, sublo, subhi,
                                        atom->tag, atom->nspecial, atom->special,
                                        atom->nspecial15, atom->special15,
                                        eflag, vflag, eflag_atom, vflag_atom,
                                        host_start, &ilist, &numneigh, cpu_time,
                                        success, felec, off2, atom->q, domain->boxlo,
                                        domain->prd, &tep_pinned);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");
  // reference to the tep array from GPU lib
  if (tep_single) {
    float *tep_ptr = (float *)tep_pinned;
    compute_force_from_tep<float>(tep_ptr);
  } else {
    double *tep_ptr = (double *)tep_pinned;
    compute_force_from_tep<double>(tep_ptr);
  }
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 template <class numtyp>
 void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
 {
  int i,ix,iy,iz;
  double ci,dix,diy,diz;
  double qixx,qixy,qixz;
  double qiyy,qiyz,qizz;
  double xix,yix,zix;
  double xiy,yiy,ziy;
  double xiz,yiz,ziz;
  double vxx,vyy,vzz;
  double vxy,vxz,vyz;
  double fix[3],fiy[3],fiz[3],tep[4];
  double** x = atom->x;
  int nlocal = atom->nlocal;
  for (i = 0; i < nlocal; i++) {
    dix = rpole[i][1];
    diy = rpole[i][2];
    diz = rpole[i][3];
    qixx = rpole[i][4];
    qixy = rpole[i][5];
    qixz = rpole[i][6];
    qiyy = rpole[i][8];
    qiyz = rpole[i][9];
    qizz = rpole[i][12];
    tep[0] = tep_ptr[4*i];
    tep[1] = tep_ptr[4*i+1];
    tep[2] = tep_ptr[4*i+2];
    torque2force(i,tep,fix,fiy,fiz,fpolar);
    iz = zaxis2local[i];
    ix = xaxis2local[i];
    iy = yaxis2local[i];
    xiz = x[iz][0] - x[i][0];
    yiz = x[iz][1] - x[i][1];
    ziz = x[iz][2] - x[i][2];
    xix = x[ix][0] - x[i][0];
    yix = x[ix][1] - x[i][1];
    zix = x[ix][2] - x[i][2];
    xiy = x[iy][0] - x[i][0];
    yiy = x[iy][1] - x[i][1];
    ziy = x[iy][2] - x[i][2];
    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
    virpolar[0] += vxx;
    virpolar[1] += vyy;
    virpolar[2] += vzz;
    virpolar[3] += vxy;
    virpolar[4] += vxz;
    virpolar[5] += vyz;
  }
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
@ -292,7 +170,7 @@ void PairAmoebaGPU::init_style()
  int tep_size;
  int mnf = 5e-2 * neighbor->oneatom;
  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
-                                special_polar_wscale, special_polar_piscale,
+                                special_mpole, special_polar_wscale, special_polar_piscale,
                                special_polar_pscale, atom->nlocal,
                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                maxspecial15, cell_size, gpu_mode, screen,
@ -308,6 +186,68 @@ void PairAmoebaGPU::init_style()
    tep_single = true;
 }
 /* ---------------------------------------------------------------------- */
 void PairAmoebaGPU::multipole_real()
 {
  if (!gpu_multipole_real_ready) {
    PairAmoeba::multipole_real();
    return;
  }
  int eflag=1, vflag=1;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  int *ilist, *numneigh, **firstneigh;
  double sublo[3],subhi[3];
  if (domain->triclinic == 0) {
    sublo[0] = domain->sublo[0];
    sublo[1] = domain->sublo[1];
    sublo[2] = domain->sublo[2];
    subhi[0] = domain->subhi[0];
    subhi[1] = domain->subhi[1];
    subhi[2] = domain->subhi[2];
  } else {
    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
  }
  inum = atom->nlocal;
  // select the correct cutoff for the term
  if (use_ewald) choose(MPOLE_LONG);
  else choose(MPOLE);
  // set the energy unit conversion factor for multipolar real-space calculation
  double felec = electric / am_dielectric;
  firstneigh = amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x,
                                                 atom->type, amtype, amgroup, rpole,
                                                 sublo, subhi, atom->tag,
                                                 atom->nspecial, atom->special,
                                                 atom->nspecial15, atom->special15,
                                                 eflag, vflag, eflag_atom, vflag_atom,
                                                 host_start, &ilist, &numneigh, cpu_time,
                                                 success, felec, off2, atom->q, domain->boxlo,
                                                 domain->prd, &tep_pinned);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");
  // reference to the tep array from GPU lib
  if (tep_single) {
    float *tep_ptr = (float *)tep_pinned;
    compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
  } else {
    double *tep_ptr = (double *)tep_pinned;
    compute_force_from_tep<double>(tep_ptr, fmpole, virmpole);
  }
 }
 /* ----------------------------------------------------------------------
   induce = induced dipole moments via pre-conditioned CG solver
   adapted from Tinker induce0a() routine
@ -1041,6 +981,128 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
 /* ---------------------------------------------------------------------- */
 void PairAmoebaGPU::polar_real()
 {
  if (!gpu_polar_real_ready) {
    PairAmoeba::polar_real();
    return;
  }
  int eflag=1, vflag=1;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
  bool success = true;
  int *ilist, *numneigh, **firstneigh;
  double sublo[3],subhi[3];
  if (domain->triclinic == 0) {
    sublo[0] = domain->sublo[0];
    sublo[1] = domain->sublo[1];
    sublo[2] = domain->sublo[2];
    subhi[0] = domain->subhi[0];
    subhi[1] = domain->subhi[1];
    subhi[2] = domain->subhi[2];
  } else {
    domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
  }
  inum = atom->nlocal;
  // select the correct cutoff for the term
  if (use_ewald) choose(POLAR_LONG);
  else choose(POLAR);
  // set the energy unit conversion factor for polar real-space calculation
  double felec = 0.5 * electric / am_dielectric;
  firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
                                        atom->type, amtype, amgroup,
                                        rpole, uind, uinp, sublo, subhi,
                                        atom->tag, atom->nspecial, atom->special,
                                        atom->nspecial15, atom->special15,
                                        eflag, vflag, eflag_atom, vflag_atom,
                                        host_start, &ilist, &numneigh, cpu_time,
                                        success, felec, off2, atom->q, domain->boxlo,
                                        domain->prd, &tep_pinned);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");
  // reference to the tep array from GPU lib
  if (tep_single) {
    float *tep_ptr = (float *)tep_pinned;
    compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
  } else {
    double *tep_ptr = (double *)tep_pinned;
    compute_force_from_tep<double>(tep_ptr, fpolar, virpolar);
  }
 }
 /* ----------------------------------------------------------------------
   init specific to this pair style
 ------------------------------------------------------------------------- */
 template <class numtyp>
 void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
                                           double** force_comp,
                                           double* virial_comp)
 {
  int i,ix,iy,iz;
  double xix,yix,zix;
  double xiy,yiy,ziy;
  double xiz,yiz,ziz;
  double vxx,vyy,vzz;
  double vxy,vxz,vyz;
  double fix[3],fiy[3],fiz[3],tep[4];
  double** x = atom->x;
  int nlocal = atom->nlocal;
  for (i = 0; i < nlocal; i++) {
    tep[0] = tep_ptr[4*i];
    tep[1] = tep_ptr[4*i+1];
    tep[2] = tep_ptr[4*i+2];
    torque2force(i,tep,fix,fiy,fiz,force_comp);
    iz = zaxis2local[i];
    ix = xaxis2local[i];
    iy = yaxis2local[i];
    xiz = x[iz][0] - x[i][0];
    yiz = x[iz][1] - x[i][1];
    ziz = x[iz][2] - x[i][2];
    xix = x[ix][0] - x[i][0];
    yix = x[ix][1] - x[i][1];
    zix = x[ix][2] - x[i][2];
    xiy = x[iy][0] - x[i][0];
    yiy = x[iy][1] - x[i][1];
    ziy = x[iy][2] - x[i][2];
    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
    virial_comp[0] += vxx;
    virial_comp[1] += vyy;
    virial_comp[2] += vzz;
    virial_comp[3] += vxy;
    virial_comp[4] += vxz;
    virial_comp[5] += vyz;
  }
 }
 /* ---------------------------------------------------------------------- */
 double PairAmoebaGPU::memory_usage()
 {
  double bytes = Pair::memory_usage();
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -35,9 +35,10 @@ class PairAmoebaGPU : public PairAmoeba {
  virtual void induce();
-  virtual void polar_real();
+  virtual void multipole_real();
  virtual void udirect2b(double **, double **);
  virtual void umutual2b(double **, double **);
  virtual void polar_real();
 private:
  int gpu_mode;
@ -46,6 +47,7 @@ class PairAmoebaGPU : public PairAmoeba {
  void *fieldp_pinned;
  bool tep_single;
  bool gpu_multipole_real_ready;
  bool gpu_udirect2b_ready;
  bool gpu_umutual2b_ready;
  bool gpu_polar_real_ready;
@ -53,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba {
  void udirect2b_cpu();
  template<class numtyp>
-  void compute_force_from_tep(const numtyp*);
+  void compute_force_from_tep(const numtyp*, double**, double*);
 };
 }    // namespace LAMMPS_NS