Removed trailing spaces

2021-09-28 17:28:33 -05:00
parent bf88ab77fa
commit b874feb127
13 changed files with 283 additions and 281 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -140,7 +140,7 @@ void AmoebaT::clear() {
  coeff_amclass.clear();
  sp_polar.clear();
  sp_nonpolar.clear();
-  
+
  this->clear_atomic();
 }

@ -169,7 +169,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {

  // Build the short neighbor list for the cutoff off2_mpole,
  //   at this point mpole is the first kernel in a time step
-  
+
  this->k_short_nbor.set_size(GX,BX);
  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                         &this->_nbor_data->begin(),
@ -194,7 +194,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int AmoebaT::udirect2b(const int eflag, const int vflag) {
-  int ainum=this->ans->inum(); 
+  int ainum=this->ans->inum();
  if (ainum == 0)
    return 0;

@ -216,7 +216,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
                           &nbor_pitch, &this->_threads_per_atom);
    this->short_nbor_polar_avail = true;
  }
-  
+
  this->k_udirect2b.set_size(GX,BX);
  this->k_udirect2b.run(&this->atom->x, &this->atom->extra, &coeff_amtype, &sp_polar,
                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -492,7 +492,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
@ -500,7 +500,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-  
+
      numtyp r = ucl_sqrt(r2);
      const numtyp4 pol1j = polar1[j];
      numtyp ck  = pol1j.x;  // rpole[j][0];
@ -533,12 +533,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-      
+
      numtyp dik = dix*dkx + diy*dky + diz*dkz;
      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
-      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
        qixx*qkxx + qiyy*qkyy + qizz*qkzz;

      // additional intermediates involving moments and distance
@ -585,11 +585,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
      numtyp dkqiry = dkqix*zr - dkqiz*xr;
      numtyp dkqirz = dkqiy*xr - dkqix*yr;
-      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
-      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
-      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);

      // get reciprocal distance terms for this interaction
@ -650,20 +650,20 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,

      // compute the force components for this interaction

-      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
        term4*qix + term5*qkx + term6*(qixk+qkxi);
-      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
        term4*qiy + term5*qky + term6*(qiyk+qkyi);
-      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
        term4*qiz + term5*qkz + term6*(qizk+qkzi);

      // compute the torque components for this interaction

-      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
        term4*qirx - term6*(qikrx+qikx);
-      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
        term4*qiry - term6*(qikry+qiky);
-      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
        term4*qirz - term6*(qikrz+qikz);

      // increment force-based gradient and torque on first site
@ -691,12 +691,12 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
        virial[5] += vyz;
      }
    } // nbor
-    
+
  } // ii<inum

  // accumulate tq
  store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
-  
+
  // accumate force, energy and virial: use _acc if not the first kernel
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
     offset,eflag,vflag,ans,engv);
@ -771,7 +771,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
    numtyp qizz = pol3i.y;   // rpole[i][12];
    int itype  = pol3i.z;    // amtype[i];
    int igroup = pol3i.w;    // amgroup[i];
-    
+
    // debug:
    // xi__ = ix; xi__.w = itype;

@ -790,7 +790,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
@ -798,7 +798,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-      
+
      numtyp r = ucl_sqrt(r2);
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
@ -898,7 +898,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
      fid[0] = -xr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkx + (numtyp)2.0*bcn[1]*qkx;
      fid[1] = -yr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dky + (numtyp)2.0*bcn[1]*qky;
      fid[2] = -zr*(bcn[0]*ck-bcn[1]*dkr+bcn[2]*qkr) - bcn[0]*dkz + (numtyp)2.0*bcn[1]*qkz;
-        
+
      scalek = factor_pscale;
      bcn[0] = bn[1] - ((numtyp)1.0-scalek*scale3)*rr3;
      bcn[1] = bn[2] - ((numtyp)1.0-scalek*scale5)*rr5;
@ -918,7 +918,7 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
  } // ii<inum

  // accumulate field and fieldp
-  
+
  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }

@ -977,10 +977,10 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
    int itype,igroup;
    numtyp bn[4],bcn[3];
    numtyp fid[3],fip[3];
-    
+
    itype  = polar3[i].z; // amtype[i];
    igroup = polar3[i].w; // amgroup[i];
-    
+
    numtyp pdi = coeff[itype].x;
    numtyp pti = coeff[itype].y;

@ -995,7 +995,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
@ -1003,7 +1003,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-  
+
      numtyp r = ucl_sqrt(r2);
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
@ -1044,7 +1044,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
      }

      // find terms needed later to compute mutual polarization
-      // if (poltyp != DIRECT) 
+      // if (poltyp != DIRECT)
      numtyp scale3 = (numtyp)1.0;
      numtyp scale5 = (numtyp)1.0;
      numtyp damp = pdi * coeff[jtype].x; // pdamp[jtype]
@ -1056,7 +1056,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
          scale3 = (numtyp)1.0 - expdamp;
          scale5 = (numtyp)1.0 - expdamp*((numtyp)1.0+damp);
        }
-        
+
      } else { // damp == 0: ???
      }

@ -1071,17 +1071,17 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
      tdipdip[3] = -bcn[0] + bcn[1]*yr*yr;
      tdipdip[4] = bcn[1]*yr*zr;
      tdipdip[5] = -bcn[0] + bcn[1]*zr*zr;
-      //if (i==0 && j == 10) 
+      //if (i==0 && j == 10)
      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
-      
+
      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
-      
+
      _fieldp[0] += fid[0];
      _fieldp[1] += fid[1];
      _fieldp[2] += fid[2];
@ -1093,7 +1093,7 @@ __kernel void k_amoeba_umutual2b(const __global numtyp4 *restrict x_,
  } // ii<inum

  // accumulate field and fieldp
-  
+
  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }

@ -1221,7 +1221,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
@ -1229,9 +1229,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-  
+
      numtyp r = ucl_sqrt(r2);
-      
+
      const numtyp4 pol1j = polar1[j];
      numtyp ck = pol1j.x;   // rpole[j][0];
      numtyp dkx = pol1j.y;  // rpole[j][1];
@ -1383,7 +1383,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp tiy3 = psr3*uky + dsr3*ukyp;
      numtyp tiz3 = psr3*ukz + dsr3*ukzp;
      numtyp tuir = -psr5*ukr - dsr5*ukrp;
-      
+
      ufld[0] += tix3 + xr*tuir;
      ufld[1] += tiy3 + yr*tuir;
      ufld[2] += tiz3 + zr*tuir;
@ -1394,14 +1394,14 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
      numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
      tuir = -psr7*ukr - dsr7*ukrp;
-      
+
      dufld[0] += xr*tix5 + xr*xr*tuir;
      dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
      dufld[2] += yr*tiy5 + yr*yr*tuir;
      dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
      dufld[5] += zr*tiz5 + zr*zr*tuir;
-      
+
      // get the dEd/dR terms used for direct polarization force

      term1 = bn[2] - dsc3*rr5;
@ -1473,7 +1473,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      numtyp frcz = depz;

      // get the dEp/dR terms used for direct polarization force
-      
+
      // tixx and tkxx
      term1 = bn[2] - psc3*rr5;
      term2 = bn[3] - psc5*rr7;
@ -1550,7 +1550,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,

      // get the dtau/dr terms used for mutual polarization force
      // poltyp == MUTUAL  && amoeba
-          
+
      term1 = bn[2] - usc3*rr5;
      term2 = bn[3] - usc5*rr7;
      term3 = usr5 + term1;
@ -1617,7 +1617,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
        virial[5] += vyz;
      }
    } // nbor
-    
+
  } // ii<inum

  // accumulate ufld and dufld to compute tep
@ -1648,7 +1648,7 @@ __kernel void k_special15(__global int * dev_nbor,
  atom_info(t_per_atom,ii,tid,offset);

  if (ii<inum) {
-  
+
    int numj, nbor, nbor_end;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@ -70,13 +70,13 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
  UCL_D_Vec<numtyp4> coeff_amtype;
  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
  UCL_D_Vec<numtyp4> coeff_amclass;
-  /// Special polar values [0-4]: 
+  /// Special polar values [0-4]:
  ///   sp_polar.x = special_polar_wscale
  ///   sp_polar.y special_polar_pscale,
  ///   sp_polar.z = special_polar_piscale
  ///   sp_polar.w = special_mpole
  UCL_D_Vec<numtyp4> sp_polar;
-  /// Special nonpolar values [0-4]: 
+  /// Special nonpolar values [0-4]:
  ///   sp_nonpolar.x = special_hal
  ///   sp_nonpolar.y special_repel
  ///   sp_nonpolar.z = special_disp
@ -97,7 +97,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
  int udirect2b(const int eflag, const int vflag);
  int umutual2b(const int eflag, const int vflag);
  int polar_real(const int eflag, const int vflag);
-  
+
 };

 }
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -106,7 +106,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              _threads_per_atom);
  if (success!=0)
    return success;
-                              
+
  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);

@ -121,7 +121,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
  _maxspecial=maxspecial;
  _maxspecial15=maxspecial15;

-  // allocate per-atom array tep 
+  // allocate per-atom array tep

  int ef_nall=nlocal; //nall;
  if (ef_nall==0)
@ -250,7 +250,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
                          const bool eflag_in, const bool vflag_in,
                          const bool eatom, const bool vatom,
                          int &host_start, const double cpu_time,
-                          bool &success, const double aewald, const double felec, 
+                          bool &success, const double aewald, const double felec,
                          const double off2_polar, double *host_q, const int nlocal,
                          double *boxlo, double *prd, void **tep_ptr) {
  acc_timers();
@ -280,7 +280,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
    dev_special15_t.clear();
    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
  }

  *tep_ptr=_tep.host.begin();
@ -320,7 +320,7 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
  _off2_polar = off2_polar;
  _felec = felec;
  const int red_blocks=polar_real(eflag,vflag);
-  
+
  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
@ -375,7 +375,7 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
    dev_special15_t.clear();
    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
  }

  if (inum_full==0) {
@ -462,7 +462,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,

  // reallocate per-atom arrays, transfer data from the host
  //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
  //   For now we invoke precompute() again here,
  //     to be able to turn on/off the udirect2b kernel (which comes before this)
  //   Once all the kernels are ready, precompute() is needed only once
@ -509,7 +509,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full,
    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
  }
-*/  
+*/
  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }

@ -560,7 +560,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full,
                          eflag_in, vflag_in, eatom, vatom,
                          host_start, ilist, jnum, cpu_time,
                          success, host_q, boxlo, prd);
-                         
+
  // ------------------- Resize _fieldp array ------------------------

  if (inum_full>_max_fieldp_size) {
@ -698,7 +698,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,

  // reallocate per-atom arrays, transfer data from the host
  //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
  //   For now we invoke precompute() again here,
  //     to be able to turn on/off the udirect2b kernel (which comes before this)
  //   Once all the kernels are ready, precompute() is needed only once
@ -745,7 +745,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full,
    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
  }
-*/  
+*/
  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }

@ -809,7 +809,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
      int idx = n+i*nstride;
      pextra[idx]   = uinp[i][0];
      pextra[idx+1] = uinp[i][1];
-      pextra[idx+2] = uinp[i][2];    
+      pextra[idx+2] = uinp[i][2];
    }
  }

@ -818,7 +818,7 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,

    for (int i = 0; i < _nall; i++) {
      int idx = n+i*nstride;
-      pextra[idx]   = pval[i]; 
+      pextra[idx]   = pval[i];
    }
  }
 }
@ -846,7 +846,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
  k_special15.set_function(*pair_program,"k_special15");
  pos_tex.get_texture(*pair_program,"pos_tex");
  q_tex.get_texture(*pair_program,"q_tex");
-  
+
  _compiled=true;

  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
@ -874,13 +874,13 @@ int BaseAmoebaT::add_onefive_neighbors() {
  int _nall=atom->nall();
  int ainum=ans->inum();
  int nbor_pitch=nbor->nbor_pitch();
-  
+
  k_special15.set_size(GX,BX);
  k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
                    &atom->dev_tag, &dev_nspecial15, &dev_special15,
                    &ainum, &_nall, &nbor_pitch,
                    &_threads_per_atom);
-  
+
  return GX;
 }

--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -287,7 +287,7 @@ class BaseAmoeba {
  virtual int udirect2b(const int eflag, const int vflag) = 0;
  virtual int umutual2b(const int eflag, const int vflag) = 0;
  virtual int polar_real(const int eflag, const int vflag) = 0;
-  
+
 };

 }
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@ -145,7 +145,7 @@ void HippoT::clear() {
  coeff_amclass.clear();
  sp_polar.clear();
  sp_nonpolar.clear();
-  
+
  this->clear_atomic();
 }

@ -199,7 +199,7 @@ int** HippoT::precompute(const int ago, const int inum_full, const int nall,
    this->dev_special15_t.clear();
    this->dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
    this->dev_special15.alloc(this->_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
-    this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+    this->dev_special15_t.alloc(nall*this->_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
  }

  if (inum_full==0) {
@ -286,7 +286,7 @@ int** HippoT::compute_dispersion_real(const int ago, const int inum_full,

  // reallocate per-atom arrays, transfer data from the host
  //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
  //   For now we invoke precompute() again here,
  //     to be able to turn on/off the udirect2b kernel (which comes before this)
  //   Once all the kernels are ready, precompute() is needed only once
@ -339,7 +339,7 @@ int HippoT::dispersion_real(const int eflag, const int vflag) {

  // Build the short neighbor list for the cutoff off2_disp,
  //   at this point mpole is the first kernel in a time step
-  
+
  this->k_short_nbor.set_size(GX,BX);
  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                         &this->_nbor_data->begin(),
@ -397,7 +397,7 @@ int** HippoT::compute_multipole_real(const int ago, const int inum_full,

  // reallocate per-atom arrays, transfer data from the host
  //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
  //   For now we invoke precompute() again here,
  //     to be able to turn on/off the udirect2b kernel (which comes before this)
  //   Once all the kernels are ready, precompute() is needed only once
@ -468,7 +468,7 @@ int HippoT::multipole_real(const int eflag, const int vflag) {

  // Build the short neighbor list for the cutoff off2_mpole,
  //   at this point mpole is the first kernel in a time step
-  
+
  this->k_short_nbor.set_size(GX,BX);
  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
                         &this->_nbor_data->begin(),
@ -537,7 +537,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
                          eflag_in, vflag_in, eatom, vatom,
                          host_start, ilist, jnum, cpu_time,
                          success, host_q, boxlo, prd);
-                         
+
  // ------------------- Resize _fieldp array ------------------------

  if (inum_full>this->_max_fieldp_size) {
@ -569,7 +569,7 @@ int** HippoT::compute_udirect2b(const int ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int HippoT::udirect2b(const int eflag, const int vflag) {
-  int ainum=this->ans->inum(); 
+  int ainum=this->ans->inum();
  if (ainum == 0)
    return 0;

@ -591,7 +591,7 @@ int HippoT::udirect2b(const int eflag, const int vflag) {
                           &nbor_pitch, &this->_threads_per_atom);
    this->short_nbor_polar_avail = true;
  }
-  
+
  this->k_udirect2b.set_size(GX,BX);
  this->k_udirect2b.run(&this->atom->x, &this->atom->extra,
                        &coeff_amtype, &coeff_amclass, &sp_polar,
@ -756,7 +756,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,

  // reallocate per-atom arrays, transfer data from the host
  //   and build the neighbor lists if needed
-  // NOTE: 
+  // NOTE:
  //   For now we invoke precompute() again here,
  //     to be able to turn on/off the udirect2b kernel (which comes before this)
  //   Once all the kernels are ready, precompute() is needed only once
@ -803,7 +803,7 @@ int** HippoT::compute_polar_real(const int ago, const int inum_full,
    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
  }
-*/  
+*/
  return firstneigh; // nbor->host_jlist.begin()-host_start;
 }

--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@ -491,7 +491,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = ix.x - jx.x;
      numtyp yr = ix.y - jx.y;
@ -499,7 +499,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      if (r2>off2) continue;
-  
+
      const numtyp4 pol1j = polar1[j];
      numtyp ck  = pol1j.x;  // rpole[j][0];
      numtyp dkx = pol1j.y;  // rpole[j][1];
@ -514,7 +514,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
      numtyp qkyz = pol3j.x; // rpole[j][9];
      numtyp qkzz = pol3j.y; // rpole[j][12];
      int jtype = pol3j.z; // amtype[j];
-      
+
      numtyp sizk = coeff[jtype].x; // sizpr[jtype];
      numtyp dmpk = coeff[jtype].y; // dmppr[jtype];
      numtyp valk = coeff[jtype].z; // elepr[jtype];
@ -534,12 +534,12 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-      
+
      numtyp dik = dix*dkx + diy*dky + diz*dkz;
      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
-      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
        qixx*qkxx + qiyy*qkyy + qizz*qkzz;

      // additional intermediates involving moments and distance
@ -586,11 +586,11 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
      numtyp dkqiry = dkqix*zr - dkqiz*xr;
      numtyp dkqirz = dkqiy*xr - dkqix*yr;
-      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
-      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
-      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);

      // get reciprocal distance terms for this interaction
@ -616,7 +616,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
      numtyp term3 = vali*qkr + valk*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
      numtyp term4 = dir*qkr - dkr*qir - 4.0*qik;
      numtyp term5 = qir*qkr;
-      numtyp eterm = term1*dmpik[0] + term2*dmpik[2] + 
+      numtyp eterm = term1*dmpik[0] + term2*dmpik[2] +
        term3*dmpik[4] + term4*dmpik[6] + term5*dmpik[8];

      // compute the Pauli repulsion energy for this interaction
@ -626,7 +626,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,

      // calculate intermediate terms for force and torque

-      numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] + 
+      numtyp de = term1*dmpik[2] + term2*dmpik[4] + term3*dmpik[6] +
        term4*dmpik[8] + term5*dmpik[10];
      term1 = -valk*dmpik[2] + dkr*dmpik[4] - qkr*dmpik[6];
      term2 = vali*dmpik[2] + dir*dmpik[4] + qir*dmpik[6];
@ -637,23 +637,23 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,

      // compute the force components for this interaction

-      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
        term4*qix + term5*qkx + term6*(qixk+qkxi);
-      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
        term4*qiy + term5*qky + term6*(qiyk+qkyi);
-      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
        term4*qiz + term5*qkz + term6*(qizk+qkzi);
      frcx = frcx*rr1 + eterm*rr3*xr;
      frcy = frcy*rr1 + eterm*rr3*yr;
      frcz = frcz*rr1 + eterm*rr3*zr;

      // compute the torque components for this interaction
-      
-      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+
+      numtyp ttmix = -dmpik[2]*dikx + term1*dirx + term3*(dqikx+dkqirx) -
        term4*qirx - term6*(qikrx+qikx);
-      numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) - 
+      numtyp ttmiy = -dmpik[2]*diky + term1*diry + term3*(dqiky+dkqiry) -
        term4*qiry - term6*(qikry+qiky);
-      numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+      numtyp ttmiz = -dmpik[2]*dikz + term1*dirz + term3*(dqikz+dkqirz) -
        term4*qirz - term6*(qikrz+qikz);
      ttmix = sizik * ttmix * rr1;
      ttmiy = sizik * ttmiy * rr1;
@ -706,7 +706,7 @@ __kernel void k_hippo_repulsion(const __global numtyp4 *restrict x_,
        virial[5] += vyz;
      }
    } // nbor
-    
+
  } // ii<inum

  // accumulate tq
@ -786,7 +786,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = ix.x - jx.x;
      numtyp yr = ix.y - jx.y;
@ -794,7 +794,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-  
+
      int jtype =   polar3[j].z; // amtype[j];
      int jclass = coeff_amtype[jtype].w;  // amtype2class[jtype];
      numtyp ck = coeff_amclass[jclass].x;    // csix[jclass];
@ -816,7 +816,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
      numtyp dk = ak * r;
      numtyp expi = ucl_exp(-di);
      numtyp expk = ucl_exp(-dk);
-     
+
      numtyp ai2,ak2;
      numtyp di4,di5;
      numtyp dk2,dk3;
@ -844,7 +844,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
          - tk2*((numtyp)1.0 + dk + (numtyp)0.5*dk2 + dk3/(numtyp)6.0) * expk
          - (numtyp)2.0*ti2*tk*((numtyp)1.0 + di + di2/(numtyp)3.0) * expi
          - (numtyp)2.0*tk2*ti*((numtyp)1.0 + dk + dk2/(numtyp)3.0) * expk;
-        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) + 
+        ddamp = (numtyp)0.25 * di2 * ti2 * ai * expi * (r*ai+(numtyp)4.0*tk - (numtyp)1.0) +
          (numtyp)0.25 * dk2 * tk2 * ak * expk * (r*ak+(numtyp)4.0*ti-(numtyp)1.0);

      } else {
@ -856,7 +856,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
      }

      numtyp damp = (numtyp)1.5*damp5 - (numtyp)0.5*damp3;
-      
+
      // apply damping and scaling factors for this interaction

      numtyp scale = factor_disp * damp*damp;
@ -892,7 +892,7 @@ __kernel void k_hippo_dispersion(const __global numtyp4 *restrict x_,
      virial[4] += vzx;
      virial[5] += vzy;
    } // nbor
-    
+
  } // ii<inum

  // accumate force, energy and virial
@ -997,7 +997,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
@ -1005,7 +1005,7 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-  
+
      numtyp r = ucl_sqrt(r2);
      const numtyp4 pol1j = polar1[j];
      numtyp ck  = pol1j.x;  // rpole[j][0];
@ -1043,12 +1043,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
-      
+
      numtyp dik = dix*dkx + diy*dky + diz*dkz;
      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
-      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) +
        qixx*qkxx + qiyy*qkyy + qizz*qkzz;

      // additional intermediates involving moments and distance
@ -1095,11 +1095,11 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
      numtyp dkqirx = dkqiz*yr - dkqiy*zr;
      numtyp dkqiry = dkqix*zr - dkqiz*xr;
      numtyp dkqirz = dkqiy*xr - dkqix*yr;
-      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy - 
+      numtyp dqikx = diy*qkz - diz*qky + dky*qiz - dkz*qiy -
        (numtyp)2.0*(qixy*qkxz+qiyy*qkyz+qiyz*qkzz - qixz*qkxy-qiyz*qkyy-qizz*qkyz);
-      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz - 
+      numtyp dqiky = diz*qkx - dix*qkz + dkz*qix - dkx*qiz -
        (numtyp)2.0*(qixz*qkxx+qiyz*qkxy+qizz*qkxz - qixx*qkxz-qixy*qkyz-qixz*qkzz);
-      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix - 
+      numtyp dqikz = dix*qky - diy*qkx + dkx*qiy - dky*qix -
        (numtyp)2.0*(qixx*qkxy+qixy*qkyy+qixz*qkyz - qixy*qkxx-qiyy*qkxy-qiyz*qkxz);

      // get reciprocal distance terms for this interaction
@ -1164,16 +1164,16 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
      numtyp rr11ik = bn[5] - ((numtyp)1.0-scalek*dmpij[10])*rr11;
      rr1 = bn[0] - ((numtyp)1.0-scalek)*rr1;
      rr3 = bn[1] - ((numtyp)1.0-scalek)*rr3;
-      numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik + 
-        term1i*rr1i + term1k*rr1k + term1ik*rr1ik + 
-        term2i*rr3i + term2k*rr3k + term2ik*rr3ik + 
+      numtyp e = term1*rr1 + term4ik*rr7ik + term5ik*rr9ik +
+        term1i*rr1i + term1k*rr1k + term1ik*rr1ik +
+        term2i*rr3i + term2k*rr3k + term2ik*rr3ik +
        term3i*rr5i + term3k*rr5k + term3ik*rr5ik;

      // find damped multipole intermediates for force and torque

-      numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik + 
-        term1i*rr3i + term1k*rr3k + term1ik*rr3ik + 
-        term2i*rr5i + term2k*rr5k + term2ik*rr5ik + 
+      numtyp de = term1*rr3 + term4ik*rr9ik + term5ik*rr11ik +
+        term1i*rr3i + term1k*rr3k + term1ik*rr3ik +
+        term2i*rr5i + term2k*rr5k + term2ik*rr5ik +
        term3i*rr7i + term3k*rr7k + term3ik*rr7ik;
      term1 = -corek*rr3i - valk*rr3ik + dkr*rr5ik - qkr*rr7ik;
      term2 = corei*rr3k + vali*rr3ik + dir*rr5ik + qir*rr7ik;
@ -1187,20 +1187,20 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,

      // compute the force components for this interaction

-      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) + 
+      numtyp frcx = de*xr + term1*dix + term2*dkx + term3*(diqkx-dkqix) +
        term4*qix + term5*qkx + term6*(qixk+qkxi);
-      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) + 
+      numtyp frcy = de*yr + term1*diy + term2*dky + term3*(diqky-dkqiy) +
        term4*qiy + term5*qky + term6*(qiyk+qkyi);
-      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) + 
+      numtyp frcz = de*zr + term1*diz + term2*dkz + term3*(diqkz-dkqiz) +
        term4*qiz + term5*qkz + term6*(qizk+qkzi);

      // compute the torque components for this interaction

-      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) -
        term4*qirx - term6*(qikrx+qikx);
-      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) -
        term4*qiry - term6*(qikry+qiky);
-      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) -
        term4*qirz - term6*(qikrz+qikz);

      // increment force-based gradient and torque on first site
@ -1228,12 +1228,12 @@ __kernel void k_hippo_multipole(const __global numtyp4 *restrict x_,
        virial[5] += vyz;
      }
    } // nbor
-    
+
  } // ii<inum

  // accumulate tq
  store_answers_hippo_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
-  
+
  // accumate force, energy and virial: use _acc if not the first kernel
  //store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
     //offset,eflag,vflag,ans,engv);
@ -1327,7 +1327,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
@ -1335,7 +1335,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-      
+
      numtyp r = ucl_sqrt(r2);
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
@ -1408,7 +1408,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
      // find the field components for charge penetration damping
      numtyp dmpi[7],dmpk[7];
      dampdir(r,alphai,alphak,dmpi,dmpk);
-          
+
      numtyp scalek = factor_dscale;
      numtyp rr3i = bn[1] - ((numtyp)1.0-scalek*dmpi[2])*rr3;
      numtyp rr5i = bn[2] - ((numtyp)1.0-scalek*dmpi[4])*rr5;
@ -1439,7 +1439,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
        rr3k*dky + (numtyp)2.0*rr5k*qky;
      fip[2] = -zr*(rr3*corek + rr3k*valk - rr5k*dkr + rr7k*qkr) -
        rr3k*dkz + (numtyp)2.0*rr5k*qkz;
-          
+
      // find terms needed later to compute mutual polarization

      _fieldp[0] += fid[0];
@ -1453,7 +1453,7 @@ __kernel void k_hippo_udirect2b(const __global numtyp4 *restrict x_,
  } // ii<inum

  // accumulate field and fieldp
-  
+
  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }

@ -1514,7 +1514,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
    int itype,igroup;
    numtyp bn[4],bcn[3];
    numtyp fid[3],fip[3];
-    
+
    itype  = polar3[i].z; // amtype[i];
    igroup = polar3[i].w; // amgroup[i];

@ -1534,7 +1534,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
@ -1542,7 +1542,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-  
+
      numtyp r = ucl_sqrt(r2);
      numtyp rinv = ucl_recip(r);
      numtyp r2inv = rinv*rinv;
@ -1595,7 +1595,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
      }

      // find terms needed later to compute mutual polarization
-      // if (poltyp != DIRECT) 
+      // if (poltyp != DIRECT)
      numtyp dmpik[5];
      dampmut(r,alphai,alphak,dmpik);
      numtyp scalek = factor_wscale;
@ -1610,17 +1610,17 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
      tdipdip[3] = -rr3ik + rr5ik*yr*yr;
      tdipdip[4] = rr5ik*yr*zr;
      tdipdip[5] = -rr3ik + rr5ik*zr*zr;
-      //if (i==0 && j == 10) 
+      //if (i==0 && j == 10)
      //  printf("i = %d: j = %d: tdipdip %f %f %f %f %f %f\n",
      //    i, j,tdipdip[0],tdipdip[1],tdipdip[2],tdipdip[3],tdipdip[4],tdipdip[5]);
      fid[0] = tdipdip[0]*ukx + tdipdip[1]*uky + tdipdip[2]*ukz;
      fid[1] = tdipdip[1]*ukx + tdipdip[3]*uky + tdipdip[4]*ukz;
      fid[2] = tdipdip[2]*ukx + tdipdip[4]*uky + tdipdip[5]*ukz;
-      
+
      fip[0] = tdipdip[0]*ukxp + tdipdip[1]*ukyp + tdipdip[2]*ukzp;
      fip[1] = tdipdip[1]*ukxp + tdipdip[3]*ukyp + tdipdip[4]*ukzp;
      fip[2] = tdipdip[2]*ukxp + tdipdip[4]*ukyp + tdipdip[5]*ukzp;
-      
+
      _fieldp[0] += fid[0];
      _fieldp[1] += fid[1];
      _fieldp[2] += fid[2];
@ -1632,7 +1632,7 @@ __kernel void k_hippo_umutual2b(const __global numtyp4 *restrict x_,
  } // ii<inum

  // accumulate field and fieldp
-  
+
  store_answers_fieldp(_fieldp,ii,inum,tid,t_per_atom,offset,i,fieldp);
 }

@ -1754,7 +1754,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,

      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
      //int jtype=jx.w;
- 
+
      // Compute r12
      numtyp xr = jx.x - ix.x;
      numtyp yr = jx.y - ix.y;
@ -1762,7 +1762,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      numtyp r2 = xr*xr + yr*yr + zr*zr;

      //if (r2>off2) continue;
-  
+
      numtyp r = ucl_sqrt(r2);

      const numtyp4 pol1j = polar1[j];
@ -1905,7 +1905,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,


      // get the field gradient for direct polarization force
-      
+
      numtyp term1i,term2i,term3i,term4i,term5i,term6i,term7i,term8i;
      numtyp term1k,term2k,term3k,term4k,term5k,term6k,term7k,term8k;
      numtyp term1core;
@ -1987,7 +1987,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
        dir*term4i - qixy*term5i + qiy*term6i + qix*term7i - qir*term8i;
      tkxy = -valk*term1k - corek*term1core - dky*term2k - dkx*term3k +
        dkr*term4k - qkxy*term5k + qky*term6k + qkx*term7k - qkr*term8k;
-      
+
      term2i = rr5i*xr;
      term1i = zr * term2i;
      term1core = rr5core*xr*zr;
@ -2039,7 +2039,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      numtyp frcx = (numtyp)-2.0 * depx;
      numtyp frcy = (numtyp)-2.0 * depy;
      numtyp frcz = (numtyp)-2.0 * depz;
-       
+
      // get the dEp/dR terms used for direct polarization force
      // poltyp == MUTUAL && hippo
      // tixx and tkxx
@ -2108,7 +2108,7 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
        virial[5] += vyz;
      }
    } // nbor
-    
+
  } // ii<inum

  // accumulate ufld and dufld to compute tep
@ -2139,7 +2139,7 @@ __kernel void k_special15(__global int * dev_nbor,
  atom_info(t_per_atom,ii,tid,offset);

  if (ii<inum) {
-  
+
    int numj, nbor, nbor_end;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);
--- a/lib/gpu/lal_hippo.h
+++ b/lib/gpu/lal_hippo.h
@ -154,13 +154,13 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
  UCL_D_Vec<numtyp4> coeff_amtype;
  /// csix = coeff_amclass.x; adisp = coeff_amclass.y;
  UCL_D_Vec<numtyp4> coeff_amclass;
-  /// Special polar values [0-4]: 
+  /// Special polar values [0-4]:
  ///   sp_polar.x = special_polar_wscale
  ///   sp_polar.y special_polar_pscale,
  ///   sp_polar.z = special_polar_piscale
  ///   sp_polar.w = special_mpole
  UCL_D_Vec<numtyp4> sp_polar;
-  /// Special nonpolar values [0-4]: 
+  /// Special nonpolar values [0-4]:
  ///   sp_nonpolar.x = special_hal
  ///   sp_nonpolar.y special_repel
  ///   sp_nonpolar.z = special_disp
@ -184,7 +184,7 @@ class Hippo : public BaseAmoeba<numtyp, acctyp> {
  int udirect2b(const int eflag, const int vflag);
  int umutual2b(const int eflag, const int vflag);
  int polar_real(const int eflag, const int vflag);
-  
+
 };

 }
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@ -129,7 +129,7 @@ int** hippo_gpu_compute_dispersion_real(const int ago, const int inum_full,
                           const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success, const double aewald, const double off2,
-                           double *host_q, double *boxlo, double *prd) {                             
+                           double *host_q, double *boxlo, double *prd) {
  return HIPPOMF.compute_dispersion_real(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
                          tag, nspecial, special, nspecial15, special15,
@ -175,7 +175,7 @@ int** hippo_gpu_compute_udirect2b(const int ago, const int inum_full,
 int** hippo_gpu_compute_umutual2b(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *host_amtype, int *host_amgroup, double **host_rpole,
-                           double **host_uind, double **host_uinp, double *host_pval, 
+                           double **host_uind, double **host_uinp, double *host_pval,
                           double *sublo, double *subhi, tagint *tag, int **nspecial,
                           tagint **special, int *nspecial15, tagint** special15,
                           const bool eflag, const bool vflag,
--- a/lib/gpu/lal_hippo_extra.h
+++ b/lib/gpu/lal_hippo_extra.h
@ -116,46 +116,46 @@ ucl_inline void damprep(const numtyp r, const numtyp r2, const numtyp rr1,
    tmp = (numtyp)4.0 * dmpi2 * dmpk2 / term;
    s = (dampi-tmp)*expk + (dampk+tmp)*expi;

-    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term - 
-          (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
+    ds = (dmpi2*dmpk2*r2 - (numtyp)4.0*dmpi2*dmpk22*r/term -
+          (numtyp)4.0*dmpi2*dmpk2/term) * expk +
      (dmpi2*dmpk2*r2 + (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0*dmpi2*dmpk2/term) * expi;
-    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 - 
-           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - 
-           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 + 
-       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
+    d2s = (dmpi2*dmpk2*r2/3.0 + dmpi2*dmpk22*r3/(numtyp)3.0 -
+           ((numtyp)4.0/(numtyp)3.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi2*dmpk2*r2/(numtyp)3.0 + dmpi22*dmpk2*r3/(numtyp)3.0 +
+       ((numtyp)4.0/(numtyp)3.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
-    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 - 
-           (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term - 
-           (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk + 
-      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 + 
-       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term + 
+    d3s = (dmpi2*dmpk23*r4/15.0 + dmpi2*dmpk22*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 -
+           (4.0/15.0)*dmpi2*dmpk24*r3/term - ((numtyp)8.0/(numtyp)5.0)*dmpi2*dmpk23*r2/term -
+           (numtyp)4.0*dmpi2*dmpk22*r/term - 4.0/term*dmpi2*dmpk2) * expk +
+      (dmpi23*dmpk2*r4/(numtyp)15.0 + dmpi22*dmpk2*r3/(numtyp)5.0 + dmpi2*dmpk2*r2/(numtyp)5.0 +
+       ((numtyp)4.0/(numtyp)15.0)*dmpi24*dmpk2*r3/term + ((numtyp)8.0/(numtyp)5.0)*dmpi23*dmpk2*r2/term +
       (numtyp)4.0*dmpi22*dmpk2*r/term + (numtyp)4.0/term*dmpi2*dmpk2) * expi;
-    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 + 
-           dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 - 
-           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term - 
-           ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term - 
-           (numtyp)4.0*dmpi2*dmpk2/term) * expk + 
-      (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 + 
-       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 + 
-       ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term + 
-       ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term + 
+    d4s = (dmpi2*dmpk24*r5/(numtyp)105.0 + ((numtyp)2.0/(numtyp)35.0)*dmpi2*dmpk23*r4 +
+           dmpi2*dmpk22*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 -
+           ((numtyp)4.0/(numtyp)105.0)*dmpi2*dmpk25*r4/term - ((numtyp)8.0/21.0)*dmpi2*dmpk24*r3/term -
+           ((numtyp)12.0/(numtyp)7.0)*dmpi2*dmpk23*r2/term - (numtyp)4.0*dmpi2*dmpk22*r/term -
+           (numtyp)4.0*dmpi2*dmpk2/term) * expk +
+      (dmpi24*dmpk2*r5/(numtyp)105.0 + (2.0/(numtyp)35.0)*dmpi23*dmpk2*r4 +
+       dmpi22*dmpk2*r3/(numtyp)7.0 + dmpi2*dmpk2*r2/(numtyp)7.0 +
+       ((numtyp)4.0/(numtyp)105.0)*dmpi25*dmpk2*r4/term + ((numtyp)8.0/(numtyp)21.0)*dmpi24*dmpk2*r3/term +
+       ((numtyp)12.0/(numtyp)7.0)*dmpi23*dmpk2*r2/term + (numtyp)4.0*dmpi22*dmpk2*r/term +
       (numtyp)4.0*dmpi2*dmpk2/term) * expi;
-    
+
    if (rorder >= 11) {
      r6 = r5 * r;
      dmpi26 = dmpi25 * dmpi2;
      dmpk26 = dmpk25 * dmpk2;
-      d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 + 
-             dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 - 
-             (4.0/945.0)*dmpi2*dmpk26*r5/term - 
-             (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term - 
-             (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term - 
-             4.0*dmpi2*dmpk2/term) * expk + 
-        (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 + 
-         dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 + 
-         (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term + 
-         (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term + 
+      d5s = (dmpi2*dmpk25*r6/945.0 + (2.0/189.0)*dmpi2*dmpk24*r5 +
+             dmpi2*dmpk23*r4/21.0 + dmpi2*dmpk22*r3/9.0 + dmpi2*dmpk2*r2/9.0 -
+             (4.0/945.0)*dmpi2*dmpk26*r5/term -
+             (4.0/63.0)*dmpi2*dmpk25*r4/term - (4.0/9.0)*dmpi2*dmpk24*r3/term -
+             (16.0/9.0)*dmpi2*dmpk23*r2/term - 4.0*dmpi2*dmpk22*r/term -
+             4.0*dmpi2*dmpk2/term) * expk +
+        (dmpi25*dmpk2*r6/945.0 + (2.0/189.0)*dmpi24*dmpk2*r5 +
+         dmpi23*dmpk2*r4/21.0 + dmpi22*dmpk2*r3/9.0 + dmpi2*dmpk2*r2/9.0 +
+         (4.0/945.0)*dmpi26*dmpk2*r5/term + (4.0/63.0)*dmpi25*dmpk2*r4/term +
+         (4.0/9.0)*dmpi24*dmpk2*r3/term + (16.0/9.0)*dmpi23*dmpk2*r2/term +
         4.0*dmpi22*dmpk2*r/term + 4.0*dmpi2*dmpk2/term) * expi;
    }
  }
@ -214,7 +214,7 @@ ucl_inline void damppole(const numtyp r, const int rorder,
  // compute tolerance and exponential damping factors

  eps = (numtyp)0.001;
-  diff = alphai-alphak; 
+  diff = alphai-alphak;
  if (diff < (numtyp)0) diff = -diff;
  dampi = alphai * r;
  dampk = alphak * r;
@ -231,7 +231,7 @@ ucl_inline void damppole(const numtyp r, const int rorder,
  dmpi[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2)*expi;
  dmpi[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi;
  dmpi[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi;
-  dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+  dmpi[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
                   (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi;
  if (diff < eps) {
    dmpk[0] = dmpi[0];
@ -248,7 +248,7 @@ ucl_inline void damppole(const numtyp r, const int rorder,
    dmpk[2] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2)*expk;
    dmpk[4] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk;
    dmpk[6] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk;
-    dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + 
+    dmpk[8] = (numtyp)1.0 - ((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
                     (numtyp)4.0*dampk4/(numtyp)105.0 + dampk5/(numtyp)210.0)*expk;
  }

@ -257,21 +257,21 @@ ucl_inline void damppole(const numtyp r, const int rorder,
  if (diff < eps) {
    dampi6 = dampi3 * dampi3;
    dampi7 = dampi3 * dampi4;
-    dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 + 
+    dmpik[0] = (numtyp)1.0 - ((numtyp)1.0 + (numtyp)11.0*dampi/(numtyp)16.0 + (numtyp)3.0*dampi2/(numtyp)16.0 +
                      dampi3/(numtyp)48.0)*expi;
-    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
                      (numtyp)7.0*dampi3/(numtyp)48.0 + dampi4/(numtyp)48.0)*expi;
-    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
-    dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+    dmpik[6] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0)*expi;
-    dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
-                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + 
+    dmpik[8] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                      dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
                      dampi7/(numtyp)5040.0)*expi;
    if (rorder >= 11) {
      dampi8 = dampi4 * dampi4;
      dmpik[10] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
-                         dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 + 
+                         dampi4/(numtyp)24.0 + dampi5/(numtyp)120.0 + dampi6/(numtyp)720.0 +
                         dampi7/(numtyp)5040.0 + dampi8/(numtyp)45360.0)*expi;
    }

@ -282,41 +282,41 @@ ucl_inline void damppole(const numtyp r, const int rorder,
    termk = alphai2 / (alphai2-alphak2);
    termi2 = termi * termi;
    termk2 = termk * termk;
-    dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi - 
+    dmpik[0] = (numtyp)1.0 - termi2*(1.0 + (numtyp)2.0*termk + (numtyp)0.5*dampi)*expi -
      termk2*((numtyp)1.0 + (numtyp)2.0*termi + (numtyp)0.5*dampk)*expk;
    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi -
      (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
-    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - 
-      termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk - 
-      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi - 
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*(1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + dampi2/(numtyp)3.0)*expi -
      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + dampk2/(numtyp)3.0)*expk;
-    dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
-                             dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi - 
-      termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk - 
-      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi - 
+    dmpik[6] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
+                             dampi3/(numtyp)6.0 + dampi4/(numtyp)30.0)*expi -
+      termk2*((numtyp)1.0 + dampk + 0.5*dampk2 + dampk3/(numtyp)6.0 + dampk4/(numtyp)30.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)2.0*dampi2/(numtyp)5.0 + dampi3/(numtyp)15.0)*expi -
      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)2.0*dampk2/(numtyp)5.0 + dampk3/(numtyp)15.0)*expk;
-    dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
-                             (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi - 
-      termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + 
-              (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk - 
-      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 + 
-                        (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi - 
-      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 + 
+    dmpik[8] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                             (numtyp)4.0*dampi4/(numtyp)105.0 + dampi5/(numtyp)210.0)*expi -
+      termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 +
+              (numtyp)4.0*dampk4/105.0 + dampk5/(numtyp)210.0)*expk -
+      (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)3.0*dampi2/(numtyp)7.0 +
+                        (numtyp)2.0*dampi3/(numtyp)21.0 + dampi4/(numtyp)105.0)*expi -
+      (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + (numtyp)3.0*dampk2/(numtyp)7.0 +
                        (numtyp)2.0*dampk3/(numtyp)21.0 + dampk4/(numtyp)105.0)*expk;
-    
+
    if (rorder >= 11) {
      dampi6 = dampi3 * dampi3;
      dampk6 = dampk3 * dampk3;
-      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
-                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 + 
-                                dampi6/(numtyp)1890.0)*expi - 
-        termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 + 
-                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk - 
-        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 + 
-                          dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi - 
-        (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 + 
+      dmpik[10] = (numtyp)1.0 - termi2*((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
+                                (numtyp)5.0*dampi4/(numtyp)126.0 + (numtyp)2.0*dampi5/(numtyp)315.0 +
+                                dampi6/(numtyp)1890.0)*expi -
+        termk2*((numtyp)1.0 + dampk + (numtyp)0.5*dampk2 + dampk3/(numtyp)6.0 + (numtyp)5.0*dampk4/(numtyp)126.0 +
+                (numtyp)2.0*dampk5/(numtyp)315.0 + dampk6/(numtyp)1890.0)*expk -
+        (numtyp)2.0*termi2*termk*((numtyp)1.0 + dampi + (numtyp)4.0*dampi2/(numtyp)9.0 + dampi3/(numtyp)9.0 +
+                          dampi4/(numtyp)63.0 + dampi5/(numtyp)945.0)*expi -
+        (numtyp)2.0*termk2*termi*((numtyp)1.0 + dampk + 4.0*dampk2/(numtyp)9.0 + dampk3/(numtyp)9.0 +
                          dampk4/(numtyp)63.0 + dampk5/(numtyp)945.0)*expk;
    }
  }
@ -404,9 +404,9 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
  if (diff < eps) {
    dampi4 = dampi2 * dampi2;
    dampi5 = dampi2 * dampi3;
-    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + 
+    dmpik[2] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 +
                      7.0*dampi3/(numtyp)48.0 + dampi4/48.0)*expi;
-    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 + 
+    dmpik[4] = (numtyp)1.0 - ((numtyp)1.0 + dampi + (numtyp)0.5*dampi2 + dampi3/(numtyp)6.0 +
                      dampi4/(numtyp)24.0 + dampi5/(numtyp)144.0)*expi;
  } else {
    dampk2 = dampk * dampk;
@ -417,12 +417,12 @@ ucl_inline void dampmut(numtyp r, numtyp alphai, numtyp alphak, numtyp dmpik[5])
    termk = alphai2 / (alphai2-alphak2);
    termi2 = termi * termi;
    termk2 = termk * termk;
-    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi - 
-      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk - 
+    dmpik[2] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2)*expk -
      (numtyp)2.0*termi2*termk*((numtyp)1.0+dampi)*expi - (numtyp)2.0*termk2*termi*((numtyp)1.0+dampk)*expk;
-    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi - 
-      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk - 
-      (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi - 
+    dmpik[4] = (numtyp)1.0 - termi2*((numtyp)1.0+dampi+(numtyp)0.5*dampi2 + dampi3/(numtyp)6.0)*expi -
+      termk2*((numtyp)1.0+dampk+(numtyp)0.5*dampk2 + dampk3/(numtyp)6.00)*expk -
+      (numtyp)2.0*termi2*termk *((numtyp)1.0+dampi+dampi2/(numtyp)3.0)*expi -
      (numtyp)2.0*termk2*termi *((numtyp)1.0+dampk+dampk2/(numtyp)3.0)*expk;
  }
 }
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@ -91,6 +91,8 @@ action pair_gauss_gpu.cpp pair_gauss.cpp
 action pair_gauss_gpu.h pair_gauss.h
 action pair_gayberne_gpu.cpp pair_gayberne.cpp
 action pair_gayberne_gpu.h pair_gayberne.cpp
+action pair_hippo_gpu.cpp pair_hippo.cpp
+action pair_hippo_gpu.h pair_hippo.cpp
 action pair_lj96_cut_gpu.cpp pair_lj96_cut.cpp
 action pair_lj96_cut_gpu.h pair_lj96_cut.h
 action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -76,7 +76,7 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in

 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double **host_uind, double **host_uinp, 
+              double **host_rpole, double **host_uind, double **host_uinp,
              double *sublo, double *subhi, tagint *tag, int **nspecial,
              tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@ -86,7 +86,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal

 int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double **host_uind, double **host_uinp, 
+              double **host_rpole, double **host_uind, double **host_uinp,
              double *sublo, double *subhi, tagint *tag, int **nspecial,
              tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@ -170,7 +170,7 @@ void PairAmoebaGPU::init_style()
    maxspecial=atom->maxspecial;
    maxspecial15=atom->maxspecial15;
  }
-    
+
  int tq_size;
  int mnf = 5e-2 * neighbor->oneatom;
  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
@ -207,7 +207,7 @@ void PairAmoebaGPU::multipole_real()

  bool success = true;
  int *ilist, *numneigh, **firstneigh;
-  
+
  double sublo[3],subhi[3];
  if (domain->triclinic == 0) {
    sublo[0] = domain->sublo[0];
@ -239,7 +239,7 @@ void PairAmoebaGPU::multipole_real()
                                                 host_start, &ilist, &numneigh, cpu_time,
                                                 success, aewald, felec, off2, atom->q,
                                                 domain->boxlo, domain->prd, &tq_pinned);
-  
+
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

@ -281,7 +281,7 @@ void PairAmoebaGPU::induce()

  // set cutoffs, taper coeffs, and PME params
  // create qfac here, free at end of polar()
-  
+
  if (use_ewald) {
    choose(POLAR_LONG);
    int nmine = p_kspace->nfft_owned;
@ -317,7 +317,7 @@ void PairAmoebaGPU::induce()
  memory->create(usump,nlocal,3,"ameoba/induce:usump");

  // get the electrostatic field due to permanent multipoles
-  
+
  dfield0c(field,fieldp);

  // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
@ -345,7 +345,7 @@ void PairAmoebaGPU::induce()
  for (i = 0; i < 10; i++) {
    printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
      i, udir[i][0], udir[i][1], udir[i][2],
-      udirp[i][0], udirp[i][1], udirp[i][2]); 
+      udirp[i][0], udirp[i][1], udirp[i][2]);
  }
 */
  // get induced dipoles via the OPT extrapolation method
@ -353,7 +353,7 @@ void PairAmoebaGPU::induce()
  //       uopt,uoptp with a optorder+1 dimension, just optorder ??
  //       since no need to store optorder+1 values after these loops

-  if (poltyp == OPT) { 
+  if (poltyp == OPT) {
    for (i = 0; i < nlocal; i++) {
      for (j = 0; j < 3; j++) {
        uopt[i][0][j] = udir[i][j];
@ -460,7 +460,7 @@ void PairAmoebaGPU::induce()
      crstyle = FIELD;
      comm->reverse_comm_pair(this);
    }
-    
+
    //error->all(FLERR,"STOP GPU");

    // set initial conjugate gradient residual and conjugate vector
@ -486,7 +486,7 @@ void PairAmoebaGPU::induce()
      cfstyle = RSD;
      comm->forward_comm_pair(this);
      uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
-      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
      crstyle = ZRSD;
      comm->reverse_comm_pair(this);
   }
@ -574,7 +574,7 @@ void PairAmoebaGPU::induce()
      if (pcgprec) {
        cfstyle = RSD;
        comm->forward_comm_pair(this);
-        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
        crstyle = ZRSD;
        comm->reverse_comm_pair(this);
      }
@ -629,7 +629,7 @@ void PairAmoebaGPU::induce()
      if (iter >= politer) done = true;

      //  apply a "peek" iteration to the mutual induced dipoles
-     
+
      if (done) {
        for (i = 0; i < nlocal; i++) {
          term = pcgpeek * poli[i];
@ -644,7 +644,7 @@ void PairAmoebaGPU::induce()

    // terminate the calculation if dipoles failed to converge
    // NOTE: could make this an error
-    
+
    if (iter >= maxiter || eps > epsold)
      if (me == 0)
 	      error->warning(FLERR,"AMOEBA induced dipoles did not converge");
@ -652,7 +652,7 @@ void PairAmoebaGPU::induce()

  // DEBUG output to dump file

-  if (uind_flag) 
+  if (uind_flag)
    dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp);

  // deallocation of arrays
@ -700,7 +700,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
    PairAmoeba::udirect2b(field, fieldp);
    return;
  }
-   
+
  int eflag=1, vflag=1;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
@ -753,7 +753,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
    int idx = 4*i;
    field[i][0] += field_ptr[idx];
    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2]; 
+    field[i][2] += field_ptr[idx+2];
  }

  double* fieldp_ptr = (double *)fieldp_pinned;
@ -764,7 +764,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
    fieldp[i][1] += fieldp_ptr[idx+1];
    fieldp[i][2] += fieldp_ptr[idx+2];
  }
-  
+
 }

 /* ----------------------------------------------------------------------
@ -802,7 +802,7 @@ void PairAmoebaGPU::udirect2b_cpu()
  firstneigh = list->firstneigh;

  // NOTE: doesn't this have a problem if aewald is tiny ??
-  
+
  aesq2 = 2.0 * aewald * aewald;
  aesq2n = 0.0;
  if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
@ -829,13 +829,13 @@ void PairAmoebaGPU::udirect2b_cpu()
    pdi = pdamp[itype];
    pti = thole[itype];
    ddi = dirdamp[itype];
-    
+
    // evaluate all sites within the cutoff distance

    for (jj = 0; jj < jnum; jj++) {
      jextra = jlist[jj];
      j = jextra & NEIGHMASK15;
-      
+
      xr = x[j][0] - x[i][0];
      yr = x[j][1] - x[i][1];
      zr = x[j][2] - x[i][2];
@ -844,7 +844,7 @@ void PairAmoebaGPU::udirect2b_cpu()

      jtype = amtype[j];
      jgroup = amgroup[j];
-      
+
      factor_wscale = special_polar_wscale[sbmask15(jextra)];
      if (igroup == jgroup) {
        factor_pscale = special_polar_piscale[sbmask15(jextra)];
@ -872,7 +872,7 @@ void PairAmoebaGPU::udirect2b_cpu()
        aefac = aesq2 * aefac;
        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
      }
-      
+
      // find terms needed later to compute mutual polarization

      if (poltyp != DIRECT) {
@ -891,7 +891,7 @@ void PairAmoebaGPU::udirect2b_cpu()
        scalek = factor_uscale;
        bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
        bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
-        
+
        neighptr[n++] = j;
        tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
        tdipdip[ndip++] = bcn[1]*xr*yr;
@ -902,7 +902,7 @@ void PairAmoebaGPU::udirect2b_cpu()
      } else {
        if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
      }
-      
+
    } // jj

    firstneigh_dipole[i] = neighptr;
@ -973,7 +973,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
    int idx = 4*i;
    field[i][0] += field_ptr[idx];
    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2]; 
+    field[i][2] += field_ptr[idx+2];
  }

  double* fieldp_ptr = (double *)fieldp_pinned;
@ -1001,7 +1001,7 @@ void PairAmoebaGPU::polar_real()

  bool success = true;
  int *ilist, *numneigh, **firstneigh;
-  
+
  double sublo[3],subhi[3];
  if (domain->triclinic == 0) {
    sublo[0] = domain->sublo[0];
@ -1033,7 +1033,7 @@ void PairAmoebaGPU::polar_real()
                                             host_start, &ilist, &numneigh, cpu_time,
                                             success, aewald, felec, off2, atom->q,
                                             domain->boxlo, domain->prd, &tq_pinned);
-  
+
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

@ -1091,11 +1091,11 @@ void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr,
    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
-    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
-    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
-    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);

    virial_comp[0] += vxx;
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@ -88,7 +88,7 @@ int ** hippo_gpu_compute_multipole_real(const int ago, const int inum, const int

 int ** hippo_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
-              double **host_rpole, double **host_uind, double **host_uinp, 
+              double **host_rpole, double **host_uind, double **host_uinp,
              double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial,
              tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@ -185,7 +185,7 @@ void PairHippoGPU::init_style()
    maxspecial=atom->maxspecial;
    maxspecial15=atom->maxspecial15;
  }
-    
+
  int tq_size;
  int mnf = 5e-2 * neighbor->oneatom;
  int success = hippo_gpu_init(atom->ntypes+1, max_amtype, max_amclass,
@ -222,7 +222,7 @@ void PairHippoGPU::dispersion_real()

  bool success = true;
  int *ilist, *numneigh, **firstneigh;
-  
+
  double sublo[3],subhi[3];
  if (domain->triclinic == 0) {
    sublo[0] = domain->sublo[0];
@ -250,7 +250,7 @@ void PairHippoGPU::dispersion_real()
                                                 host_start, &ilist, &numneigh, cpu_time,
                                                 success, aewald, off2, atom->q,
                                                 domain->boxlo, domain->prd);
-  
+
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");
 }
@ -270,7 +270,7 @@ void PairHippoGPU::multipole_real()

  bool success = true;
  int *ilist, *numneigh, **firstneigh;
-  
+
  double sublo[3],subhi[3];
  if (domain->triclinic == 0) {
    sublo[0] = domain->sublo[0];
@ -302,7 +302,7 @@ void PairHippoGPU::multipole_real()
                                                host_start, &ilist, &numneigh, cpu_time,
                                                success, aewald, felec, off2, atom->q,
                                                domain->boxlo, domain->prd, &tq_pinned);
-  
+
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

@ -344,7 +344,7 @@ void PairHippoGPU::induce()

  // set cutoffs, taper coeffs, and PME params
  // create qfac here, free at end of polar()
-  
+
  if (use_ewald) {
    choose(POLAR_LONG);
    int nmine = p_kspace->nfft_owned;
@ -380,7 +380,7 @@ void PairHippoGPU::induce()
  memory->create(usump,nlocal,3,"ameoba/induce:usump");

  // get the electrostatic field due to permanent multipoles
-  
+
  dfield0c(field,fieldp);

  // need reverse_comm_pair if dfield0c (i.e. udirect2b) is CPU-only
@ -408,7 +408,7 @@ void PairHippoGPU::induce()
  for (i = 0; i < 10; i++) {
    printf("i = %d: udir = %f %f %f; udirp = %f %f %f\n",
      i, udir[i][0], udir[i][1], udir[i][2],
-      udirp[i][0], udirp[i][1], udirp[i][2]); 
+      udirp[i][0], udirp[i][1], udirp[i][2]);
  }
 */
  // get induced dipoles via the OPT extrapolation method
@ -416,7 +416,7 @@ void PairHippoGPU::induce()
  //       uopt,uoptp with a optorder+1 dimension, just optorder ??
  //       since no need to store optorder+1 values after these loops

-  if (poltyp == OPT) { 
+  if (poltyp == OPT) {
    for (i = 0; i < nlocal; i++) {
      for (j = 0; j < 3; j++) {
        uopt[i][0][j] = udir[i][j];
@ -523,7 +523,7 @@ void PairHippoGPU::induce()
      crstyle = FIELD;
      comm->reverse_comm_pair(this);
    }
-    
+
    //error->all(FLERR,"STOP GPU");

    // set initial conjugate gradient residual and conjugate vector
@ -549,7 +549,7 @@ void PairHippoGPU::induce()
      cfstyle = RSD;
      comm->forward_comm_pair(this);
      uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp);
-      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+      uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
      crstyle = ZRSD;
      comm->reverse_comm_pair(this);
   }
@ -637,7 +637,7 @@ void PairHippoGPU::induce()
      if (pcgprec) {
        cfstyle = RSD;
        comm->forward_comm_pair(this);
-        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); 
+        uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp);
        crstyle = ZRSD;
        comm->reverse_comm_pair(this);
      }
@ -692,7 +692,7 @@ void PairHippoGPU::induce()
      if (iter >= politer) done = true;

      //  apply a "peek" iteration to the mutual induced dipoles
-     
+
      if (done) {
        for (i = 0; i < nlocal; i++) {
          term = pcgpeek * poli[i];
@ -707,7 +707,7 @@ void PairHippoGPU::induce()

    // terminate the calculation if dipoles failed to converge
    // NOTE: could make this an error
-    
+
    if (iter >= maxiter || eps > epsold)
      if (me == 0)
 	      error->warning(FLERR,"hippo induced dipoles did not converge");
@ -715,7 +715,7 @@ void PairHippoGPU::induce()

  // DEBUG output to dump file

-  if (uind_flag) 
+  if (uind_flag)
    dump6(fp_uind,"id uindx uindy uindz uinpx uinpy uinpz",DEBYE,uind,uinp);

  // deallocation of arrays
@ -763,7 +763,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
    PairAmoeba::udirect2b(field, fieldp);
    return;
  }
-   
+
  int eflag=1, vflag=1;
  int nall = atom->nlocal + atom->nghost;
  int inum, host_start;
@ -816,7 +816,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
    int idx = 4*i;
    field[i][0] += field_ptr[idx];
    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2]; 
+    field[i][2] += field_ptr[idx+2];
  }

  double* fieldp_ptr = (double *)fieldp_pinned;
@ -827,7 +827,7 @@ void PairHippoGPU::udirect2b(double **field, double **fieldp)
    fieldp[i][1] += fieldp_ptr[idx+1];
    fieldp[i][2] += fieldp_ptr[idx+2];
  }
-  
+
 }

 /* ----------------------------------------------------------------------
@ -865,7 +865,7 @@ void PairHippoGPU::udirect2b_cpu()
  firstneigh = list->firstneigh;

  // NOTE: doesn't this have a problem if aewald is tiny ??
-  
+
  aesq2 = 2.0 * aewald * aewald;
  aesq2n = 0.0;
  if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald);
@ -892,13 +892,13 @@ void PairHippoGPU::udirect2b_cpu()
    pdi = pdamp[itype];
    pti = thole[itype];
    ddi = dirdamp[itype];
-    
+
    // evaluate all sites within the cutoff distance

    for (jj = 0; jj < jnum; jj++) {
      jextra = jlist[jj];
      j = jextra & NEIGHMASK15;
-      
+
      xr = x[j][0] - x[i][0];
      yr = x[j][1] - x[i][1];
      zr = x[j][2] - x[i][2];
@ -907,7 +907,7 @@ void PairHippoGPU::udirect2b_cpu()

      jtype = amtype[j];
      jgroup = amgroup[j];
-      
+
      factor_wscale = special_polar_wscale[sbmask15(jextra)];
      if (igroup == jgroup) {
        factor_pscale = special_polar_piscale[sbmask15(jextra)];
@ -935,7 +935,7 @@ void PairHippoGPU::udirect2b_cpu()
        aefac = aesq2 * aefac;
        bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2;
      }
-      
+
      // find terms needed later to compute mutual polarization

      if (poltyp != DIRECT) {
@ -954,7 +954,7 @@ void PairHippoGPU::udirect2b_cpu()
        scalek = factor_uscale;
        bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3;
        bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5;
-        
+
        neighptr[n++] = j;
        tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr;
        tdipdip[ndip++] = bcn[1]*xr*yr;
@ -965,7 +965,7 @@ void PairHippoGPU::udirect2b_cpu()
      } else {
        if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j);
      }
-      
+
    } // jj

    firstneigh_dipole[i] = neighptr;
@ -1036,7 +1036,7 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
    int idx = 4*i;
    field[i][0] += field_ptr[idx];
    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2]; 
+    field[i][2] += field_ptr[idx+2];
  }

  double* fieldp_ptr = (double *)fieldp_pinned;
@ -1064,7 +1064,7 @@ void PairHippoGPU::polar_real()

  bool success = true;
  int *ilist, *numneigh, **firstneigh;
-  
+
  double sublo[3],subhi[3];
  if (domain->triclinic == 0) {
    sublo[0] = domain->sublo[0];
@ -1096,7 +1096,7 @@ void PairHippoGPU::polar_real()
                                             host_start, &ilist, &numneigh, cpu_time,
                                             success, aewald, felec, off2, atom->q,
                                             domain->boxlo, domain->prd, &tq_pinned);
-  
+
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

@ -1156,11 +1156,11 @@ void PairHippoGPU::compute_force_from_torque(const numtyp* tq_ptr,
    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
-    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] +
                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
-    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] +
                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
-    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] +
                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);

    virial_comp[0] += vxx;