Added work on amoeba/gpu, some minor changes to PairAmoeba to allow function overriding in PairAmoebaGPU, added the package AMOEBA to cmake/CMakeLists.txt

2021-08-25 22:57:37 -05:00
parent 9c095e8d76
commit 3825fee8e9
19 changed files with 2228 additions and 6 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -140,6 +140,7 @@ option(CMAKE_VERBOSE_MAKEFILE "Generate verbose Makefiles" OFF)

 set(STANDARD_PACKAGES
  ADIOS
+  AMOEBA
  ASPHERE
  ATC
  AWPMD
@ -308,6 +309,7 @@ endif()
 pkg_depends(ML-IAP ML-SNAP)
 pkg_depends(MPIIO MPI)
 pkg_depends(ATC MANYBODY)
+pkg_depends(AMOEBA KSPACE)
 pkg_depends(LATBOLTZ MPI)
 pkg_depends(PHONON KSPACE)
 pkg_depends(SCAFACOS MPI)
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -0,0 +1,155 @@
+/***************************************************************************
+                                 amoeba.cpp
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "amoeba_cl.h"
+#elif defined(USE_CUDART)
+const char *amoeba=0;
+#else
+#include "amoeba_cubin.h"
+#endif
+
+#include "lal_amoeba.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define AmoebaT Amoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+AmoebaT::Amoeba() : BaseAmoeba<numtyp,acctyp>(),
+  _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+AmoebaT::~Amoeba() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pdamp,
+                  const double *host_thole, const double *host_special_polar_wscale,
+                  const double *host_special_polar_piscale,
+                  const double *host_special_polar_pscale,
+                  const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15,
+                  const double cell_size, const double gpu_split, FILE *_screen,
+                  const double aewald, const double felec,
+                  const double off2, const double polar_dscale,
+                  const double polar_uscale) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
+                            cell_size,gpu_split,_screen,amoeba,"k_amoeba_polar");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+
+  UCL_H_Vec<numtyp4> host_write(max_amtype, *(this->ucl_device), UCL_WRITE_ONLY);
+  for (int i = 0; i < max_amtype; i++) {
+    host_write[i].x = host_pdamp[i];
+    host_write[i].y = host_thole[i];
+    host_write[i].z = (numtyp)0;
+    host_write[i].w = (numtyp)0;
+  }
+
+  damping.alloc(max_amtype,*(this->ucl_device), UCL_READ_ONLY);
+  ucl_copy(damping,host_write,false);
+
+  UCL_H_Vec<numtyp4> dview(5, *(this->ucl_device), UCL_WRITE_ONLY);
+  sp_polar.alloc(5,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<5; i++) {
+    dview[i].x=host_special_polar_wscale[i];
+    dview[i].y=host_special_polar_piscale[i];
+    dview[i].z=host_special_polar_pscale[i];
+    dview[i].w=(numtyp)0;
+  }
+  ucl_copy(sp_polar,dview,5,false);
+
+  _aewald = aewald;
+  _felec = felec;
+  _off2 = off2;
+  _polar_dscale = polar_dscale;
+  _polar_uscale = polar_uscale;
+
+  _allocated=true;
+  this->_max_bytes=damping.row_bytes()
+    + sp_polar.row_bytes()
+    + this->_tep.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void AmoebaT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  damping.clear();
+  sp_polar.clear();
+  
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double AmoebaT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(Amoeba<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int AmoebaT::loop(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int _nall=this->atom->nall();
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+
+  this->k_polar.set_size(GX,BX);
+
+  this->k_polar.run(&this->atom->x, &this->atom->extra,
+                    &damping, &sp_polar,
+                    &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                    &this->ans->force, &this->ans->engv, &this->_tep,
+                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
+                    &this->_threads_per_atom,
+                    &_aewald, &_felec, &_off2, &_polar_dscale, &_polar_uscale);
+  this->time_pair.stop();
+  return GX;
+}
+
+template class Amoeba<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -0,0 +1,684 @@
+// **************************************************************************
+//                                   amoeba.cu
+//                             -------------------
+//                          Trung Dac Nguyen (Northwestern)
+//
+//  Device code for acceleration of the amoeba pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : trung.nguyen@northwestern.edu
+// ***************************************************************************
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include <stdio.h>
+#include "lal_aux_fun1.h"
+#ifdef LAMMPS_SMALLBIG
+#define tagint int
+#endif
+#ifdef LAMMPS_BIGBIG
+#include "inttypes.h"
+#define tagint int64_t
+#endif
+#ifdef LAMMPS_SMALLSMALL
+#define tagint int
+#endif
+#ifndef _DOUBLE_DOUBLE
+_texture( pos_tex,float4);
+_texture( q_tex,float);
+#else
+_texture_2d( pos_tex,int4);
+_texture( q_tex,int2);
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+#if (SHUFFLE_AVAIL == 0)
+
+#define local_allocate_store_ufld()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    red_acc[0][tid]=ufld[0];                                                \
+    red_acc[1][tid]=ufld[1];                                                \
+    red_acc[2][tid]=ufld[2];                                                \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<3; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    ufld[0]=red_acc[0][tid];                                                \
+    ufld[1]=red_acc[1][tid];                                                \
+    ufld[2]=red_acc[2][tid];                                                \
+    red_acc[0][tid]=dufld[0];                                               \
+    red_acc[1][tid]=dufld[1];                                               \
+    red_acc[2][tid]=dufld[2];                                               \
+    red_acc[3][tid]=dufld[3];                                               \
+    red_acc[4][tid]=dufld[4];                                               \
+    red_acc[5][tid]=dufld[5];                                               \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    dufld[0]=red_acc[0][tid];                                               \
+    dufld[1]=red_acc[1][tid];                                               \
+    dufld[2]=red_acc[2][tid];                                               \
+    dufld[3]=red_acc[3][tid];                                               \
+    dufld[4]=red_acc[4][tid];                                               \
+    dufld[5]=red_acc[5][tid];                                               \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    numtyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#else
+
+#define local_allocate_store_ufld()
+
+#define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
+                          i, tep)                                           \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      ufld[0] += shfl_down(ufld[0], s, t_per_atom);                         \
+      ufld[1] += shfl_down(ufld[1], s, t_per_atom);                         \
+      ufld[2] += shfl_down(ufld[2], s, t_per_atom);                         \
+      dufld[0] += shfl_down(dufld[0], s, t_per_atom);                       \
+      dufld[1] += shfl_down(dufld[1], s, t_per_atom);                       \
+      dufld[2] += shfl_down(dufld[2], s, t_per_atom);                       \
+      dufld[3] += shfl_down(dufld[3], s, t_per_atom);                       \
+      dufld[4] += shfl_down(dufld[4], s, t_per_atom);                       \
+      dufld[5] += shfl_down(dufld[5], s, t_per_atom);                       \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    numtyp4 t;                                                              \
+    t.x = diz*ufld[1] - diy*ufld[2] + qixz*dufld[1] - qixy*dufld[3] +       \
+      (numtyp)2.0*qiyz*(dufld[2]-dufld[5]) + (qizz-qiyy)*dufld[4];          \
+    t.y = dix*ufld[2] - diz*ufld[0] - qiyz*dufld[1] + qixy*dufld[4] +       \
+      (numtyp)2.0*qixz*(dufld[5]-dufld[0]) + (qixx-qizz)*dufld[3];          \
+    t.z = diy*ufld[0] - dix*ufld[1] + qiyz*dufld[3] - qixz*dufld[4] +       \
+      (numtyp)2.0*qixy*(dufld[0]-dufld[2]) + (qiyy-qixx)*dufld[1];          \
+    tep[i]=t;                                                               \
+  }
+
+#endif
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MY_PIS (acctyp)1.77245385090551602729
+
+__kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
+                            const __global numtyp *restrict extra,
+                            const __global numtyp4 *restrict damping,
+                            const __global numtyp4 *restrict sp_polar,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            __global numtyp4 *restrict tep,
+                            const int eflag, const int vflag, const int inum,
+                            const int nall, const int nbor_pitch, const int t_per_atom,
+                            const numtyp aewald, const numtyp felec,
+                            const numtyp off2, const numtyp polar_dscale,
+                            const numtyp polar_uscale)
+{
+  int tid, ii, offset, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_ufld();
+  local_allocate_store_charge();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int l=0; l<6; l++) virial[l]=(acctyp)0;
+  }
+
+  acctyp ufld[3];
+  ufld[0] = (acctyp)0; ufld[1]=(acctyp)0; ufld[2]=(acctyp)0;
+  acctyp dufld[6];
+  for (int l=0; l<6; l++) dufld[l]=(acctyp)0;
+
+  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
+  numtyp4* polar1 = (numtyp4*)(&extra[0]);
+  numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
+  numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
+  numtyp4* polar4 = (numtyp4*)(&extra[12*nall]);
+  numtyp4* polar5 = (numtyp4*)(&extra[16*nall]);
+
+  //numtyp4 xi__;
+
+  if (ii<inum) {
+    int k,m,itype,igroup;
+    numtyp bfac;
+    numtyp psc3,psc5,psc7;
+    numtyp dsc3,dsc5,dsc7;
+    numtyp usc3,usc5;
+    numtyp psr3,psr5,psr7;
+    numtyp dsr3,dsr5,dsr7;
+    numtyp usr5;
+    numtyp term1,term2,term3;
+    numtyp term4,term5;
+    numtyp term6,term7;
+    numtyp rc3[3],rc5[3],rc7[3];
+    numtyp prc3[3],prc5[3],prc7[3];
+    numtyp drc3[3],drc5[3],drc7[3];
+    numtyp urc3[3],urc5[3];
+    numtyp bn[5];
+    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;
+
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    //numtyp qtmp; fetch(qtmp,i,q_tex);
+    //int itype=ix.w;
+
+    ci  = polar1[i].x;    // rpole[i][0];
+    dix = polar1[i].y;    // rpole[i][1];
+    diy = polar1[i].z;    // rpole[i][2];
+    diz = polar1[i].w;    // rpole[i][3];
+    qixx = polar2[i].x;   // rpole[i][4];
+    qixy = polar2[i].y;   // rpole[i][5];
+    qixz = polar2[i].z;   // rpole[i][6];
+    qiyy = polar2[i].w;   // rpole[i][8];
+    qiyz   = polar3[i].x; // rpole[i][9];
+    qizz   = polar3[i].y; // rpole[i][12];
+    itype  = polar3[i].z; // amtype[i];
+    igroup = polar3[i].w; // amgroup[i];
+    uix = polar4[i].x;    // uind[i][0];
+    uiy = polar4[i].y;    // uind[i][1];
+    uiz = polar4[i].z;    // uind[i][2];
+    uixp = polar5[i].x;   // uinp[i][0];
+    uiyp = polar5[i].y;   // uinp[i][1];
+    uizp = polar5[i].z;   // uinp[i][2];
+
+    // debug:
+    // xi__ = ix; xi__.w = itype;
+
+    numtyp pdi = damping[itype].x;
+    numtyp pti = damping[itype].y;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int jextra=dev_packed[nbor];
+      int j = jextra & NEIGHMASK15;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      //int jtype=jx.w;
+ 
+      // Compute r12
+      numtyp xr = jx.x - ix.x;
+      numtyp yr = jx.y - ix.y;
+      numtyp zr = jx.z - ix.z;
+      numtyp r2 = xr*xr + yr*yr + zr*zr;
+
+      if (r2>off2) continue;
+  
+      numtyp r = ucl_sqrt(r2);
+      
+      numtyp ck = polar1[j].x;   // rpole[j][0];
+      numtyp dkx = polar1[j].y;  // rpole[j][1];
+      numtyp dky = polar1[j].z;  // rpole[j][2];
+      numtyp dkz = polar1[j].w;  // rpole[j][3];
+      numtyp qkxx = polar2[j].x; // rpole[j][4];
+      numtyp qkxy = polar2[j].y; // rpole[j][5];
+      numtyp qkxz = polar2[j].z; // rpole[j][6];
+      numtyp qkyy = polar2[j].w; // rpole[j][8];
+      numtyp qkyz = polar3[j].x; // rpole[j][9];
+      numtyp qkzz = polar3[j].y; // rpole[j][12];
+      int jtype =   polar3[j].z; // amtype[j];
+      int jgroup =  polar3[j].w; // amgroup[j];
+      numtyp ukx = polar4[j].x;  // uind[j][0];
+      numtyp uky = polar4[j].y;  // uind[j][1];
+      numtyp ukz = polar4[j].z;  // uind[j][2];
+      numtyp ukxp = polar5[j].x; // uinp[j][0];
+      numtyp ukyp = polar5[j].y; // uinp[j][1];
+      numtyp ukzp = polar5[j].z; // uinp[j][2];
+
+      numtyp factor_wscale, factor_dscale, factor_pscale, factor_uscale;
+      const numtyp4 sp_pol = sp_polar[sbmask15(jextra)];
+      factor_wscale = sp_pol.x; // sp_polar_wscale[sbmask15(jextra)];
+      if (igroup == jgroup) {
+        factor_pscale = sp_pol.y; // sp_polar_piscale[sbmask15(jextra)];
+        factor_dscale = polar_dscale;
+        factor_uscale = polar_uscale;
+      } else {
+        factor_pscale = sp_pol.z; // sp_polar_pscale[sbmask15(jextra)];
+        factor_dscale = factor_uscale = (numtyp)1.0;
+      }
+
+      // intermediates involving moments and separation distance
+
+      numtyp dir = dix*xr + diy*yr + diz*zr;
+      numtyp qix = qixx*xr + qixy*yr + qixz*zr;
+      numtyp qiy = qixy*xr + qiyy*yr + qiyz*zr;
+      numtyp qiz = qixz*xr + qiyz*yr + qizz*zr;
+      numtyp qir = qix*xr + qiy*yr + qiz*zr;
+      numtyp dkr = dkx*xr + dky*yr + dkz*zr;
+      numtyp qkx = qkxx*xr + qkxy*yr + qkxz*zr;
+      numtyp qky = qkxy*xr + qkyy*yr + qkyz*zr;
+      numtyp qkz = qkxz*xr + qkyz*yr + qkzz*zr;
+      numtyp qkr = qkx*xr + qky*yr + qkz*zr;
+      numtyp uir = uix*xr + uiy*yr + uiz*zr;
+      numtyp uirp = uixp*xr + uiyp*yr + uizp*zr;
+      numtyp ukr = ukx*xr + uky*yr + ukz*zr;
+      numtyp ukrp = ukxp*xr + ukyp*yr + ukzp*zr;
+
+      // get reciprocal distance terms for this interaction
+
+      numtyp rinv = ucl_recip(r);
+      numtyp r2inv = rinv*rinv;
+      numtyp rr1 = felec * rinv;
+      numtyp rr3 = rr1 * r2inv;
+      numtyp rr5 = (numtyp)3.0 * rr3 * r2inv;
+      numtyp rr7 = (numtyp)5.0 * rr5 * r2inv;
+      numtyp rr9 = (numtyp)7.0 * rr7 * r2inv;
+
+      // calculate the real space Ewald error function terms
+
+      numtyp ralpha = aewald * r;
+      numtyp exp2a = ucl_exp(-ralpha*ralpha);
+      numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*ralpha);
+      numtyp _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * exp2a;
+      //bn[0] = erfc(ralpha) / r;
+      bn[0] = _erfc * rinv;
+      numtyp alsq2 = (numtyp)2.0 * aewald*aewald;
+      numtyp alsq2n = (numtyp)0.0;
+      if (aewald > (numtyp)0.0) alsq2n = (numtyp)1.0 / (MY_PIS*aewald);
+
+      for (m = 1; m <= 4; m++) {
+        bfac = (numtyp) (m+m-1);
+        alsq2n = alsq2 * alsq2n;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+      }
+      for (m = 0; m < 5; m++) bn[m] *= felec;
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp sc3 = (numtyp)1.0;
+      numtyp sc5 = (numtyp)1.0;
+      numtyp sc7 = (numtyp)1.0;
+      for (k = 0; k < 3; k++) {
+        rc3[k] = (numtyp)0.0;
+        rc5[k] = (numtyp)0.0;
+        rc7[k] = (numtyp)0.0;
+      }
+
+      // apply Thole polarization damping to scale factors
+
+      numtyp damp = pdi * damping[jtype].x; // pdamp[jtype]
+      if (damp != (numtyp)0.0) {
+        numtyp pgamma = MIN(pti,damping[jtype].y); // thole[jtype]
+        damp = pgamma * ucl_powr(r/damp,(numtyp)3.0);
+        if (damp < (numtyp)50.0) {
+          numtyp expdamp = ucl_exp(-damp);
+          sc3 = (numtyp)1.0 - expdamp;
+          sc5 = (numtyp)1.0 - ((numtyp)1.0+damp)*expdamp;
+          sc7 = (numtyp)1.0 - ((numtyp)1.0+damp+(numtyp)0.6*damp*damp) * expdamp;
+          numtyp temp3 = (numtyp)3.0 * damp * expdamp * r2inv;
+          numtyp temp5 = damp;
+          numtyp temp7 = (numtyp)-0.2 + (numtyp)0.6*damp;
+          rc3[0] = xr * temp3;
+          rc3[1] = yr * temp3;
+          rc3[2] = zr * temp3;
+          rc5[0] = rc3[0] * temp5;
+          rc5[1] = rc3[1] * temp5;
+          rc5[2] = rc3[2] * temp5;
+          rc7[0] = rc5[0] * temp7;
+          rc7[1] = rc5[1] * temp7;
+          rc7[2] = rc5[2] * temp7;
+        }
+
+        psc3 = (numtyp)1.0 - sc3*factor_pscale;
+        psc5 = (numtyp)1.0 - sc5*factor_pscale;
+        psc7 = (numtyp)1.0 - sc7*factor_pscale;
+        dsc3 = (numtyp)1.0 - sc3*factor_dscale;
+        dsc5 = (numtyp)1.0 - sc5*factor_dscale;
+        dsc7 = (numtyp)1.0 - sc7*factor_dscale;
+        usc3 = (numtyp)1.0 - sc3*factor_uscale;
+        usc5 = (numtyp)1.0 - sc5*factor_uscale;
+        psr3 = bn[1] - psc3*rr3;
+        psr5 = bn[2] - psc5*rr5;
+        psr7 = bn[3] - psc7*rr7;
+        dsr3 = bn[1] - dsc3*rr3;
+        dsr5 = bn[2] - dsc5*rr5;
+        dsr7 = bn[3] - dsc7*rr7;
+        usr5 = bn[2] - usc5*rr5;
+        for (k = 0; k < 3; k++) {
+          prc3[k] = rc3[k] * factor_pscale;
+          prc5[k] = rc5[k] * factor_pscale;
+          prc7[k] = rc7[k] * factor_pscale;
+          drc3[k] = rc3[k] * factor_dscale;
+          drc5[k] = rc5[k] * factor_dscale;
+          drc7[k] = rc7[k] * factor_dscale;
+          urc3[k] = rc3[k] * factor_uscale;
+          urc5[k] = rc5[k] * factor_uscale;
+        }
+      } else { // damp == 0: ???
+      }
+
+      // get the induced dipole field used for dipole torques
+
+      numtyp tix3 = psr3*ukx + dsr3*ukxp;
+      numtyp tiy3 = psr3*uky + dsr3*ukyp;
+      numtyp tiz3 = psr3*ukz + dsr3*ukzp;
+      numtyp tuir = -psr5*ukr - dsr5*ukrp;
+      
+      ufld[0] += tix3 + xr*tuir;
+      ufld[1] += tiy3 + yr*tuir;
+      ufld[2] += tiz3 + zr*tuir;
+
+      // get induced dipole field gradient used for quadrupole torques
+
+      numtyp tix5 = (numtyp)2.0 * (psr5*ukx+dsr5*ukxp);
+      numtyp tiy5 = (numtyp)2.0 * (psr5*uky+dsr5*ukyp);
+      numtyp tiz5 = (numtyp)2.0 * (psr5*ukz+dsr5*ukzp);
+      tuir = -psr7*ukr - dsr7*ukrp;
+      
+      dufld[0] += xr*tix5 + xr*xr*tuir;
+      dufld[1] += xr*tiy5 + yr*tix5 + (numtyp)2.0*xr*yr*tuir;
+      dufld[2] += yr*tiy5 + yr*yr*tuir;
+      dufld[3] += xr*tiz5 + zr*tix5 + (numtyp)2.0*xr*zr*tuir;
+      dufld[4] += yr*tiz5 + zr*tiy5 + (numtyp)2.0*yr*zr*tuir;
+      dufld[5] += zr*tiz5 + zr*zr*tuir;
+      
+      // get the dEd/dR terms used for direct polarization force
+
+      term1 = bn[2] - dsc3*rr5;
+      term2 = bn[3] - dsc5*rr7;
+      term3 = -dsr3 + term1*xr*xr - rr3*xr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr - dsr5*xr;
+      term5 = term2*xr*xr - dsr5 - rr5*xr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*xr - bn[3] - rr7*xr*drc7[0];
+      term7 = rr5*drc5[0] - (numtyp)2.0*bn[3]*xr + (dsc5+(numtyp)1.5*dsc7)*rr7*xr;
+      numtyp tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixx + (qiy*yr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qix*term7 +qir*term6;
+      numtyp tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxx + (qky*yr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*yr*yr - rr3*yr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr - dsr5*yr;
+      term5 = term2*yr*yr - dsr5 - rr5*yr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*yr - bn[3] - rr7*yr*drc7[1];
+      term7 = rr5*drc5[1] - (numtyp)2.0*bn[3]*yr + (dsc5+(numtyp)1.5*dsc7)*rr7*yr;
+      numtyp tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyy + (qix*xr+qiz*zr)*dsc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyy + (qkx*xr+qkz*zr)*dsc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      term3 = -dsr3 + term1*zr*zr - rr3*zr*drc3[2];
+      term4 = rr3*drc3[2] - term1*zr - dsr5*zr;
+      term5 = term2*zr*zr - dsr5 - rr5*zr*drc5[2];
+      term6 = (bn[4]-dsc7*rr9)*zr*zr - bn[3] - rr7*zr*drc7[2];
+      term7 = rr5*drc5[2] - (numtyp)2.0*bn[3]*zr + (dsc5+(numtyp)1.5*dsc7)*rr7*zr;
+      numtyp tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qizz + (qix*xr+qiy*yr)*dsc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkzz + (qkx*xr+qky*yr)*dsc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*xr*yr - rr3*yr*drc3[0];
+      term4 = rr3*drc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*yr - rr7*yr*drc7[0];
+      term7 = rr5*drc5[0] - term2*xr;
+      numtyp tixy = ci*term3 - dsr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixy - (numtyp)2.0*dsr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      numtyp tkxy = ck*term3 + dsr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxy - (numtyp)2.0*dsr7*yr*qkx +(numtyp) 2.0*qky*term7 + qkr*term6;
+
+      term3 = term1*xr*zr - rr3*zr*drc3[0];
+      term5 = term2*xr*zr - rr5*zr*drc5[0];
+      term6 = (bn[4]-dsc7*rr9)*xr*zr - rr7*zr*drc7[0];
+      numtyp tixz = ci*term3 - dsr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qixz - (numtyp)2.0*dsr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkxz = ck*term3 + dsr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkxz - (numtyp)2.0*dsr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      term3 = term1*yr*zr - rr3*zr*drc3[1];
+      term4 = rr3*drc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*drc5[1];
+      term6 = (bn[4]-dsc7*rr9)*yr*zr - rr7*zr*drc7[1];
+      term7 = rr5*drc5[1] - term2*yr;
+      numtyp tiyz = ci*term3 - dsr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*dsr5*qiyz - (numtyp)2.0*dsr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      numtyp tkyz = ck*term3 + dsr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*dsr5*qkyz - (numtyp)2.0*dsr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      numtyp depx = tixx*ukxp + tixy*ukyp + tixz*ukzp - tkxx*uixp - tkxy*uiyp - tkxz*uizp;
+      numtyp depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp - tkxy*uixp - tkyy*uiyp - tkyz*uizp;
+      numtyp depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp - tkxz*uixp - tkyz*uiyp - tkzz*uizp;
+
+      numtyp frcx = depx;
+      numtyp frcy = depy;
+      numtyp frcz = depz;
+
+      // get the dEp/dR terms used for direct polarization force
+      
+      // tixx and tkxx
+      term1 = bn[2] - psc3*rr5;
+      term2 = bn[3] - psc5*rr7;
+      term3 = -psr3 + term1*xr*xr - rr3*xr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr - psr5*xr;
+      term5 = term2*xr*xr - psr5 - rr5*xr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*xr - bn[3] - rr7*xr*prc7[0];
+      term7 = rr5*prc5[0] - (numtyp)2.0*bn[3]*xr + (psc5+(numtyp)1.5*psc7)*rr7*xr;
+      tixx = ci*term3 + dix*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixx + (qiy*yr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qix*term7 + qir*term6;
+      tkxx = ck*term3 - dkx*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxx + (qky*yr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qkx*term7 + qkr*term6;
+
+      // tiyy and tkyy
+      term3 = -psr3 + term1*yr*yr - rr3*yr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr - psr5*yr;
+      term5 = term2*yr*yr - psr5 - rr5*yr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*yr - bn[3] - rr7*yr*prc7[1];
+      term7 = rr5*prc5[1] - (numtyp)2.0*bn[3]*yr + (psc5+(numtyp)1.5*psc7)*rr7*yr;
+      tiyy = ci*term3 + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyy + (qix*xr+qiz*zr)*psc7*rr7 + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkyy = ck*term3 - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyy + (qkx*xr+qkz*zr)*psc7*rr7 + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tizz and tkzz
+      term3 = -psr3 + term1*zr*zr - rr3*zr*prc3[2];
+      term4 = rr3*prc3[2] - term1*zr - psr5*zr;
+      term5 = term2*zr*zr - psr5 - rr5*zr*prc5[2];
+      term6 = (bn[4]-psc7*rr9)*zr*zr - bn[3] - rr7*zr*prc7[2];
+      term7 = rr5*prc5[2] - (numtyp)2.0*bn[3]*zr + (psc5+(numtyp)1.5*psc7)*rr7*zr;
+      tizz = ci*term3 + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qizz + (qix*xr+qiy*yr)*psc7*rr7 + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkzz = ck*term3 - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkzz + (qkx*xr+qky*yr)*psc7*rr7 + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tixy and tkxy
+      term3 = term1*xr*yr - rr3*yr*prc3[0];
+      term4 = rr3*prc3[0] - term1*xr;
+      term5 = term2*xr*yr - rr5*yr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*yr - rr7*yr*prc7[0];
+      term7 = rr5*prc5[0] - term2*xr;
+      tixy = ci*term3 - psr5*dix*yr + diy*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixy - (numtyp)2.0*psr7*yr*qix + (numtyp)2.0*qiy*term7 + qir*term6;
+      tkxy = ck*term3 + psr5*dkx*yr - dky*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxy - (numtyp)2.0*psr7*yr*qkx + (numtyp)2.0*qky*term7 + qkr*term6;
+
+      // tixz and tkxz
+      term3 = term1*xr*zr - rr3*zr*prc3[0];
+      term5 = term2*xr*zr - rr5*zr*prc5[0];
+      term6 = (bn[4]-psc7*rr9)*xr*zr - rr7*zr*prc7[0];
+      tixz = ci*term3 - psr5*dix*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qixz - (numtyp)2.0*psr7*zr*qix + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkxz = ck*term3 + psr5*dkx*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkxz - (numtyp)2.0*psr7*zr*qkx + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      // tiyz and tkyz
+      term3 = term1*yr*zr - rr3*zr*prc3[1];
+      term4 = rr3*prc3[1] - term1*yr;
+      term5 = term2*yr*zr - rr5*zr*prc5[1];
+      term6 = (bn[4]-psc7*rr9)*yr*zr - rr7*zr*prc7[1];
+      term7 = rr5*prc5[1] - term2*yr;
+      tiyz = ci*term3 - psr5*diy*zr + diz*term4 + dir*term5 +
+        (numtyp)2.0*psr5*qiyz - (numtyp)2.0*psr7*zr*qiy + (numtyp)2.0*qiz*term7 + qir*term6;
+      tkyz = ck*term3 + psr5*dky*zr - dkz*term4 - dkr*term5 +
+        (numtyp)2.0*psr5*qkyz - (numtyp)2.0*psr7*zr*qky + (numtyp)2.0*qkz*term7 + qkr*term6;
+
+      depx = tixx*ukx + tixy*uky + tixz*ukz - tkxx*uix - tkxy*uiy - tkxz*uiz;
+      depy = tixy*ukx + tiyy*uky + tiyz*ukz - tkxy*uix - tkyy*uiy - tkyz*uiz;
+      depz = tixz*ukx + tiyz*uky + tizz*ukz - tkxz*uix - tkyz*uiy - tkzz*uiz;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      // get the dtau/dr terms used for mutual polarization force
+      // poltyp == MUTUAL  && amoeba
+          
+      term1 = bn[2] - usc3*rr5;
+      term2 = bn[3] - usc5*rr7;
+      term3 = usr5 + term1;
+      term4 = rr3 * factor_uscale;
+      term5 = -xr*term3 + rc3[0]*term4;
+      term6 = -usr5 + xr*xr*term2 - rr5*xr*urc5[0];
+      tixx = uix*term5 + uir*term6;
+      tkxx = ukx*term5 + ukr*term6;
+
+      term5 = -yr*term3 + rc3[1]*term4;
+      term6 = -usr5 + yr*yr*term2 - rr5*yr*urc5[1];
+      tiyy = uiy*term5 + uir*term6;
+      tkyy = uky*term5 + ukr*term6;
+
+      term5 = -zr*term3 + rc3[2]*term4;
+      term6 = -usr5 + zr*zr*term2 - rr5*zr*urc5[2];
+      tizz = uiz*term5 + uir*term6;
+      tkzz = ukz*term5 + ukr*term6;
+
+      term4 = -usr5 * yr;
+      term5 = -xr*term1 + rr3*urc3[0];
+      term6 = xr*yr*term2 - rr5*yr*urc5[0];
+      tixy = uix*term4 + uiy*term5 + uir*term6;
+      tkxy = ukx*term4 + uky*term5 + ukr*term6;
+
+      term4 = -usr5 * zr;
+      term6 = xr*zr*term2 - rr5*zr*urc5[0];
+      tixz = uix*term4 + uiz*term5 + uir*term6;
+      tkxz = ukx*term4 + ukz*term5 + ukr*term6;
+
+      term5 = -yr*term1 + rr3*urc3[1];
+      term6 = yr*zr*term2 - rr5*zr*urc5[1];
+      tiyz = uiy*term4 + uiz*term5 + uir*term6;
+      tkyz = uky*term4 + ukz*term5 + ukr*term6;
+
+      depx = tixx*ukxp + tixy*ukyp + tixz*ukzp
+        + tkxx*uixp + tkxy*uiyp + tkxz*uizp;
+      depy = tixy*ukxp + tiyy*ukyp + tiyz*ukzp
+        + tkxy*uixp + tkyy*uiyp + tkyz*uizp;
+      depz = tixz*ukxp + tiyz*ukyp + tizz*ukzp
+        + tkxz*uixp + tkyz*uiyp + tkzz*uizp;
+
+      frcx = frcx + depx;
+      frcy = frcy + depy;
+      frcz = frcz + depz;
+
+      f.x -= frcx;
+      f.y -= frcy;
+      f.z -= frcz;
+
+      if (EVFLAG && vflag) {
+        numtyp vxx = xr * frcx;
+        numtyp vxy = (numtyp)0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)0.5 * (zr*frcx+xr*frcz);
+        numtyp vyy = yr * frcy;
+        numtyp vyz = (numtyp)0.5 * (zr*frcy+yr*frcz);
+        numtyp vzz = zr * frcz;
+
+        virial[0] += vxx;
+        virial[1] += vyy;
+        virial[2] += vzz;
+        virial[3] += vxy;
+        virial[4] += vxz;
+        virial[5] += vyz;
+      }
+    } // nbor
+    
+  } // ii<inum
+
+  // accumulate ufld and dufld to compute tep
+  store_answers_tep(ufld,dufld,ii,inum,tid,t_per_atom,offset,i,tep);
+
+  // accumate force, energy and virial
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,
+     offset,eflag,vflag,ans,engv);
+}
+
+/* ----------------------------------------------------------------------
+   scan standard neighbor list and make it compatible with 1-5 neighbors
+   if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
+   else scan special15 to see if a 1-5 neighbor and adjust offset to SBBITS15
+   else do nothing to IJ entry
+------------------------------------------------------------------------- */
+
+__kernel void k_special15(__global int * dev_nbor,
+                          const __global int * dev_packed,
+                          const __global tagint *restrict tag,
+                          const __global int *restrict nspecial15,
+                          const __global tagint *restrict special15,
+                          const int inum, const int nall, const int nbor_pitch,
+                          const int t_per_atom) {
+  int tid, ii, offset, n_stride, i;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+  
+    int numj, nbor, nbor_end;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    int n15 = nspecial15[ii];
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+
+      int sj=dev_packed[nbor];
+      int which = sj >> SBBITS & 3;
+      int j = sj & NEIGHMASK;
+      tagint jtag = tag[j];
+
+      if (!which) {
+        int offset=ii;
+        for (int k=0; k<n15; k++) {
+          if (special15[offset] == jtag) {
+            which = 4;
+            break;
+          }
+          offset += nall;
+        }
+      }
+
+      if (which) dev_nbor[nbor] = j ^ (which << SBBITS15);
+    } // for nbor
+
+  } // if ii
+}
+
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@ -0,0 +1,87 @@
+/***************************************************************************
+                                  amoeba.h
+                             -------------------
+                          Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the amoeba pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_AMOEBA_H
+#define LAL_AMOEBA_H
+
+#include "lal_base_amoeba.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class Amoeba : public BaseAmoeba<numtyp, acctyp> {
+ public:
+  Amoeba();
+  ~Amoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const int max_amtype, const double *host_pdamp,
+           const double *host_thole, const double *host_special_polar_wscale,
+           const double *host_special_polar_piscale,
+           const double *host_special_polar_pscale,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const int maxspecial15, const double cell_size,
+           const double gpu_split, FILE *_screen,
+           const double aewald, const double felec,
+           const double off2, const double polar_dscale,
+           const double polar_uscale);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// pdamp = damping.x; thole = damping.y
+  UCL_D_Vec<numtyp4> damping;
+  /// Special polar values [0-4]: 
+  ///   sp_polar.x = special_polar_wscale
+  ///   sp_polar.y special_polar_pscale,
+  ///   sp_polar.z = special_polar_piscale
+  UCL_D_Vec<numtyp4> sp_polar;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _aewald, _felec, _off2, _polar_dscale, _polar_uscale;
+  numtyp _qqrd2e;
+
+ private:
+  bool _allocated;
+  int loop(const int eflag, const int vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -0,0 +1,142 @@
+/***************************************************************************
+                                 amoeba_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to amoeba acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_amoeba.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static Amoeba<PRECISION,ACC_PRECISION> AMOEBAMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int amoeba_gpu_init(const int ntypes, const int max_amtype,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double aewald, const double felec,
+                    const double off2, const double polar_dscale,
+                    const double polar_uscale, int& tep_size) {
+  AMOEBAMF.clear();
+  gpu_mode=AMOEBAMF.device->gpu_mode();
+  double gpu_split=AMOEBAMF.device->particle_split();
+  int first_gpu=AMOEBAMF.device->first_device();
+  int last_gpu=AMOEBAMF.device->last_device();
+  int world_me=AMOEBAMF.device->world_me();
+  int gpu_rank=AMOEBAMF.device->gpu_rank();
+  int procs_per_gpu=AMOEBAMF.device->procs_per_gpu();
+
+  tep_size=sizeof(PRECISION);
+
+  AMOEBAMF.device->init_message(screen,"amoeba",first_gpu,last_gpu);
+
+  bool message=false;
+  if (AMOEBAMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
+                          host_special_polar_wscale, host_special_polar_piscale,
+                          host_special_polar_pscale, nlocal, nall, max_nbors,
+                          maxspecial, maxspecial15, cell_size, gpu_split, screen,
+                          aewald, felec, off2, polar_dscale, polar_uscale);
+
+  AMOEBAMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=AMOEBAMF.init(ntypes, max_amtype, host_pdamp, host_thole,
+                            host_special_polar_wscale, host_special_polar_piscale,
+                            host_special_polar_pscale, nlocal, nall, max_nbors,
+                            maxspecial, maxspecial15, cell_size, gpu_split, screen,
+                            aewald, felec, off2, polar_dscale, polar_uscale);
+
+    AMOEBAMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    AMOEBAMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void amoeba_gpu_clear() {
+  AMOEBAMF.clear();
+}
+
+int** amoeba_gpu_compute_n(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           int *host_amtype, int *host_amgroup,
+                           double **host_rpole, double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, int *nspecial15, tagint** special15,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
+  return AMOEBAMF.compute(ago, inum_full, nall, host_x, host_type,
+                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                          eflag, vflag, eatom,
+                          vatom, host_start, ilist, jnum, cpu_time, success,
+                          host_q, boxlo, prd, tep_ptr);
+}
+
+void amoeba_gpu_compute(const int ago, const int inum_full, const int nall,
+                        double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+                        double **host_rpole, double **host_uind, double **host_uinp,
+                        int *ilist, int *numj, int **firstneigh,
+                        const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q,
+                        const int nlocal, double *boxlo, double *prd, void **tep_ptr) {
+  AMOEBAMF.compute(ago,inum_full, nall, host_x, host_type,
+                   host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
+                   ilist, numj, firstneigh, eflag, vflag, eatom, vatom,
+                   host_start, cpu_time, success, host_q, nlocal, boxlo, prd, tep_ptr);
+}
+
+double amoeba_gpu_bytes() {
+  return AMOEBAMF.host_memory_usage();
+}
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -0,0 +1,516 @@
+/***************************************************************************
+                               base_amoeba.cpp
+                             -------------------
+                            Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#include "lal_base_amoeba.h"
+namespace LAMMPS_AL {
+#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> global_device;
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::BaseAmoeba() : _compiled(false), _max_bytes(0) {
+  device=&global_device;
+  ans=new Answer<numtyp,acctyp>();
+  nbor=new Neighbor();
+  pair_program=nullptr;
+  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
+}
+
+template <class numtyp, class acctyp>
+BaseAmoebaT::~BaseAmoeba() {
+  delete ans;
+  delete nbor;
+  k_polar.clear();
+  k_special15.clear();
+  if (pair_program) delete pair_program;
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
+                             const int max_nbors, const int maxspecial,
+                             const int maxspecial15,
+                             const double cell_size, const double gpu_split,
+                             FILE *_screen, const void *pair_program,
+                             const char *k_name) {
+  screen=_screen;
+
+  int gpu_nbor=0;
+  if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (device->gpu_mode()==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  _threads_per_atom=device->threads_per_charge();
+
+  bool charge = true;
+  bool rot = false;
+  bool vel = false;
+  _extra_fields = 24; // round up to accomodate quadruples of numtyp values
+                      // rpole 13; uind 3; uinp 3; amtype, amgroup
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields);
+  if (success!=0)
+    return success;
+
+  if (ucl_device!=device->gpu) _compiled=false;
+
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
+  compile_kernels(*ucl_device,pair_program,k_name);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_nbor,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->x,4);
+  q_tex.bind_float(atom->q,1);
+
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  _maxspecial=maxspecial;
+  _maxspecial15=maxspecial15;
+
+  // allocate per-atom array tep 
+
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
+  _max_tep_size=static_cast<int>(static_cast<double>(ef_nall)*1.10);
+  _tep.alloc(_max_tep_size*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+  dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
+
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  nbor->clear();
+  ans->clear();
+
+  _tep.clear();
+  dev_nspecial15.clear();
+  dev_special15.clear();
+  dev_special15_t.clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * BaseAmoebaT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return nullptr;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void BaseAmoebaT::build_nbor_list(const int inum, const int host_inum,
+                                         const int nall, double **host_x,
+                                         int *host_type, double *sublo,
+                                         double *subhi, tagint *tag,
+                                         int **nspecial, tagint **special,
+                                         int *nspecial15, tagint **special15,
+                                         bool &success) {
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+
+  // add one-five neighbors
+
+  if (_maxspecial15>0) {
+    UCL_H_Vec<int> view_nspecial15;
+    UCL_H_Vec<tagint> view_special15;
+    view_nspecial15.view(nspecial15,nall,*ucl_device);
+    view_special15.view(special15[0],nall*_maxspecial15,*ucl_device);
+    ucl_copy(dev_nspecial15,view_nspecial15,nall,false);
+    ucl_copy(dev_special15_t,view_special15,_maxspecial15*nall,false);
+    nbor->transpose(dev_special15, dev_special15_t, _maxspecial15, nall);
+
+    add_onefive_neighbors();
+  }
+
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute(const int f_ago, const int inum_full, const int nall,
+                          double **host_x, int *host_type, int *host_amtype,
+                          int *host_amgroup, double **host_rpole,
+                          double **host_uind, double **host_uinp,
+                          int *ilist, int *numj, int **firstneigh,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, const double cpu_time,
+                          bool &success, double *host_q, const int nlocal,
+                          double *boxlo, double *prd, void **tep_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
+
+    dev_nspecial15.clear();
+    dev_special15.clear();
+    dev_special15_t.clear();
+    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+  }
+
+  *tep_ptr=_tep.host.begin();
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return;
+  }
+
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  // packing host arrays into host_extra
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+  atom->add_extra_data();
+
+  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int** BaseAmoebaT::compute(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, int *host_amtype,
+                           int *host_amgroup, double **host_rpole,
+                           double **host_uind, double **host_uinp,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           int *nspecial15, tagint **special15,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum, const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
+  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
+
+  // ------------------- Resize _tep array ------------------------
+
+  if (nall>_max_tep_size) {
+    _max_tep_size=static_cast<int>(static_cast<double>(nall)*1.10);
+    _tep.resize(_max_tep_size*4);
+
+    dev_nspecial15.clear();
+    dev_special15.clear();
+    dev_special15_t.clear();
+    dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
+    dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);   
+  }
+  *tep_ptr=_tep.host.begin();
+
+  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
+    zero_timers();
+    return nullptr;
+  }
+
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
+  host_start=inum;
+
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    sublo, subhi, tag, nspecial, special, nspecial15, special15,
+                    success);
+    if (!success)
+      return nullptr;
+    atom->cast_q_data(host_q);
+    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    cast_extra_data(host_amtype, host_amgroup, host_rpole, host_uind, host_uinp);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_q_data();
+  atom->add_extra_data();
+
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
+
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
+  device->add_ans_object(ans);
+  hd_balancer.stop_timer();
+
+  // copy tep from device to host
+
+  _tep.update_host(_max_tep_size*4,false);
+/*  
+  printf("GPU lib: tep size = %d: max tep size = %d\n", this->_tep.cols(), _max_tep_size);
+  for (int i = 0; i < 10; i++) {
+    numtyp4* p = (numtyp4*)(&this->_tep[4*i]);
+    printf("i = %d; tep = %f %f %f\n", i, p->x, p->y, p->z);
+  }
+*/  
+  return nbor->host_jlist.begin()-host_start;
+}
+
+template <class numtyp, class acctyp>
+double BaseAmoebaT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
+    double** uind, double** uinp) {
+  int _nall=atom->nall();
+  numtyp *pextra=reinterpret_cast<numtyp*>(&(atom->extra[0]));
+
+  int n = 0;
+  int nstride = 4;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = rpole[i][0];
+    pextra[idx+1] = rpole[i][1];
+    pextra[idx+2] = rpole[i][2];
+    pextra[idx+3] = rpole[i][3];
+  }
+
+  n += nstride*_nall;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = rpole[i][4];
+    pextra[idx+1] = rpole[i][5];
+    pextra[idx+2] = rpole[i][6];
+    pextra[idx+3] = rpole[i][8];
+  }
+
+  n += nstride*_nall;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = rpole[i][9];
+    pextra[idx+1] = rpole[i][12];
+    pextra[idx+2] = (numtyp)amtype[i];
+    pextra[idx+3] = (numtyp)amgroup[i];
+  }
+
+  n += nstride*_nall;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = uind[i][0];
+    pextra[idx+1] = uind[i][1];
+    pextra[idx+2] = uind[i][2];
+  }
+
+  n += nstride*_nall;
+  for (int i = 0; i < _nall; i++) {
+    int idx = n+i*nstride;
+    pextra[idx]   = uinp[i][0];
+    pextra[idx+1] = uinp[i][1];
+    pextra[idx+2] = uinp[i][2];    
+  }
+}
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
+                                  const char *kname) {
+  if (_compiled)
+    return;
+
+  if (pair_program) delete pair_program;
+  pair_program=new UCL_Program(dev);
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  
+  k_polar.set_function(*pair_program,kname);
+  k_special15.set_function(*pair_program,"k_special15");
+  pos_tex.get_texture(*pair_program,"pos_tex");
+  q_tex.get_texture(*pair_program,"q_tex");
+  
+  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.has_subgroup_support()) {
+    size_t mx_subgroup_sz = k_polar.max_subgroup_size(_block_size);
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
+}
+
+template <class numtyp, class acctyp>
+int BaseAmoebaT::add_onefive_neighbors() {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
+                               (BX/_threads_per_atom)));
+
+  int _nall=atom->nall();
+  int ainum=ans->inum();
+  int nbor_pitch=nbor->nbor_pitch();
+  
+  k_special15.set_size(GX,BX);
+  k_special15.run(&nbor->dev_nbor, &_nbor_data->begin(),
+                    &atom->dev_tag, &dev_nspecial15, &dev_special15,
+                    &ainum, &_nall, &nbor_pitch,
+                    &_threads_per_atom);
+  
+  return GX;
+}
+
+template class BaseAmoeba<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -0,0 +1,225 @@
+/***************************************************************************
+                                base_amoeba.h
+                             -------------------
+                        Trung Dac Nguyen (Northwestern)
+
+  Base class for pair styles needing per-particle data for position,
+  charge, and type.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : trung.nguyen@northwestern.edu
+ ***************************************************************************/
+
+#ifndef LAL_BASE_AMOEBA_H
+#define LAL_BASE_AMOEBA_H
+
+#include "lal_device.h"
+#include "lal_balance.h"
+#include "mpi.h"
+
+#if defined(USE_OPENCL)
+#include "geryon/ocl_texture.h"
+#elif defined(USE_CUDART)
+#include "geryon/nvc_texture.h"
+#elif defined(USE_HIP)
+#include "geryon/hip_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BaseAmoeba {
+ public:
+  BaseAmoeba();
+  virtual ~BaseAmoeba();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * \param k_name name for the kernel for force calculation
+    *
+    * Returns:
+    * -  0 if successful
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const int maxspecial15, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const void *pair_program, const char *k_name);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead(const int add_kernels=0);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->x,4);
+      q_tex.bind_float(atom->q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum,
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      nbor->acc_timers(screen);
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    time_pair.zero();
+    atom->zero_timers();
+    ans->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, int *nspecial15, tagint **special15,
+                       bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *host_amtype,
+               int *host_amgroup, double **host_rpole, double **host_uind,
+               double **host_uinp, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd, void **tep_ptr);
+
+  /// Pair loop with device neighboring
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, int *host_amtype,
+                int *host_amgroup, double **host_rpole, double **host_uind,
+                double **host_uinp, double *sublo, double *subhi,
+                tagint *tag, int **nspecial, tagint **special,
+                int *nspecial15, tagint **special15,
+                const bool eflag, const bool vflag,
+                const bool eatom, const bool vatom, int &host_start,
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd, void **tep_ptr);
+
+  // -------------------------- DEVICE DATA -------------------------
+
+  /// Device Properties and Atom and Neighbor storage
+  Device<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  Balance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  Atom<numtyp,acctyp> *atom;
+
+  UCL_Vector<numtyp,numtyp> polar1, polar2, polar3, polar4, polar5;
+
+  /// cast host arrays into a single array for atom->extra
+  void cast_extra_data(int* amtype, int* amgroup, double** rpole,
+    double** uind, double** uinp);
+
+  /// Per-atom arrays
+  UCL_Vector<numtyp,numtyp> _tep;
+  int _max_tep_size;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  Answer<numtyp,acctyp> *ans;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  Neighbor *nbor;
+  /// Device storage for 1-5 special neighbor counts
+  UCL_D_Vec<int> dev_nspecial15;
+  /// Device storage for special neighbors
+  UCL_D_Vec<tagint> dev_special15, dev_special15_t;
+
+  int add_onefive_neighbors();
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_polar,k_special15;
+  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {}
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size, _block_bio_size, _threads_per_atom;
+  int _extra_fields;
+  double  _max_bytes, _max_an_bytes, _maxspecial, _maxspecial15;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
+
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+
+  virtual int loop(const int eflag, const int vflag) = 0;
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -72,7 +72,9 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -72,7 +72,9 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_charge();

-  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -73,7 +73,9 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_charge();

-  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  bool charge = true;
+  bool rot = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@ -72,7 +72,10 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  bool charge = false;
+  bool rot = false;
+  bool vel = true;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -94,7 +94,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  else
    _threads_per_atom=device->threads_per_three();

-  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  bool charge = false;
+  bool rot = false;
+  int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -579,6 +579,11 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    time_nbor.stop();
    if (_time_device)
      time_nbor.add_to_total();
+
+    // on the host, special[i][j] = the special j neighbor of atom i (nall by maxspecial)
+    // on the device, transpose the matrix (1-d array) for coalesced reads
+    //   dev_special[i][j] = the special i neighbor of atom j
+
    time_transpose.start();
    const int b2x=_block_cell_2d;
    const int b2y=_block_cell_2d;
@ -682,6 +687,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    if (_cutoff < _cell_size) vadjust*=1.46;
    mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
    if (mn<33) mn+=3;
+    
    resize_max_neighbors<numtyp,acctyp>(mn,success);
    set_nbor_block_size(mn/2);
    if (!success)
@ -834,6 +840,17 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  time_nbor.stop();
 }

+void Neighbor::transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in, 
+    const int columns_in, const int rows_in)
+{
+  const int b2x=_block_cell_2d;
+  const int b2y=_block_cell_2d;
+  const int g2x=static_cast<int>(ceil(static_cast<double>(columns_in)/b2x));
+  const int g2y=static_cast<int>(ceil(static_cast<double>(rows_in)/b2y));
+  _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+  _shared->k_transpose.run(&out, &in, &columns_in, &rows_in);
+}
+
 template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
     (double **x, const int inum, const int host_inum, const int nall,
      Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -260,6 +260,10 @@ class Neighbor {
    return o.str();
  }

+  /// Helper function
+  void transpose(UCL_D_Vec<tagint> &out, const UCL_D_Vec<tagint> &in, 
+    const int columns_in, const int rows_in);
+
 private:
  NeighborShared *_shared;
  UCL_Device *dev;
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -44,6 +44,19 @@ _texture_2d( pos_tex,int4);
 #define LAL_USE_OLD_NEIGHBOR
 #endif

+/*
+  compute the id of the cell where the atoms belong to
+x: atom coordinates
+cell_id: cell ids
+particle_id: 
+boxlo[0-2]: the lower left corner of the local box
+ncell[xyz]: the number of cells in xyz dims
+i_cell_size is the inverse cell size
+inum = the number of the local atoms that are ported to the device
+nall = the number of the local+ghost atoms that are ported to the device
+cells_in_cutoff = the number of cells that are within the cutoff
+*/
+
 __kernel void calc_cell_id(const numtyp4 *restrict x_,
                           unsigned *restrict cell_id,
                           int *restrict particle_id,
@ -86,6 +99,8 @@ __kernel void calc_cell_id(const numtyp4 *restrict x_,
  }
 }

+// compute the number of atoms in each cell
+
 __kernel void kernel_calc_cell_counts(const unsigned *restrict cell_id,
                                      int *restrict cell_counts,
                                      int nall, int ncell) {
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@ -357,7 +357,7 @@ class PairAmoeba : public Pair {

  void polar();
  void polar_energy();
-  void polar_real();
+  virtual void polar_real();
  void polar_kspace();
  void damppole(double, int, double, double, double *, double *, double *);

--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@ -41,6 +41,8 @@ action fix_npt_gpu.cpp
 action fix_nve_asphere_gpu.h fix_nve_asphere.h
 action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
 action gpu_extra.h
+action pair_amoeba_gpu.cpp pair_amoeba.cpp
+action pair_amoeba_gpu.h pair_amoeba.h
 action pair_beck_gpu.cpp pair_beck.cpp
 action pair_beck_gpu.h pair_beck.h
 action pair_born_coul_long_gpu.cpp pair_born_coul_long.cpp
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -0,0 +1,299 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Trung Nguyen (Northwestern)
+------------------------------------------------------------------------- */
+
+#include "pair_amoeba_gpu.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "suffix.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+// External functions from cuda library for atom decomposition
+
+int amoeba_gpu_init(const int ntypes, const int max_amtype,
+                    const double *host_pdamp, const double *host_thole,
+                    const double *host_special_polar_wscale,
+                    const double *host_special_polar_piscale,
+                    const double *host_special_polar_pscale,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const int maxspecial15,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    const double aewald, const double felec,
+                    const double off2, const double polar_dscale,
+                    const double polar_uscale, int& tep_size);
+void amoeba_gpu_clear();
+
+int ** amoeba_gpu_compute_n(const int ago, const int inum, const int nall,
+                            double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
+                            double **host_rpole, double **host_uind, double **host_uinp,
+                            double *sublo, double *subhi, tagint *tag, int **nspecial,
+                            tagint **special, int* nspecial15, tagint** special15,
+                            const bool eflag, const bool vflag,
+                            const bool eatom, const bool vatom, int &host_start,
+                            int **ilist, int **jnum, const double cpu_time,
+                            bool &success, double *host_q, double *boxlo, double *prd,
+                            void **tep_ptr);
+void amoeba_gpu_compute(const int ago, const int inum,
+                        const int nall, double **host_x, int *host_type,
+                        int *host_amtype, int *host_amgroup,
+                        double **host_rpole, double **host_uind, double **host_uinp,
+                        int *ilist, int *numj, int **firstneigh,
+                        const bool eflag, const bool vflag, const bool eatom,
+                        const bool vatom, int &host_start, const double cpu_time,
+                        bool &success, double *host_q, const int nlocal,
+                        double *boxlo, double *prd, void **tep_ptr);
+
+double amoeba_gpu_bytes();
+
+enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG};
+
+/* ---------------------------------------------------------------------- */
+
+PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
+{
+  respa_enable = 0;
+  reinitflag = 0;
+  cpu_time = 0.0;
+  suffix_flag |= Suffix::GPU;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairAmoebaGPU::~PairAmoebaGPU()
+{
+  amoeba_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairAmoebaGPU::polar_real()
+{
+  int eflag=1, vflag=1;
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  if (gpu_mode != GPU_FORCE) {
+    double sublo[3],subhi[3];
+    if (domain->triclinic == 0) {
+      sublo[0] = domain->sublo[0];
+      sublo[1] = domain->sublo[1];
+      sublo[2] = domain->sublo[2];
+      subhi[0] = domain->subhi[0];
+      subhi[1] = domain->subhi[1];
+      subhi[2] = domain->subhi[2];
+    } else {
+      domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi);
+    }
+    inum = atom->nlocal;
+
+    firstneigh = amoeba_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+                                      atom->type, amtype, amgroup,
+                                      rpole, uind, uinp, sublo, subhi,
+                                      atom->tag, atom->nspecial, atom->special,
+                                      atom->nspecial15, atom->special15,
+                                      eflag, vflag, eflag_atom, vflag_atom,
+                                      host_start, &ilist, &numneigh, cpu_time,
+                                      success, atom->q, domain->boxlo,
+                                      domain->prd, &tep_pinned);
+
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    
+    amoeba_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+                       amtype, amgroup, rpole, uind, uinp,
+                       ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+                       vflag_atom, host_start, cpu_time, success, atom->q,
+                       atom->nlocal, domain->boxlo, domain->prd, &tep_pinned);
+  }
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  // reference to the tep array from GPU lib
+
+  if (tep_single) {
+    float *tep_ptr = (float *)tep_pinned;
+    compute_force_from_tep<float>(tep_ptr);
+  } else {
+    double *tep_ptr = (double *)tep_pinned;
+    compute_force_from_tep<double>(tep_ptr);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template <class numtyp>
+void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr)
+{
+  int i,ix,iy,iz;
+  double ci,dix,diy,diz;
+  double qixx,qixy,qixz;
+  double qiyy,qiyz,qizz;
+  double xix,yix,zix;
+  double xiy,yiy,ziy;
+  double xiz,yiz,ziz;
+  double vxx,vyy,vzz;
+  double vxy,vxz,vyz;
+  double fix[3],fiy[3],fiz[3],tep[4];
+
+  double** x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    dix = rpole[i][1];
+    diy = rpole[i][2];
+    diz = rpole[i][3];
+    qixx = rpole[i][4];
+    qixy = rpole[i][5];
+    qixz = rpole[i][6];
+    qiyy = rpole[i][8];
+    qiyz = rpole[i][9];
+    qizz = rpole[i][12];
+    
+    tep[0] = tep_ptr[4*i];
+    tep[1] = tep_ptr[4*i+1];
+    tep[2] = tep_ptr[4*i+2];
+    torque2force(i,tep,fix,fiy,fiz,fpolar);
+
+    iz = zaxis2local[i];
+    ix = xaxis2local[i];
+    iy = yaxis2local[i];
+
+    xiz = x[iz][0] - x[i][0];
+    yiz = x[iz][1] - x[i][1];
+    ziz = x[iz][2] - x[i][2];
+    xix = x[ix][0] - x[i][0];
+    yix = x[ix][1] - x[i][1];
+    zix = x[ix][2] - x[i][2];
+    xiy = x[iy][0] - x[i][0];
+    yiy = x[iy][1] - x[i][1];
+    ziy = x[iy][2] - x[i][2];
+
+    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
+    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
+                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
+                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
+                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+
+    virpolar[0] += vxx;
+    virpolar[1] += vyy;
+    virpolar[2] += vzz;
+    virpolar[3] += vxy;
+    virpolar[4] += vxz;
+    virpolar[5] += vyz;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::init_style()
+{
+  PairAmoeba::init_style();
+
+  if (gpu_mode == GPU_FORCE) {
+    if (comm->me == 0)
+      error->warning(FLERR,"Pair style amoeba/gpu does not support neigh no "
+        "for now, automatically switching to neigh yes");
+    gpu_mode = GPU_NEIGH;
+  } 
+
+  // Repeat cutsq calculation because done after call to init_style
+
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+
+  // select the cutoff (off2) for neighbor list builds (the polar term for now)
+
+  if (use_ewald) choose(POLAR_LONG);
+  else choose(POLAR);
+
+  double cell_size = sqrt(off2) + neighbor->skin;
+
+  int maxspecial=0;
+  int maxspecial15=0;
+  if (atom->molecular != Atom::ATOMIC) {
+    maxspecial=atom->maxspecial;
+    maxspecial15=atom->maxspecial15;
+  }
+    
+  int tep_size;
+  int mnf = 5e-2 * neighbor->oneatom;
+
+  // set the energy unit conversion factor for polar real-space calculation
+
+  double felec = 0.5 * electric / am_dielectric;
+  
+  int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole,
+                                special_polar_wscale, special_polar_piscale,
+                                special_polar_pscale, atom->nlocal,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
+                                maxspecial15, cell_size, gpu_mode, screen,
+                                aewald, felec, off2, polar_dscale, polar_uscale,
+                                tep_size);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (tep_size == sizeof(double))
+    tep_single = false;
+  else
+    tep_single = true;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairAmoebaGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + amoeba_gpu_bytes();
+}
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@ -0,0 +1,63 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(amoeba/gpu,PairAmoebaGPU);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_AMOEBA_GPU_H
+#define LMP_PAIR_AMOEBA_GPU_H
+
+#include "pair_amoeba.h"
+
+namespace LAMMPS_NS {
+
+class PairAmoebaGPU : public PairAmoeba {
+ public:
+  PairAmoebaGPU(LAMMPS *lmp);
+  ~PairAmoebaGPU();
+  void init_style();
+  double memory_usage();
+
+  enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+  virtual void polar_real();
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  void *tep_pinned;
+  bool tep_single;
+
+  template<class numtyp>
+  void compute_force_from_tep(const numtyp*);
+};
+
+}    // namespace LAMMPS_NS
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Insufficient memory on accelerator
+
+There is insufficient memory on one of the devices specified for the gpu
+package
+
+E: Pair style amoeba/gpu requires atom attribute q
+
+The atom style defined does not have this attribute.
+
+*/