Fixed bugs in the multipole real-space part on the GPU; separately multipole real and polar real work correctly (along with udirect2b and umutual2b), but

together they are conflicting due to the use of ans to copy forces back from device to host. The other 2 kernels (induce part) do not touch forces and energies.
2021-09-17 15:24:36 -05:00
parent d926705950
commit 2e6df83b9b
10 changed files with 123 additions and 104 deletions
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -53,8 +53,7 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
                  const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const int maxspecial15,
                  const double cell_size, const double gpu_split, FILE *_screen,
-                  const double aewald, const double polar_dscale,
-                  const double polar_uscale) {
+                  const double polar_dscale, const double polar_uscale) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,maxspecial15,
                            cell_size,gpu_split,_screen,amoeba,
@ -97,7 +96,6 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const double *host_pda
  }
  ucl_copy(sp_polar,dview,5,false);

-  _aewald = aewald;
  _polar_dscale = polar_dscale;
  _polar_uscale = polar_uscale;

@ -158,7 +156,7 @@ int AmoebaT::multipole_real(const int eflag, const int vflag) {
                    &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv, &this->_tep,
                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,  &_aewald, &this->_felec,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
                    &this->_off2_mpole, &_polar_dscale, &_polar_uscale);
  this->time_pair.stop();

@ -198,7 +196,7 @@ int AmoebaT::udirect2b(const int eflag, const int vflag) {
                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                        &this->dev_short_nbor,
                        &this->_fieldp, &ainum, &_nall, &nbor_pitch,
-                        &this->_threads_per_atom, &_aewald, &this->_off2_polar,
+                        &this->_threads_per_atom, &this->_aewald, &this->_off2_polar,
                        &_polar_dscale, &_polar_uscale);

  this->time_pair.stop();
@ -237,7 +235,7 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
  this->k_umutual2b.run(&this->atom->x, &this->atom->extra, &damping, &sp_polar,
                        &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                        &this->dev_short_nbor, &this->_fieldp, &ainum, &_nall,
-                        &nbor_pitch, &this->_threads_per_atom, &_aewald,
+                        &nbor_pitch, &this->_threads_per_atom, &this->_aewald,
                        &this->_off2_polar, &_polar_dscale, &_polar_uscale);

  this->time_pair.stop();
@ -278,7 +276,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) {
                    &this->dev_short_nbor,
                    &this->ans->force, &this->ans->engv, &this->_tep,
                    &eflag, &vflag, &ainum, &_nall, &nbor_pitch,
-                    &this->_threads_per_atom,  &_aewald, &this->_felec,
+                    &this->_threads_per_atom,  &this->_aewald, &this->_felec,
                    &this->_off2_polar, &_polar_dscale, &_polar_uscale);
  this->time_pair.stop();

--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -225,20 +225,20 @@ _texture( q_tex,int2);
 ------------------------------------------------------------------------- */

 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
-                            const __global numtyp *restrict extra,
-                            const __global numtyp4 *restrict damping,
-                            const __global numtyp4 *restrict sp_polar,
-                            const __global int *dev_nbor,
-                            const __global int *dev_packed,
-                            const __global int *dev_short_nbor,
-                            __global acctyp4 *restrict ans,
-                            __global acctyp *restrict engv,
-                            __global numtyp4 *restrict tep,
-                            const int eflag, const int vflag, const int inum,
-                            const int nall, const int nbor_pitch, const int t_per_atom,
-                            const numtyp aewald, const numtyp felec,
-                            const numtyp off2, const numtyp polar_dscale,
-                            const numtyp polar_uscale)
+                                const __global numtyp *restrict extra,
+                                const __global numtyp4 *restrict damping,
+                                const __global numtyp4 *restrict sp_polar,
+                                const __global int *dev_nbor,
+                                const __global int *dev_packed,
+                                const __global int *dev_short_nbor,
+                                __global acctyp4 *restrict ans,
+                                __global acctyp *restrict engv,
+                                __global numtyp4 *restrict tep,
+                                const int eflag, const int vflag, const int inum,
+                                const int nall, const int nbor_pitch, const int t_per_atom,
+                                const numtyp aewald, const numtyp felec,
+                                const numtyp off2, const numtyp polar_dscale,
+                                const numtyp polar_uscale)
 {
  int tid, ii, offset, i;
  atom_info(t_per_atom,ii,tid,offset);
@ -257,7 +257,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
  }

  acctyp4 tq;
-  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0; tq.w=(acctyp)0; 
+  tq.x=(acctyp)0; tq.y=(acctyp)0; tq.z=(acctyp)0;

  numtyp dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
  numtyp4* polar1 = (numtyp4*)(&extra[0]);
@ -272,7 +272,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
    numtyp term1,term2,term3;
    numtyp term4,term5;
    numtyp term6,term7;
-    numtyp rc3[3],rc5[3],rc7[3];
    numtyp bn[6];
    numtyp ci,uix,uiy,uiz,uixp,uiyp,uizp;

@ -309,9 +308,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
    // debug:
    // xi__ = ix; xi__.w = itype;

-    numtyp pdi = damping[itype].x;
-    numtyp pti = damping[itype].y;
-
    for ( ; nbor<nbor_end; nbor+=n_stride) {

      int jextra=nbor_mem[nbor];
@ -326,10 +322,9 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp zr = jx.z - ix.z;
      numtyp r2 = xr*xr + yr*yr + zr*zr;

-      //if (r2>off2) continue;
+      if (r2>off2) continue;
  
      numtyp r = ucl_sqrt(r2);
-      
      numtyp ck = polar1[j].x;   // rpole[j][0];
      numtyp dkx = polar1[j].y;  // rpole[j][1];
      numtyp dky = polar1[j].z;  // rpole[j][2];
@ -363,7 +358,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      numtyp qik = qix*qkx + qiy*qky + qiz*qkz;
      numtyp diqk = dix*qkx + diy*qky + diz*qkz;
      numtyp dkqi = dkx*qix + dky*qiy + dkz*qiz;
-      numtyp qiqk = (numtyp )2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
+      numtyp qiqk = (numtyp)2.0*(qixy*qkxy+qixz*qkxz+qiyz*qkyz) + 
        qixx*qkxx + qiyy*qkyy + qizz*qkzz;

      // additional intermediates involving moments and distance
@ -452,8 +447,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
      term4 = dir*qkr - dkr*qir - 4.0*qik;
      term5 = qir*qkr;
-
-      numtyp scalek = 1.0 - factor_mpole;
+      numtyp scalek = (numtyp)1.0 - factor_mpole;
      rr1 = bn[0] - scalek*rr1;
      rr3 = bn[1] - scalek*rr3;
      rr5 = bn[2] - scalek*rr5;
@ -485,11 +479,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,

      // compute the torque components for this interaction

-      numtyp tix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
+      numtyp ttmix = -rr3*dikx + term1*dirx + term3*(dqikx+dkqirx) - 
        term4*qirx - term6*(qikrx+qikx);
-      numtyp tiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
+      numtyp ttmiy = -rr3*diky + term1*diry + term3*(dqiky+dkqiry) - 
        term4*qiry - term6*(qikry+qiky);
-      numtyp tiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
+      numtyp ttmiz = -rr3*dikz + term1*dirz + term3*(dqikz+dkqirz) - 
        term4*qirz - term6*(qikrz+qikz);

      // increment force-based gradient and torque on first site
@ -497,16 +491,16 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
      f.x += frcx;
      f.y += frcy;
      f.z += frcz;
-      tq.x += tix;
-      tq.y += tiy;
-      tq.z += tiz;
+      tq.x += ttmix;
+      tq.y += ttmiy;
+      tq.z += ttmiz;

      if (EVFLAG && vflag) {
        numtyp vxx = -xr * frcx;
-        numtyp vxy = (numtyp )-0.5 * (yr*frcx+xr*frcy);
-        numtyp vxz = (numtyp )-0.5 * (zr*frcx+xr*frcz);
+        numtyp vxy = (numtyp)-0.5 * (yr*frcx+xr*frcy);
+        numtyp vxz = (numtyp)-0.5 * (zr*frcx+xr*frcz);
        numtyp vyy = -yr * frcy;
-        numtyp vyz = (numtyp )-0.5 * (zr*frcy+yr*frcz);
+        numtyp vyz = (numtyp)-0.5 * (zr*frcy+yr*frcz);
        numtyp vzz = -zr * frcz;

        virial[0] += vxx;
@ -520,7 +514,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
    
  } // ii<inum

-  // accumulate ufld and dufld to compute tep
+  // accumulate tq
  store_answers_amoeba_tq(tq,ii,inum,tid,t_per_atom,offset,i,tep);
  
  // accumate force, energy and virial
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@ -46,8 +46,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
           const int nlocal, const int nall, const int max_nbors,
           const int maxspecial, const int maxspecial15, const double cell_size,
           const double gpu_split, FILE *_screen,
-           const double aewald, const double polar_dscale,
-           const double polar_uscale);
+           const double polar_dscale, const double polar_uscale);

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
@ -75,7 +74,7 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
  /// Number of atom types
  int _lj_types;

-  numtyp _aewald, _polar_dscale, _polar_uscale;
+  numtyp _polar_dscale, _polar_uscale;
  numtyp _qqrd2e;

 protected:
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@ -37,8 +37,8 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                    const int nlocal, const int nall, const int max_nbors,
                    const int maxspecial, const int maxspecial15,
                    const double cell_size, int &gpu_mode, FILE *screen,
-                    const double aewald, const double polar_dscale,
-                    const double polar_uscale, int& tep_size) {
+                    const double polar_dscale, const double polar_uscale,
+                    int& tep_size) {
  AMOEBAMF.clear();
  gpu_mode=AMOEBAMF.device->gpu_mode();
  double gpu_split=AMOEBAMF.device->particle_split();
@ -67,7 +67,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                          host_special_mpole, host_special_polar_wscale,
                          host_special_polar_piscale, host_special_polar_pscale,
                          nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                          cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
+                          cell_size, gpu_split, screen, polar_dscale, polar_uscale);

  AMOEBAMF.device->world_barrier();
  if (message)
@ -87,7 +87,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                            host_special_mpole, host_special_polar_wscale,
                            host_special_polar_piscale, host_special_polar_pscale,
                            nlocal, nall, max_nbors, maxspecial, maxspecial15,
-                            cell_size, gpu_split, screen, aewald, polar_dscale, polar_uscale);
+                            cell_size, gpu_split, screen, polar_dscale, polar_uscale);

    AMOEBAMF.device->gpu_barrier();
    if (message)
@ -113,13 +113,13 @@ int** amoeba_gpu_compute_multipole_real(const int ago, const int inum_full,
                           const bool eflag, const bool vflag, const bool eatom,
                           const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double felec, const double off2,
+                           bool &success, const double aewald, const double felec, const double off2,
                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
  return AMOEBAMF.compute_multipole_real(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, sublo, subhi,
                          tag, nspecial, special, nspecial15, special15,
                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
 }

 int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
@ -131,13 +131,13 @@ int** amoeba_gpu_compute_udirect2b(const int ago, const int inum_full,
                           const bool eflag, const bool vflag, const bool eatom,
                           const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double off2, double *host_q,
+                           bool &success,  const double aewald, const double off2, double *host_q,
                           double *boxlo, double *prd, void **fieldp_ptr) {
  return AMOEBAMF.compute_udirect2b(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr);
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
 }

 int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
@ -149,13 +149,13 @@ int** amoeba_gpu_compute_umutual2b(const int ago, const int inum_full,
                           const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double off2, double *host_q,
+                           bool &success, const double aewald, const double off2, double *host_q,
                           double *boxlo, double *prd, void **fieldp_ptr) {
  return AMOEBAMF.compute_umutual2b(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, off2, host_q, boxlo, prd, fieldp_ptr);
+                          cpu_time, success, aewald, off2, host_q, boxlo, prd, fieldp_ptr);
 }

 int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
@ -167,13 +167,13 @@ int** amoeba_gpu_compute_polar_real(const int ago, const int inum_full,
                           const bool eflag, const bool vflag, const bool eatom,
                           const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double felec, const double off2,
+                           bool &success, const double aewald, const double felec, const double off2,
                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
  return AMOEBAMF.compute_polar_real(ago, inum_full, nall, host_x, host_type,
                          host_amtype, host_amgroup, host_rpole, host_uind, host_uinp,
                          sublo, subhi, tag, nspecial, special, nspecial15, special15,
                          eflag, vflag, eatom, vatom, host_start, ilist, jnum,
-                          cpu_time, success, felec, off2, host_q, boxlo, prd, tep_ptr);
+                          cpu_time, success, aewald, felec, off2, host_q, boxlo, prd, tep_ptr);
 }

 double amoeba_gpu_bytes() {
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -252,8 +252,8 @@ void BaseAmoebaT::compute_polar_real_host_nbor(const int f_ago, const int inum_f
                          const bool eflag_in, const bool vflag_in,
                          const bool eatom, const bool vatom,
                          int &host_start, const double cpu_time,
-                          bool &success, const double off2_polar, const double felec,
-                          double *host_q, const int nlocal,
+                          bool &success, const double aewald, const double felec, 
+                          const double off2_polar, double *host_q, const int nlocal,
                          double *boxlo, double *prd, void **tep_ptr) {
  acc_timers();
  int eflag, vflag;
@ -440,7 +440,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co
                           const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double felec, const double off2_mpole,
+                           bool &success, const double aewald, const double felec, const double off2_mpole,
                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
  acc_timers();
  int eflag, vflag;
@ -488,6 +488,7 @@ int** BaseAmoebaT::compute_multipole_real(const int ago, const int inum_full, co

  _off2_mpole = off2_mpole;
  _felec = felec;
+  _aewald = aewald;
  const int red_blocks=multipole_real(eflag,vflag);
  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans);
@ -521,8 +522,8 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
                           const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double off2_polar, double *host_q,
-                           double *boxlo, double *prd, void** fieldp_ptr) {
+                           bool &success, const double aewald, const double off2_polar,
+                           double *host_q, double *boxlo, double *prd, void** fieldp_ptr) {
  acc_timers();
  int eflag, vflag;
  if (eatom) eflag=2;
@ -560,6 +561,7 @@ int** BaseAmoebaT::compute_udirect2b(const int ago, const int inum_full, const i
  *fieldp_ptr=_fieldp.host.begin();

  _off2_polar = off2_polar;
+  _aewald = aewald;
  const int red_blocks=udirect2b(eflag,vflag);

  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
@ -591,8 +593,8 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
                           const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double off2_polar, double *host_q,
-                           double *boxlo, double *prd, void** fieldp_ptr) {
+                           bool &success, const double aewald, const double off2_polar,
+                           double *host_q, double *boxlo, double *prd, void** fieldp_ptr) {
  acc_timers();
  int eflag, vflag;
  if (eatom) eflag=2;
@ -630,6 +632,7 @@ int** BaseAmoebaT::compute_umutual2b(const int ago, const int inum_full, const i
  *fieldp_ptr=_fieldp.host.begin();

  _off2_polar = off2_polar;
+  _aewald = aewald;
  const int red_blocks=umutual2b(eflag,vflag);

  // copy field and fieldp from device to host (_fieldp store both arrays, one after another)
@ -660,8 +663,9 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const
                           const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                           bool &success, const double felec, const double off2_polar,
-                           double *host_q, double *boxlo, double *prd, void **tep_ptr) {
+                           bool &success, const double aewald, const double felec,
+                           const double off2_polar, double *host_q, double *boxlo,
+                           double *prd, void **tep_ptr) {
  acc_timers();
  int eflag, vflag;
  if (eatom) eflag=2;
@ -708,6 +712,7 @@ int** BaseAmoebaT::compute_polar_real(const int ago, const int inum_full, const

  _off2_polar = off2_polar;
  _felec = felec;
+  _aewald = aewald;
  const int red_blocks=polar_real(eflag,vflag);
  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans);
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -151,7 +151,7 @@ class BaseAmoeba {
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double felec, const double off2_mpole, double *charge,
+                const double aewald, const double felec, const double off2_mpole, double *charge,
                double *boxlo, double *prd, void **tep_ptr);

  /// Compute the real space part of the permanent field (udirect2b) with device neighboring
@ -165,7 +165,8 @@ class BaseAmoeba {
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+                const double aewald, const double off2_polar, double *charge,
+                double *boxlo, double *prd, void **fieldp_ptr);

  /// Compute the real space part of the induced field (umutual2b) with device neighboring
  int** compute_umutual2b(const int ago, const int inum_full, const int nall,
@ -178,7 +179,8 @@ class BaseAmoeba {
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double off2_polar, double *charge, double *boxlo, double *prd, void **fieldp_ptr);
+                const double aewald, const double off2_polar, double *charge,
+                double *boxlo, double *prd, void **fieldp_ptr);

  /// Compute polar real-space with device neighboring
  int** compute_polar_real(const int ago, const int inum_full, const int nall,
@ -190,7 +192,7 @@ class BaseAmoeba {
                const bool eflag, const bool vflag,
                const bool eatom, const bool vatom, int &host_start,
                int **ilist, int **numj, const double cpu_time, bool &success,
-                const double felec, const double off2_polar, double *charge,
+                const double aewald, const double felec, const double off2_polar, double *charge,
                double *boxlo, double *prd, void **tep_ptr);

  /// Compute polar real-space with host neighboring (not active for now)
@ -200,7 +202,7 @@ class BaseAmoeba {
               double **host_uinp, int *ilist, int *numj,
               int **firstneigh, const bool eflag, const bool vflag,
               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, const double felec, const double off2_polar,
+               const double cpu_time, bool &success, const double aewald, const double felec, const double off2_polar,
               double *charge, const int nlocal, double *boxlo, double *prd, void **tep_ptr);

  // -------------------------- DEVICE DATA -------------------------
@ -272,7 +274,8 @@ class BaseAmoeba {
  bool short_nbor_avail;
  UCL_D_Vec<int> *_nbor_data;

-  numtyp _felec,_off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;
+  numtyp _aewald,_felec;
+  numtyp _off2_hal,_off2_repulse,_off2_dispersion,_off2_mpole,_off2_polar;

  void compile_kernels(UCL_Device &dev, const void *pair_string,
     const char *kname_multipole, const char *kname_udirect2b,
--- a/src/AMOEBA/amoeba_multipole.cpp
+++ b/src/AMOEBA/amoeba_multipole.cpp
@ -369,6 +369,9 @@ void PairAmoeba::multipole_real()
        bn[k] = (bfac*bn[k-1]+alsq2n*exp2a) / r2;
      }
      for (k = 0; k < 6; k++) bn[k] *= felec;
+      //if (i == 0 && j < 10) {
+      //  printf("j = %d: aewald = %f; rr1 = %f; bn: %f %f %f %f %f %f\n", j, aewald, rr1, bn[0], bn[1], bn[2], bn[3], bn[4], bn[5]);
+      //}

      // find damped multipole intermediates and energy value

@ -447,6 +450,10 @@ void PairAmoeba::multipole_real()
        rr9 = bn[4] - scalek*rr9;
        rr11 = bn[5] - scalek*rr11;
        e = term1*rr1 + term2*rr3 + term3*rr5 + term4*rr7 + term5*rr9;
+        if (i == 0  && j < 10) {
+          //printf("j = %d: scalek = %f; rr11 = %f; terms: %f %f %f %f %f\n", j, scalek, rr11, term1, term2, term3, term4, term5);
+          //printf("j = %d: felec = %f; rr1 = %f; bn0 = %f\n", j, felec, rr1, bn[0]);
+        }

        // find standard multipole intermediates for force and torque

@ -457,6 +464,7 @@ void PairAmoeba::multipole_real()
        term4 = 2.0 * (-ck*rr5+dkr*rr7-qkr*rr9);
        term5 = 2.0 * (-ci*rr5-dir*rr7-qir*rr9);
        term6 = 4.0 * rr7;
+        
      }

      empole += e;
@ -515,16 +523,20 @@ void PairAmoeba::multipole_real()
      tq[i][0] += ttmi[0];
      tq[i][1] += ttmi[1];
      tq[i][2] += ttmi[2];
-
+      //if (i == 0  && j < 10) {
+      //  printf("j = %d: erfc = %f; f: %f %f %f; tq =  %f %f %f\n", j, erfc(ralpha), frcx, frcy, frcz, ttmi[0], ttmi[1], ttmi[2]);
+        //printf("j = %d: terms: %f %f %f; tq =  %f %f %f\n", j, term1, term2, term3, qikrx, qikry, qikrz);
+      //}
      // increment force-based gradient and torque on second site
-
+      // commenting out j parts for DEBUGGING
+      
      fmpole[j][0] -= frcx;
      fmpole[j][1] -= frcy;
      fmpole[j][2] -= frcz;
      tq[j][0] += ttmk[0];
      tq[j][1] += ttmk[1];
      tq[j][2] += ttmk[2];
-
+      
      // increment the virial due to pairwise Cartesian forces

      vxx = -xr * frcx;
@ -556,10 +568,11 @@ void PairAmoeba::multipole_real()
  comm->reverse_comm_pair(this);

  // resolve site torques then increment forces and virial
-
+  printf("compute multipole real\n");
  for (i = 0; i < nlocal; i++) {
-    torque2force(i,tq[i],fix,fiy,fiz,fmpole);  
-
+    if (i == 0) printf("before fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]);
+    torque2force(i,tq[i],fix,fiy,fiz,fmpole);
+    if (i == 0) printf("after fmpole = %f %f %f\n", fmpole[i][0], fmpole[i][1], fmpole[i][2]);
    iz = zaxis2local[i];
    ix = xaxis2local[i];
    iy = yaxis2local[i];
@ -575,15 +588,16 @@ void PairAmoeba::multipole_real()
    ziy = x[iy][2] - x[i][2];

    vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0];
+    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
+    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
-    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];

+    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
    virmpole[0] += vxx;
    virmpole[1] += vyy;
    virmpole[2] += vzz;
--- a/src/AMOEBA/amoeba_polar.cpp
+++ b/src/AMOEBA/amoeba_polar.cpp
@ -1176,7 +1176,7 @@ void PairAmoeba::polar_real()
  comm->reverse_comm_pair(this);

  // torque is induced field and gradient cross permanent moments
-  
+  printf("compute polar real\n");
  for (i = 0; i < nlocal; i++) {
    dix = rpole[i][1];
    diy = rpole[i][2];
@ -1197,8 +1197,10 @@ void PairAmoeba::polar_real()
      qiyz*dufld[i][3] - qixz*dufld[i][4] + 
      2.0*qixy*(dufld[i][0]-dufld[i][2]) + (qiyy-qixx)*dufld[i][1];

+    if (i == 0) printf("before fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]);
    torque2force(i,tep,fix,fiy,fiz,fpolar);
-    
+    if (i == 0) printf("after fpolar = %f %f %f\n", fpolar[i][0], fpolar[i][1], fpolar[i][2]);
+
    iz = zaxis2local[i];
    ix = xaxis2local[i];
    iy = yaxis2local[i];
@ -1222,7 +1224,7 @@ void PairAmoeba::polar_real()
                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-
+    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
    virpolar[0] += vxx;
    virpolar[1] += vyy;
    virpolar[2] += vzz;
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@ -972,6 +972,9 @@ void PairAmoeba::init_style()
  // request neighbor lists

  int irequest = neighbor->request(this,instance_me);
+  // for DEBUGGING with GPU
+  //neighbor->requests[irequest]->half = 0;
+  //neighbor->requests[irequest]->full = 1;

  // open debug output files
  // names are hard-coded
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@ -60,8 +60,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                    const int nlocal, const int nall, const int max_nbors,
                    const int maxspecial, const int maxspecial15,
                    const double cell_size, int &gpu_mode, FILE *screen,
-                    const double aewald, const double polar_dscale,
-                    const double polar_uscale, int& tep_size);
+                    const double polar_dscale, const double polar_uscale, int& tep_size);
 void amoeba_gpu_clear();

 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
@ -70,8 +69,8 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
              int **nspecial, tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double felec, const double off2, double *host_q,
-              double *boxlo, double *prd, void **tep_ptr);
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tep_ptr);

 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
              double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
@ -80,7 +79,7 @@ int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nal
              tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double off2, double *host_q,
+              bool &success, const double aewald, const double off2, double *host_q,
              double *boxlo, double *prd, void **fieldp_ptr);

 int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nall,
@ -90,7 +89,7 @@ int ** amoeba_gpu_compute_umutual2b(const int ago, const int inum, const int nal
              tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double off2, double *host_q,
+              bool &success, const double aewald, const double off2, double *host_q,
              double *boxlo, double *prd, void **fieldp_ptr);

 int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int nall,
@ -100,8 +99,8 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
              tagint **special, int* nspecial15, tagint** special15,
              const bool eflag, const bool vflag, const bool eatom, const bool vatom,
              int &host_start, int **ilist, int **jnum, const double cpu_time,
-              bool &success, const double felec, const double off2, double *host_q,
-              double *boxlo, double *prd, void **tep_ptr);
+              bool &success, const double aewald, const double felec, const double off2,
+              double *host_q, double *boxlo, double *prd, void **tep_ptr);

 double amoeba_gpu_bytes();

@ -119,7 +118,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
  gpu_multipole_real_ready = true;
  gpu_udirect2b_ready = true;
  gpu_umutual2b_ready = true;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = false;

  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@ -174,7 +173,7 @@ void PairAmoebaGPU::init_style()
                                special_polar_pscale, atom->nlocal,
                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                maxspecial15, cell_size, gpu_mode, screen,
-                                aewald, polar_dscale, polar_uscale, tep_size);
+                                polar_dscale, polar_uscale, tep_size);
  GPU_EXTRA::check_flag(success,error,world);

  if (gpu_mode == GPU_FORCE)
@ -231,14 +230,14 @@ void PairAmoebaGPU::multipole_real()
                                                 atom->nspecial15, atom->special15,
                                                 eflag, vflag, eflag_atom, vflag_atom,
                                                 host_start, &ilist, &numneigh, cpu_time,
-                                                 success, felec, off2, atom->q, domain->boxlo,
-                                                 domain->prd, &tep_pinned);
+                                                 success, aewald, felec, off2, atom->q,
+                                                 domain->boxlo, domain->prd, &tep_pinned);
  
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

  // reference to the tep array from GPU lib
-
+  printf("compute multipole real\n");
  if (tep_single) {
    float *tep_ptr = (float *)tep_pinned;
    compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
@ -727,7 +726,7 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
                                        atom->nspecial15, atom->special15,
                                        eflag, vflag, eflag_atom, vflag_atom,
                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, off2, atom->q, domain->boxlo,
+                                        success, aewald, off2, atom->q, domain->boxlo,
                                        domain->prd, &fieldp_pinned);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");
@ -951,7 +950,7 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
                                        atom->nspecial15, atom->special15,
                                        eflag, vflag, eflag_atom, vflag_atom,
                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, off2, atom->q, domain->boxlo,
+                                        success,aewald, off2, atom->q, domain->boxlo,
                                        domain->prd, &fieldp_pinned);
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");
@ -1008,7 +1007,7 @@ void PairAmoebaGPU::polar_real()
  }
  inum = atom->nlocal;

-  // select the correct cutoff for the term
+  // select the correct cutoff and aewald for the term

  if (use_ewald) choose(POLAR_LONG);
  else choose(POLAR);
@ -1024,14 +1023,14 @@ void PairAmoebaGPU::polar_real()
                                        atom->nspecial15, atom->special15,
                                        eflag, vflag, eflag_atom, vflag_atom,
                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, felec, off2, atom->q, domain->boxlo,
+                                        success, aewald, felec, off2, atom->q, domain->boxlo,
                                        domain->prd, &tep_pinned);
  
  if (!success)
    error->one(FLERR,"Insufficient memory on accelerator");

  // reference to the tep array from GPU lib
-
+  printf("compute polar real\n");
  if (tep_single) {
    float *tep_ptr = (float *)tep_pinned;
    compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
@ -1066,7 +1065,9 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
    tep[1] = tep_ptr[4*i+1];
    tep[2] = tep_ptr[4*i+2];

+    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
    torque2force(i,tep,fix,fiy,fiz,force_comp);
+    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);

    iz = zaxis2local[i];
    ix = xaxis2local[i];
@ -1086,12 +1087,12 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
    vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1];
    vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2];
    vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + 
-                xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
+                 xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]);
    vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + 
-                xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
+                 xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
    vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
-                yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-
+                 yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
+    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
    virial_comp[0] += vxx;
    virial_comp[1] += vyy;
    virial_comp[2] += vzz;