diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 49c0d78d7f..41185f30e3 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -62,7 +62,7 @@ _texture( q_tex,int2);
     tq.z=red_acc[2][tid];                                                   \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    tep[i]=tq;                                                               \
+    tep[i]=tq;                                                              \
   }
 
 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
@@ -147,14 +147,14 @@ _texture( q_tex,int2);
     fieldp[ii+inum] = fp;                                                   \
   }
 
-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom  \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
     if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
       if (eflag) {                                                          \
         simdsync();                                                         \
-        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul);         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
       if (vflag) {                                                          \
         simdsync();                                                         \
@@ -174,7 +174,7 @@ _texture( q_tex,int2);
     if (eflag!=2 && vflag!=2) {                                             \
       if (eflag) {                                                          \
         simdsync();                                                         \
-        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);               \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
         if (vflag) __syncthreads();                                         \
         if (tid==0) {                                                       \
           engv[ei]+=energy*(acctyp)0.5;                                     \
@@ -225,7 +225,7 @@ _texture( q_tex,int2);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    tep[i]=tq;                                                               \
+    tep[i]=tq;                                                              \
   }
 
 #define store_answers_tep(ufld, dufld, ii, inum,tid, t_per_atom, offset,    \
@@ -280,25 +280,23 @@ _texture( q_tex,int2);
 
 #if (EVFLAG == 1)
 
-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
     if (vflag==2 || eflag==2) {                                             \
       if (eflag)                                                            \
-        simd_reduce_add2(t_per_atom,energy,e_coul);                                \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
       if (vflag)                                                            \
         simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
-    if (ii == 0) printf("old = %f %f %f\n", old.x, old.y, old.z);           \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
     ans[ii]=old;                                                            \
-    if (ii == 0) printf("new = %f %f %f\n", old.x, old.y, old.z);           \
   }                                                                         \
   if (eflag || vflag) {                                                     \
     if (eflag!=2 && vflag!=2) {                                             \
@@ -378,7 +376,7 @@ _texture( q_tex,int2);
 // EVFLAG == 0
 #else
 
-#define store_answers_p(f, energy, e_coul, virial, ii, inum, tid, t_per_atom,       \
+#define store_answers_p(f,energy,e_coul, virial, ii, inum, tid, t_per_atom, \
                         offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1)                                                         \
     simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
@@ -402,20 +400,20 @@ _texture( q_tex,int2);
 ------------------------------------------------------------------------- */
 
 __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
-                                const __global numtyp *restrict extra,
-                                const __global numtyp4 *restrict damping,
-                                const __global numtyp4 *restrict sp_polar,
-                                const __global int *dev_nbor,
-                                const __global int *dev_packed,
-                                const __global int *dev_short_nbor,
-                                __global acctyp4 *restrict ans,
-                                __global acctyp *restrict engv,
-                                __global numtyp4 *restrict tep,
-                                const int eflag, const int vflag, const int inum,
-                                const int nall, const int nbor_pitch, const int t_per_atom,
-                                const numtyp aewald, const numtyp felec,
-                                const numtyp off2, const numtyp polar_dscale,
-                                const numtyp polar_uscale)
+                                 const __global numtyp *restrict extra,
+                                 const __global numtyp4 *restrict damping,
+                                 const __global numtyp4 *restrict sp_polar,
+                                 const __global int *dev_nbor,
+                                 const __global int *dev_packed,
+                                 const __global int *dev_short_nbor,
+                                 __global acctyp4 *restrict ans,
+                                 __global acctyp *restrict engv,
+                                 __global numtyp4 *restrict tep,
+                                 const int eflag, const int vflag, const int inum,
+                                 const int nall, const int nbor_pitch,
+                                 const int t_per_atom, const numtyp aewald,
+                                 const numtyp felec, const numtyp off2,
+                                 const numtyp polar_dscale, const numtyp polar_uscale)
 {
   int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
@@ -439,14 +437,11 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
 
-  //numtyp4 xi__;
-
   if (ii<inum) {
-    int k,m,itype,igroup;
+    int m,itype,igroup;
     numtyp bfac;
     numtyp term1,term2,term3;
-    numtyp term4,term5;
-    numtyp term6;
+    numtyp term4,term5,term6;
     numtyp bn[6];
     numtyp ci,dix,diy,diz,qixx,qixy,qixz,qiyy,qiyz,qizz;
 
@@ -480,9 +475,6 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
     itype  = polar3[i].z; // amtype[i];
     igroup = polar3[i].w; // amgroup[i];
 
-    // debug:
-    // xi__ = ix; xi__.w = itype;
-
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int jextra=nbor_mem[nbor];
@@ -497,7 +489,7 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       numtyp zr = jx.z - ix.z;
       numtyp r2 = xr*xr + yr*yr + zr*zr;
 
-      if (r2>off2) continue;
+      //if (r2>off2) continue;
   
       numtyp r = ucl_sqrt(r2);
       numtyp ck = polar1[j].x;   // rpole[j][0];
@@ -613,14 +605,14 @@ __kernel void k_amoeba_multipole(const __global numtyp4 *restrict x_,
       for (m = 1; m < 6; m++) {
         bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
-        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
       }
       for (m = 0; m < 6; m++) bn[m] *= felec;
 
       term1 = ci*ck;
       term2 = ck*dir - ci*dkr + dik;
-      term3 = ci*qkr + ck*qir - dir*dkr + 2.0*(dkqi-diqk+qiqk);
-      term4 = dir*qkr - dkr*qir - 4.0*qik;
+      term3 = ci*qkr + ck*qir - dir*dkr + (numtyp)2.0*(dkqi-diqk+qiqk);
+      term4 = dir*qkr - dkr*qir - (numtyp)4.0*qik;
       term5 = qir*qkr;
       numtyp scalek = (numtyp)1.0 - factor_mpole;
       rr1 = bn[0] - scalek*rr1;
@@ -730,8 +722,6 @@ __kernel void k_amoeba_udirect2b(const __global numtyp4 *restrict x_,
   numtyp4* polar2 = (numtyp4*)(&extra[4*nall]);
   numtyp4* polar3 = (numtyp4*)(&extra[8*nall]);
 
-  //numtyp4 xi__;
-
   if (ii<inum) {
     int numj, nbor, nbor_end;
     const __global int* nbor_mem=dev_packed;
@@ -1337,7 +1327,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
       for (m = 1; m <= 4; m++) {
         bfac = (numtyp) (m+m-1);
         alsq2n = alsq2 * alsq2n;
-        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) / r2;
+        bn[m] = (bfac*bn[m-1]+alsq2n*exp2a) * r2inv;
       }
       for (m = 0; m < 5; m++) bn[m] *= felec;
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index dcf7d95047..f932f05e25 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -60,7 +60,7 @@ int amoeba_gpu_init(const int ntypes, const int max_amtype,
                     const int nlocal, const int nall, const int max_nbors,
                     const int maxspecial, const int maxspecial15,
                     const double cell_size, int &gpu_mode, FILE *screen,
-                    const double polar_dscale, const double polar_uscale, int& tep_size);
+                    const double polar_dscale, const double polar_uscale, int& tq_size);
 void amoeba_gpu_clear();
 
 int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall,
@@ -70,7 +70,7 @@ int ** amoeba_gpu_compute_multipole_real(const int ago, const int inum, const in
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
               bool &success, const double aewald, const double felec, const double off2,
-              double *host_q, double *boxlo, double *prd, void **tep_ptr);
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
 
 int ** amoeba_gpu_compute_udirect2b(const int ago, const int inum, const int nall,
               double **host_x, int *host_type, int *host_amtype, int *host_amgroup,
@@ -100,7 +100,7 @@ int ** amoeba_gpu_compute_polar_real(const int ago, const int inum, const int na
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
               int &host_start, int **ilist, int **jnum, const double cpu_time,
               bool &success, const double aewald, const double felec, const double off2,
-              double *host_q, double *boxlo, double *prd, void **tep_ptr);
+              double *host_q, double *boxlo, double *prd, void **tq_ptr);
 
 double amoeba_gpu_bytes();
 
@@ -113,7 +113,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   cpu_time = 0.0;
   suffix_flag |= Suffix::GPU;
   fieldp_pinned = nullptr;
-  tep_pinned = nullptr;
+  tq_pinned = nullptr;
 
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
@@ -166,23 +166,23 @@ void PairAmoebaGPU::init_style()
     maxspecial15=atom->maxspecial15;
   }
     
-  int tep_size;
+  int tq_size;
   int mnf = 5e-2 * neighbor->oneatom;
   int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, pdamp, thole, dirdamp,
                                 special_mpole, special_polar_wscale, special_polar_piscale,
                                 special_polar_pscale, atom->nlocal,
                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                 maxspecial15, cell_size, gpu_mode, screen,
-                                polar_dscale, polar_uscale, tep_size);
+                                polar_dscale, polar_uscale, tq_size);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE)
     error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now");
 
-  if (tep_size == sizeof(double))
-    tep_single = false;
+  if (tq_size == sizeof(double))
+    tq_single = false;
   else
-    tep_single = true;
+    tq_single = true;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -231,19 +231,19 @@ void PairAmoebaGPU::multipole_real()
                                                  eflag, vflag, eflag_atom, vflag_atom,
                                                  host_start, &ilist, &numneigh, cpu_time,
                                                  success, aewald, felec, off2, atom->q,
-                                                 domain->boxlo, domain->prd, &tep_pinned);
+                                                 domain->boxlo, domain->prd, &tq_pinned);
   
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // reference to the tep array from GPU lib
-  printf("compute multipole real\n");
-  if (tep_single) {
-    float *tep_ptr = (float *)tep_pinned;
-    compute_force_from_tep<float>(tep_ptr, fmpole, virmpole);
+
+  if (tq_single) {
+    float *tq_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tq_ptr, fmpole, virmpole);
   } else {
-    double *tep_ptr = (double *)tep_pinned;
-    compute_force_from_tep<double>(tep_ptr, fmpole, virmpole);
+    double *tq_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tq_ptr, fmpole, virmpole);
   }
 }
 
@@ -681,7 +681,6 @@ void PairAmoebaGPU::induce()
   }
 }
 
-
 /* ----------------------------------------------------------------------
    udirect2b = Ewald real direct field via list
    udirect2b computes the real space contribution of the permanent
@@ -721,19 +720,20 @@ void PairAmoebaGPU::udirect2b(double **field, double **fieldp)
   else choose(POLAR);
 
   firstneigh = amoeba_gpu_compute_udirect2b(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup, rpole, uind, uinp,
-                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, aewald, off2, atom->q, domain->boxlo,
-                                        domain->prd, &fieldp_pinned);
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success, aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // rebuild dipole-dipole pair list and store pairwise dipole matrices
   // done one atom at a time in real-space double loop over atoms & neighs
-
+  // NOTE: for the moment the tdipdip values are computed just in time in umutual2b()
   // udirect2b_cpu();
 
   // accumulate the field and fieldp values from the GPU lib
@@ -945,13 +945,14 @@ void PairAmoebaGPU::umutual2b(double **field, double **fieldp)
   else choose(POLAR);
 
   firstneigh = amoeba_gpu_compute_umutual2b(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup, rpole, uind, uinp,
-                                        sublo, subhi, atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success,aewald, off2, atom->q, domain->boxlo,
-                                        domain->prd, &fieldp_pinned);
+                                            atom->type, amtype, amgroup, rpole,
+                                            uind, uinp, sublo, subhi, atom->tag,
+                                            atom->nspecial, atom->special,
+                                            atom->nspecial15, atom->special15,
+                                            eflag, vflag, eflag_atom, vflag_atom,
+                                            host_start, &ilist, &numneigh, cpu_time,
+                                            success,aewald, off2, atom->q,
+                                            domain->boxlo, domain->prd, &fieldp_pinned);
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
@@ -1017,37 +1018,37 @@ void PairAmoebaGPU::polar_real()
   double felec = 0.5 * electric / am_dielectric;
 
   firstneigh = amoeba_gpu_compute_polar_real(neighbor->ago, inum, nall, atom->x,
-                                        atom->type, amtype, amgroup,
-                                        rpole, uind, uinp, sublo, subhi,
-                                        atom->tag, atom->nspecial, atom->special,
-                                        atom->nspecial15, atom->special15,
-                                        eflag, vflag, eflag_atom, vflag_atom,
-                                        host_start, &ilist, &numneigh, cpu_time,
-                                        success, aewald, felec, off2, atom->q, domain->boxlo,
-                                        domain->prd, &tep_pinned);
+                                             atom->type, amtype, amgroup,
+                                             rpole, uind, uinp, sublo, subhi,
+                                             atom->tag, atom->nspecial, atom->special,
+                                             atom->nspecial15, atom->special15,
+                                             eflag, vflag, eflag_atom, vflag_atom,
+                                             host_start, &ilist, &numneigh, cpu_time,
+                                             success, aewald, felec, off2, atom->q,
+                                             domain->boxlo, domain->prd, &tq_pinned);
   
   if (!success)
     error->one(FLERR,"Insufficient memory on accelerator");
 
   // reference to the tep array from GPU lib
-  printf("compute polar real\n");
-  if (tep_single) {
-    float *tep_ptr = (float *)tep_pinned;
-    compute_force_from_tep<float>(tep_ptr, fpolar, virpolar);
+
+  if (tq_single) {
+    float *tep_ptr = (float *)tq_pinned;
+    compute_force_from_torque<float>(tep_ptr, fpolar, virpolar);
   } else {
-    double *tep_ptr = (double *)tep_pinned;
-    compute_force_from_tep<double>(tep_ptr, fpolar, virpolar);
+    double *tep_ptr = (double *)tq_pinned;
+    compute_force_from_torque<double>(tep_ptr, fpolar, virpolar);
   }
 }
 
 /* ----------------------------------------------------------------------
-   init specific to this pair style
+   compute atom forces from torques
 ------------------------------------------------------------------------- */
 
 template <class numtyp>
-void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
-                                           double** force_comp,
-                                           double* virial_comp)
+void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr,
+                                              double** force_comp,
+                                              double* virial_comp)
 {
   int i,ix,iy,iz;
   double xix,yix,zix;
@@ -1055,19 +1056,16 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
   double xiz,yiz,ziz;
   double vxx,vyy,vzz;
   double vxy,vxz,vyz;
-  double fix[3],fiy[3],fiz[3],tep[4];
+  double fix[3],fiy[3],fiz[3],_tq[4];
 
   double** x = atom->x;
   int nlocal = atom->nlocal;
 
   for (i = 0; i < nlocal; i++) {
-    tep[0] = tep_ptr[4*i];
-    tep[1] = tep_ptr[4*i+1];
-    tep[2] = tep_ptr[4*i+2];
-
-    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
-    torque2force(i,tep,fix,fiy,fiz,force_comp);
-    if (i == 0) printf("before fcomp = %f %f %f\n", force_comp[i][0], force_comp[i][1], force_comp[i][2]);
+    _tq[0] = tq_ptr[4*i];
+    _tq[1] = tq_ptr[4*i+1];
+    _tq[2] = tq_ptr[4*i+2];
+    torque2force(i,_tq,fix,fiy,fiz,force_comp);
 
     iz = zaxis2local[i];
     ix = xaxis2local[i];
@@ -1092,7 +1090,7 @@ void PairAmoebaGPU::compute_force_from_tep(const numtyp* tep_ptr,
                  xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]);
     vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + 
                  yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]);
-    //if (i < 10) printf("fix = %f %f %f; v %f %f %f %f %f %f\n", fix[0], fix[1], fix[2], vxx, vyy, vzz, vxy, vxz,vyz);
+
     virial_comp[0] += vxx;
     virial_comp[1] += vyy;
     virial_comp[2] += vzz;
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index a913449a62..d9a3fc5904 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -43,9 +43,9 @@ class PairAmoebaGPU : public PairAmoeba {
  private:
   int gpu_mode;
   double cpu_time;
-  void *tep_pinned;
+  void *tq_pinned;
   void *fieldp_pinned;
-  bool tep_single;
+  bool tq_single;
 
   bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
@@ -55,7 +55,7 @@ class PairAmoebaGPU : public PairAmoeba {
   void udirect2b_cpu();
 
   template<class numtyp>
-  void compute_force_from_tep(const numtyp*, double**, double*);
+  void compute_force_from_torque(const numtyp*, double**, double*);
 };
 
 }    // namespace LAMMPS_NS