diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 4a26f7f98d..b0013f0b9b 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -14,7 +14,7 @@
 // ***************************************************************************
 
 #if defined(NV_KERNEL) || defined(USE_HIP)
-//#include <stdio.h>
+#include <stdio.h>
 #include "lal_aux_fun1.h"
 #ifdef LAMMPS_SMALLBIG
 #define tagint int
@@ -1630,14 +1630,19 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
                           __global numtyp *restrict fdip_phi2,
                           __global numtyp *restrict fdip_sum_phi,
                           const int bsorder, const int inum,
-                          const int nyzgrid, const int nygrid,
-                          const int t_per_atom)
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          const int ngridxy, const int ngridx)
 {
-  int tid, ii, offset, i, n_stride;
-  atom_info(t_per_atom,ii,tid,offset);
+  //int tid, ii, offset, i, n_stride;
+  //atom_info(t_per_atom,ii,tid,offset);
+
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
 
   if (ii<inum) {
-    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp4 ix; fetch4(ix,ii,pos_tex); //x_[i];
 
     int j,k,m;
     numtyp v0,v1,v2,v3;
@@ -1706,7 +1711,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     tuv012 = (numtyp)0.0;
     tuv111 = (numtyp)0.0;
 
-    k = igrid[4*i+2] - nlpts;
+    k = igrid[4*ii+2] - nzlo_out - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
       /*
       v0 = thetai3[m][kb][0];
@@ -1714,9 +1719,9 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v2 = thetai3[m][kb][2];
       v3 = thetai3[m][kb][3];
       */
-      int i3 = i*4*bsorder + 4*kb;
+      int i3 = ii*4*bsorder + 4*kb;
       v0 = thetai3[i3];
-      v1 = thetai3[i3]+1;
+      v1 = thetai3[i3+1];
       v2 = thetai3[i3+2];
       v3 = thetai3[i3+3];
       tu00_1 = (numtyp)0.0;
@@ -1742,7 +1747,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       tu12 = (numtyp)0.0;
       tu03 = (numtyp)0.0;
 
-      j = igrid[4*i+1] - nlpts;
+      j = igrid[4*ii+1] - nylo_out - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
         /*
         u0 = thetai2[m][jb][0];
@@ -1750,7 +1755,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u2 = thetai2[m][jb][2];
         u3 = thetai2[m][jb][3];
         */
-        int i2 = i*4*bsorder+4*jb;
+        int i2 = ii*4*bsorder+4*jb;
         u0 = thetai2[i2];
         u1 = thetai2[i2+1];
         u2 = thetai2[i2+2];
@@ -1763,11 +1768,11 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         t2_2 = (numtyp)0.0;
         t3 = (numtyp)0.0;
 
-        int ii = igrid[4*i] - nlpts;
+        int i = igrid[4*ii] - nxlo_out - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
           /*
-          tq_1 = grid[k][j][ii][0];
-          tq_2 = grid[k][j][ii][1];
+          tq_1 = grid[k][j][i][0];
+          tq_2 = grid[k][j][i][1];
           t0_1 += tq_1*thetai1[m][ib][0];
           t1_1 += tq_1*thetai1[m][ib][1];
           t2_1 += tq_1*thetai1[m][ib][2];
@@ -1776,14 +1781,19 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          int i1 = i*4*bsorder+4*ib;
+          int i1 = ii*4*bsorder+4*ib;
           numtyp w0 = thetai1[i1];
           numtyp w1 = thetai1[i1+1];
           numtyp w2 = thetai1[i1+2];
           numtyp w3 = thetai1[i1+3];
-          int gidx = 2*(k*nyzgrid + j*nygrid + ii);
+          int gidx = 2*(k*ngridxy + j*ngridx + i);
           tq_1 = grid[gidx];
           tq_2 = grid[gidx+1];
+/*
+          if (ii == 0 && jb == 0 && kb == 0)
+            printf("ii = 0: igrid %d %d %d; grid %f %f; k = %d j = %d; i = %d; origin = %f %f; gidx = %d\n",
+              igrid[4*ii+0], igrid[4*ii+1], igrid[4*ii+2], tq_1, tq_2, k, j, i, grid[0], grid[1], gidx);
+*/          
           t0_1 += tq_1*w0;
           t1_1 += tq_1*w1;
           t2_1 += tq_1*w2;
@@ -1791,7 +1801,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t1_2 += tq_2*w1;
           t2_2 += tq_2*w2;
           t3 += (tq_1+tq_2)*w3;
-          ii++;
+          i++;
         }
 
         tu00_1 += t0_1*u0;
@@ -1875,7 +1885,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     fdip_phi1[m][8] = tuv101_1;
     fdip_phi1[m][9] = tuv011_1;
 */
-    int idx = 10*m;
+    int idx = 10*ii;
     fdip_phi1[idx+0] = (numtyp)0.0;
     fdip_phi1[idx+1] = tuv100_1;
     fdip_phi1[idx+2] = tuv010_1;
@@ -1886,7 +1896,18 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     fdip_phi1[idx+7] = tuv110_1;
     fdip_phi1[idx+8] = tuv101_1;
     fdip_phi1[idx+9] = tuv011_1;
-
+/*
+    fdip_phi2[m][0] = 0.0;
+    fdip_phi2[m][1] = tuv100_2;
+    fdip_phi2[m][2] = tuv010_2;
+    fdip_phi2[m][3] = tuv001_2;
+    fdip_phi2[m][4] = tuv200_2;
+    fdip_phi2[m][5] = tuv020_2;
+    fdip_phi2[m][6] = tuv002_2;
+    fdip_phi2[m][7] = tuv110_2;
+    fdip_phi2[m][8] = tuv101_2;
+    fdip_phi2[m][9] = tuv011_2;
+*/    
     fdip_phi2[idx+0] = (numtyp)0.0;
     fdip_phi2[idx+1] = tuv100_2;
     fdip_phi2[idx+2] = tuv010_2;
@@ -1898,7 +1919,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     fdip_phi2[idx+8] = tuv101_2;
     fdip_phi2[idx+9] = tuv011_2;
 
-    idx = 20*m;
+    idx = 20*ii;
     fdip_sum_phi[idx+0] = tuv000;
     fdip_sum_phi[idx+1] = tuv100;
     fdip_sum_phi[idx+2] = tuv010;
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 151c38c9c4..95b7237e46 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -165,12 +165,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
-                          double *host_grid_brick_start, void **host_fdip_phi1,
-                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          double *host_grid_brick_start, double ****host_grid_brick,
+                          void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                           int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                           int nxlo_out, int nxhi_out, bool& first_iteration) {
    AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
-                          host_thetai3, igrid, host_grid_brick_start, host_fdip_phi1,
+                          host_thetai3, igrid, host_grid_brick_start, host_grid_brick, host_fdip_phi1,
                           host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
                           nylo_out, nyhi_out,  nxlo_out, nxhi_out, first_iteration);
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index af8d5ca481..21a97a0852 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -555,7 +555,7 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 // Prepare for umutual1() after bspline_fill() is done on host
 //   - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
 //     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
-//     host_igrid is allocated with nmax by by 4
+//     host_igrid is allocated with nmax by 4
 //   - transfer extra data from host to device
 // ---------------------------------------------------------------------------
 
@@ -563,8 +563,9 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                     double ***host_thetai1, double ***host_thetai2,
                                     double ***host_thetai3, int** host_igrid,
-                                    double* host_grid_brick_start, int nzlo_out,
-                                    int nzhi_out, int nylo_out, int nyhi_out,
+                                    double* host_grid_brick_start, double**** host_grid_brick,
+                                    int nzlo_out, int nzhi_out,
+                                    int nylo_out, int nyhi_out,
                                     int nxlo_out, int nxhi_out) {
   
   _bsorder = bsorder;
@@ -599,7 +600,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   }
 
   UCL_H_Vec<double> dview;
-  dview.alloc(inum_full*bsorder*4,*(this->ucl_device));
+  dview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
 
   // pack host data to device
 
@@ -634,7 +635,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   ucl_copy(_thetai3,dview,false);
 
   UCL_H_Vec<int> dview_int;
-  dview_int.alloc(inum_full*4, *(this->ucl_device));
+  dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
   for (int i = 0; i < inum_full; i++) {
     int idx = i*4;
     dview_int[idx+0] = host_igrid[i][0];
@@ -643,6 +644,33 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   }
   ucl_copy(_igrid, dview_int, false);
 
+  
+  
+}
+
+// ---------------------------------------------------------------------------
+// fphi_uind = induced potential from grid
+// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** igrid,
+                                    double *host_grid_brick_start, double ****host_grid_brick,
+                                    void** host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
+                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out, bool& first_iteration)
+{
+  // allocation/resize and transfers before the first iteration
+  
+  if (first_iteration) {
+    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
+                      igrid, host_grid_brick_start, host_grid_brick, nzlo_out, nzhi_out,
+                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
+    if (first_iteration) first_iteration = false;
+  }
+
   // update the cgrid_brick with data host
   
   _nzlo_out = nzlo_out;
@@ -656,36 +684,27 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   _ngridx = nxhi_out - nxlo_out + 1;
   _num_grid_points = _ngridx * _ngridy * _ngridz;
 
-  UCL_H_Vec<double> dview_cgrid;
-  dview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device));
+  UCL_H_Vec<double> hview_cgrid;
+  hview_cgrid.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_WRITE);
+  int n = 0;
+  for (int iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (int iy = nylo_out; iy <= nyhi_out; iy++)
+      for (int ix = nxlo_out; ix <= nxhi_out; ix++) {
+/*        
+        if (iz == nzlo_out && iy == nylo_out && ix == nxlo_out) {
+          printf("origin = %d %d %d: grid = %f %f %f\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1]);
+        }
+        if (iz == -2 && iy == 4 && ix == 8) printf("ixyz = %d %d %d: grid = %f %f %f; n = %d\n", iz, iy, ix, host_grid_brick[iz][iy][ix][0], host_grid_brick[iz][iy][ix][1], n);
+*/        
+        hview_cgrid[n] = host_grid_brick[iz][iy][ix][0];
+        hview_cgrid[n+1] = host_grid_brick[iz][iy][ix][1];
+        n += 2;
+      }
+  //hview_cgrid.view(host_grid_brick_start, _num_grid_points*2, *(this->ucl_device));
   _cgrid_brick.alloc(_num_grid_points*2, *(this->ucl_device), UCL_READ_ONLY);
-  ucl_copy(_cgrid_brick,dview_cgrid,false);
+  ucl_copy(_cgrid_brick,hview_cgrid,false);
 
-}
 
-// ---------------------------------------------------------------------------
-// fphi_uind = induced potential from grid
-// fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
-// ---------------------------------------------------------------------------
-
-template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
-                                    double ***host_thetai1, double ***host_thetai2,
-                                    double ***host_thetai3, int** igrid,
-                                    double *host_grid_brick_start, void** host_fdip_phi1,
-                                    void **host_fdip_phi2, void **host_fdip_sum_phi,
-                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out, bool& first_iteration)
-{
-  // allocation/resize and transfers before the first iteration
-  
-  if (first_iteration) {
-    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
-                      igrid, host_grid_brick_start, nzlo_out, nzhi_out,
-                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
-    if (first_iteration) first_iteration = false;
-  }
-    
   const int red_blocks = fphi_uind();
 
   _fdip_phi1.update_host(_max_thetai_size*10);
@@ -711,16 +730,16 @@ int BaseAmoebaT::fphi_uind() {
 
   // Compute the block size and grid size to keep all cores busy
   const int BX=block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
-                               (BX/_threads_per_atom)));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
 
   time_pair.start();
-  int ngridyz = _ngridy * _ngridz;
+  int ngridxy = _ngridx * _ngridy;
   k_fphi_uind.set_size(GX,BX);
   k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3,
                   &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2,
-                  &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy,
-                  &_threads_per_atom);
+                  &_fdip_sum_phi, &_bsorder, &ainum, 
+                  &_nzlo_out, &_nzhi_out, &_nylo_out, &_nyhi_out, 
+                  &_nxlo_out, &_nxhi_out, &ngridxy, &_ngridx);
   time_pair.stop();
 
   return GX;
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index a001423812..c2c2a2d93d 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -153,8 +153,9 @@ class BaseAmoeba {
   virtual void precompute_induce(const int inum_full, const int bsorder,
                                  double ***host_thetai1, double ***host_thetai2,
                                  double ***host_thetai3, int** igrid,
-                                 double* grid_brick_start, int nzlo_out,
-                                 int nzhi_out, int nylo_out, int nyhi_out,
+                                 double *host_grid_brick_start, double ****host_grid_brick,
+                                 int nzlo_out, int nzhi_out,
+                                 int nylo_out, int nyhi_out,
                                  int nxlo_out, int nxhi_out);
 
   /// Compute multipole real-space with device neighboring
@@ -182,8 +183,8 @@ class BaseAmoeba {
   virtual void compute_fphi_uind(const int inum_full, const int bsorder,
                                  double ***host_thetai1, double ***host_thetai2,
                                  double ***host_thetai3, int** igrid,
-                                 double *host_grid_brick_start, void **host_fdip_phi1,
-                                 void **host_fdip_phi2, void **host_fdip_sum_phi,
+                                 double *host_grid_brick_start, double ****host_grid_brick,
+                                 void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
                                  int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                                  int nxlo_out, int nxhi_out, bool& first_iteration);
 
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index bf6db3472d..936cf8afbc 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -91,7 +91,7 @@ void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
                           double ***host_thetai3, int** igrid,
-                          double *host_grid_brick_start, void **host_fdip_phi1,
+                          double *host_grid_brick_start, double ****host_grid_brick, void **host_fdip_phi1,
                           void **host_fdip_phi2, void **host_fdip_sum_phi,
                           int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
                           int nxlo_out, int nxhi_out, bool& first_iteration);
@@ -121,7 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;     // need to be true for precompute()
   gpu_udirect2b_ready = true;
   gpu_umutual1_ready = true;
-  gpu_fphi_uind_ready = false;
+  gpu_fphi_uind_ready = true;
   gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
 
@@ -1139,7 +1139,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
   void* fdip_phi2_pinned = nullptr;
   void* fdip_sum_phi_pinned = nullptr;
   amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3,
-                        igrid, ic_kspace->grid_brick_start,
+                        igrid, ic_kspace->grid_brick_start, grid,
                         &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned,
                         ic_kspace->nzlo_out, ic_kspace->nzhi_out,
                         ic_kspace->nylo_out, ic_kspace->nyhi_out,
@@ -1150,8 +1150,10 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
   double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
   for (int i = 0; i < nlocal; i++) {
     int idx = 10 * i;
-    for (int m = 0; m < 10; m++)
-      fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+    for (int m = 0; m < 10; m++) {
+       fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+    }
+    if (i == 0) printf("gpu fdip phi1 = %f %f %f\n", fdip_phi1[i][0], fdip_phi1[i][1], fdip_phi1[i][2]);      
   }
 
   double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
@@ -1159,6 +1161,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
     int idx = 10 * i;
     for (int m = 0; m < 10; m++)
       fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m];
+    if (i == 0) printf("gpu fdip phi2 = %f %f %f\n", fdip_phi2[i][0], fdip_phi2[i][1], fdip_phi2[i][2]);      
   }
 
   double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
@@ -1166,6 +1169,7 @@ void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
     int idx = 20 * i;
     for (int m = 0; m < 20; m++)
       fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m];
+    if (i == 0) printf("gpu fdip sum phi = %f %f %f\n", fdip_sum_phi[i][0], fdip_sum_phi[i][1], fdip_sum_phi[i][2]);            
   }
 }