diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 5f50486e28..c52246b06b 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -68,7 +68,31 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 
 # host code compilation
 
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+	
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 
 #ifdef CUDPP_OPT
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 498c55ceba..38058bab55 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -275,13 +275,18 @@ int AmoebaT::fphi_uind() {
   const int BX=this->block_size();
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
-/*
-  this->time_pair.start();
 
+  this->time_pair.start();
+  int ngridyz = this->_ngridy * this->_ngridz;
   this->k_fphi_uind.set_size(GX,BX);
-  this->k_fphi_uind.run();
+  this->k_fphi_uind.run(&this->atom->x, &this->_thetai1,
+                        &this->_thetai2, &this->_thetai3,
+                        &this->_igrid, &this->_cgrid_brick,
+                        &this->_fdip_phi1, &this->_fdip_phi2,
+                        &this->_fdip_sum_phi, &this->_bsorder,
+                        &ainum, &ngridyz, &this->_ngridy,
+                        &this->_threads_per_atom);
   this->time_pair.stop();
-*/
 
   return GX;
 }
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 1239764108..984154f16e 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1621,15 +1621,16 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
-                          const __global numtyp4 *restrict thetai1,
-                          const __global numtyp4 *restrict thetai2,
-                          const __global numtyp4 *restrict thetai3,
-                          const __global int4 *restrict igrid,
-                          const __global numtyp4 *restrict grid,
-                          __global numtyp4 *restrict fdip_phi1,
-                          __global numtyp4 *restrict fdip_phi2,
-                          __global numtyp4 *restrict fdip_sum_phi,
+                          const __global numtyp *restrict thetai1,
+                          const __global numtyp *restrict thetai2,
+                          const __global numtyp *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp *restrict grid,
+                          __global numtyp *restrict fdip_phi1,
+                          __global numtyp *restrict fdip_phi2,
+                          __global numtyp *restrict fdip_sum_phi,
                           const int bsorder, const int inum,
+                          const int nyzgrid, const int nygrid,
                           const int t_per_atom)
 {
   int tid, ii, offset, i, n_stride;
@@ -1666,46 +1667,46 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
 
     // extract the permanent multipole field at each site
 
-    tuv100_1 = 0.0;
-    tuv010_1 = 0.0;
-    tuv001_1 = 0.0;
-    tuv200_1 = 0.0;
-    tuv020_1 = 0.0;
-    tuv002_1 = 0.0;
-    tuv110_1 = 0.0;
-    tuv101_1 = 0.0;
-    tuv011_1 = 0.0;
-    tuv100_2 = 0.0;
-    tuv010_2 = 0.0;
-    tuv001_2 = 0.0;
-    tuv200_2 = 0.0;
-    tuv020_2 = 0.0;
-    tuv002_2 = 0.0;
-    tuv110_2 = 0.0;
-    tuv101_2 = 0.0;
-    tuv011_2 = 0.0;
-    tuv000 = 0.0;
-    tuv001 = 0.0;
-    tuv010 = 0.0;
-    tuv100 = 0.0;
-    tuv200 = 0.0;
-    tuv020 = 0.0;
-    tuv002 = 0.0;
-    tuv110 = 0.0;
-    tuv101 = 0.0;
-    tuv011 = 0.0;
-    tuv300 = 0.0;
-    tuv030 = 0.0;
-    tuv003 = 0.0;
-    tuv210 = 0.0;
-    tuv201 = 0.0;
-    tuv120 = 0.0;
-    tuv021 = 0.0;
-    tuv102 = 0.0;
-    tuv012 = 0.0;
-    tuv111 = 0.0;
+    tuv100_1 = (numtyp)0.0;
+    tuv010_1 = (numtyp)0.0;
+    tuv001_1 = (numtyp)0.0;
+    tuv200_1 = (numtyp)0.0;
+    tuv020_1 = (numtyp)0.0;
+    tuv002_1 = (numtyp)0.0;
+    tuv110_1 = (numtyp)0.0;
+    tuv101_1 = (numtyp)0.0;
+    tuv011_1 = (numtyp)0.0;
+    tuv100_2 = (numtyp)0.0;
+    tuv010_2 = (numtyp)0.0;
+    tuv001_2 = (numtyp)0.0;
+    tuv200_2 = (numtyp)0.0;
+    tuv020_2 = (numtyp)0.0;
+    tuv002_2 = (numtyp)0.0;
+    tuv110_2 = (numtyp)0.0;
+    tuv101_2 = (numtyp)0.0;
+    tuv011_2 = (numtyp)0.0;
+    tuv000 = (numtyp)0.0;
+    tuv001 = (numtyp)0.0;
+    tuv010 = (numtyp)0.0;
+    tuv100 = (numtyp)0.0;
+    tuv200 = (numtyp)0.0;
+    tuv020 = (numtyp)0.0;
+    tuv002 = (numtyp)0.0;
+    tuv110 = (numtyp)0.0;
+    tuv101 = (numtyp)0.0;
+    tuv011 = (numtyp)0.0;
+    tuv300 = (numtyp)0.0;
+    tuv030 = (numtyp)0.0;
+    tuv003 = (numtyp)0.0;
+    tuv210 = (numtyp)0.0;
+    tuv201 = (numtyp)0.0;
+    tuv120 = (numtyp)0.0;
+    tuv021 = (numtyp)0.0;
+    tuv102 = (numtyp)0.0;
+    tuv012 = (numtyp)0.0;
+    tuv111 = (numtyp)0.0;
 
-    k = igrid[i].z - nlpts;
+    k = igrid[3*i+2] - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
       /*
       v0 = thetai3[m][kb][0];
@@ -1713,30 +1714,35 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v2 = thetai3[m][kb][2];
       v3 = thetai3[m][kb][3];
       */
-      tu00_1 = 0.0;
-      tu01_1 = 0.0;
-      tu10_1 = 0.0;
-      tu20_1 = 0.0;
-      tu11_1 = 0.0;
-      tu02_1 = 0.0;
-      tu00_2 = 0.0;
-      tu01_2 = 0.0;
-      tu10_2 = 0.0;
-      tu20_2 = 0.0;
-      tu11_2 = 0.0;
-      tu02_2 = 0.0;
-      tu00 = 0.0;
-      tu10 = 0.0;
-      tu01 = 0.0;
-      tu20 = 0.0;
-      tu11 = 0.0;
-      tu02 = 0.0;
-      tu30 = 0.0;
-      tu21 = 0.0;
-      tu12 = 0.0;
-      tu03 = 0.0;
+      int i3 = m*4*bsorder + 4*kb;
+      v0 = thetai3[i3];
+      v1 = thetai3[i3]+1;
+      v2 = thetai3[i3+2];
+      v3 = thetai3[i3+3];
+      tu00_1 = (numtyp)0.0;
+      tu01_1 = (numtyp)0.0;
+      tu10_1 = (numtyp)0.0;
+      tu20_1 = (numtyp)0.0;
+      tu11_1 = (numtyp)0.0;
+      tu02_1 = (numtyp)0.0;
+      tu00_2 = (numtyp)0.0;
+      tu01_2 = (numtyp)0.0;
+      tu10_2 = (numtyp)0.0;
+      tu20_2 = (numtyp)0.0;
+      tu11_2 = (numtyp)0.0;
+      tu02_2 = (numtyp)0.0;
+      tu00 = (numtyp)0.0;
+      tu10 = (numtyp)0.0;
+      tu01 = (numtyp)0.0;
+      tu20 = (numtyp)0.0;
+      tu11 = (numtyp)0.0;
+      tu02 = (numtyp)0.0;
+      tu30 = (numtyp)0.0;
+      tu21 = (numtyp)0.0;
+      tu12 = (numtyp)0.0;
+      tu03 = (numtyp)0.0;
 
-      j = igrid[i].y - nlpts;
+      j = igrid[3*i+1] - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
         /*
         u0 = thetai2[m][jb][0];
@@ -1744,19 +1750,24 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u2 = thetai2[m][jb][2];
         u3 = thetai2[m][jb][3];
         */
-        t0_1 = 0.0;
-        t1_1 = 0.0;
-        t2_1 = 0.0;
-        t0_2 = 0.0;
-        t1_2 = 0.0;
-        t2_2 = 0.0;
-        t3 = 0.0;
+        int i2 = m*4*bsorder+4*jb;
+        u0 = thetai2[i2];
+        u1 = thetai2[i2+1];
+        u2 = thetai2[i2+2];
+        u3 = thetai2[i2+3];
+        t0_1 = (numtyp)0.0;
+        t1_1 = (numtyp)0.0;
+        t2_1 = (numtyp)0.0;
+        t0_2 = (numtyp)0.0;
+        t1_2 = (numtyp)0.0;
+        t2_2 = (numtyp)0.0;
+        t3 = (numtyp)0.0;
 
-        i = igrid[m].x - nlpts;
+        int ii = igrid[3*i] - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
           /*
-          tq_1 = grid[k][j][i][0];
-          tq_2 = grid[k][j][i][1];
+          tq_1 = grid[k][j][ii][0];
+          tq_2 = grid[k][j][ii][1];
           t0_1 += tq_1*thetai1[m][ib][0];
           t1_1 += tq_1*thetai1[m][ib][1];
           t2_1 += tq_1*thetai1[m][ib][2];
@@ -1765,7 +1776,22 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          i++;
+          int i1 = m*4*bsorder+4*ib;
+          numtyp w0 = thetai1[i1];
+          numtyp w1 = thetai1[i1+1];
+          numtyp w2 = thetai1[i1+2];
+          numtyp w3 = thetai1[i1+3];
+          int gidx = 2*(k*nyzgrid + j*nygrid + ii);
+          tq_1 = grid[gidx];
+          tq_2 = grid[gidx+1];
+          t0_1 += tq_1*w0;
+          t1_1 += tq_1*w1;
+          t2_1 += tq_1*w2;
+          t0_2 += tq_2*w0;
+          t1_2 += tq_2*w1;
+          t2_2 += tq_2*w2;
+          t3 += (tq_1+tq_2)*w3;
+          ii++;
         }
 
         tu00_1 += t0_1*u0;
@@ -1836,6 +1862,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       tuv111 += tu11*v1;
       k++;
     }
+
 /*
     fdip_phi1[m][0] = 0.0;
     fdip_phi1[m][1] = tuv100_1;
@@ -1847,39 +1874,51 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     fdip_phi1[m][7] = tuv110_1;
     fdip_phi1[m][8] = tuv101_1;
     fdip_phi1[m][9] = tuv011_1;
-
-    fdip_phi2[m][0] = 0.0;
-    fdip_phi2[m][1] = tuv100_2;
-    fdip_phi2[m][2] = tuv010_2;
-    fdip_phi2[m][3] = tuv001_2;
-    fdip_phi2[m][4] = tuv200_2;
-    fdip_phi2[m][5] = tuv020_2;
-    fdip_phi2[m][6] = tuv002_2;
-    fdip_phi2[m][7] = tuv110_2;
-    fdip_phi2[m][8] = tuv101_2;
-    fdip_phi2[m][9] = tuv011_2;
-
-    fdip_sum_phi[m][0] = tuv000;
-    fdip_sum_phi[m][1] = tuv100;
-    fdip_sum_phi[m][2] = tuv010;
-    fdip_sum_phi[m][3] = tuv001;
-    fdip_sum_phi[m][4] = tuv200;
-    fdip_sum_phi[m][5] = tuv020;
-    fdip_sum_phi[m][6] = tuv002;
-    fdip_sum_phi[m][7] = tuv110;
-    fdip_sum_phi[m][8] = tuv101;
-    fdip_sum_phi[m][9] = tuv011;
-    fdip_sum_phi[m][10] = tuv300;
-    fdip_sum_phi[m][11] = tuv030;
-    fdip_sum_phi[m][12] = tuv003;
-    fdip_sum_phi[m][13] = tuv210;
-    fdip_sum_phi[m][14] = tuv201;
-    fdip_sum_phi[m][15] = tuv120;
-    fdip_sum_phi[m][16] = tuv021;
-    fdip_sum_phi[m][17] = tuv102;
-    fdip_sum_phi[m][18] = tuv012;
-    fdip_sum_phi[m][19] = tuv111;
 */
+    int idx = 10*m;
+    fdip_phi1[idx+0] = (numtyp)0.0;
+    fdip_phi1[idx+1] = tuv100_1;
+    fdip_phi1[idx+2] = tuv010_1;
+    fdip_phi1[idx+3] = tuv001_1;
+    fdip_phi1[idx+4] = tuv200_1;
+    fdip_phi1[idx+5] = tuv020_1;
+    fdip_phi1[idx+6] = tuv002_1;
+    fdip_phi1[idx+7] = tuv110_1;
+    fdip_phi1[idx+8] = tuv101_1;
+    fdip_phi1[idx+9] = tuv011_1;
+
+    fdip_phi2[idx+0] = (numtyp)0.0;
+    fdip_phi2[idx+1] = tuv100_2;
+    fdip_phi2[idx+2] = tuv010_2;
+    fdip_phi2[idx+3] = tuv001_2;
+    fdip_phi2[idx+4] = tuv200_2;
+    fdip_phi2[idx+5] = tuv020_2;
+    fdip_phi2[idx+6] = tuv002_2;
+    fdip_phi2[idx+7] = tuv110_2;
+    fdip_phi2[idx+8] = tuv101_2;
+    fdip_phi2[idx+9] = tuv011_2;
+
+    idx = 20*m;
+    fdip_sum_phi[idx+0] = tuv000;
+    fdip_sum_phi[idx+1] = tuv100;
+    fdip_sum_phi[idx+2] = tuv010;
+    fdip_sum_phi[idx+3] = tuv001;
+    fdip_sum_phi[idx+4] = tuv200;
+    fdip_sum_phi[idx+5] = tuv020;
+    fdip_sum_phi[idx+6] = tuv002;
+    fdip_sum_phi[idx+7] = tuv110;
+    fdip_sum_phi[idx+8] = tuv101;
+    fdip_sum_phi[idx+9] = tuv011;
+    fdip_sum_phi[idx+10] = tuv300;
+    fdip_sum_phi[idx+11] = tuv030;
+    fdip_sum_phi[idx+12] = tuv003;
+    fdip_sum_phi[idx+13] = tuv210;
+    fdip_sum_phi[idx+14] = tuv201;
+    fdip_sum_phi[idx+15] = tuv120;
+    fdip_sum_phi[idx+16] = tuv021;
+    fdip_sum_phi[idx+17] = tuv102;
+    fdip_sum_phi[idx+18] = tuv012;
+    fdip_sum_phi[idx+19] = tuv111;
   }
 }
 
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index c18b10675b..cd5a9abf81 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
   k_multipole.clear();
   k_udirect2b.clear();
   k_umutual2b.clear();
+  k_fphi_uind.clear();
   k_polar.clear();
   k_special15.clear();
   k_short_nbor.clear();
@@ -182,6 +183,11 @@ void BaseAmoebaT::clear_atomic() {
   _thetai1.clear();
   _thetai2.clear();
   _thetai3.clear();
+  _igrid.clear();
+  _fdip_phi1.clear();
+  _fdip_phi2.clear();
+  _cgrid_brick.clear();
+  _fdip_sum_phi.clear();
   dev_nspecial15.clear();
   dev_special15.clear();
   dev_special15_t.clear();
@@ -444,34 +450,70 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 template <class numtyp, class acctyp>
 void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                     double **host_thetai1, double **host_thetai2,
-                                    double **host_thetai3, int** host_igrid) {
+                                    double **host_thetai3, int** host_igrid,
+                                    double* grid_brick_start, int nzlo_out,
+                                    int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out) {
   
   _bsorder = bsorder;
 
+  // allocate or resize per-atom arrays
+  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
+  //   will be consolidated once all terms are ready
+
   if (_max_thetai_size == 0) {
     _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY);
+
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
+
   } else {
     if (inum_full>_max_thetai_size) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
       _thetai1.resize(_max_thetai_size*bsorder*4);
       _thetai2.resize(_max_thetai_size*bsorder*4);
       _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size);
+      _igrid.resize(_max_thetai_size*3);
+
+      _fdip_phi1.resize(_max_thetai_size*10);
+      _fdip_phi2.resize(_max_thetai_size*10);
+      _fdip_sum_phi.resize(_max_thetai_size*20);
     }
   }
 
-  memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int));
-  _thetai1.update_device(inum_full*bsorder*4,true);
-  _thetai2.update_device(inum_full*bsorder*4,true);
-  _thetai3.update_device(inum_full*bsorder*4,true);
-  _igrid.update_device(inum_full,true);
+  UCL_H_Vec<double> dview;
+
+  // copy from host to device
+  
+  dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai1,dview,false);
+  dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai2,dview,false);
+  dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai3,dview,false);
+
+  UCL_H_Vec<int> dview_int;
+  dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device));
+  ucl_copy(_igrid,dview_int,false);
+
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
+  dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device));
+  ucl_copy(_cgrid_brick,dview,false);
+
 }
 
 // ---------------------------------------------------------------------------
@@ -593,18 +635,35 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                     double **host_thetai1, double **host_thetai2,
                                     double **host_thetai3, int** igrid,
-                                    double ****host_cgrid_brick, double **host_fdip_phi1,
-                                    double **host_fdip_phi2, double **host_fdip_sum_phi)
+                                    double *host_grid_brick_start, double **host_fdip_phi1,
+                                    double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out)
 {
-  // once allocation and transfers
-  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid);
+  // allocation/resize and transfers (do this right after udirect?)
   
-  // resize grid if needed, then copy from host to device
-  // cgrid_brick.alloc()/resize()
-  // cgrid_brick.begin() = host_cgrid_brick[0][0][0][0];
-  //
+  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
+                    igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out,
+                    nxlo_out, nxhi_out);
 
-  const int red_bllocks = fphi_uind();
+  // update the cgrid_brick with data host
+  
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
+
+  UCL_H_Vec<double> dview;
+  dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device));
+  ucl_copy(_cgrid_brick,dview,false);
+
+  const int red_blocks = fphi_uind();
 }
 
 
@@ -814,6 +873,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_udirect2b.set_function(*pair_program,kname_udirect2b);
   k_umutual2b.set_function(*pair_program,kname_umutual2b);
   k_polar.set_function(*pair_program,kname_polar);
+  k_fphi_uind.set_function(*pair_program,"kname_fphi_uind");
   k_short_nbor.set_function(*pair_program,kname_short_nbor);
   k_special15.set_function(*pair_program,kname_special15);
   pos_tex.get_texture(*pair_program,"pos_tex");
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index f333bdf9a6..8503e6fba4 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -152,7 +152,10 @@ class BaseAmoeba {
 
   virtual void precompute_induce(const int inum_full, const int bsorder,
                                  double **host_thetai1, double **host_thetai2,
-                                 double **host_thetai3, int** igrid);
+                                 double **host_thetai3, int** igrid,
+                                 double* grid_brick_start, int nzlo_out,
+                                 int nzhi_out, int nylo_out, int nyhi_out,
+                                 int nxlo_out, int nxhi_out);
 
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
@@ -179,8 +182,10 @@ class BaseAmoeba {
   virtual void compute_fphi_uind(const int inum_full, const int bsorder,
                                  double **host_thetai1, double **host_thetai2,
                                  double **host_thetai3, int** igrid,
-                                 double ****host_grid, double **host_fdip_phi1,
-                                 double **host_fdip_phi2, double **host_fdip_sum_phi);
+                                 double *host_grid_brick_start, double **host_fdip_phi1,
+                                 double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                 int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                 int nxlo_out, int nxhi_out);
 
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@@ -249,9 +254,12 @@ class BaseAmoeba {
   int _nmax, _max_tep_size, _max_fieldp_size;
 
   int _bsorder;
-  UCL_Vector<numtyp,numtyp> _thetai1, _thetai2, _thetai3;
-  UCL_Vector<int4,int4> _igrid;
+  UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3, _cgrid_brick;
+  UCL_D_Vec<int> _igrid;
+  UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
   int _max_thetai_size;
+  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
+  int _ngridx, _ngridy, _ngridz, _num_grid_points;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
@@ -272,7 +280,7 @@ class BaseAmoeba {
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *pair_program;
-  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind;
   UCL_Kernel k_special15, k_short_nbor;
   inline int block_size() { return _block_size; }
   inline void set_kernel(const int eflag, const int vflag) {}
@@ -305,7 +313,6 @@ class BaseAmoeba {
   virtual int umutual2b(const int eflag, const int vflag) = 0;
   virtual int fphi_uind() = 0;
   virtual int polar_real(const int eflag, const int vflag) = 0;
-
   
 
   #if !defined(USE_OPENCL) && !defined(USE_HIP)