Working on the fphi_uind kernel and array allocations

2022-08-30 23:40:04 -05:00
parent c5c3c697df
commit aac264f2e2
5 changed files with 283 additions and 148 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -68,7 +68,31 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)

 # host code compilation

-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
+	
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)

 #ifdef CUDPP_OPT
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@ -275,13 +275,18 @@ int AmoebaT::fphi_uind() {
  const int BX=this->block_size();
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));
-/*
-  this->time_pair.start();

+  this->time_pair.start();
+  int ngridyz = this->_ngridy * this->_ngridz;
  this->k_fphi_uind.set_size(GX,BX);
-  this->k_fphi_uind.run();
+  this->k_fphi_uind.run(&this->atom->x, &this->_thetai1,
+                        &this->_thetai2, &this->_thetai3,
+                        &this->_igrid, &this->_cgrid_brick,
+                        &this->_fdip_phi1, &this->_fdip_phi2,
+                        &this->_fdip_sum_phi, &this->_bsorder,
+                        &ainum, &ngridyz, &this->_ngridy,
+                        &this->_threads_per_atom);
  this->time_pair.stop();
-*/

  return GX;
 }
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@ -1621,15 +1621,16 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */

 __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
-                          const __global numtyp4 *restrict thetai1,
-                          const __global numtyp4 *restrict thetai2,
-                          const __global numtyp4 *restrict thetai3,
-                          const __global int4 *restrict igrid,
-                          const __global numtyp4 *restrict grid,
-                          __global numtyp4 *restrict fdip_phi1,
-                          __global numtyp4 *restrict fdip_phi2,
-                          __global numtyp4 *restrict fdip_sum_phi,
+                          const __global numtyp *restrict thetai1,
+                          const __global numtyp *restrict thetai2,
+                          const __global numtyp *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp *restrict grid,
+                          __global numtyp *restrict fdip_phi1,
+                          __global numtyp *restrict fdip_phi2,
+                          __global numtyp *restrict fdip_sum_phi,
                          const int bsorder, const int inum,
+                          const int nyzgrid, const int nygrid,
                          const int t_per_atom)
 {
  int tid, ii, offset, i, n_stride;
@ -1666,46 +1667,46 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,

    // extract the permanent multipole field at each site

-    tuv100_1 = 0.0;
-    tuv010_1 = 0.0;
-    tuv001_1 = 0.0;
-    tuv200_1 = 0.0;
-    tuv020_1 = 0.0;
-    tuv002_1 = 0.0;
-    tuv110_1 = 0.0;
-    tuv101_1 = 0.0;
-    tuv011_1 = 0.0;
-    tuv100_2 = 0.0;
-    tuv010_2 = 0.0;
-    tuv001_2 = 0.0;
-    tuv200_2 = 0.0;
-    tuv020_2 = 0.0;
-    tuv002_2 = 0.0;
-    tuv110_2 = 0.0;
-    tuv101_2 = 0.0;
-    tuv011_2 = 0.0;
-    tuv000 = 0.0;
-    tuv001 = 0.0;
-    tuv010 = 0.0;
-    tuv100 = 0.0;
-    tuv200 = 0.0;
-    tuv020 = 0.0;
-    tuv002 = 0.0;
-    tuv110 = 0.0;
-    tuv101 = 0.0;
-    tuv011 = 0.0;
-    tuv300 = 0.0;
-    tuv030 = 0.0;
-    tuv003 = 0.0;
-    tuv210 = 0.0;
-    tuv201 = 0.0;
-    tuv120 = 0.0;
-    tuv021 = 0.0;
-    tuv102 = 0.0;
-    tuv012 = 0.0;
-    tuv111 = 0.0;
+    tuv100_1 = (numtyp)0.0;
+    tuv010_1 = (numtyp)0.0;
+    tuv001_1 = (numtyp)0.0;
+    tuv200_1 = (numtyp)0.0;
+    tuv020_1 = (numtyp)0.0;
+    tuv002_1 = (numtyp)0.0;
+    tuv110_1 = (numtyp)0.0;
+    tuv101_1 = (numtyp)0.0;
+    tuv011_1 = (numtyp)0.0;
+    tuv100_2 = (numtyp)0.0;
+    tuv010_2 = (numtyp)0.0;
+    tuv001_2 = (numtyp)0.0;
+    tuv200_2 = (numtyp)0.0;
+    tuv020_2 = (numtyp)0.0;
+    tuv002_2 = (numtyp)0.0;
+    tuv110_2 = (numtyp)0.0;
+    tuv101_2 = (numtyp)0.0;
+    tuv011_2 = (numtyp)0.0;
+    tuv000 = (numtyp)0.0;
+    tuv001 = (numtyp)0.0;
+    tuv010 = (numtyp)0.0;
+    tuv100 = (numtyp)0.0;
+    tuv200 = (numtyp)0.0;
+    tuv020 = (numtyp)0.0;
+    tuv002 = (numtyp)0.0;
+    tuv110 = (numtyp)0.0;
+    tuv101 = (numtyp)0.0;
+    tuv011 = (numtyp)0.0;
+    tuv300 = (numtyp)0.0;
+    tuv030 = (numtyp)0.0;
+    tuv003 = (numtyp)0.0;
+    tuv210 = (numtyp)0.0;
+    tuv201 = (numtyp)0.0;
+    tuv120 = (numtyp)0.0;
+    tuv021 = (numtyp)0.0;
+    tuv102 = (numtyp)0.0;
+    tuv012 = (numtyp)0.0;
+    tuv111 = (numtyp)0.0;

-    k = igrid[i].z - nlpts;
+    k = igrid[3*i+2] - nlpts;
    for (int kb = 0; kb < bsorder; kb++) {
      /*
      v0 = thetai3[m][kb][0];
@ -1713,30 +1714,35 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
      v2 = thetai3[m][kb][2];
      v3 = thetai3[m][kb][3];
      */
-      tu00_1 = 0.0;
-      tu01_1 = 0.0;
-      tu10_1 = 0.0;
-      tu20_1 = 0.0;
-      tu11_1 = 0.0;
-      tu02_1 = 0.0;
-      tu00_2 = 0.0;
-      tu01_2 = 0.0;
-      tu10_2 = 0.0;
-      tu20_2 = 0.0;
-      tu11_2 = 0.0;
-      tu02_2 = 0.0;
-      tu00 = 0.0;
-      tu10 = 0.0;
-      tu01 = 0.0;
-      tu20 = 0.0;
-      tu11 = 0.0;
-      tu02 = 0.0;
-      tu30 = 0.0;
-      tu21 = 0.0;
-      tu12 = 0.0;
-      tu03 = 0.0;
+      int i3 = m*4*bsorder + 4*kb;
+      v0 = thetai3[i3];
+      v1 = thetai3[i3]+1;
+      v2 = thetai3[i3+2];
+      v3 = thetai3[i3+3];
+      tu00_1 = (numtyp)0.0;
+      tu01_1 = (numtyp)0.0;
+      tu10_1 = (numtyp)0.0;
+      tu20_1 = (numtyp)0.0;
+      tu11_1 = (numtyp)0.0;
+      tu02_1 = (numtyp)0.0;
+      tu00_2 = (numtyp)0.0;
+      tu01_2 = (numtyp)0.0;
+      tu10_2 = (numtyp)0.0;
+      tu20_2 = (numtyp)0.0;
+      tu11_2 = (numtyp)0.0;
+      tu02_2 = (numtyp)0.0;
+      tu00 = (numtyp)0.0;
+      tu10 = (numtyp)0.0;
+      tu01 = (numtyp)0.0;
+      tu20 = (numtyp)0.0;
+      tu11 = (numtyp)0.0;
+      tu02 = (numtyp)0.0;
+      tu30 = (numtyp)0.0;
+      tu21 = (numtyp)0.0;
+      tu12 = (numtyp)0.0;
+      tu03 = (numtyp)0.0;

-      j = igrid[i].y - nlpts;
+      j = igrid[3*i+1] - nlpts;
      for (int jb = 0; jb < bsorder; jb++) {
        /*
        u0 = thetai2[m][jb][0];
@ -1744,19 +1750,24 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
        u2 = thetai2[m][jb][2];
        u3 = thetai2[m][jb][3];
        */
-        t0_1 = 0.0;
-        t1_1 = 0.0;
-        t2_1 = 0.0;
-        t0_2 = 0.0;
-        t1_2 = 0.0;
-        t2_2 = 0.0;
-        t3 = 0.0;
+        int i2 = m*4*bsorder+4*jb;
+        u0 = thetai2[i2];
+        u1 = thetai2[i2+1];
+        u2 = thetai2[i2+2];
+        u3 = thetai2[i2+3];
+        t0_1 = (numtyp)0.0;
+        t1_1 = (numtyp)0.0;
+        t2_1 = (numtyp)0.0;
+        t0_2 = (numtyp)0.0;
+        t1_2 = (numtyp)0.0;
+        t2_2 = (numtyp)0.0;
+        t3 = (numtyp)0.0;

-        i = igrid[m].x - nlpts;
+        int ii = igrid[3*i] - nlpts;
        for (int ib = 0; ib < bsorder; ib++) {
          /*
-          tq_1 = grid[k][j][i][0];
-          tq_2 = grid[k][j][i][1];
+          tq_1 = grid[k][j][ii][0];
+          tq_2 = grid[k][j][ii][1];
          t0_1 += tq_1*thetai1[m][ib][0];
          t1_1 += tq_1*thetai1[m][ib][1];
          t2_1 += tq_1*thetai1[m][ib][2];
@ -1765,7 +1776,22 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
          t2_2 += tq_2*thetai1[m][ib][2];
          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
          */
-          i++;
+          int i1 = m*4*bsorder+4*ib;
+          numtyp w0 = thetai1[i1];
+          numtyp w1 = thetai1[i1+1];
+          numtyp w2 = thetai1[i1+2];
+          numtyp w3 = thetai1[i1+3];
+          int gidx = 2*(k*nyzgrid + j*nygrid + ii);
+          tq_1 = grid[gidx];
+          tq_2 = grid[gidx+1];
+          t0_1 += tq_1*w0;
+          t1_1 += tq_1*w1;
+          t2_1 += tq_1*w2;
+          t0_2 += tq_2*w0;
+          t1_2 += tq_2*w1;
+          t2_2 += tq_2*w2;
+          t3 += (tq_1+tq_2)*w3;
+          ii++;
        }

        tu00_1 += t0_1*u0;
@ -1836,6 +1862,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
      tuv111 += tu11*v1;
      k++;
    }
+
 /*
    fdip_phi1[m][0] = 0.0;
    fdip_phi1[m][1] = tuv100_1;
@ -1847,39 +1874,51 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
    fdip_phi1[m][7] = tuv110_1;
    fdip_phi1[m][8] = tuv101_1;
    fdip_phi1[m][9] = tuv011_1;
-
-    fdip_phi2[m][0] = 0.0;
-    fdip_phi2[m][1] = tuv100_2;
-    fdip_phi2[m][2] = tuv010_2;
-    fdip_phi2[m][3] = tuv001_2;
-    fdip_phi2[m][4] = tuv200_2;
-    fdip_phi2[m][5] = tuv020_2;
-    fdip_phi2[m][6] = tuv002_2;
-    fdip_phi2[m][7] = tuv110_2;
-    fdip_phi2[m][8] = tuv101_2;
-    fdip_phi2[m][9] = tuv011_2;
-
-    fdip_sum_phi[m][0] = tuv000;
-    fdip_sum_phi[m][1] = tuv100;
-    fdip_sum_phi[m][2] = tuv010;
-    fdip_sum_phi[m][3] = tuv001;
-    fdip_sum_phi[m][4] = tuv200;
-    fdip_sum_phi[m][5] = tuv020;
-    fdip_sum_phi[m][6] = tuv002;
-    fdip_sum_phi[m][7] = tuv110;
-    fdip_sum_phi[m][8] = tuv101;
-    fdip_sum_phi[m][9] = tuv011;
-    fdip_sum_phi[m][10] = tuv300;
-    fdip_sum_phi[m][11] = tuv030;
-    fdip_sum_phi[m][12] = tuv003;
-    fdip_sum_phi[m][13] = tuv210;
-    fdip_sum_phi[m][14] = tuv201;
-    fdip_sum_phi[m][15] = tuv120;
-    fdip_sum_phi[m][16] = tuv021;
-    fdip_sum_phi[m][17] = tuv102;
-    fdip_sum_phi[m][18] = tuv012;
-    fdip_sum_phi[m][19] = tuv111;
 */
+    int idx = 10*m;
+    fdip_phi1[idx+0] = (numtyp)0.0;
+    fdip_phi1[idx+1] = tuv100_1;
+    fdip_phi1[idx+2] = tuv010_1;
+    fdip_phi1[idx+3] = tuv001_1;
+    fdip_phi1[idx+4] = tuv200_1;
+    fdip_phi1[idx+5] = tuv020_1;
+    fdip_phi1[idx+6] = tuv002_1;
+    fdip_phi1[idx+7] = tuv110_1;
+    fdip_phi1[idx+8] = tuv101_1;
+    fdip_phi1[idx+9] = tuv011_1;
+
+    fdip_phi2[idx+0] = (numtyp)0.0;
+    fdip_phi2[idx+1] = tuv100_2;
+    fdip_phi2[idx+2] = tuv010_2;
+    fdip_phi2[idx+3] = tuv001_2;
+    fdip_phi2[idx+4] = tuv200_2;
+    fdip_phi2[idx+5] = tuv020_2;
+    fdip_phi2[idx+6] = tuv002_2;
+    fdip_phi2[idx+7] = tuv110_2;
+    fdip_phi2[idx+8] = tuv101_2;
+    fdip_phi2[idx+9] = tuv011_2;
+
+    idx = 20*m;
+    fdip_sum_phi[idx+0] = tuv000;
+    fdip_sum_phi[idx+1] = tuv100;
+    fdip_sum_phi[idx+2] = tuv010;
+    fdip_sum_phi[idx+3] = tuv001;
+    fdip_sum_phi[idx+4] = tuv200;
+    fdip_sum_phi[idx+5] = tuv020;
+    fdip_sum_phi[idx+6] = tuv002;
+    fdip_sum_phi[idx+7] = tuv110;
+    fdip_sum_phi[idx+8] = tuv101;
+    fdip_sum_phi[idx+9] = tuv011;
+    fdip_sum_phi[idx+10] = tuv300;
+    fdip_sum_phi[idx+11] = tuv030;
+    fdip_sum_phi[idx+12] = tuv003;
+    fdip_sum_phi[idx+13] = tuv210;
+    fdip_sum_phi[idx+14] = tuv201;
+    fdip_sum_phi[idx+15] = tuv120;
+    fdip_sum_phi[idx+16] = tuv021;
+    fdip_sum_phi[idx+17] = tuv102;
+    fdip_sum_phi[idx+18] = tuv012;
+    fdip_sum_phi[idx+19] = tuv111;
  }
 }

--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@ -37,6 +37,7 @@ BaseAmoebaT::~BaseAmoeba() {
  k_multipole.clear();
  k_udirect2b.clear();
  k_umutual2b.clear();
+  k_fphi_uind.clear();
  k_polar.clear();
  k_special15.clear();
  k_short_nbor.clear();
@ -182,6 +183,11 @@ void BaseAmoebaT::clear_atomic() {
  _thetai1.clear();
  _thetai2.clear();
  _thetai3.clear();
+  _igrid.clear();
+  _fdip_phi1.clear();
+  _fdip_phi2.clear();
+  _cgrid_brick.clear();
+  _fdip_sum_phi.clear();
  dev_nspecial15.clear();
  dev_special15.clear();
  dev_special15_t.clear();
@ -444,34 +450,70 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
 template <class numtyp, class acctyp>
 void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
                                    double **host_thetai1, double **host_thetai2,
-                                    double **host_thetai3, int** host_igrid) {
+                                    double **host_thetai3, int** host_igrid,
+                                    double* grid_brick_start, int nzlo_out,
+                                    int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out) {
  
  _bsorder = bsorder;

+  // allocate or resize per-atom arrays
+  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
+  //   will be consolidated once all terms are ready
+
  if (_max_thetai_size == 0) {
    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY);
+
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
+
  } else {
    if (inum_full>_max_thetai_size) {
      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
      _thetai1.resize(_max_thetai_size*bsorder*4);
      _thetai2.resize(_max_thetai_size*bsorder*4);
      _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size);
+      _igrid.resize(_max_thetai_size*3);
+
+      _fdip_phi1.resize(_max_thetai_size*10);
+      _fdip_phi2.resize(_max_thetai_size*10);
+      _fdip_sum_phi.resize(_max_thetai_size*20);
    }
  }

-  memcpy(_thetai1.host.begin(),host_thetai1,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_thetai2.host.begin(),host_thetai2,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_thetai3.host.begin(),host_thetai3,inum_full*bsorder*4*sizeof(numtyp));
-  memcpy(_igrid.host.begin(),host_igrid,inum_full*4*sizeof(int));
-  _thetai1.update_device(inum_full*bsorder*4,true);
-  _thetai2.update_device(inum_full*bsorder*4,true);
-  _thetai3.update_device(inum_full*bsorder*4,true);
-  _igrid.update_device(inum_full,true);
+  UCL_H_Vec<double> dview;
+
+  // copy from host to device
+  
+  dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai1,dview,false);
+  dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai2,dview,false);
+  dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device));
+  ucl_copy(_thetai3,dview,false);
+
+  UCL_H_Vec<int> dview_int;
+  dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device));
+  ucl_copy(_igrid,dview_int,false);
+
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
+  dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device));
+  ucl_copy(_cgrid_brick,dview,false);
+
 }

 // ---------------------------------------------------------------------------
@ -593,18 +635,35 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                    double **host_thetai1, double **host_thetai2,
                                    double **host_thetai3, int** igrid,
-                                    double ****host_cgrid_brick, double **host_fdip_phi1,
-                                    double **host_fdip_phi2, double **host_fdip_sum_phi)
+                                    double *host_grid_brick_start, double **host_fdip_phi1,
+                                    double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                    int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out)
 {
-  // once allocation and transfers
-  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid);
+  // allocation/resize and transfers (do this right after udirect?)
  
-  // resize grid if needed, then copy from host to device
-  // cgrid_brick.alloc()/resize()
-  // cgrid_brick.begin() = host_cgrid_brick[0][0][0][0];
-  //
+  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
+                    igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out,
+                    nxlo_out, nxhi_out);

-  const int red_bllocks = fphi_uind();
+  // update the cgrid_brick with data host
+  
+  _nzlo_out = nzlo_out;
+  _nzhi_out = nzhi_out;
+  _nylo_out = nylo_out;
+  _nyhi_out = nyhi_out;
+  _nxlo_out = nxlo_out;
+  _nxhi_out = nxhi_out;
+  _ngridz = nzhi_out - nzlo_out + 1;
+  _ngridy = nyhi_out - nylo_out + 1;
+  _ngridx = nxhi_out - nxlo_out + 1;
+  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
+
+  UCL_H_Vec<double> dview;
+  dview.view(host_grid_brick_start,_num_grid_points,*(this->ucl_device));
+  ucl_copy(_cgrid_brick,dview,false);
+
+  const int red_blocks = fphi_uind();
 }


@ -814,6 +873,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
  k_udirect2b.set_function(*pair_program,kname_udirect2b);
  k_umutual2b.set_function(*pair_program,kname_umutual2b);
  k_polar.set_function(*pair_program,kname_polar);
+  k_fphi_uind.set_function(*pair_program,"kname_fphi_uind");
  k_short_nbor.set_function(*pair_program,kname_short_nbor);
  k_special15.set_function(*pair_program,kname_special15);
  pos_tex.get_texture(*pair_program,"pos_tex");
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@ -152,7 +152,10 @@ class BaseAmoeba {

  virtual void precompute_induce(const int inum_full, const int bsorder,
                                 double **host_thetai1, double **host_thetai2,
-                                 double **host_thetai3, int** igrid);
+                                 double **host_thetai3, int** igrid,
+                                 double* grid_brick_start, int nzlo_out,
+                                 int nzhi_out, int nylo_out, int nyhi_out,
+                                 int nxlo_out, int nxhi_out);

  /// Compute multipole real-space with device neighboring
  virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
@ -179,8 +182,10 @@ class BaseAmoeba {
  virtual void compute_fphi_uind(const int inum_full, const int bsorder,
                                 double **host_thetai1, double **host_thetai2,
                                 double **host_thetai3, int** igrid,
-                                 double ****host_grid, double **host_fdip_phi1,
-                                 double **host_fdip_phi2, double **host_fdip_sum_phi);
+                                 double *host_grid_brick_start, double **host_fdip_phi1,
+                                 double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                 int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                                 int nxlo_out, int nxhi_out);

  /// Compute polar real-space with device neighboring
  virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
@ -249,9 +254,12 @@ class BaseAmoeba {
  int _nmax, _max_tep_size, _max_fieldp_size;

  int _bsorder;
-  UCL_Vector<numtyp,numtyp> _thetai1, _thetai2, _thetai3;
-  UCL_Vector<int4,int4> _igrid;
+  UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3, _cgrid_brick;
+  UCL_D_Vec<int> _igrid;
+  UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
  int _max_thetai_size;
+  int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
+  int _ngridx, _ngridy, _ngridz, _num_grid_points;

  // ------------------------ FORCE/ENERGY DATA -----------------------

@ -272,7 +280,7 @@ class BaseAmoeba {

  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *pair_program;
-  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar;
+  UCL_Kernel k_multipole, k_udirect2b, k_umutual2b, k_polar, k_fphi_uind;
  UCL_Kernel k_special15, k_short_nbor;
  inline int block_size() { return _block_size; }
  inline void set_kernel(const int eflag, const int vflag) {}
@ -305,7 +313,6 @@ class BaseAmoeba {
  virtual int umutual2b(const int eflag, const int vflag) = 0;
  virtual int fphi_uind() = 0;
  virtual int polar_real(const int eflag, const int vflag) = 0;
-
  

  #if !defined(USE_OPENCL) && !defined(USE_HIP)