diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index d67fa4f869..53a9f6aa3e 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1621,9 +1621,9 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
 ------------------------------------------------------------------------- */
 
 __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
-                          const __global numtyp *restrict thetai1,
-                          const __global numtyp *restrict thetai2,
-                          const __global numtyp *restrict thetai3,
+                          const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
                           const __global numtyp *restrict grid,
                           __global numtyp *restrict fdip_phi1,
@@ -1698,11 +1698,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       v2 = thetai3[m][kb][2];
       v3 = thetai3[m][kb][3];
       */
-      int i3 = ii*4*bsorder + 4*kb;
-      numtyp v0 = thetai3[i3];
-      numtyp v1 = thetai3[i3+1];
-      numtyp v2 = thetai3[i3+2];
-      numtyp v3 = thetai3[i3+3];
+      int i3 = ii*bsorder + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
       numtyp tu00_1 = (numtyp)0.0;
       numtyp tu01_1 = (numtyp)0.0;
       numtyp tu10_1 = (numtyp)0.0;
@@ -1734,11 +1735,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         u2 = thetai2[m][jb][2];
         u3 = thetai2[m][jb][3];
         */
-        int i2 = ii*4*bsorder+4*jb;
-        numtyp u0 = thetai2[i2];
-        numtyp u1 = thetai2[i2+1];
-        numtyp u2 = thetai2[i2+2];
-        numtyp u3 = thetai2[i2+3];
+        int i2 = ii*bsorder+jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
         numtyp t0_1 = (numtyp)0.0;
         numtyp t1_1 = (numtyp)0.0;
         numtyp t2_1 = (numtyp)0.0;
@@ -1760,11 +1762,12 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
           t2_2 += tq_2*thetai1[m][ib][2];
           t3 += (tq_1+tq_2)*thetai1[m][ib][3];
           */
-          int i1 = ii*4*bsorder+4*ib;
-          numtyp w0 = thetai1[i1];
-          numtyp w1 = thetai1[i1+1];
-          numtyp w2 = thetai1[i1+2];
-          numtyp w3 = thetai1[i1+3];
+          int i1 = ii*bsorder+ib;
+          numtyp4 tha1 = thetai1[i1];
+          numtyp w0 = tha1.x;
+          numtyp w1 = tha1.y;
+          numtyp w2 = tha1.z;
+          numtyp w3 = tha1.w;
           int gidx = 2*(k*ngridxy + j*ngridx + i);
           numtyp tq_1 = grid[gidx];
           numtyp tq_2 = grid[gidx+1];
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 5989ba889d..3e14159d5a 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -189,8 +189,6 @@ void BaseAmoebaT::clear_atomic() {
   _fdip_sum_phi.clear();
   _cgrid_brick.clear();
 
-  hview.clear();
-
   dev_nspecial15.clear();
   dev_special15.clear();
   dev_special15_t.clear();
@@ -578,30 +576,25 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
 
   if (_max_thetai_size == 0) {
     _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai1.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder,*(this->ucl_device),UCL_READ_ONLY);
     _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
 
     _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
     _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_READ_WRITE);
     _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_READ_WRITE);
-
-    hview.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device));
-
   } else {
     if (inum_full>_max_thetai_size) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-      _thetai1.resize(_max_thetai_size*bsorder*4);
-      _thetai2.resize(_max_thetai_size*bsorder*4);
-      _thetai3.resize(_max_thetai_size*bsorder*4);
+      _thetai1.resize(_max_thetai_size*bsorder);
+      _thetai2.resize(_max_thetai_size*bsorder);
+      _thetai3.resize(_max_thetai_size*bsorder);
       _igrid.resize(_max_thetai_size*4);
 
       _fdip_phi1.resize(_max_thetai_size*10);
       _fdip_phi2.resize(_max_thetai_size*10);
       _fdip_sum_phi.resize(_max_thetai_size*20);
-
-      hview.resize(_max_thetai_size*bsorder*4);
     }
   }
 
@@ -609,44 +602,47 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
-      int idx = i*4*bsorder + 4*j;
-      hview[idx+0] = host_thetai1[i][j][0];
-      hview[idx+1] = host_thetai1[i][j][1];
-      hview[idx+2] = host_thetai1[i][j][2];
-      hview[idx+3] = host_thetai1[i][j][3];
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai1[i][j][0];
+      v.y = host_thetai1[i][j][1];
+      v.z = host_thetai1[i][j][2];
+      v.w = host_thetai1[i][j][3];
+      _thetai1[idx] = v;
     }
-  ucl_copy(_thetai1,hview,false);
+  _thetai1.update_device(true);
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
-      int idx = i*4*bsorder + 4*j;
-      hview[idx+0] = host_thetai2[i][j][0];
-      hview[idx+1] = host_thetai2[i][j][1];
-      hview[idx+2] = host_thetai2[i][j][2];
-      hview[idx+3] = host_thetai2[i][j][3];
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai2[i][j][0];
+      v.y = host_thetai2[i][j][1];
+      v.z = host_thetai2[i][j][2];
+      v.w = host_thetai2[i][j][3];
+      _thetai2[idx] = v;
     }
-  ucl_copy(_thetai2,hview,false);
+  _thetai2.update_device(true);
 
   for (int i = 0; i < inum_full; i++)
     for (int j = 0; j < bsorder; j++) {
-      int idx = i*4*bsorder + 4*j;
-      hview[idx+0] = host_thetai3[i][j][0];
-      hview[idx+1] = host_thetai3[i][j][1];
-      hview[idx+2] = host_thetai3[i][j][2];
-      hview[idx+3] = host_thetai3[i][j][3];
+      int idx = i*bsorder + j;
+      numtyp4 v;
+      v.x = host_thetai3[i][j][0];
+      v.y = host_thetai3[i][j][1];
+      v.z = host_thetai3[i][j][2];
+      v.w = host_thetai3[i][j][3];
+      _thetai3[idx] = v;
     }
-  ucl_copy(_thetai3,hview,false);
+  _thetai3.update_device(true);
 
-  //UCL_H_Vec<int> dview_int;
-  //dview_int.alloc(_max_thetai_size*4, *(this->ucl_device));
   for (int i = 0; i < inum_full; i++) {
     int idx = i*4;
     _igrid[idx+0] = host_igrid[i][0];
     _igrid[idx+1] = host_igrid[i][1];
     _igrid[idx+2] = host_igrid[i][2];
   }
-  //ucl_copy(_igrid, dview_int, false);
-  _igrid.update_device(false);
+  _igrid.update_device(true);
 
   _nzlo_out = nzlo_out;
   _nzhi_out = nzhi_out;
@@ -694,7 +690,7 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
     first_iteration = false;
   }
 
-  // TODO: find out why this host alloc helps makes the cgrid_brick update_device() work correcly
+  // TODO: find out why this host alloc helps the cgrid_brick update_device() work correcly
   UCL_H_Vec<numtyp> hdummy;
   hdummy.alloc(1, *(this->ucl_device), UCL_READ_ONLY);
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 760d0e3005..802b6962b7 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -256,7 +256,7 @@ class BaseAmoeba {
   int _nmax, _max_tep_size, _max_fieldp_size;
 
   int _bsorder;
-  UCL_D_Vec<numtyp> _thetai1, _thetai2, _thetai3;
+  UCL_Vector<numtyp4,numtyp4> _thetai1, _thetai2, _thetai3;
   UCL_Vector<int,int> _igrid;
   UCL_Vector<numtyp,numtyp> _cgrid_brick;
   UCL_Vector<numtyp,numtyp> _fdip_phi1, _fdip_phi2, _fdip_sum_phi;
@@ -264,9 +264,6 @@ class BaseAmoeba {
   int _nzlo_out, _nzhi_out, _nylo_out, _nyhi_out, _nxlo_out, _nxhi_out;
   int _ngridx, _ngridy, _ngridz, _num_grid_points;
 
-  /// buffer
-  UCL_H_Vec<double> hview;
-
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   Answer<numtyp,acctyp> *ans;