From c5c3c697dfd80c445fc3c310f86b220354bff6a0 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 29 Aug 2022 00:13:30 -0500
Subject: [PATCH] Adding fphi_uind kernel, working on the arrays allocation

---
 lib/gpu/Nvidia.makefile     |   2 +-
 lib/gpu/lal_amoeba.cu       | 268 ++++++++++++++++++++++++++++++++++++
 lib/gpu/lal_base_amoeba.cpp |  13 +-
 lib/gpu/lal_base_amoeba.h   |   2 +-
 4 files changed, 279 insertions(+), 6 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 768daff53a..5f50486e28 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -62,7 +62,7 @@ $(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
 
-$(OBJ_DIR)/%_cubin.h: lal_%.cu  $(PRE1_H)
+$(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
 	$(BIN2C) -c -n $* $(OBJ_DIR)/$*.cubin > $@
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 1b2900f97f..1239764108 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1615,6 +1615,274 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
+                          const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int4 *restrict igrid,
+                          const __global numtyp4 *restrict grid,
+                          __global numtyp4 *restrict fdip_phi1,
+                          __global numtyp4 *restrict fdip_phi2,
+                          __global numtyp4 *restrict fdip_sum_phi,
+                          const int bsorder, const int inum,
+                          const int t_per_atom)
+{
+  int tid, ii, offset, i, n_stride;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  if (ii<inum) {
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+
+    int j,k,m;
+    numtyp v0,v1,v2,v3;
+    numtyp u0,u1,u2,u3;
+    numtyp t0,t1,t2,t3;
+    numtyp t0_1,t0_2,t1_1,t1_2;
+    numtyp t2_1,t2_2,tq_1,tq_2;
+    numtyp tu00,tu10,tu01,tu20,tu11;
+    numtyp tu02,tu30,tu21,tu12,tu03;
+    numtyp tu00_1,tu01_1,tu10_1;
+    numtyp tu00_2,tu01_2,tu10_2;
+    numtyp tu20_1,tu11_1,tu02_1;
+    numtyp tu20_2,tu11_2,tu02_2;
+    numtyp tuv100_1,tuv010_1,tuv001_1;
+    numtyp tuv100_2,tuv010_2,tuv001_2;
+    numtyp tuv200_1,tuv020_1,tuv002_1;
+    numtyp tuv110_1,tuv101_1,tuv011_1;
+    numtyp tuv200_2,tuv020_2,tuv002_2;
+    numtyp tuv110_2,tuv101_2,tuv011_2;
+    numtyp tuv000,tuv100,tuv010,tuv001;
+    numtyp tuv200,tuv020,tuv002,tuv110;
+    numtyp tuv101,tuv011,tuv300,tuv030;
+    numtyp tuv003,tuv210,tuv201,tuv120;
+    numtyp tuv021,tuv102,tuv012,tuv111;
+
+    int nlpts = (bsorder-1) / 2;
+
+    // extract the permanent multipole field at each site
+
+    tuv100_1 = 0.0;
+    tuv010_1 = 0.0;
+    tuv001_1 = 0.0;
+    tuv200_1 = 0.0;
+    tuv020_1 = 0.0;
+    tuv002_1 = 0.0;
+    tuv110_1 = 0.0;
+    tuv101_1 = 0.0;
+    tuv011_1 = 0.0;
+    tuv100_2 = 0.0;
+    tuv010_2 = 0.0;
+    tuv001_2 = 0.0;
+    tuv200_2 = 0.0;
+    tuv020_2 = 0.0;
+    tuv002_2 = 0.0;
+    tuv110_2 = 0.0;
+    tuv101_2 = 0.0;
+    tuv011_2 = 0.0;
+    tuv000 = 0.0;
+    tuv001 = 0.0;
+    tuv010 = 0.0;
+    tuv100 = 0.0;
+    tuv200 = 0.0;
+    tuv020 = 0.0;
+    tuv002 = 0.0;
+    tuv110 = 0.0;
+    tuv101 = 0.0;
+    tuv011 = 0.0;
+    tuv300 = 0.0;
+    tuv030 = 0.0;
+    tuv003 = 0.0;
+    tuv210 = 0.0;
+    tuv201 = 0.0;
+    tuv120 = 0.0;
+    tuv021 = 0.0;
+    tuv102 = 0.0;
+    tuv012 = 0.0;
+    tuv111 = 0.0;
+
+    k = igrid[i].z - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      /*
+      v0 = thetai3[m][kb][0];
+      v1 = thetai3[m][kb][1];
+      v2 = thetai3[m][kb][2];
+      v3 = thetai3[m][kb][3];
+      */
+      tu00_1 = 0.0;
+      tu01_1 = 0.0;
+      tu10_1 = 0.0;
+      tu20_1 = 0.0;
+      tu11_1 = 0.0;
+      tu02_1 = 0.0;
+      tu00_2 = 0.0;
+      tu01_2 = 0.0;
+      tu10_2 = 0.0;
+      tu20_2 = 0.0;
+      tu11_2 = 0.0;
+      tu02_2 = 0.0;
+      tu00 = 0.0;
+      tu10 = 0.0;
+      tu01 = 0.0;
+      tu20 = 0.0;
+      tu11 = 0.0;
+      tu02 = 0.0;
+      tu30 = 0.0;
+      tu21 = 0.0;
+      tu12 = 0.0;
+      tu03 = 0.0;
+
+      j = igrid[i].y - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        /*
+        u0 = thetai2[m][jb][0];
+        u1 = thetai2[m][jb][1];
+        u2 = thetai2[m][jb][2];
+        u3 = thetai2[m][jb][3];
+        */
+        t0_1 = 0.0;
+        t1_1 = 0.0;
+        t2_1 = 0.0;
+        t0_2 = 0.0;
+        t1_2 = 0.0;
+        t2_2 = 0.0;
+        t3 = 0.0;
+
+        i = igrid[m].x - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          /*
+          tq_1 = grid[k][j][i][0];
+          tq_2 = grid[k][j][i][1];
+          t0_1 += tq_1*thetai1[m][ib][0];
+          t1_1 += tq_1*thetai1[m][ib][1];
+          t2_1 += tq_1*thetai1[m][ib][2];
+          t0_2 += tq_2*thetai1[m][ib][0];
+          t1_2 += tq_2*thetai1[m][ib][1];
+          t2_2 += tq_2*thetai1[m][ib][2];
+          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
+          */
+          i++;
+        }
+
+        tu00_1 += t0_1*u0;
+        tu10_1 += t1_1*u0;
+        tu01_1 += t0_1*u1;
+        tu20_1 += t2_1*u0;
+        tu11_1 += t1_1*u1;
+        tu02_1 += t0_1*u2;
+        tu00_2 += t0_2*u0;
+        tu10_2 += t1_2*u0;
+        tu01_2 += t0_2*u1;
+        tu20_2 += t2_2*u0;
+        tu11_2 += t1_2*u1;
+        tu02_2 += t0_2*u2;
+        t0 = t0_1 + t0_2;
+        t1 = t1_1 + t1_2;
+        t2 = t2_1 + t2_2;
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv100_1 += tu10_1*v0;
+      tuv010_1 += tu01_1*v0;
+      tuv001_1 += tu00_1*v1;
+      tuv200_1 += tu20_1*v0;
+      tuv020_1 += tu02_1*v0;
+      tuv002_1 += tu00_1*v2;
+      tuv110_1 += tu11_1*v0;
+      tuv101_1 += tu10_1*v1;
+      tuv011_1 += tu01_1*v1;
+      tuv100_2 += tu10_2*v0;
+      tuv010_2 += tu01_2*v0;
+      tuv001_2 += tu00_2*v1;
+      tuv200_2 += tu20_2*v0;
+      tuv020_2 += tu02_2*v0;
+      tuv002_2 += tu00_2*v2;
+      tuv110_2 += tu11_2*v0;
+      tuv101_2 += tu10_2*v1;
+      tuv011_2 += tu01_2*v1;
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+/*
+    fdip_phi1[m][0] = 0.0;
+    fdip_phi1[m][1] = tuv100_1;
+    fdip_phi1[m][2] = tuv010_1;
+    fdip_phi1[m][3] = tuv001_1;
+    fdip_phi1[m][4] = tuv200_1;
+    fdip_phi1[m][5] = tuv020_1;
+    fdip_phi1[m][6] = tuv002_1;
+    fdip_phi1[m][7] = tuv110_1;
+    fdip_phi1[m][8] = tuv101_1;
+    fdip_phi1[m][9] = tuv011_1;
+
+    fdip_phi2[m][0] = 0.0;
+    fdip_phi2[m][1] = tuv100_2;
+    fdip_phi2[m][2] = tuv010_2;
+    fdip_phi2[m][3] = tuv001_2;
+    fdip_phi2[m][4] = tuv200_2;
+    fdip_phi2[m][5] = tuv020_2;
+    fdip_phi2[m][6] = tuv002_2;
+    fdip_phi2[m][7] = tuv110_2;
+    fdip_phi2[m][8] = tuv101_2;
+    fdip_phi2[m][9] = tuv011_2;
+
+    fdip_sum_phi[m][0] = tuv000;
+    fdip_sum_phi[m][1] = tuv100;
+    fdip_sum_phi[m][2] = tuv010;
+    fdip_sum_phi[m][3] = tuv001;
+    fdip_sum_phi[m][4] = tuv200;
+    fdip_sum_phi[m][5] = tuv020;
+    fdip_sum_phi[m][6] = tuv002;
+    fdip_sum_phi[m][7] = tuv110;
+    fdip_sum_phi[m][8] = tuv101;
+    fdip_sum_phi[m][9] = tuv011;
+    fdip_sum_phi[m][10] = tuv300;
+    fdip_sum_phi[m][11] = tuv030;
+    fdip_sum_phi[m][12] = tuv003;
+    fdip_sum_phi[m][13] = tuv210;
+    fdip_sum_phi[m][14] = tuv201;
+    fdip_sum_phi[m][15] = tuv120;
+    fdip_sum_phi[m][16] = tuv021;
+    fdip_sum_phi[m][17] = tuv102;
+    fdip_sum_phi[m][18] = tuv012;
+    fdip_sum_phi[m][19] = tuv111;
+*/
+  }
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 88a2c87166..c18b10675b 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -453,14 +453,14 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
     _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
     _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
-    _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _igrid.alloc(_max_thetai_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
   } else {
     if (inum_full>_max_thetai_size) {
       _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
       _thetai1.resize(_max_thetai_size*bsorder*4);
       _thetai2.resize(_max_thetai_size*bsorder*4);
       _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size*4);
+      _igrid.resize(_max_thetai_size);
     }
   }
 
@@ -471,7 +471,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   _thetai1.update_device(inum_full*bsorder*4,true);
   _thetai2.update_device(inum_full*bsorder*4,true);
   _thetai3.update_device(inum_full*bsorder*4,true);
-  _igrid.update_device(inum_full*4,true);
+  _igrid.update_device(inum_full,true);
 }
 
 // ---------------------------------------------------------------------------
@@ -593,12 +593,17 @@ template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
                                     double **host_thetai1, double **host_thetai2,
                                     double **host_thetai3, int** igrid,
-                                    double ****host_grid, double **host_fdip_phi1,
+                                    double ****host_cgrid_brick, double **host_fdip_phi1,
                                     double **host_fdip_phi2, double **host_fdip_sum_phi)
 {
   // once allocation and transfers
   precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3, igrid);
   
+  // resize grid if needed, then copy from host to device
+  // cgrid_brick.alloc()/resize()
+  // cgrid_brick.begin() = host_cgrid_brick[0][0][0][0];
+  //
+
   const int red_bllocks = fphi_uind();
 }
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 68c3470977..f333bdf9a6 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -250,7 +250,7 @@ class BaseAmoeba {
 
   int _bsorder;
   UCL_Vector<numtyp,numtyp> _thetai1, _thetai2, _thetai3;
-  UCL_Vector<int,int> _igrid;
+  UCL_Vector<int4,int4> _igrid;
   int _max_thetai_size;
 
   // ------------------------ FORCE/ENERGY DATA -----------------------