From 62ecf98cda4d1bd970b7bf1b5e8f1a09c388d009 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 16 Sep 2022 14:47:16 -0500
Subject: [PATCH] Enabled fphi_uind in hippo/gpu, really need to refactor hippo
 and amoeba in the GPU lib to remove kernel duplicates

---
 lib/gpu/Nvidia.makefile     |  26 +--
 lib/gpu/lal_amoeba.cpp      |   3 +-
 lib/gpu/lal_amoeba.cu       |   2 +-
 lib/gpu/lal_base_amoeba.cpp |  24 +--
 lib/gpu/lal_base_amoeba.h   |  10 +-
 lib/gpu/lal_hippo.cpp       |   3 +-
 lib/gpu/lal_hippo.cu        | 301 ++++++++++++++++++++++++++++++++++
 lib/gpu/lal_hippo_ext.cpp   |  14 ++
 src/GPU/pair_amoeba_gpu.cpp |   2 +-
 src/GPU/pair_hippo_gpu.cpp  | 311 ++++++++++++++++++++++++++++++++----
 src/GPU/pair_hippo_gpu.h    |   6 +
 11 files changed, 626 insertions(+), 76 deletions(-)

diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index c52246b06b..5f50486e28 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -68,31 +68,7 @@ $(OBJ_DIR)/%_cubin.h: lal_%.cu $(PRE1_H)
 
 # host code compilation
 
-$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
-	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
-	$(CUDR) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
-	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
-	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
-	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
-	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
-	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
-
-$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
-	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
-	
-$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cubin.h $(HOST_H)
+$(OBJ_DIR)/lal_%.o: lal_%.cpp $(CUHS) $(HOST_H)
 	$(CUDR) -o $@ -c $< -I$(OBJ_DIR)
 
 #ifdef CUDPP_OPT
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 48316e9b6e..02870ea861 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -64,7 +64,8 @@ int AmoebaT::init(const int ntypes, const int max_amtype, const int max_amclass,
                             cell_size,gpu_split,_screen,amoeba,
                             "k_amoeba_multipole", "k_amoeba_udirect2b",
                             "k_amoeba_umutual2b", "k_amoeba_polar",
-                            "k_amoeba_short_nbor", "k_amoeba_special15");
+                            "k_amoeba_fphi_uind", "k_amoeba_short_nbor",
+                            "k_amoeba_special15");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index d391279f5d..66926721cb 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1626,7 +1626,7 @@ __kernel void k_amoeba_polar(const __global numtyp4 *restrict x_,
    fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
 ------------------------------------------------------------------------- */
 
-__kernel void k_fphi_uind(const __global numtyp4 *restrict thetai1,
+__kernel void k_amoeba_fphi_uind(const __global numtyp4 *restrict thetai1,
                           const __global numtyp4 *restrict thetai2,
                           const __global numtyp4 *restrict thetai3,
                           const __global int *restrict igrid,
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 3ee0517dfb..eac704fbfc 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -65,6 +65,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
                              const char *k_name_udirect2b,
                              const char *k_name_umutual2b,
                              const char *k_name_polar,
+                             const char *k_name_fphi_uind,
                              const char *k_name_short_nbor,
                              const char* k_name_special15) {
   screen=_screen;
@@ -100,7 +101,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program,k_name_multipole,
                   k_name_udirect2b, k_name_umutual2b,k_name_polar,
-                  k_name_short_nbor, k_name_special15);
+                  k_name_fphi_uind, k_name_short_nbor, k_name_special15);
 
   if (_threads_per_atom>1 && gpu_nbor==0) {
     nbor->packing(true);
@@ -934,6 +935,7 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_udirect2b,
                                   const char *kname_umutual2b,
                                   const char *kname_polar,
+                                  const char *kname_fphi_uind,
                                   const char *kname_short_nbor,
                                   const char* kname_special15) {
   if (_compiled)
@@ -942,17 +944,17 @@ void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
   std::string oclstring = device->compile_string()+" -DEVFLAG=1";
-  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  pair_program->load_string(pair_str, oclstring.c_str(),nullptr, screen);
 
-  k_multipole.set_function(*pair_program,kname_multipole);
-  k_udirect2b.set_function(*pair_program,kname_udirect2b);
-  k_umutual2b.set_function(*pair_program,kname_umutual2b);
-  k_polar.set_function(*pair_program,kname_polar);
-  k_fphi_uind.set_function(*pair_program,"k_fphi_uind");
-  k_short_nbor.set_function(*pair_program,kname_short_nbor);
-  k_special15.set_function(*pair_program,kname_special15);
-  pos_tex.get_texture(*pair_program,"pos_tex");
-  q_tex.get_texture(*pair_program,"q_tex");
+  k_multipole.set_function(*pair_program, kname_multipole);
+  k_udirect2b.set_function(*pair_program, kname_udirect2b);
+  k_umutual2b.set_function(*pair_program, kname_umutual2b);
+  k_polar.set_function(*pair_program, kname_polar);
+  k_fphi_uind.set_function(*pair_program, kname_fphi_uind);
+  k_short_nbor.set_function(*pair_program, kname_short_nbor);
+  k_special15.set_function(*pair_program, kname_special15);
+  pos_tex.get_texture(*pair_program, "pos_tex");
+  q_tex.get_texture(*pair_program, "q_tex");
 
   _compiled=true;
 
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 802b6962b7..5aeb729993 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -62,9 +62,10 @@ class BaseAmoeba {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const int maxspecial15, const double cell_size,
                   const double gpu_split, FILE *screen, const void *pair_program,
-                  const char *kname_multipole,
-                  const char *kname_udirect2b, const char *kname_umutual2b,
-                  const char *kname_polar, const char *kname_short_nbor, const char* kname_special15);
+                  const char *kname_multipole, const char *kname_udirect2b,
+                  const char *kname_umutual2b, const char *kname_polar,
+                  const char *kname_fphi_uind, const char *kname_short_nbor,
+                  const char* kname_special15);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead(const int add_kernels=0);
@@ -309,7 +310,8 @@ class BaseAmoeba {
   void compile_kernels(UCL_Device &dev, const void *pair_string,
      const char *kname_multipole, const char *kname_udirect2b,
      const char *kname_umutual2b, const char *kname_polar,
-     const char *kname_short_nbor, const char* kname_special15);
+     const char *kname_fphi_uind, const char *kname_short_nbor,
+     const char* kname_special15);
 
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp
index 79a8772c3e..9917ab91a2 100644
--- a/lib/gpu/lal_hippo.cpp
+++ b/lib/gpu/lal_hippo.cpp
@@ -67,7 +67,8 @@ int HippoT::init(const int ntypes, const int max_amtype, const int max_amclass,
                             cell_size,gpu_split,_screen,hippo,
                             "k_hippo_multipole", "k_hippo_udirect2b",
                             "k_hippo_umutual2b", "k_hippo_polar",
-                            "k_hippo_short_nbor", "k_hippo_special15");
+                            "k_hippo_fphi_uind", "k_hippo_short_nbor",
+                            "k_hippo_special15");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_hippo.cu b/lib/gpu/lal_hippo.cu
index be8d2c0701..dde8f9bfd5 100644
--- a/lib/gpu/lal_hippo.cu
+++ b/lib/gpu/lal_hippo.cu
@@ -2045,6 +2045,307 @@ __kernel void k_hippo_polar(const __global numtyp4 *restrict x_,
      offset,eflag,vflag,ans,engv,NUM_BLOCKS_X);
 }
 
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+__kernel void k_hippo_fphi_uind(const __global numtyp4 *restrict thetai1,
+                          const __global numtyp4 *restrict thetai2,
+                          const __global numtyp4 *restrict thetai3,
+                          const __global int *restrict igrid,
+                          const __global numtyp *restrict grid,
+                          __global numtyp *restrict fdip_phi1,
+                          __global numtyp *restrict fdip_phi2,
+                          __global numtyp *restrict fdip_sum_phi,
+                          const int bsorder, const int inum,
+                          const int nzlo_out, const int nylo_out,
+                          const int nxlo_out, const int ngridxy,
+                          const int ngridx)
+{
+  //int tid, ii, offset, i, n_stride;
+  //atom_info(t_per_atom,ii,tid,offset);
+  
+
+  int tid=THREAD_ID_X;
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+
+  if (ii<inum) {
+
+    int nlpts = (bsorder-1) / 2;
+    
+    int istart = fast_mul(ii,4);
+    int igridx = igrid[istart];
+    int igridy = igrid[istart+1];
+    int igridz = igrid[istart+2];
+    
+    // now istart is used to index thetai1, thetai2 and thetai3
+    istart = fast_mul(ii,bsorder);
+
+    // extract the permanent multipole field at each site
+
+    numtyp tuv100_1 = (numtyp)0.0;
+    numtyp tuv010_1 = (numtyp)0.0;
+    numtyp tuv001_1 = (numtyp)0.0;
+    numtyp tuv200_1 = (numtyp)0.0;
+    numtyp tuv020_1 = (numtyp)0.0;
+    numtyp tuv002_1 = (numtyp)0.0;
+    numtyp tuv110_1 = (numtyp)0.0;
+    numtyp tuv101_1 = (numtyp)0.0;
+    numtyp tuv011_1 = (numtyp)0.0;
+    numtyp tuv100_2 = (numtyp)0.0;
+    numtyp tuv010_2 = (numtyp)0.0;
+    numtyp tuv001_2 = (numtyp)0.0;
+    numtyp tuv200_2 = (numtyp)0.0;
+    numtyp tuv020_2 = (numtyp)0.0;
+    numtyp tuv002_2 = (numtyp)0.0;
+    numtyp tuv110_2 = (numtyp)0.0;
+    numtyp tuv101_2 = (numtyp)0.0;
+    numtyp tuv011_2 = (numtyp)0.0;
+    numtyp tuv000 = (numtyp)0.0;
+    numtyp tuv001 = (numtyp)0.0;
+    numtyp tuv010 = (numtyp)0.0;
+    numtyp tuv100 = (numtyp)0.0;
+    numtyp tuv200 = (numtyp)0.0;
+    numtyp tuv020 = (numtyp)0.0;
+    numtyp tuv002 = (numtyp)0.0;
+    numtyp tuv110 = (numtyp)0.0;
+    numtyp tuv101 = (numtyp)0.0;
+    numtyp tuv011 = (numtyp)0.0;
+    numtyp tuv300 = (numtyp)0.0;
+    numtyp tuv030 = (numtyp)0.0;
+    numtyp tuv003 = (numtyp)0.0;
+    numtyp tuv210 = (numtyp)0.0;
+    numtyp tuv201 = (numtyp)0.0;
+    numtyp tuv120 = (numtyp)0.0;
+    numtyp tuv021 = (numtyp)0.0;
+    numtyp tuv102 = (numtyp)0.0;
+    numtyp tuv012 = (numtyp)0.0;
+    numtyp tuv111 = (numtyp)0.0;
+
+    int k = (igridz - nzlo_out) - nlpts;
+    for (int kb = 0; kb < bsorder; kb++) {
+      /*
+      v0 = thetai3[m][kb][0];
+      v1 = thetai3[m][kb][1];
+      v2 = thetai3[m][kb][2];
+      v3 = thetai3[m][kb][3];
+      */
+      int i3 = istart + kb;
+      numtyp4 tha3 = thetai3[i3];
+      numtyp v0 = tha3.x;
+      numtyp v1 = tha3.y;
+      numtyp v2 = tha3.z;
+      numtyp v3 = tha3.w;
+      numtyp tu00_1 = (numtyp)0.0;
+      numtyp tu01_1 = (numtyp)0.0;
+      numtyp tu10_1 = (numtyp)0.0;
+      numtyp tu20_1 = (numtyp)0.0;
+      numtyp tu11_1 = (numtyp)0.0;
+      numtyp tu02_1 = (numtyp)0.0;
+      numtyp tu00_2 = (numtyp)0.0;
+      numtyp tu01_2 = (numtyp)0.0;
+      numtyp tu10_2 = (numtyp)0.0;
+      numtyp tu20_2 = (numtyp)0.0;
+      numtyp tu11_2 = (numtyp)0.0;
+      numtyp tu02_2 = (numtyp)0.0;
+      numtyp tu00 = (numtyp)0.0;
+      numtyp tu10 = (numtyp)0.0;
+      numtyp tu01 = (numtyp)0.0;
+      numtyp tu20 = (numtyp)0.0;
+      numtyp tu11 = (numtyp)0.0;
+      numtyp tu02 = (numtyp)0.0;
+      numtyp tu30 = (numtyp)0.0;
+      numtyp tu21 = (numtyp)0.0;
+      numtyp tu12 = (numtyp)0.0;
+      numtyp tu03 = (numtyp)0.0;
+
+      int j = (igridy - nylo_out) - nlpts;
+      for (int jb = 0; jb < bsorder; jb++) {
+        /*
+        u0 = thetai2[m][jb][0];
+        u1 = thetai2[m][jb][1];
+        u2 = thetai2[m][jb][2];
+        u3 = thetai2[m][jb][3];
+        */
+        int i2 = istart + jb;
+        numtyp4 tha2 = thetai2[i2];
+        numtyp u0 = tha2.x;
+        numtyp u1 = tha2.y;
+        numtyp u2 = tha2.z;
+        numtyp u3 = tha2.w;
+        numtyp t0_1 = (numtyp)0.0;
+        numtyp t1_1 = (numtyp)0.0;
+        numtyp t2_1 = (numtyp)0.0;
+        numtyp t0_2 = (numtyp)0.0;
+        numtyp t1_2 = (numtyp)0.0;
+        numtyp t2_2 = (numtyp)0.0;
+        numtyp t3 = (numtyp)0.0;
+
+        int i = (igridx - nxlo_out) - nlpts;
+        for (int ib = 0; ib < bsorder; ib++) {
+          /*
+          tq_1 = grid[k][j][i][0];
+          tq_2 = grid[k][j][i][1];
+          t0_1 += tq_1*thetai1[m][ib][0];
+          t1_1 += tq_1*thetai1[m][ib][1];
+          t2_1 += tq_1*thetai1[m][ib][2];
+          t0_2 += tq_2*thetai1[m][ib][0];
+          t1_2 += tq_2*thetai1[m][ib][1];
+          t2_2 += tq_2*thetai1[m][ib][2];
+          t3 += (tq_1+tq_2)*thetai1[m][ib][3];
+          */
+          int i1 = istart + ib;
+          numtyp4 tha1 = thetai1[i1];
+          numtyp w0 = tha1.x;
+          numtyp w1 = tha1.y;
+          numtyp w2 = tha1.z;
+          numtyp w3 = tha1.w;
+          int gidx = 2*(k*ngridxy + j*ngridx + i);
+          numtyp tq_1 = grid[gidx];
+          numtyp tq_2 = grid[gidx+1];
+          t0_1 += tq_1*w0;
+          t1_1 += tq_1*w1;
+          t2_1 += tq_1*w2;
+          t0_2 += tq_2*w0;
+          t1_2 += tq_2*w1;
+          t2_2 += tq_2*w2;
+          t3 += (tq_1+tq_2)*w3;
+          i++;
+        }
+
+        tu00_1 += t0_1*u0;
+        tu10_1 += t1_1*u0;
+        tu01_1 += t0_1*u1;
+        tu20_1 += t2_1*u0;
+        tu11_1 += t1_1*u1;
+        tu02_1 += t0_1*u2;
+        tu00_2 += t0_2*u0;
+        tu10_2 += t1_2*u0;
+        tu01_2 += t0_2*u1;
+        tu20_2 += t2_2*u0;
+        tu11_2 += t1_2*u1;
+        tu02_2 += t0_2*u2;
+        numtyp t0 = t0_1 + t0_2;
+        numtyp t1 = t1_1 + t1_2;
+        numtyp t2 = t2_1 + t2_2;
+        tu00 += t0*u0;
+        tu10 += t1*u0;
+        tu01 += t0*u1;
+        tu20 += t2*u0;
+        tu11 += t1*u1;
+        tu02 += t0*u2;
+        tu30 += t3*u0;
+        tu21 += t2*u1;
+        tu12 += t1*u2;
+        tu03 += t0*u3;
+        j++;
+      }
+
+      tuv100_1 += tu10_1*v0;
+      tuv010_1 += tu01_1*v0;
+      tuv001_1 += tu00_1*v1;
+      tuv200_1 += tu20_1*v0;
+      tuv020_1 += tu02_1*v0;
+      tuv002_1 += tu00_1*v2;
+      tuv110_1 += tu11_1*v0;
+      tuv101_1 += tu10_1*v1;
+      tuv011_1 += tu01_1*v1;
+      tuv100_2 += tu10_2*v0;
+      tuv010_2 += tu01_2*v0;
+      tuv001_2 += tu00_2*v1;
+      tuv200_2 += tu20_2*v0;
+      tuv020_2 += tu02_2*v0;
+      tuv002_2 += tu00_2*v2;
+      tuv110_2 += tu11_2*v0;
+      tuv101_2 += tu10_2*v1;
+      tuv011_2 += tu01_2*v1;
+      tuv000 += tu00*v0;
+      tuv100 += tu10*v0;
+      tuv010 += tu01*v0;
+      tuv001 += tu00*v1;
+      tuv200 += tu20*v0;
+      tuv020 += tu02*v0;
+      tuv002 += tu00*v2;
+      tuv110 += tu11*v0;
+      tuv101 += tu10*v1;
+      tuv011 += tu01*v1;
+      tuv300 += tu30*v0;
+      tuv030 += tu03*v0;
+      tuv003 += tu00*v3;
+      tuv210 += tu21*v0;
+      tuv201 += tu20*v1;
+      tuv120 += tu12*v0;
+      tuv021 += tu02*v1;
+      tuv102 += tu10*v2;
+      tuv012 += tu01*v2;
+      tuv111 += tu11*v1;
+      k++;
+    }
+
+    int idx;
+    numtyp fdip_buf[20];
+
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_1;
+    fdip_buf[2] = tuv010_1;
+    fdip_buf[3] = tuv001_1;
+    fdip_buf[4] = tuv200_1;
+    fdip_buf[5] = tuv020_1;
+    fdip_buf[6] = tuv002_1;
+    fdip_buf[7] = tuv110_1;
+    fdip_buf[8] = tuv101_1;
+    fdip_buf[9] = tuv011_1;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi1[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = (numtyp)0.0;
+    fdip_buf[1] = tuv100_2;
+    fdip_buf[2] = tuv010_2;
+    fdip_buf[3] = tuv001_2;
+    fdip_buf[4] = tuv200_2;
+    fdip_buf[5] = tuv020_2;
+    fdip_buf[6] = tuv002_2;
+    fdip_buf[7] = tuv110_2;
+    fdip_buf[8] = tuv101_2;
+    fdip_buf[9] = tuv011_2;
+    idx = ii;    
+    for (int m = 0; m < 10; m++) {
+      fdip_phi2[idx] = fdip_buf[m];
+      idx += inum;
+    }
+
+    fdip_buf[0] = tuv000;
+    fdip_buf[1] = tuv100;
+    fdip_buf[2] = tuv010;
+    fdip_buf[3] = tuv001;
+    fdip_buf[4] = tuv200;
+    fdip_buf[5] = tuv020;
+    fdip_buf[6] = tuv002;
+    fdip_buf[7] = tuv110;
+    fdip_buf[8] = tuv101;
+    fdip_buf[9] = tuv011;
+    fdip_buf[10] = tuv300;
+    fdip_buf[11] = tuv030;
+    fdip_buf[12] = tuv003;
+    fdip_buf[13] = tuv210;
+    fdip_buf[14] = tuv201;
+    fdip_buf[15] = tuv120;
+    fdip_buf[16] = tuv021;
+    fdip_buf[17] = tuv102;
+    fdip_buf[18] = tuv012;
+    fdip_buf[19] = tuv111;
+    idx = ii;    
+    for (int m = 0; m < 20; m++) {
+      fdip_sum_phi[idx] = fdip_buf[m];
+      idx += inum;
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    scan standard neighbor list and make it compatible with 1-5 neighbors
    if IJ entry is a 1-2,1-3,1-4 neighbor then adjust offset to SBBITS15
diff --git a/lib/gpu/lal_hippo_ext.cpp b/lib/gpu/lal_hippo_ext.cpp
index e7deaddbf3..6b189defe9 100644
--- a/lib/gpu/lal_hippo_ext.cpp
+++ b/lib/gpu/lal_hippo_ext.cpp
@@ -193,6 +193,20 @@ void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **
                              eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid, double ****host_grid_brick,
+                          void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          bool& first_iteration) {
+   HIPPOMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
+                             host_thetai3, igrid, host_grid_brick, host_fdip_phi1,
+                             host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
+                             nylo_out, nyhi_out, nxlo_out, nxhi_out, first_iteration);
+}
+
 double hippo_gpu_bytes() {
   return HIPPOMF.host_memory_usage();
 }
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index e62c8185be..267dc666d6 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -938,7 +938,7 @@ void PairAmoebaGPU::ufield0c(double **field, double **fieldp)
     }
   }
 */
-  // accumulate the field and fieldp values from the real space portion from umutual2b() on the GPU
+  // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
   amoeba_gpu_update_fieldp(&fieldp_pinned);
diff --git a/src/GPU/pair_hippo_gpu.cpp b/src/GPU/pair_hippo_gpu.cpp
index 41c1355fbb..8c1b380f65 100644
--- a/src/GPU/pair_hippo_gpu.cpp
+++ b/src/GPU/pair_hippo_gpu.cpp
@@ -18,7 +18,7 @@
 
 #include "pair_hippo_gpu.h"
 
-#include "amoeba_convolution.h"
+#include "amoeba_convolution_gpu.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
@@ -46,6 +46,9 @@ enum{GEAR,ASPC,LSQR};
 enum{BUILD,APPLY};
 enum{GORDON1,GORDON2};
 
+// same as in pair_amoeba.cpp
+enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC};
+
 #define DEBYE 4.80321    // conversion factor from q-Angs (real units) to Debye
 
 // External functions from cuda library for atom decomposition
@@ -102,6 +105,16 @@ void hippo_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void hippo_gpu_update_fieldp(void **fieldp_ptr);
 
+void hippo_gpu_fphi_uind(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          double ****host_grid_brick, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          const int nzlo_out, const int nzhi_out,
+                          const int nylo_out, const int nyhi_out,
+                          const int nxlo_out, const int nxhi_out,
+                          bool& first_iteration);
+
 void hippo_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp, double *host_pval,
               const bool eflag, const bool vflag, const bool eatom, const bool vatom,
@@ -129,8 +142,10 @@ PairHippoGPU::PairHippoGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_dispersion_real_ready = true;
   gpu_multipole_real_ready = true;
   gpu_udirect2b_ready = true;
+  gpu_umutual1_ready = true;
+  gpu_fphi_uind_ready = true;
   gpu_umutual2b_ready = true;
-  gpu_polar_real_ready = true;
+  gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
 
   GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
 }
@@ -198,6 +213,16 @@ void PairHippoGPU::init_style()
     tq_single = false;
   else
     tq_single = true;
+
+  // replace with the gpu counterpart
+
+  if (gpu_umutual1_ready) {
+    if (use_ewald && ic_kspace) {
+      delete ic_kspace;
+      ic_kspace =
+        new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC);
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -392,6 +417,8 @@ void PairHippoGPU::induce()
 
   int debug = 1;
 
+  first_induce_iteration  = true;
+
   // set cutoffs, taper coeffs, and PME params
   // create qfac here, free at end of polar()
 
@@ -403,8 +430,6 @@ void PairHippoGPU::induce()
 
   // owned atoms
 
-  double **x = atom->x;
-  double **f = atom->f;
   int nlocal = atom->nlocal;
 
   // zero out the induced dipoles at each site
@@ -996,37 +1021,60 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
   int i,j;
   double term;
 
+  double time0,time1,time2;
+
   // zero field,fieldp for owned and ghost atoms
 
   int nlocal = atom->nlocal;
   int nall = nlocal + atom->nghost;
 
-  for (i = 0; i < nall; i++) {
-    for (j = 0; j < 3; j++) {
+  memset(&field[0][0], 0, 3*nall *sizeof(double));
+  memset(&fieldp[0][0], 0, 3*nall *sizeof(double));
+
+/*  
+  for (int i = 0; i < nall; i++) {
+    for (int j = 0; j < 3; j++) {
       field[i][j] = 0.0;
       fieldp[i][j] = 0.0;
     }
   }
-
+*/
+  
   // get the real space portion of the mutual field first
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   if (polar_rspace_flag) umutual2b(field,fieldp);
+  time1 = MPI_Wtime();
 
   // get the reciprocal space part of the mutual field
 
   if (polar_kspace_flag) umutual1(field,fieldp);
+  time2 = MPI_Wtime();
 
   // add the self-energy portion of the mutual field
 
   term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS;
+  for (int i = 0; i < nlocal; i++) {
+    field[i][0] += term*uind[i][0];
+    field[i][1] += term*uind[i][1];
+    field[i][2] += term*uind[i][2];
+  }
+  for (int i = 0; i < nlocal; i++) {
+    fieldp[i][0] += term*uinp[i][0];
+    fieldp[i][1] += term*uinp[i][1];
+    fieldp[i][2] += term*uinp[i][2];
+  }
+/*  
   for (i = 0; i < nlocal; i++) {
     for (j = 0; j < 3; j++) {
       field[i][j] += term*uind[i][j];
       fieldp[i][j] += term*uinp[i][j];
     }
   }
-
-  // accumulate the field and fieldp values from real-space portion from umutual2b() on the GPU
+*/
+  // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU
   //   field and fieldp may already have some nonzero values from kspace (umutual1 and self)
 
   hippo_gpu_update_fieldp(&fieldp_pinned);
@@ -1049,6 +1097,228 @@ void PairHippoGPU::ufield0c(double **field, double **fieldp)
     fieldp[i][1] += fieldp_ptr[idx+1];
     fieldp[i][2] += fieldp_ptr[idx+2];
   }
+
+  // accumulate timing information
+
+  time_mutual_rspace += time1 - time0;
+  time_mutual_kspace += time2 - time1;
+}
+
+/* ----------------------------------------------------------------------
+   umutual1 = Ewald recip mutual induced field
+   umutual1 computes the reciprocal space contribution of the
+   induced atomic dipole moments to the field
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::umutual1(double **field, double **fieldp)
+{
+  int m,n;
+  int nxlo,nxhi,nylo,nyhi,nzlo,nzhi;
+  double term;
+  double a[3][3];  // indices not flipped vs Fortran
+
+  // return if the Ewald coefficient is zero
+
+  if (aewald < 1.0e-6) return;
+
+  // convert Cartesian dipoles to fractional coordinates
+
+  for (int j = 0; j < 3; j++) {
+    a[0][j] = nfft1 * recip[0][j];
+    a[1][j] = nfft2 * recip[1][j];
+    a[2][j] = nfft3 * recip[2][j];
+  }
+
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+    fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2];
+    fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2];
+    fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2];
+  }
+    
+  for (int i = 0; i < nlocal; i++) {      
+    fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2];
+    fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2];
+    fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2];
+  }
+/*
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2];
+      fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2];
+    }
+  }
+*/
+  // gridpre = my portion of 4d grid in brick decomp w/ ghost values
+
+  double ****gridpre = (double ****) ic_kspace->zero();
+
+  // map 2 values to grid
+
+  grid_uind(fuind,fuinp,gridpre);
+
+  // pre-convolution operations including forward FFT
+  // gridfft = my portion of complex 3d grid in FFT decomposition
+
+  double *gridfft = ic_kspace->pre_convolution();
+
+  // ---------------------
+  // convolution operation
+  // ---------------------
+
+  nxlo = ic_kspace->nxlo_fft;
+  nxhi = ic_kspace->nxhi_fft;
+  nylo = ic_kspace->nylo_fft;
+  nyhi = ic_kspace->nyhi_fft;
+  nzlo = ic_kspace->nzlo_fft;
+  nzhi = ic_kspace->nzhi_fft;
+
+  // use qfac values stored in udirect1()
+
+  m = n = 0;
+  for (int k = nzlo; k <= nzhi; k++) {
+    for (int j = nylo; j <= nyhi; j++) {
+      for (int i = nxlo; i <= nxhi; i++) {
+        term = qfac[m++];
+        gridfft[n] *= term;
+        gridfft[n+1] *= term;
+        n += 2;
+      }
+    }
+  }
+
+  // post-convolution operations including backward FFT
+  // gridppost = my portion of 4d grid in brick decomp w/ ghost values
+
+  double ****gridpost = (double ****) ic_kspace->post_convolution();
+
+  // get potential
+  double time0, time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
+  fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi);
+
+  time1 = MPI_Wtime();
+  time_fphi_uind += (time1 - time0);
+
+  // store fractional reciprocal potentials for OPT method
+
+  if (poltyp == OPT) {
+    for (int i = 0; i < nlocal; i++) {
+      for (int j = 0; j < 10; j++) {
+        fopt[i][optlevel][j] = fdip_phi1[i][j];
+        foptp[i][optlevel][j] = fdip_phi2[i][j];
+      }
+    }
+  }
+
+  // convert the dipole fields from fractional to Cartesian
+
+  for (int i = 0; i < 3; i++) {
+    a[0][i] = nfft1 * recip[0][i];
+    a[1][i] = nfft2 * recip[1][i];
+    a[2][i] = nfft3 * recip[2][i];
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi1[i][1] +
+      a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3];
+    double dfy = a[1][0]*fdip_phi1[i][1] +
+      a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3];
+    double dfz = a[2][0]*fdip_phi1[i][1] +
+      a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3];
+    field[i][0] -= dfx;
+    field[i][1] -= dfy;
+    field[i][2] -= dfz;
+  }
+
+  for (int i = 0; i < nlocal; i++) {
+    double dfx = a[0][0]*fdip_phi2[i][1] +
+      a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3];
+    double dfy = a[1][0]*fdip_phi2[i][1] +
+      a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3];
+    double dfz = a[2][0]*fdip_phi2[i][1] +
+      a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3];
+    fieldp[i][0] -= dfx;
+    fieldp[i][1] -= dfy;
+    fieldp[i][2] -= dfz;
+  }
+/*
+  for (int i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      dipfield1[i][j] = a[j][0]*fdip_phi1[i][1] +
+        a[j][1]*fdip_phi1[i][2] + a[j][2]*fdip_phi1[i][3];
+      dipfield2[i][j] = a[j][0]*fdip_phi2[i][1] +
+        a[j][1]*fdip_phi2[i][2] + a[j][2]*fdip_phi2[i][3];
+    }
+  }
+
+  // increment the field at each multipole site
+
+  for (i = 0; i < nlocal; i++) {
+    for (j = 0; j < 3; j++) {
+      field[i][j] -= dipfield1[i][j];
+      fieldp[i][j] -= dipfield2[i][j];
+    }
+  }
+*/
+}
+
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+void PairHippoGPU::fphi_uind(double ****grid, double **fdip_phi1,
+                              double **fdip_phi2, double **fdip_sum_phi)
+{
+  if (!gpu_fphi_uind_ready) {
+    PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
+    return;
+  }
+
+  void* fdip_phi1_pinned = nullptr;
+  void* fdip_phi2_pinned = nullptr;
+  void* fdip_sum_phi_pinned = nullptr;
+  hippo_gpu_fphi_uind(atom->nlocal, bsorder, thetai1,
+                       thetai2, thetai3, igrid, grid,
+                       &fdip_phi1_pinned, &fdip_phi2_pinned,
+                       &fdip_sum_phi_pinned,
+                       ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                       ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                       ic_kspace->nxlo_out, ic_kspace->nxhi_out,
+                       first_induce_iteration);
+  
+  int nlocal = atom->nlocal;
+  double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int n = i;
+    for (int m = 0; m < 10; m++) {
+      fdip_phi1[i][m] = _fdip_phi1_ptr[n];
+      n += nlocal;
+    }
+  }
+
+  double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int n = i;
+    for (int m = 0; m < 10; m++) {
+      fdip_phi2[i][m] = _fdip_phi2_ptr[n];
+      n += nlocal;
+    }
+  }
+
+  double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int n = i;
+    for (int m = 0; m < 20; m++) {
+      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n];
+      n += nlocal;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1089,29 +1359,6 @@ void PairHippoGPU::umutual2b(double **field, double **fieldp)
   double *pval = atom->dvector[index_pval];
   hippo_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, pval,
                               aewald, off2, &fieldp_pinned);
-/*
-  // accumulate the field and fieldp values from the GPU lib
-  //   field and fieldp may already have some nonzero values from kspace (umutual1)
-
-  int nlocal = atom->nlocal;
-  double *field_ptr = (double *)fieldp_pinned;
-
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    field[i][0] += field_ptr[idx];
-    field[i][1] += field_ptr[idx+1];
-    field[i][2] += field_ptr[idx+2];
-  }
-
-  double* fieldp_ptr = (double *)fieldp_pinned;
-  fieldp_ptr += 4*inum;
-  for (int i = 0; i < nlocal; i++) {
-    int idx = 4*i;
-    fieldp[i][0] += fieldp_ptr[idx];
-    fieldp[i][1] += fieldp_ptr[idx+1];
-    fieldp[i][2] += fieldp_ptr[idx+2];
-  }
-*/
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/GPU/pair_hippo_gpu.h b/src/GPU/pair_hippo_gpu.h
index 1ed1c3299d..742fbfb119 100644
--- a/src/GPU/pair_hippo_gpu.h
+++ b/src/GPU/pair_hippo_gpu.h
@@ -39,6 +39,8 @@ class PairHippoGPU : public PairAmoeba {
   virtual void dispersion_real();
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
+  virtual void umutual1(double **, double **);
+  virtual void fphi_uind(double ****, double **, double **, double **);
   virtual void umutual2b(double **, double **);
   virtual void ufield0c(double **, double **);
   virtual void polar_real();
@@ -55,9 +57,13 @@ class PairHippoGPU : public PairAmoeba {
   bool gpu_dispersion_real_ready;
   bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
+  bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
   bool gpu_umutual2b_ready;
   bool gpu_polar_real_ready;
 
+  bool first_induce_iteration;
+
   void udirect2b_cpu();
 
   template<class numtyp>