From 21b7fb2fcfb842b1f332eb737ae83fa5f89d48d2 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndtrung@uchicago.edu>
Date: Fri, 2 Sep 2022 14:55:20 -0500
Subject: [PATCH] Exposing fphi_uind to the gpu pair style, still keeping the
 part not ready though

---
 lib/gpu/lal_amoeba_ext.cpp  |  12 ++-
 lib/gpu/lal_base_amoeba.cpp | 200 ++++++++++++++++++++----------------
 lib/gpu/lal_base_amoeba.h   |  14 +--
 src/AMOEBA/pair_amoeba.h    |   2 +-
 src/GPU/pair_amoeba_gpu.cpp |  60 ++++++++++-
 src/GPU/pair_amoeba_gpu.h   |   4 +
 6 files changed, 193 insertions(+), 99 deletions(-)
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 6989a5e6f6..151c38c9c4 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,9 +162,17 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
-void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp,
+void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
                           double ***host_thetai1, double ***host_thetai2,
-                          double ***host_thetai3, double ***grid) {
+                          double ***host_thetai3, int** igrid,
+                          double *host_grid_brick_start, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                          int nxlo_out, int nxhi_out, bool& first_iteration) {
+   AMOEBAMF.compute_fphi_uind(inum_full, bsorder, host_thetai1, host_thetai2,
+                          host_thetai3, igrid, host_grid_brick_start, host_fdip_phi1,
+                          host_fdip_phi2, host_fdip_sum_phi, nzlo_out, nzhi_out,
+                          nylo_out, nyhi_out,  nxlo_out, nxhi_out, first_iteration);
 }
 
 void amoeba_setup_fft(const int numel, const int element_type) {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 1269a798db..bdd43aa59e 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _max_fieldp_size = _max_tep_size;
   _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
-  _max_thetai_size = _max_tep_size;
+  _max_thetai_size = 0;
 
   _nmax = nall;
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
@@ -441,81 +441,6 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   return nbor->host_jlist.begin()-host_start;
 }
 
-// ---------------------------------------------------------------------------
-// Prepare for umutual1: bspline_fill
-//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed
-//   - transfer extra data from host to device
-// ---------------------------------------------------------------------------
-
-template <class numtyp, class acctyp>
-void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
-                                    double **host_thetai1, double **host_thetai2,
-                                    double **host_thetai3, int** host_igrid,
-                                    double* grid_brick_start, int nzlo_out,
-                                    int nzhi_out, int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out) {
-  
-  _bsorder = bsorder;
-
-  // allocate or resize per-atom arrays
-  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
-  //   will be consolidated once all terms are ready
-
-  if (_max_thetai_size == 0) {
-    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
-    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
-
-    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
-    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
-    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
-
-  } else {
-    if (inum_full>_max_thetai_size) {
-      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
-      _thetai1.resize(_max_thetai_size*bsorder*4);
-      _thetai2.resize(_max_thetai_size*bsorder*4);
-      _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size*4);
-
-      _fdip_phi1.resize(_max_thetai_size*10);
-      _fdip_phi2.resize(_max_thetai_size*10);
-      _fdip_sum_phi.resize(_max_thetai_size*20);
-    }
-  }
-
-  UCL_H_Vec<double> dview;
-
-  // copy from host to device
-  
-  dview.view(&host_thetai1[0][0],inum_full*bsorder*4,*(this->ucl_device));
-  ucl_copy(_thetai1,dview,false);
-  dview.view(&host_thetai2[0][0],inum_full*bsorder*4,*(this->ucl_device));
-  ucl_copy(_thetai2,dview,false);
-  dview.view(&host_thetai3[0][0],inum_full*bsorder*4,*(this->ucl_device));
-  ucl_copy(_thetai3,dview,false);
-
-  UCL_H_Vec<int> dview_int;
-  dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device));
-  ucl_copy(_igrid,dview_int,false);
-
-  _nzlo_out = nzlo_out;
-  _nzhi_out = nzhi_out;
-  _nylo_out = nylo_out;
-  _nyhi_out = nyhi_out;
-  _nxlo_out = nxlo_out;
-  _nxhi_out = nxhi_out;
-  _ngridz = nzhi_out - nzlo_out + 1;
-  _ngridy = nyhi_out - nylo_out + 1;
-  _ngridx = nxhi_out - nxlo_out + 1;
-  _num_grid_points = _ngridx*_ngridy*_ngridz*2;
-  dview.view(grid_brick_start,_num_grid_points,*(this->ucl_device));
-  ucl_copy(_cgrid_brick,dview,false);
-
-}
-
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
@@ -626,6 +551,98 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
   // _fieldp.update_host(_max_fieldp_size*8,false);
 }
 
+// ---------------------------------------------------------------------------
+// Prepare for umutual1() after bspline_fill() is done on host
+//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, and igrid if needed
+//     host_thetai1, host_thetai2, host_thetai3 are allocated with nmax by bsordermax by 4
+//     host_igrid is allocated with nmax by by 4
+//   - transfer extra data from host to device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** host_igrid,
+                                    double* grid_brick_start, int nzlo_out,
+                                    int nzhi_out, int nylo_out, int nyhi_out,
+                                    int nxlo_out, int nxhi_out) {
+  
+  _bsorder = bsorder;
+
+  // allocate or resize per-atom arrays
+  // _max_thetai_size, _max_tep_size and _max_fieldp_size are essentially _nmax
+  //   will be consolidated once all terms are ready
+
+  if (_max_thetai_size == 0) {
+    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
+
+    _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
+    _fdip_sum_phi.alloc(_max_thetai_size*20,*(this->ucl_device),UCL_WRITE_ONLY);
+
+  } else {
+    if (inum_full>_max_thetai_size) {
+      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+      _thetai1.resize(_max_thetai_size*bsorder*4);
+      _thetai2.resize(_max_thetai_size*bsorder*4);
+      _thetai3.resize(_max_thetai_size*bsorder*4);
+      _igrid.resize(_max_thetai_size*4);
+
+      _fdip_phi1.resize(_max_thetai_size*10);
+      _fdip_phi2.resize(_max_thetai_size*10);
+      _fdip_sum_phi.resize(_max_thetai_size*20);
+    }
+  }
+
+  UCL_H_Vec<double> dview;
+  dview.alloc(inum_full*bsorder*4,*(this->ucl_device));
+
+  // pack host data to device
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*4*bsorder + 4*j;
+      dview[idx+0] = host_thetai1[i][j][0];
+      dview[idx+1] = host_thetai1[i][j][1];
+      dview[idx+2] = host_thetai1[i][j][2];
+      dview[idx+3] = host_thetai1[i][j][3];
+    }
+  ucl_copy(_thetai1,dview,false);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*4*bsorder + 4*j;
+      dview[idx+0] = host_thetai2[i][j][0];
+      dview[idx+1] = host_thetai2[i][j][1];
+      dview[idx+2] = host_thetai2[i][j][2];
+      dview[idx+3] = host_thetai2[i][j][3];
+    }
+  ucl_copy(_thetai2,dview,false);
+
+  for (int i = 0; i < inum_full; i++)
+    for (int j = 0; j < bsorder; j++) {
+      int idx = i*4*bsorder + 4*j;
+      dview[idx+0] = host_thetai3[i][j][0];
+      dview[idx+1] = host_thetai3[i][j][1];
+      dview[idx+2] = host_thetai3[i][j][2];
+      dview[idx+3] = host_thetai3[i][j][3];
+    }
+  ucl_copy(_thetai3,dview,false);
+
+  UCL_H_Vec<int> dview_int;
+  for (int i = 0; i < inum_full; i++) {
+    int idx = i*4;
+    dview_int[idx+0] = host_igrid[i][0];
+    dview_int[idx+1] = host_igrid[i][1];
+    dview_int[idx+2] = host_igrid[i][2];
+  }
+  ucl_copy(_igrid,dview_int,false);
+}
+
 // ---------------------------------------------------------------------------
 // fphi_uind = induced potential from grid
 // fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
@@ -633,19 +650,22 @@ void BaseAmoebaT::compute_umutual2b(int *host_amtype, int *host_amgroup, double
 
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
-                                    double **host_thetai1, double **host_thetai2,
-                                    double **host_thetai3, int** igrid,
-                                    double *host_grid_brick_start, double **host_fdip_phi1,
-                                    double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                    double ***host_thetai1, double ***host_thetai2,
+                                    double ***host_thetai3, int** igrid,
+                                    double *host_grid_brick_start, void** host_fdip_phi1,
+                                    void **host_fdip_phi2, void **host_fdip_sum_phi,
                                     int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                    int nxlo_out, int nxhi_out)
+                                    int nxlo_out, int nxhi_out, bool& first_iteration)
 {
-  // allocation/resize and transfers (do this right after udirect?)
+  // allocation/resize and transfers before the first iteration
   
-  precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
-                    igrid, host_grid_brick_start, nzlo_out, nzhi_out, nylo_out, nyhi_out,
-                    nxlo_out, nxhi_out);
-
+  if (first_iteration) {
+    precompute_induce(inum_full, bsorder, host_thetai1, host_thetai2, host_thetai3,
+                      igrid, host_grid_brick_start, nzlo_out, nzhi_out,
+                      nylo_out, nyhi_out, nxlo_out, nxhi_out);
+    if (first_iteration) first_iteration = false;
+  }
+    
   // update the cgrid_brick with data host
   
   _nzlo_out = nzlo_out;
@@ -664,6 +684,14 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
   ucl_copy(_cgrid_brick,dview,false);
 
   const int red_blocks = fphi_uind();
+
+  _fdip_phi1.update_host(_max_thetai_size*10);
+  _fdip_phi2.update_host(_max_thetai_size*10);
+  _fdip_sum_phi.update_host(_max_thetai_size*20);
+
+  *host_fdip_phi1 = _fdip_phi1.host.begin();
+  *host_fdip_phi2 = _fdip_phi2.host.begin();
+  *host_fdip_sum_phi = _fdip_sum_phi.host.begin();
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index d3ae3a750b..a001423812 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -151,8 +151,8 @@ class BaseAmoeba {
                 double *charge, double *boxlo, double *prd);
 
   virtual void precompute_induce(const int inum_full, const int bsorder,
-                                 double **host_thetai1, double **host_thetai2,
-                                 double **host_thetai3, int** igrid,
+                                 double ***host_thetai1, double ***host_thetai2,
+                                 double ***host_thetai3, int** igrid,
                                  double* grid_brick_start, int nzlo_out,
                                  int nzhi_out, int nylo_out, int nyhi_out,
                                  int nxlo_out, int nxhi_out);
@@ -180,12 +180,12 @@ class BaseAmoeba {
                 const double aewald, const double off2_polar, void **fieldp_ptr);
 
   virtual void compute_fphi_uind(const int inum_full, const int bsorder,
-                                 double **host_thetai1, double **host_thetai2,
-                                 double **host_thetai3, int** igrid,
-                                 double *host_grid_brick_start, double **host_fdip_phi1,
-                                 double **host_fdip_phi2, double **host_fdip_sum_phi,
+                                 double ***host_thetai1, double ***host_thetai2,
+                                 double ***host_thetai3, int** igrid,
+                                 double *host_grid_brick_start, void **host_fdip_phi1,
+                                 void **host_fdip_phi2, void **host_fdip_sum_phi,
                                  int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
-                                 int nxlo_out, int nxhi_out);
+                                 int nxlo_out, int nxhi_out, bool& first_iteration);
 
   /// Compute polar real-space with device neighboring
   virtual void compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole,
diff --git a/src/AMOEBA/pair_amoeba.h b/src/AMOEBA/pair_amoeba.h
index 93978ab1f2..17b2d4a1e8 100644
--- a/src/AMOEBA/pair_amoeba.h
+++ b/src/AMOEBA/pair_amoeba.h
@@ -407,7 +407,7 @@ class PairAmoeba : public Pair {
   void grid_mpole(double **, double ***);
   void fphi_mpole(double ***, double **);
   void grid_uind(double **, double **, double ****);
-  void fphi_uind(double ****, double **, double **, double **);
+  virtual void fphi_uind(double ****, double **, double **, double **);
   void grid_disp(double ***);
 
   void kewald();
diff --git a/src/GPU/pair_amoeba_gpu.cpp b/src/GPU/pair_amoeba_gpu.cpp
index cd3c01cde3..bf6db3472d 100644
--- a/src/GPU/pair_amoeba_gpu.cpp
+++ b/src/GPU/pair_amoeba_gpu.cpp
@@ -88,9 +88,13 @@ void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup,
 
 void amoeba_gpu_update_fieldp(void **fieldp_ptr);
 
-void amoeba_gpu_grid_uind(double **host_fuind, double **host_fuinp,
-                          double** host_thetai1, double** host_thetai2,
-                          double** host_thetai3, double ***grid);
+void amoeba_gpu_fphi_uind(const int inum_full, const int bsorder,
+                          double ***host_thetai1, double ***host_thetai2,
+                          double ***host_thetai3, int** igrid,
+                          double *host_grid_brick_start, void **host_fdip_phi1,
+                          void **host_fdip_phi2, void **host_fdip_sum_phi,
+                          int nzlo_out, int nzhi_out, int nylo_out, int nyhi_out,
+                          int nxlo_out, int nxhi_out, bool& first_iteration);
 
 void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup,
               double **host_rpole, double **host_uind, double **host_uinp,
@@ -117,6 +121,7 @@ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE)
   gpu_multipole_real_ready = true;     // need to be true for precompute()
   gpu_udirect2b_ready = true;
   gpu_umutual1_ready = true;
+  gpu_fphi_uind_ready = false;
   gpu_umutual2b_ready = true;
   gpu_polar_real_ready = true;         // need to be true for copying data from device back to host
 
@@ -481,6 +486,8 @@ void PairAmoebaGPU::induce()
 
     // conjugate gradient iteration of the mutual induced dipoles
 
+    first_induce_iteration  = true;
+
     while (!done) {
       iter++;
 
@@ -1115,6 +1122,53 @@ void PairAmoebaGPU::umutual1(double **field, double **fieldp)
 */
 }
 
+/* ----------------------------------------------------------------------
+   fphi_uind = induced potential from grid
+   fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid
+------------------------------------------------------------------------- */
+
+void PairAmoebaGPU::fphi_uind(double ****grid, double **fdip_phi1,
+                              double **fdip_phi2, double **fdip_sum_phi)
+{
+  if (!gpu_fphi_uind_ready) {
+    PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi);
+    return;
+  }
+
+  void* fdip_phi1_pinned = nullptr;
+  void* fdip_phi2_pinned = nullptr;
+  void* fdip_sum_phi_pinned = nullptr;
+  amoeba_gpu_fphi_uind(atom->nlocal, bsorder, thetai1, thetai2, thetai3,
+                        igrid, ic_kspace->grid_brick_start,
+                        &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned,
+                        ic_kspace->nzlo_out, ic_kspace->nzhi_out,
+                        ic_kspace->nylo_out, ic_kspace->nyhi_out,
+                        ic_kspace->nxlo_out, ic_kspace->nxhi_out,
+                        first_induce_iteration);
+  
+  int nlocal = atom->nlocal;
+  double *_fdip_phi1_ptr = (double *)fdip_phi1_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 10 * i;
+    for (int m = 0; m < 10; m++)
+      fdip_phi1[i][m] = _fdip_phi1_ptr[idx+m];
+  }
+
+  double *_fdip_phi2_ptr = (double *)fdip_phi2_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 10 * i;
+    for (int m = 0; m < 10; m++)
+      fdip_phi2[i][m] = _fdip_phi2_ptr[idx+m];
+  }
+
+  double *_fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned;
+  for (int i = 0; i < nlocal; i++) {
+    int idx = 20 * i;
+    for (int m = 0; m < 20; m++)
+      fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[idx+m];
+  }
+}
+
 /* ----------------------------------------------------------------------
    umutual2b = Ewald real mutual field via list
    umutual2b computes the real space contribution of the induced
diff --git a/src/GPU/pair_amoeba_gpu.h b/src/GPU/pair_amoeba_gpu.h
index e0563cd8b5..fe6ed3368f 100644
--- a/src/GPU/pair_amoeba_gpu.h
+++ b/src/GPU/pair_amoeba_gpu.h
@@ -39,6 +39,7 @@ class PairAmoebaGPU : public PairAmoeba {
   virtual void multipole_real();
   virtual void udirect2b(double **, double **);
   virtual void umutual1(double **, double **);
+  virtual void fphi_uind(double ****, double **, double **, double **);
   virtual void umutual2b(double **, double **);
   virtual void ufield0c(double **, double **);
   virtual void polar_real();
@@ -56,9 +57,12 @@ class PairAmoebaGPU : public PairAmoeba {
   bool gpu_multipole_real_ready;
   bool gpu_udirect2b_ready;
   bool gpu_umutual1_ready;
+  bool gpu_fphi_uind_ready;
   bool gpu_umutual2b_ready;
   bool gpu_polar_real_ready;
 
+  bool first_induce_iteration;
+
   void udirect2b_cpu();
 
   template<class numtyp>