From cad7e1b364c6b6e2a376b26b31af6386038580e3 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Fri, 2 Sep 2022 10:18:59 -0500
Subject: [PATCH] Moved fphi_uind up to BaseAmoeba

---
 lib/gpu/lal_amoeba.cpp      | 32 --------------------------------
 lib/gpu/lal_amoeba.cu       |  6 +++---
 lib/gpu/lal_amoeba.h        |  1 -
 lib/gpu/lal_base_amoeba.cpp | 36 ++++++++++++++++++++++++++++++++----
 lib/gpu/lal_base_amoeba.h   |  2 +-
 5 files changed, 36 insertions(+), 41 deletions(-)
diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp
index 38058bab55..924a175cfe 100644
--- a/lib/gpu/lal_amoeba.cpp
+++ b/lib/gpu/lal_amoeba.cpp
@@ -259,38 +259,6 @@ int AmoebaT::umutual2b(const int eflag, const int vflag) {
   return GX;
 }
 
-// ---------------------------------------------------------------------------
-// Interpolate the potential from the PME grid
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-int AmoebaT::fphi_uind() {
-  int ainum=this->ans->inum();
-  if (ainum == 0)
-    return 0;
-
-  int _nall=this->atom->nall();
-  int nbor_pitch=this->nbor->nbor_pitch();
-
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
-  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  this->time_pair.start();
-  int ngridyz = this->_ngridy * this->_ngridz;
-  this->k_fphi_uind.set_size(GX,BX);
-  this->k_fphi_uind.run(&this->atom->x, &this->_thetai1,
-                        &this->_thetai2, &this->_thetai3,
-                        &this->_igrid, &this->_cgrid_brick,
-                        &this->_fdip_phi1, &this->_fdip_phi2,
-                        &this->_fdip_sum_phi, &this->_bsorder,
-                        &ainum, &ngridyz, &this->_ngridy,
-                        &this->_threads_per_atom);
-  this->time_pair.stop();
-
-  return GX;
-}
-
 // ---------------------------------------------------------------------------
 // Calculate the polar real-space term, returning tep
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_amoeba.cu b/lib/gpu/lal_amoeba.cu
index 984154f16e..200191cea2 100644
--- a/lib/gpu/lal_amoeba.cu
+++ b/lib/gpu/lal_amoeba.cu
@@ -1706,7 +1706,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
     tuv012 = (numtyp)0.0;
     tuv111 = (numtyp)0.0;
 
-    k = igrid[3*i+2] - nlpts;
+    k = igrid[4*i+2] - nlpts;
     for (int kb = 0; kb < bsorder; kb++) {
       /*
       v0 = thetai3[m][kb][0];
@@ -1742,7 +1742,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
       tu12 = (numtyp)0.0;
       tu03 = (numtyp)0.0;
 
-      j = igrid[3*i+1] - nlpts;
+      j = igrid[4*i+1] - nlpts;
       for (int jb = 0; jb < bsorder; jb++) {
         /*
         u0 = thetai2[m][jb][0];
@@ -1763,7 +1763,7 @@ __kernel void k_fphi_uind(const __global numtyp4 *restrict x_,
         t2_2 = (numtyp)0.0;
         t3 = (numtyp)0.0;
 
-        int ii = igrid[3*i] - nlpts;
+        int ii = igrid[4*i] - nlpts;
         for (int ib = 0; ib < bsorder; ib++) {
           /*
           tq_1 = grid[k][j][ii][0];
diff --git a/lib/gpu/lal_amoeba.h b/lib/gpu/lal_amoeba.h
index 005ea14fb9..d12b79719f 100644
--- a/lib/gpu/lal_amoeba.h
+++ b/lib/gpu/lal_amoeba.h
@@ -91,7 +91,6 @@ class Amoeba : public BaseAmoeba<numtyp, acctyp> {
   int multipole_real(const int eflag, const int vflag);
   int udirect2b(const int eflag, const int vflag);
   int umutual2b(const int eflag, const int vflag);
-  int fphi_uind();
   int polar_real(const int eflag, const int vflag);
 
 };
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index cd5a9abf81..1269a798db 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -144,7 +144,7 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _max_fieldp_size = _max_tep_size;
   _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
-  _max_thetai_size = 0;
+  _max_thetai_size = _max_tep_size;
 
   _nmax = nall;
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
@@ -466,7 +466,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
     _thetai1.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
     _thetai2.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
     _thetai3.alloc(_max_thetai_size*bsorder*4,*(this->ucl_device),UCL_READ_ONLY);
-    _igrid.alloc(_max_thetai_size*3,*(this->ucl_device),UCL_READ_ONLY);
+    _igrid.alloc(_max_thetai_size*4,*(this->ucl_device),UCL_READ_ONLY);
 
     _fdip_phi1.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
     _fdip_phi2.alloc(_max_thetai_size*10,*(this->ucl_device),UCL_WRITE_ONLY);
@@ -478,7 +478,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
       _thetai1.resize(_max_thetai_size*bsorder*4);
       _thetai2.resize(_max_thetai_size*bsorder*4);
       _thetai3.resize(_max_thetai_size*bsorder*4);
-      _igrid.resize(_max_thetai_size*3);
+      _igrid.resize(_max_thetai_size*4);
 
       _fdip_phi1.resize(_max_thetai_size*10);
       _fdip_phi2.resize(_max_thetai_size*10);
@@ -498,7 +498,7 @@ void BaseAmoebaT::precompute_induce(const int inum_full, const int bsorder,
   ucl_copy(_thetai3,dview,false);
 
   UCL_H_Vec<int> dview_int;
-  dview_int.view(&host_igrid[0][0],inum_full*3,*(this->ucl_device));
+  dview_int.view(&host_igrid[0][0],inum_full*4,*(this->ucl_device));
   ucl_copy(_igrid,dview_int,false);
 
   _nzlo_out = nzlo_out;
@@ -666,6 +666,34 @@ void BaseAmoebaT::compute_fphi_uind(const int inum_full, const int bsorder,
   const int red_blocks = fphi_uind();
 }
 
+// ---------------------------------------------------------------------------
+// Interpolate the potential from the PME grid
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int BaseAmoebaT::fphi_uind() {
+  int ainum=ans->inum();
+  if (ainum == 0)
+    return 0;
+
+  int _nall=atom->nall();
+  int nbor_pitch=nbor->nbor_pitch();
+
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(ans->inum())/
+                               (BX/_threads_per_atom)));
+
+  time_pair.start();
+  int ngridyz = _ngridy * _ngridz;
+  k_fphi_uind.set_size(GX,BX);
+  k_fphi_uind.run(&atom->x, &_thetai1, &_thetai2, &_thetai3,
+                  &_igrid, &_cgrid_brick, &_fdip_phi1, &_fdip_phi2,
+                  &_fdip_sum_phi, &_bsorder, &ainum, &ngridyz, &_ngridy,
+                  &_threads_per_atom);
+  time_pair.stop();
+
+  return GX;
+}
 
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute polar real-space
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 8503e6fba4..d3ae3a750b 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -311,7 +311,7 @@ class BaseAmoeba {
   virtual int multipole_real(const int eflag, const int vflag) = 0;
   virtual int udirect2b(const int eflag, const int vflag) = 0;
   virtual int umutual2b(const int eflag, const int vflag) = 0;
-  virtual int fphi_uind() = 0;
+  virtual int fphi_uind();
   virtual int polar_real(const int eflag, const int vflag) = 0;