From f4a90c62c070deb7b6b82c9f3f3b0c639e7a84a6 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Tue, 23 Aug 2022 15:42:05 -0500
Subject: [PATCH] First attempt to port the forward FFT in the k-space induce
 term to the GPU, not working yet

---
 lib/gpu/Makefile.lammps.standard   |  2 +-
 lib/gpu/lal_amoeba_ext.cpp         |  8 +--
 lib/gpu/lal_base_amoeba.cpp        | 84 +++++++++++++++++++++++++++++-
 lib/gpu/lal_base_amoeba.h          | 21 +++++++-
 src/AMOEBA/amoeba_convolution.cpp  | 38 +++++++++++++-
 src/AMOEBA/amoeba_convolution.h    |  2 +
 src/AMOEBA/pair_amoeba.cpp         | 10 ++++
 src/GPU/amoeba_convolution_gpu.cpp | 30 +++++++++--
 8 files changed, 181 insertions(+), 14 deletions(-)
diff --git a/lib/gpu/Makefile.lammps.standard b/lib/gpu/Makefile.lammps.standard
index 9526e8e373..0bb3394b3e 100644
--- a/lib/gpu/Makefile.lammps.standard
+++ b/lib/gpu/Makefile.lammps.standard
@@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
 endif
 
 gpu_SYSINC =
-gpu_SYSLIB =  -lcudart -lcuda
+gpu_SYSLIB =  -lcudart -lcuda -lcufft
 gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs
 
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 304159e571..7d9d836b29 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,12 +162,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
-void amoeba_setup_fft(const int size, const int element_type) {
-  AMOEBAMF.setup_fft(size, element_type);
+void amoeba_setup_fft(const int numel, const int element_type) {
+  AMOEBAMF.setup_fft(numel, element_type);
 }
 
-void amoeba_compute_fft1d(void** in, void** out, const int mode) {
-  AMOEBAMF.compute_fft1d(in, out, mode);
+void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
+  AMOEBAMF.compute_fft1d(in, out, numel, mode);
 }
 
 double amoeba_gpu_bytes() {
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 05a48f9588..2f3c04c7f1 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -15,6 +15,7 @@
  ***************************************************************************/
 
 #include "lal_base_amoeba.h"
+
 namespace LAMMPS_AL {
 #define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
 
@@ -39,6 +40,9 @@ BaseAmoebaT::~BaseAmoeba() {
   k_polar.clear();
   k_special15.clear();
   k_short_nbor.clear();
+
+  //if (cufft_plan_created) cufftDestroy(plan);
+
   if (pair_program) delete pair_program;
 }
 
@@ -137,11 +141,15 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
   _max_fieldp_size = _max_tep_size;
   _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
 
+  _max_thetai_size = 0;
+
   _nmax = nall;
   dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
   dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
 
+  cufft_plan_created = false;
+
   return success;
 }
 
@@ -169,6 +177,9 @@ void BaseAmoebaT::clear_atomic() {
 
   _tep.clear();
   _fieldp.clear();
+  _thetai1.clear();
+  _thetai2.clear();
+  _thetai3.clear();
   dev_nspecial15.clear();
   dev_special15.clear();
   dev_special15_t.clear();
@@ -422,6 +433,36 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
   return nbor->host_jlist.begin()-host_start;
 }
 
+// ---------------------------------------------------------------------------
+// Prepare for umutual1: bspline_fill
+//   - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed
+//   - transfer extra data from host to device
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::precompute_umutual1(const int ago, const int inum_full, const int nall,
+                                       const int bsordermax, double **host_x,
+                                       double **host_thetai1, double **host_thetai2,
+                                       double **host_thetai3, void* grid) {
+  
+  _bsordermax = bsordermax;
+
+  if (_max_thetai_size == 0) {
+    _max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
+    _thetai1.alloc(_max_thetai_size*_bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai2.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+    _thetai3.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
+  } else {
+    if (inum_full>_max_thetai_size) {
+      _max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
+      _thetai1.resize(_max_thetai_size*_bsordermax*4);
+      _thetai2.resize(_max_thetai_size*_bsordermax*4);
+      _thetai3.resize(_max_thetai_size*_bsordermax*4);
+    }
+  }
+
+}
+
 // ---------------------------------------------------------------------------
 // Reneighbor on GPU if necessary, and then compute multipole real-space
 // ---------------------------------------------------------------------------
@@ -583,7 +624,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::setup_fft(const int size, const int element_type)
+void BaseAmoebaT::setup_fft(const int numel, const int element_type)
 {
 
 }
@@ -593,9 +634,48 @@ void BaseAmoebaT::setup_fft(const int size, const int element_type)
 // ---------------------------------------------------------------------------
 
 template <class numtyp, class acctyp>
-void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode)
+void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode)
 {
+  if (cufft_plan_created == false) {
+    int m = numel/2;
+    cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
+    cufft_plan_created = true;
+  }
 
+  // n = number of double complex
+  int n = numel/2;
+  
+  // copy the host array to the device (data)
+  UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
+  data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
+  int m = 0;
+  double* d_in = (double*)in;
+  for (int i = 0; i < n; i++) {
+    data[i].x = d_in[m];
+    data[i].y = d_in[m+1];
+    m += 2;
+  }
+  data.update_device(false);
+
+  // perform the in-place forward FFT
+  
+  cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
+    (cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
+  if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
+  ucl_device->sync();
+  data.update_host(false);
+
+  // copy back the data to the host array
+
+  m = 0;
+  double* d_out = (double*)out;
+  for (int i = 0; i < n; i++) {
+    d_out[m] = data[i].x;
+    d_out[m+1] = data[i].y;
+    m += 2;
+  }
+
+  data.clear();
 }
 
 // ---------------------------------------------------------------------------
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index 2bff362f29..3d0b3ab1a4 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -31,6 +31,14 @@
 #include "geryon/nvd_texture.h"
 #endif
 
+#if !defined(USE_OPENCL) && !defined(USE_HIP)
+// temporary workaround for int2 also defined in cufft
+#ifdef int2
+#undef int2
+#endif
+#include "cufft.h"
+#endif
+
 namespace LAMMPS_AL {
 
 template <class numtyp, class acctyp>
@@ -142,6 +150,11 @@ class BaseAmoeba {
                 int **&ilist, int **&numj, const double cpu_time, bool &success,
                 double *charge, double *boxlo, double *prd);
 
+  virtual void precompute_umutual1(const int ago, const int inum_full, const int nall,
+                                    const int bsordermax, double **host_x,
+                                    double **host_thetai1, double **host_thetai2,
+                                    double **host_thetai3, void* grid);
+
   /// Compute multipole real-space with device neighboring
   virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
                 double **host_x, int *host_type, int *host_amtype,
@@ -196,7 +209,7 @@ class BaseAmoeba {
 
   /// compute forward/backward FFT on the device
 
-  void compute_fft1d(void** in, void** out, const int mode);
+  void compute_fft1d(void* in, void* out, const int numel, const int mode);
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -230,6 +243,10 @@ class BaseAmoeba {
   UCL_Vector<acctyp,acctyp> _tep, _fieldp;
   int _nmax, _max_tep_size, _max_fieldp_size;
 
+  int _bsordermax;
+  UCL_Vector<acctyp,acctyp> _thetai1, _thetai2, _thetai3;
+  int _max_thetai_size;
+
   // ------------------------ FORCE/ENERGY DATA -----------------------
 
   Answer<numtyp,acctyp> *ans;
@@ -282,6 +299,8 @@ class BaseAmoeba {
   virtual int umutual2b(const int eflag, const int vflag) = 0;
   virtual int polar_real(const int eflag, const int vflag) = 0;
 
+  cufftHandle plan;
+  bool cufft_plan_created;
 };
 
 }
diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp
index 9c8f728f99..4dde750c61 100644
--- a/src/AMOEBA/amoeba_convolution.cpp
+++ b/src/AMOEBA/amoeba_convolution.cpp
@@ -203,7 +203,7 @@ AmoebaConvolution::AmoebaConvolution(LAMMPS *lmp, Pair *pair,
   fft1 = new FFT3d(lmp,world,nx,ny,nz,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-       1,0,&tmp,0);
+                   1,0,&tmp,0);
   //       0,0,&tmp,0);
 
   fft2 = new FFT3d(lmp,world,nx,ny,nz,
@@ -358,15 +358,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
     cfft[n++] = ZEROF;
   }
 
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = MPI_Wtime();
 
   if (SCALE) {
     double scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
   }
 
+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT1,"PRE Convo / POST FFT");
   debug_file(CFFT1,"pre.convo.post.fft");
@@ -414,15 +422,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
   debug_scalar(FFT,"PRE Convo / POST Remap");
   debug_file(FFT,"pre.convo.post.remap");
 #endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  time1 = MPI_Wtime();
 
   if (SCALE) {
     double scale = 1.0/nfft_global;
     for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
   }
 
+  time_fft += time1  - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT1,"PRE Convo / POST FFT");
   debug_file(CFFT1,"pre.convo.post.fft");
@@ -455,7 +472,16 @@ void *AmoebaConvolution::post_convolution_3d()
   debug_scalar(CFFT1,"POST Convo / PRE FFT");
   debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+  time1 = MPI_Wtime();
+
+  time_fft += time1 - time0;
 
 #if DEBUG_AMOEBA
   debug_scalar(CFFT2,"POST Convo / POST FFT");
@@ -497,8 +523,18 @@ void *AmoebaConvolution::post_convolution_4d()
   debug_scalar(CFFT1,"POST Convo / PRE FFT");
   debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
 
+  time1 = MPI_Wtime();
+
+  time_fft += time1 - time0;
+
 #if DEBUG_AMOEBA
   debug_scalar(CFFT2,"POST Convo / POST FFT");
   debug_file(CFFT2,"post.convo.post.fft");
diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h
index 00f2b8ed91..8e7f09218a 100644
--- a/src/AMOEBA/amoeba_convolution.h
+++ b/src/AMOEBA/amoeba_convolution.h
@@ -47,6 +47,8 @@ class AmoebaConvolution : protected Pointers {
   FFT_SCALAR *pre_convolution();
   void *post_convolution();
 
+  double time_fft;
+
  protected:
   int which;                   // caller name for convolution being performed
   int flag3d;                  // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp
index d5270af450..3b66ebc221 100644
--- a/src/AMOEBA/pair_amoeba.cpp
+++ b/src/AMOEBA/pair_amoeba.cpp
@@ -347,6 +347,10 @@ void PairAmoeba::compute(int eflag, int vflag)
     time_direct_rspace = time_direct_kspace = 0.0;
     time_mutual_rspace = time_mutual_kspace = 0.0;
     time_polar_rspace = time_polar_kspace = 0.0;
+
+    if (ic_kspace) {
+      ic_kspace->time_fft = 0.0;
+    }
   }
 
   double time0,time1,time2,time3,time4,time5,time6,time7,time8;
@@ -542,6 +546,10 @@ void PairAmoeba::finish()
   MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
   time_polar_kspace = ave/comm->nprocs;
 
+  double time_mutual_fft = ic_kspace->time_fft;
+  MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
+  time_mutual_fft = ave/comm->nprocs;
+  
   double time_total = (time_init + time_hal + time_repulse + time_disp +
                        time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
 
@@ -570,7 +578,9 @@ void PairAmoeba::finish()
     utils::logmesg(lmp,"      Mpole  time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
     utils::logmesg(lmp,"      Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
     utils::logmesg(lmp,"      Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
+    utils::logmesg(lmp,"       - FFT time: {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
     utils::logmesg(lmp,"      Polar  time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
+    
   }
 }
 
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
index f514a50620..f9daa06e65 100644
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -21,12 +21,13 @@ using namespace LAMMPS_NS;
 
 #define SCALE 0
 
-enum {FORWARD,BACKWARD};
+//#define USE_AMOEBA_FFT
 
+#ifdef USE_AMOEBA_FFT
 // External functions from GPU library
-
-int amoeba_setup_fft(const int size, const int element_type);
-int amoeba_compute_fft1d(void* in, void* out, const int mode);
+int amoeba_setup_fft(const int size, const int numel, const int element_type);
+int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
+#endif
 
 /* ----------------------------------------------------------------------
    partition an FFT grid across processors
@@ -52,6 +53,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
 FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
 {
   int ix,iy,iz,n;
+  double time0,time1;
 
   // reverse comm for 4d brick grid + ghosts
 
@@ -87,11 +89,20 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
   debug_file(FFT,"pre.convo.post.remap");
 #endif
 
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   // perform forward FFT
 
+  #ifdef USE_AMOEBA_FFT
+  amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
+  #else
   fft1->compute(cfft,cfft,FFT3d::FORWARD);
+  #endif
 
-  //amoeba_compute_fft1d(cfft,cfft,FORWARD);
+  time1 = MPI_Wtime();
+
+  time_fft += time1 - time0;
 
   if (SCALE) {
     double scale = 1.0/nfft_global;
@@ -119,7 +130,16 @@ void *AmoebaConvolutionGPU::post_convolution_4d()
   debug_scalar(CFFT1,"POST Convo / PRE FFT");
   debug_file(CFFT1,"post.convo.pre.fft");
 #endif
+
+  double time0,time1;
+
+  MPI_Barrier(world);
+  time0 = MPI_Wtime();
+
   fft2->compute(cfft,cfft,FFT3d::BACKWARD);
+  time1 = MPI_Wtime();
+
+  time_fft += time1 - time0;
 
 #if DEBUG_AMOEBA
   debug_scalar(CFFT2,"POST Convo / POST FFT");