From 46b8b00a4faf716c1bad0139a37461138c572094 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndtrung@uchicago.edu>
Date: Mon, 15 Aug 2022 15:51:43 -0500
Subject: [PATCH] Working on fft on the device

---
 lib/gpu/lal_amoeba_ext.cpp         |  4 ++++
 lib/gpu/lal_base_amoeba.cpp        | 22 ++++++++++++++++++++++
 lib/gpu/lal_base_amoeba.h          |  5 ++++-
 src/GPU/amoeba_convolution_gpu.cpp |  5 ++++-
 4 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp
index 63ed683833..be183b284d 100644
--- a/lib/gpu/lal_amoeba_ext.cpp
+++ b/lib/gpu/lal_amoeba_ext.cpp
@@ -162,6 +162,10 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
                               eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
 }
 
+void amoeba_compute_fft1d(void** in, void** out, const int mode) {
+  AMOEBAMF.compute_fft1d(in, out, mode);
+}
+
 double amoeba_gpu_bytes() {
   return AMOEBAMF.host_memory_usage();
 }
diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp
index 3b67ee31a1..b0d6ecee68 100644
--- a/lib/gpu/lal_base_amoeba.cpp
+++ b/lib/gpu/lal_base_amoeba.cpp
@@ -568,12 +568,30 @@ void BaseAmoebaT::compute_polar_real(int *host_amtype, int *host_amgroup,
   _tep.update_host(_max_tep_size*4,false);
 }
 
+// ---------------------------------------------------------------------------
+// Return the memory bytes allocated on the host and device
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 double BaseAmoebaT::host_memory_usage_atomic() const {
   return device->atom.host_memory_usage()+nbor->host_memory_usage()+
          4*sizeof(numtyp)+sizeof(BaseAmoeba<numtyp,acctyp>);
 }
 
+// ---------------------------------------------------------------------------
+// Compute FFT
+// ---------------------------------------------------------------------------
+
+template <class numtyp, class acctyp>
+void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode)
+{
+
+}
+
+// ---------------------------------------------------------------------------
+// Copy the extra data from host to device
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
                                   double** uind, double** uinp, double* pval) {
@@ -645,6 +663,10 @@ void BaseAmoebaT::cast_extra_data(int* amtype, int* amgroup, double** rpole,
   }
 }
 
+// ---------------------------------------------------------------------------
+// Compile (load) the kernel strings and set the kernels
+// ---------------------------------------------------------------------------
+
 template <class numtyp, class acctyp>
 void BaseAmoebaT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                   const char *kname_multipole,
diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h
index f439e2945f..cf767be96e 100644
--- a/lib/gpu/lal_base_amoeba.h
+++ b/lib/gpu/lal_base_amoeba.h
@@ -189,7 +189,10 @@ class BaseAmoeba {
      // _fieldp store both arrays, one after another
     _fieldp.update_host(_max_fieldp_size*8,false);
   }
-  
+
+  /// compute forward/backward FFT on the device
+  void compute_fft1d(void** in, void** out, const int mode);
+
   // -------------------------- DEVICE DATA -------------------------
 
   /// Device Properties and Atom and Neighbor storage
diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp
index 976a115fe1..ad52df3d4b 100644
--- a/src/GPU/amoeba_convolution_gpu.cpp
+++ b/src/GPU/amoeba_convolution_gpu.cpp
@@ -23,7 +23,8 @@ using namespace LAMMPS_NS;
 
 // External functions from GPU library
 
-//int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int size, const int flag);
+int amoeba_setup_fft(const int size);
+int amoeba_compute_fft1d(FFT_SCALAR* in, FFT_SCALAR* out, const int mode);
 
 /* ----------------------------------------------------------------------
    partition an FFT grid across processors
@@ -39,6 +40,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
   AmoebaConvolution(lmp, pair, nx_caller, ny_caller,  nz_caller, order_caller,
                     which_caller)
 {
+
 }
 
 /* ----------------------------------------------------------------------
@@ -81,6 +83,7 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
   debug_scalar(FFT,"PRE Convo / POST Remap");
   debug_file(FFT,"pre.convo.post.remap");
 #endif
+
   // perform forward FFT
 
   fft1->compute(cfft,cfft,FFT3d::FORWARD);