From f4a90c62c070deb7b6b82c9f3f3b0c639e7a84a6 Mon Sep 17 00:00:00 2001 From: Trung Nguyen Date: Tue, 23 Aug 2022 15:42:05 -0500 Subject: [PATCH] First attempt to port the forward FFT in the k-space induce term to the GPU, not working yet --- lib/gpu/Makefile.lammps.standard | 2 +- lib/gpu/lal_amoeba_ext.cpp | 8 +-- lib/gpu/lal_base_amoeba.cpp | 84 +++++++++++++++++++++++++++++- lib/gpu/lal_base_amoeba.h | 21 +++++++- src/AMOEBA/amoeba_convolution.cpp | 38 +++++++++++++- src/AMOEBA/amoeba_convolution.h | 2 + src/AMOEBA/pair_amoeba.cpp | 10 ++++ src/GPU/amoeba_convolution_gpu.cpp | 30 +++++++++-- 8 files changed, 181 insertions(+), 14 deletions(-) diff --git a/lib/gpu/Makefile.lammps.standard b/lib/gpu/Makefile.lammps.standard index 9526e8e373..0bb3394b3e 100644 --- a/lib/gpu/Makefile.lammps.standard +++ b/lib/gpu/Makefile.lammps.standard @@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda endif gpu_SYSINC = -gpu_SYSLIB = -lcudart -lcuda +gpu_SYSLIB = -lcudart -lcuda -lcufft gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs diff --git a/lib/gpu/lal_amoeba_ext.cpp b/lib/gpu/lal_amoeba_ext.cpp index 304159e571..7d9d836b29 100644 --- a/lib/gpu/lal_amoeba_ext.cpp +++ b/lib/gpu/lal_amoeba_ext.cpp @@ -162,12 +162,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double * eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr); } -void amoeba_setup_fft(const int size, const int element_type) { - AMOEBAMF.setup_fft(size, element_type); +void amoeba_setup_fft(const int numel, const int element_type) { + AMOEBAMF.setup_fft(numel, element_type); } -void amoeba_compute_fft1d(void** in, void** out, const int mode) { - AMOEBAMF.compute_fft1d(in, out, mode); +void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) { + AMOEBAMF.compute_fft1d(in, out, numel, mode); } double amoeba_gpu_bytes() { diff --git a/lib/gpu/lal_base_amoeba.cpp b/lib/gpu/lal_base_amoeba.cpp index 05a48f9588..2f3c04c7f1 100644 --- a/lib/gpu/lal_base_amoeba.cpp +++ b/lib/gpu/lal_base_amoeba.cpp @@ -15,6 +15,7 @@ ***************************************************************************/ #include "lal_base_amoeba.h" + namespace LAMMPS_AL { #define BaseAmoebaT BaseAmoeba @@ -39,6 +40,9 @@ BaseAmoebaT::~BaseAmoeba() { k_polar.clear(); k_special15.clear(); k_short_nbor.clear(); + + //if (cufft_plan_created) cufftDestroy(plan); + if (pair_program) delete pair_program; } @@ -137,11 +141,15 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall, _max_fieldp_size = _max_tep_size; _fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _max_thetai_size = 0; + _nmax = nall; dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY); dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY); + cufft_plan_created = false; + return success; } @@ -169,6 +177,9 @@ void BaseAmoebaT::clear_atomic() { _tep.clear(); _fieldp.clear(); + _thetai1.clear(); + _thetai2.clear(); + _thetai3.clear(); dev_nspecial15.clear(); dev_special15.clear(); dev_special15_t.clear(); @@ -422,6 +433,36 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall return nbor->host_jlist.begin()-host_start; } +// --------------------------------------------------------------------------- +// Prepare for umutual1: bspline_fill +// - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed +// - transfer extra data from host to device +// --------------------------------------------------------------------------- + +template +void BaseAmoebaT::precompute_umutual1(const int ago, const int inum_full, const int nall, + const int bsordermax, double **host_x, + double **host_thetai1, double **host_thetai2, + double **host_thetai3, void* grid) { + + _bsordermax = bsordermax; + + if (_max_thetai_size == 0) { + _max_thetai_size = static_cast(static_cast(inum_full)*1.10); + _thetai1.alloc(_max_thetai_size*_bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai2.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + _thetai3.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + } else { + if (inum_full>_max_thetai_size) { + _max_thetai_size=static_cast(static_cast(inum_full)*1.10); + _thetai1.resize(_max_thetai_size*_bsordermax*4); + _thetai2.resize(_max_thetai_size*_bsordermax*4); + _thetai3.resize(_max_thetai_size*_bsordermax*4); + } + } + +} + // --------------------------------------------------------------------------- // Reneighbor on GPU if necessary, and then compute multipole real-space // --------------------------------------------------------------------------- @@ -583,7 +624,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const { // --------------------------------------------------------------------------- template -void BaseAmoebaT::setup_fft(const int size, const int element_type) +void BaseAmoebaT::setup_fft(const int numel, const int element_type) { } @@ -593,9 +634,48 @@ void BaseAmoebaT::setup_fft(const int size, const int element_type) // --------------------------------------------------------------------------- template -void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode) +void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode) { + if (cufft_plan_created == false) { + int m = numel/2; + cufftPlan1d(&plan, m, CUFFT_Z2Z, 1); + cufft_plan_created = true; + } + // n = number of double complex + int n = numel/2; + + // copy the host array to the device (data) + UCL_Vector data; + data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE); + int m = 0; + double* d_in = (double*)in; + for (int i = 0; i < n; i++) { + data[i].x = d_in[m]; + data[i].y = d_in[m+1]; + m += 2; + } + data.update_device(false); + + // perform the in-place forward FFT + + cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device, + (cufftDoubleComplex*)&data.device, CUFFT_FORWARD); + if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result); + ucl_device->sync(); + data.update_host(false); + + // copy back the data to the host array + + m = 0; + double* d_out = (double*)out; + for (int i = 0; i < n; i++) { + d_out[m] = data[i].x; + d_out[m+1] = data[i].y; + m += 2; + } + + data.clear(); } // --------------------------------------------------------------------------- diff --git a/lib/gpu/lal_base_amoeba.h b/lib/gpu/lal_base_amoeba.h index 2bff362f29..3d0b3ab1a4 100644 --- a/lib/gpu/lal_base_amoeba.h +++ b/lib/gpu/lal_base_amoeba.h @@ -31,6 +31,14 @@ #include "geryon/nvd_texture.h" #endif +#if !defined(USE_OPENCL) && !defined(USE_HIP) +// temporary workaround for int2 also defined in cufft +#ifdef int2 +#undef int2 +#endif +#include "cufft.h" +#endif + namespace LAMMPS_AL { template @@ -142,6 +150,11 @@ class BaseAmoeba { int **&ilist, int **&numj, const double cpu_time, bool &success, double *charge, double *boxlo, double *prd); + virtual void precompute_umutual1(const int ago, const int inum_full, const int nall, + const int bsordermax, double **host_x, + double **host_thetai1, double **host_thetai2, + double **host_thetai3, void* grid); + /// Compute multipole real-space with device neighboring virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, @@ -196,7 +209,7 @@ class BaseAmoeba { /// compute forward/backward FFT on the device - void compute_fft1d(void** in, void** out, const int mode); + void compute_fft1d(void* in, void* out, const int numel, const int mode); // -------------------------- DEVICE DATA ------------------------- @@ -230,6 +243,10 @@ class BaseAmoeba { UCL_Vector _tep, _fieldp; int _nmax, _max_tep_size, _max_fieldp_size; + int _bsordermax; + UCL_Vector _thetai1, _thetai2, _thetai3; + int _max_thetai_size; + // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; @@ -282,6 +299,8 @@ class BaseAmoeba { virtual int umutual2b(const int eflag, const int vflag) = 0; virtual int polar_real(const int eflag, const int vflag) = 0; + cufftHandle plan; + bool cufft_plan_created; }; } diff --git a/src/AMOEBA/amoeba_convolution.cpp b/src/AMOEBA/amoeba_convolution.cpp index 9c8f728f99..4dde750c61 100644 --- a/src/AMOEBA/amoeba_convolution.cpp +++ b/src/AMOEBA/amoeba_convolution.cpp @@ -203,7 +203,7 @@ AmoebaConvolution::AmoebaConvolution(LAMMPS *lmp, Pair *pair, fft1 = new FFT3d(lmp,world,nx,ny,nz, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,&tmp,0); + 1,0,&tmp,0); // 0,0,&tmp,0); fft2 = new FFT3d(lmp,world,nx,ny,nz, @@ -358,15 +358,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d() cfft[n++] = ZEROF; } + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); + time1 = MPI_Wtime(); if (SCALE) { double scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT1,"PRE Convo / POST FFT"); debug_file(CFFT1,"pre.convo.post.fft"); @@ -414,15 +422,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d() debug_scalar(FFT,"PRE Convo / POST Remap"); debug_file(FFT,"pre.convo.post.remap"); #endif + + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + // perform forward FFT fft1->compute(cfft,cfft,FFT3d::FORWARD); + time1 = MPI_Wtime(); if (SCALE) { double scale = 1.0/nfft_global; for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale; } + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT1,"PRE Convo / POST FFT"); debug_file(CFFT1,"pre.convo.post.fft"); @@ -455,7 +472,16 @@ void *AmoebaConvolution::post_convolution_3d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = MPI_Wtime(); + + time_fft += time1 - time0; #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); @@ -497,8 +523,18 @@ void *AmoebaConvolution::post_convolution_4d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = MPI_Wtime(); + + time_fft += time1 - time0; + #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT"); debug_file(CFFT2,"post.convo.post.fft"); diff --git a/src/AMOEBA/amoeba_convolution.h b/src/AMOEBA/amoeba_convolution.h index 00f2b8ed91..8e7f09218a 100644 --- a/src/AMOEBA/amoeba_convolution.h +++ b/src/AMOEBA/amoeba_convolution.h @@ -47,6 +47,8 @@ class AmoebaConvolution : protected Pointers { FFT_SCALAR *pre_convolution(); void *post_convolution(); + double time_fft; + protected: int which; // caller name for convolution being performed int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick diff --git a/src/AMOEBA/pair_amoeba.cpp b/src/AMOEBA/pair_amoeba.cpp index d5270af450..3b66ebc221 100644 --- a/src/AMOEBA/pair_amoeba.cpp +++ b/src/AMOEBA/pair_amoeba.cpp @@ -347,6 +347,10 @@ void PairAmoeba::compute(int eflag, int vflag) time_direct_rspace = time_direct_kspace = 0.0; time_mutual_rspace = time_mutual_kspace = 0.0; time_polar_rspace = time_polar_kspace = 0.0; + + if (ic_kspace) { + ic_kspace->time_fft = 0.0; + } } double time0,time1,time2,time3,time4,time5,time6,time7,time8; @@ -542,6 +546,10 @@ void PairAmoeba::finish() MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world); time_polar_kspace = ave/comm->nprocs; + double time_mutual_fft = ic_kspace->time_fft; + MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world); + time_mutual_fft = ave/comm->nprocs; + double time_total = (time_init + time_hal + time_repulse + time_disp + time_mpole + time_induce + time_polar + time_qxfer) / 100.0; @@ -570,7 +578,9 @@ void PairAmoeba::finish() utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total); utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total); utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total); + utils::logmesg(lmp," - FFT time: {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total); utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total); + } } diff --git a/src/GPU/amoeba_convolution_gpu.cpp b/src/GPU/amoeba_convolution_gpu.cpp index f514a50620..f9daa06e65 100644 --- a/src/GPU/amoeba_convolution_gpu.cpp +++ b/src/GPU/amoeba_convolution_gpu.cpp @@ -21,12 +21,13 @@ using namespace LAMMPS_NS; #define SCALE 0 -enum {FORWARD,BACKWARD}; +//#define USE_AMOEBA_FFT +#ifdef USE_AMOEBA_FFT // External functions from GPU library - -int amoeba_setup_fft(const int size, const int element_type); -int amoeba_compute_fft1d(void* in, void* out, const int mode); +int amoeba_setup_fft(const int size, const int numel, const int element_type); +int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode); +#endif /* ---------------------------------------------------------------------- partition an FFT grid across processors @@ -52,6 +53,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair, FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() { int ix,iy,iz,n; + double time0,time1; // reverse comm for 4d brick grid + ghosts @@ -87,11 +89,20 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d() debug_file(FFT,"pre.convo.post.remap"); #endif + MPI_Barrier(world); + time0 = MPI_Wtime(); + // perform forward FFT + #ifdef USE_AMOEBA_FFT + amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD); + #else fft1->compute(cfft,cfft,FFT3d::FORWARD); + #endif - //amoeba_compute_fft1d(cfft,cfft,FORWARD); + time1 = MPI_Wtime(); + + time_fft += time1 - time0; if (SCALE) { double scale = 1.0/nfft_global; @@ -119,7 +130,16 @@ void *AmoebaConvolutionGPU::post_convolution_4d() debug_scalar(CFFT1,"POST Convo / PRE FFT"); debug_file(CFFT1,"post.convo.pre.fft"); #endif + + double time0,time1; + + MPI_Barrier(world); + time0 = MPI_Wtime(); + fft2->compute(cfft,cfft,FFT3d::BACKWARD); + time1 = MPI_Wtime(); + + time_fft += time1 - time0; #if DEBUG_AMOEBA debug_scalar(CFFT2,"POST Convo / POST FFT");