First attempt to port the forward FFT in the k-space induce term to the GPU, not working yet

This commit is contained in:
Trung Nguyen
2022-08-23 15:42:05 -05:00
parent 921796a15f
commit f4a90c62c0
8 changed files with 181 additions and 14 deletions

View File

@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
endif
gpu_SYSINC =
gpu_SYSLIB = -lcudart -lcuda
gpu_SYSLIB = -lcudart -lcuda -lcufft
gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs

View File

@ -162,12 +162,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
}
void amoeba_setup_fft(const int size, const int element_type) {
AMOEBAMF.setup_fft(size, element_type);
void amoeba_setup_fft(const int numel, const int element_type) {
AMOEBAMF.setup_fft(numel, element_type);
}
void amoeba_compute_fft1d(void** in, void** out, const int mode) {
AMOEBAMF.compute_fft1d(in, out, mode);
void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
AMOEBAMF.compute_fft1d(in, out, numel, mode);
}
double amoeba_gpu_bytes() {

View File

@ -15,6 +15,7 @@
***************************************************************************/
#include "lal_base_amoeba.h"
namespace LAMMPS_AL {
#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
@ -39,6 +40,9 @@ BaseAmoebaT::~BaseAmoeba() {
k_polar.clear();
k_special15.clear();
k_short_nbor.clear();
//if (cufft_plan_created) cufftDestroy(plan);
if (pair_program) delete pair_program;
}
@ -137,11 +141,15 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
_max_fieldp_size = _max_tep_size;
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_max_thetai_size = 0;
_nmax = nall;
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
cufft_plan_created = false;
return success;
}
@ -169,6 +177,9 @@ void BaseAmoebaT::clear_atomic() {
_tep.clear();
_fieldp.clear();
_thetai1.clear();
_thetai2.clear();
_thetai3.clear();
dev_nspecial15.clear();
dev_special15.clear();
dev_special15_t.clear();
@ -422,6 +433,36 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
return nbor->host_jlist.begin()-host_start;
}
// ---------------------------------------------------------------------------
// Prepare for umutual1: bspline_fill
// - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed
// - transfer extra data from host to device
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::precompute_umutual1(const int ago, const int inum_full, const int nall,
const int bsordermax, double **host_x,
double **host_thetai1, double **host_thetai2,
double **host_thetai3, void* grid) {
_bsordermax = bsordermax;
if (_max_thetai_size == 0) {
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.alloc(_max_thetai_size*_bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_thetai2.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
_thetai3.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
} else {
if (inum_full>_max_thetai_size) {
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
_thetai1.resize(_max_thetai_size*_bsordermax*4);
_thetai2.resize(_max_thetai_size*_bsordermax*4);
_thetai3.resize(_max_thetai_size*_bsordermax*4);
}
}
}
// ---------------------------------------------------------------------------
// Reneighbor on GPU if necessary, and then compute multipole real-space
// ---------------------------------------------------------------------------
@ -583,7 +624,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::setup_fft(const int size, const int element_type)
void BaseAmoebaT::setup_fft(const int numel, const int element_type)
{
}
@ -593,9 +634,48 @@ void BaseAmoebaT::setup_fft(const int size, const int element_type)
// ---------------------------------------------------------------------------
template <class numtyp, class acctyp>
void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode)
void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode)
{
if (cufft_plan_created == false) {
int m = numel/2;
cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
cufft_plan_created = true;
}
// n = number of double complex
int n = numel/2;
// copy the host array to the device (data)
UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
int m = 0;
double* d_in = (double*)in;
for (int i = 0; i < n; i++) {
data[i].x = d_in[m];
data[i].y = d_in[m+1];
m += 2;
}
data.update_device(false);
// perform the in-place forward FFT
cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
(cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
ucl_device->sync();
data.update_host(false);
// copy back the data to the host array
m = 0;
double* d_out = (double*)out;
for (int i = 0; i < n; i++) {
d_out[m] = data[i].x;
d_out[m+1] = data[i].y;
m += 2;
}
data.clear();
}
// ---------------------------------------------------------------------------

View File

@ -31,6 +31,14 @@
#include "geryon/nvd_texture.h"
#endif
#if !defined(USE_OPENCL) && !defined(USE_HIP)
// temporary workaround for int2 also defined in cufft
#ifdef int2
#undef int2
#endif
#include "cufft.h"
#endif
namespace LAMMPS_AL {
template <class numtyp, class acctyp>
@ -142,6 +150,11 @@ class BaseAmoeba {
int **&ilist, int **&numj, const double cpu_time, bool &success,
double *charge, double *boxlo, double *prd);
virtual void precompute_umutual1(const int ago, const int inum_full, const int nall,
const int bsordermax, double **host_x,
double **host_thetai1, double **host_thetai2,
double **host_thetai3, void* grid);
/// Compute multipole real-space with device neighboring
virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
double **host_x, int *host_type, int *host_amtype,
@ -196,7 +209,7 @@ class BaseAmoeba {
/// compute forward/backward FFT on the device
void compute_fft1d(void** in, void** out, const int mode);
void compute_fft1d(void* in, void* out, const int numel, const int mode);
// -------------------------- DEVICE DATA -------------------------
@ -230,6 +243,10 @@ class BaseAmoeba {
UCL_Vector<acctyp,acctyp> _tep, _fieldp;
int _nmax, _max_tep_size, _max_fieldp_size;
int _bsordermax;
UCL_Vector<acctyp,acctyp> _thetai1, _thetai2, _thetai3;
int _max_thetai_size;
// ------------------------ FORCE/ENERGY DATA -----------------------
Answer<numtyp,acctyp> *ans;
@ -282,6 +299,8 @@ class BaseAmoeba {
virtual int umutual2b(const int eflag, const int vflag) = 0;
virtual int polar_real(const int eflag, const int vflag) = 0;
cufftHandle plan;
bool cufft_plan_created;
};
}

View File

@ -203,7 +203,7 @@ AmoebaConvolution::AmoebaConvolution(LAMMPS *lmp, Pair *pair,
fft1 = new FFT3d(lmp,world,nx,ny,nz,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
1,0,&tmp,0);
1,0,&tmp,0);
// 0,0,&tmp,0);
fft2 = new FFT3d(lmp,world,nx,ny,nz,
@ -358,15 +358,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
cfft[n++] = ZEROF;
}
double time0,time1;
MPI_Barrier(world);
time0 = MPI_Wtime();
// perform forward FFT
fft1->compute(cfft,cfft,FFT3d::FORWARD);
time1 = MPI_Wtime();
if (SCALE) {
double scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
@ -414,15 +422,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
debug_scalar(FFT,"PRE Convo / POST Remap");
debug_file(FFT,"pre.convo.post.remap");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = MPI_Wtime();
// perform forward FFT
fft1->compute(cfft,cfft,FFT3d::FORWARD);
time1 = MPI_Wtime();
if (SCALE) {
double scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
@ -455,7 +472,16 @@ void *AmoebaConvolution::post_convolution_3d()
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = MPI_Wtime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = MPI_Wtime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
@ -497,8 +523,18 @@ void *AmoebaConvolution::post_convolution_4d()
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = MPI_Wtime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = MPI_Wtime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
debug_file(CFFT2,"post.convo.post.fft");

View File

@ -47,6 +47,8 @@ class AmoebaConvolution : protected Pointers {
FFT_SCALAR *pre_convolution();
void *post_convolution();
double time_fft;
protected:
int which; // caller name for convolution being performed
int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick

View File

@ -347,6 +347,10 @@ void PairAmoeba::compute(int eflag, int vflag)
time_direct_rspace = time_direct_kspace = 0.0;
time_mutual_rspace = time_mutual_kspace = 0.0;
time_polar_rspace = time_polar_kspace = 0.0;
if (ic_kspace) {
ic_kspace->time_fft = 0.0;
}
}
double time0,time1,time2,time3,time4,time5,time6,time7,time8;
@ -542,6 +546,10 @@ void PairAmoeba::finish()
MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_polar_kspace = ave/comm->nprocs;
double time_mutual_fft = ic_kspace->time_fft;
MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
time_mutual_fft = ave/comm->nprocs;
double time_total = (time_init + time_hal + time_repulse + time_disp +
time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
@ -570,7 +578,9 @@ void PairAmoeba::finish()
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
utils::logmesg(lmp," - FFT time: {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
}
}

View File

@ -21,12 +21,13 @@ using namespace LAMMPS_NS;
#define SCALE 0
enum {FORWARD,BACKWARD};
//#define USE_AMOEBA_FFT
#ifdef USE_AMOEBA_FFT
// External functions from GPU library
int amoeba_setup_fft(const int size, const int element_type);
int amoeba_compute_fft1d(void* in, void* out, const int mode);
int amoeba_setup_fft(const int size, const int numel, const int element_type);
int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
#endif
/* ----------------------------------------------------------------------
partition an FFT grid across processors
@ -52,6 +53,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
{
int ix,iy,iz,n;
double time0,time1;
// reverse comm for 4d brick grid + ghosts
@ -87,11 +89,20 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
debug_file(FFT,"pre.convo.post.remap");
#endif
MPI_Barrier(world);
time0 = MPI_Wtime();
// perform forward FFT
#ifdef USE_AMOEBA_FFT
amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
#else
fft1->compute(cfft,cfft,FFT3d::FORWARD);
#endif
//amoeba_compute_fft1d(cfft,cfft,FORWARD);
time1 = MPI_Wtime();
time_fft += time1 - time0;
if (SCALE) {
double scale = 1.0/nfft_global;
@ -119,7 +130,16 @@ void *AmoebaConvolutionGPU::post_convolution_4d()
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = MPI_Wtime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = MPI_Wtime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");