First attempt to port the forward FFT in the k-space induce term to the GPU, not working yet
This commit is contained in:
@ -6,6 +6,6 @@ CUDA_HOME=/usr/local/cuda
|
||||
endif
|
||||
|
||||
gpu_SYSINC =
|
||||
gpu_SYSLIB = -lcudart -lcuda
|
||||
gpu_SYSLIB = -lcudart -lcuda -lcufft
|
||||
gpu_SYSPATH = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs
|
||||
|
||||
|
||||
@ -162,12 +162,12 @@ void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double *
|
||||
eflag_in, vflag_in, eatom, vatom, aewald, felec, off2, tep_ptr);
|
||||
}
|
||||
|
||||
void amoeba_setup_fft(const int size, const int element_type) {
|
||||
AMOEBAMF.setup_fft(size, element_type);
|
||||
void amoeba_setup_fft(const int numel, const int element_type) {
|
||||
AMOEBAMF.setup_fft(numel, element_type);
|
||||
}
|
||||
|
||||
void amoeba_compute_fft1d(void** in, void** out, const int mode) {
|
||||
AMOEBAMF.compute_fft1d(in, out, mode);
|
||||
void amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode) {
|
||||
AMOEBAMF.compute_fft1d(in, out, numel, mode);
|
||||
}
|
||||
|
||||
double amoeba_gpu_bytes() {
|
||||
|
||||
@ -15,6 +15,7 @@
|
||||
***************************************************************************/
|
||||
|
||||
#include "lal_base_amoeba.h"
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
#define BaseAmoebaT BaseAmoeba<numtyp, acctyp>
|
||||
|
||||
@ -39,6 +40,9 @@ BaseAmoebaT::~BaseAmoeba() {
|
||||
k_polar.clear();
|
||||
k_special15.clear();
|
||||
k_short_nbor.clear();
|
||||
|
||||
//if (cufft_plan_created) cufftDestroy(plan);
|
||||
|
||||
if (pair_program) delete pair_program;
|
||||
}
|
||||
|
||||
@ -137,11 +141,15 @@ int BaseAmoebaT::init_atomic(const int nlocal, const int nall,
|
||||
_max_fieldp_size = _max_tep_size;
|
||||
_fieldp.alloc(_max_fieldp_size*8,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
|
||||
_max_thetai_size = 0;
|
||||
|
||||
_nmax = nall;
|
||||
dev_nspecial15.alloc(nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15.alloc(_maxspecial15*nall,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dev_special15_t.alloc(nall*_maxspecial15,*(this->ucl_device),UCL_READ_ONLY);
|
||||
|
||||
cufft_plan_created = false;
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
@ -169,6 +177,9 @@ void BaseAmoebaT::clear_atomic() {
|
||||
|
||||
_tep.clear();
|
||||
_fieldp.clear();
|
||||
_thetai1.clear();
|
||||
_thetai2.clear();
|
||||
_thetai3.clear();
|
||||
dev_nspecial15.clear();
|
||||
dev_special15.clear();
|
||||
dev_special15_t.clear();
|
||||
@ -422,6 +433,36 @@ int** BaseAmoebaT::precompute(const int ago, const int inum_full, const int nall
|
||||
return nbor->host_jlist.begin()-host_start;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Prepare for umutual1: bspline_fill
|
||||
// - reallocate per-atom arrays, thetai1, thetai2, thetai3, if needed
|
||||
// - transfer extra data from host to device
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::precompute_umutual1(const int ago, const int inum_full, const int nall,
|
||||
const int bsordermax, double **host_x,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, void* grid) {
|
||||
|
||||
_bsordermax = bsordermax;
|
||||
|
||||
if (_max_thetai_size == 0) {
|
||||
_max_thetai_size = static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.alloc(_max_thetai_size*_bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
_thetai2.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
_thetai3.alloc(_max_thetai_size*bsordermax*4,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE);
|
||||
} else {
|
||||
if (inum_full>_max_thetai_size) {
|
||||
_max_thetai_size=static_cast<int>(static_cast<double>(inum_full)*1.10);
|
||||
_thetai1.resize(_max_thetai_size*_bsordermax*4);
|
||||
_thetai2.resize(_max_thetai_size*_bsordermax*4);
|
||||
_thetai3.resize(_max_thetai_size*_bsordermax*4);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Reneighbor on GPU if necessary, and then compute multipole real-space
|
||||
// ---------------------------------------------------------------------------
|
||||
@ -583,7 +624,7 @@ double BaseAmoebaT::host_memory_usage_atomic() const {
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::setup_fft(const int size, const int element_type)
|
||||
void BaseAmoebaT::setup_fft(const int numel, const int element_type)
|
||||
{
|
||||
|
||||
}
|
||||
@ -593,9 +634,48 @@ void BaseAmoebaT::setup_fft(const int size, const int element_type)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void BaseAmoebaT::compute_fft1d(void** in, void** out, const int mode)
|
||||
void BaseAmoebaT::compute_fft1d(void* in, void* out, const int numel, const int mode)
|
||||
{
|
||||
if (cufft_plan_created == false) {
|
||||
int m = numel/2;
|
||||
cufftPlan1d(&plan, m, CUFFT_Z2Z, 1);
|
||||
cufft_plan_created = true;
|
||||
}
|
||||
|
||||
// n = number of double complex
|
||||
int n = numel/2;
|
||||
|
||||
// copy the host array to the device (data)
|
||||
UCL_Vector<cufftDoubleComplex,cufftDoubleComplex> data;
|
||||
data.alloc(n, *(this->ucl_device), UCL_READ_WRITE, UCL_READ_WRITE);
|
||||
int m = 0;
|
||||
double* d_in = (double*)in;
|
||||
for (int i = 0; i < n; i++) {
|
||||
data[i].x = d_in[m];
|
||||
data[i].y = d_in[m+1];
|
||||
m += 2;
|
||||
}
|
||||
data.update_device(false);
|
||||
|
||||
// perform the in-place forward FFT
|
||||
|
||||
cufftResult result = cufftExecZ2Z(plan, (cufftDoubleComplex*)&data.device,
|
||||
(cufftDoubleComplex*)&data.device, CUFFT_FORWARD);
|
||||
if (result != CUFFT_SUCCESS) printf("failed cufft %d\n", result);
|
||||
ucl_device->sync();
|
||||
data.update_host(false);
|
||||
|
||||
// copy back the data to the host array
|
||||
|
||||
m = 0;
|
||||
double* d_out = (double*)out;
|
||||
for (int i = 0; i < n; i++) {
|
||||
d_out[m] = data[i].x;
|
||||
d_out[m+1] = data[i].y;
|
||||
m += 2;
|
||||
}
|
||||
|
||||
data.clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@ -31,6 +31,14 @@
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
|
||||
#if !defined(USE_OPENCL) && !defined(USE_HIP)
|
||||
// temporary workaround for int2 also defined in cufft
|
||||
#ifdef int2
|
||||
#undef int2
|
||||
#endif
|
||||
#include "cufft.h"
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_AL {
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
@ -142,6 +150,11 @@ class BaseAmoeba {
|
||||
int **&ilist, int **&numj, const double cpu_time, bool &success,
|
||||
double *charge, double *boxlo, double *prd);
|
||||
|
||||
virtual void precompute_umutual1(const int ago, const int inum_full, const int nall,
|
||||
const int bsordermax, double **host_x,
|
||||
double **host_thetai1, double **host_thetai2,
|
||||
double **host_thetai3, void* grid);
|
||||
|
||||
/// Compute multipole real-space with device neighboring
|
||||
virtual int** compute_multipole_real(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *host_amtype,
|
||||
@ -196,7 +209,7 @@ class BaseAmoeba {
|
||||
|
||||
/// compute forward/backward FFT on the device
|
||||
|
||||
void compute_fft1d(void** in, void** out, const int mode);
|
||||
void compute_fft1d(void* in, void* out, const int numel, const int mode);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
@ -230,6 +243,10 @@ class BaseAmoeba {
|
||||
UCL_Vector<acctyp,acctyp> _tep, _fieldp;
|
||||
int _nmax, _max_tep_size, _max_fieldp_size;
|
||||
|
||||
int _bsordermax;
|
||||
UCL_Vector<acctyp,acctyp> _thetai1, _thetai2, _thetai3;
|
||||
int _max_thetai_size;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
Answer<numtyp,acctyp> *ans;
|
||||
@ -282,6 +299,8 @@ class BaseAmoeba {
|
||||
virtual int umutual2b(const int eflag, const int vflag) = 0;
|
||||
virtual int polar_real(const int eflag, const int vflag) = 0;
|
||||
|
||||
cufftHandle plan;
|
||||
bool cufft_plan_created;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -203,7 +203,7 @@ AmoebaConvolution::AmoebaConvolution(LAMMPS *lmp, Pair *pair,
|
||||
fft1 = new FFT3d(lmp,world,nx,ny,nz,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
1,0,&tmp,0);
|
||||
1,0,&tmp,0);
|
||||
// 0,0,&tmp,0);
|
||||
|
||||
fft2 = new FFT3d(lmp,world,nx,ny,nz,
|
||||
@ -358,15 +358,23 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_3d()
|
||||
cfft[n++] = ZEROF;
|
||||
}
|
||||
|
||||
double time0,time1;
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = MPI_Wtime();
|
||||
|
||||
// perform forward FFT
|
||||
|
||||
fft1->compute(cfft,cfft,FFT3d::FORWARD);
|
||||
time1 = MPI_Wtime();
|
||||
|
||||
if (SCALE) {
|
||||
double scale = 1.0/nfft_global;
|
||||
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
|
||||
}
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT1,"PRE Convo / POST FFT");
|
||||
debug_file(CFFT1,"pre.convo.post.fft");
|
||||
@ -414,15 +422,24 @@ FFT_SCALAR *AmoebaConvolution::pre_convolution_4d()
|
||||
debug_scalar(FFT,"PRE Convo / POST Remap");
|
||||
debug_file(FFT,"pre.convo.post.remap");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = MPI_Wtime();
|
||||
|
||||
// perform forward FFT
|
||||
|
||||
fft1->compute(cfft,cfft,FFT3d::FORWARD);
|
||||
time1 = MPI_Wtime();
|
||||
|
||||
if (SCALE) {
|
||||
double scale = 1.0/nfft_global;
|
||||
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
|
||||
}
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT1,"PRE Convo / POST FFT");
|
||||
debug_file(CFFT1,"pre.convo.post.fft");
|
||||
@ -455,7 +472,16 @@ void *AmoebaConvolution::post_convolution_3d()
|
||||
debug_scalar(CFFT1,"POST Convo / PRE FFT");
|
||||
debug_file(CFFT1,"post.convo.pre.fft");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = MPI_Wtime();
|
||||
|
||||
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
|
||||
time1 = MPI_Wtime();
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT2,"POST Convo / POST FFT");
|
||||
@ -497,8 +523,18 @@ void *AmoebaConvolution::post_convolution_4d()
|
||||
debug_scalar(CFFT1,"POST Convo / PRE FFT");
|
||||
debug_file(CFFT1,"post.convo.pre.fft");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = MPI_Wtime();
|
||||
|
||||
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
|
||||
|
||||
time1 = MPI_Wtime();
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT2,"POST Convo / POST FFT");
|
||||
debug_file(CFFT2,"post.convo.post.fft");
|
||||
|
||||
@ -47,6 +47,8 @@ class AmoebaConvolution : protected Pointers {
|
||||
FFT_SCALAR *pre_convolution();
|
||||
void *post_convolution();
|
||||
|
||||
double time_fft;
|
||||
|
||||
protected:
|
||||
int which; // caller name for convolution being performed
|
||||
int flag3d; // 1 if using 3d grid_brick, 0 for 4d cgrid_brick
|
||||
|
||||
@ -347,6 +347,10 @@ void PairAmoeba::compute(int eflag, int vflag)
|
||||
time_direct_rspace = time_direct_kspace = 0.0;
|
||||
time_mutual_rspace = time_mutual_kspace = 0.0;
|
||||
time_polar_rspace = time_polar_kspace = 0.0;
|
||||
|
||||
if (ic_kspace) {
|
||||
ic_kspace->time_fft = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
double time0,time1,time2,time3,time4,time5,time6,time7,time8;
|
||||
@ -542,6 +546,10 @@ void PairAmoeba::finish()
|
||||
MPI_Allreduce(&time_polar_kspace,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_polar_kspace = ave/comm->nprocs;
|
||||
|
||||
double time_mutual_fft = ic_kspace->time_fft;
|
||||
MPI_Allreduce(&time_mutual_fft,&ave,1,MPI_DOUBLE,MPI_SUM,world);
|
||||
time_mutual_fft = ave/comm->nprocs;
|
||||
|
||||
double time_total = (time_init + time_hal + time_repulse + time_disp +
|
||||
time_mpole + time_induce + time_polar + time_qxfer) / 100.0;
|
||||
|
||||
@ -570,7 +578,9 @@ void PairAmoeba::finish()
|
||||
utils::logmesg(lmp," Mpole time: {:.6g} {:.3g}%\n", time_mpole_kspace, time_mpole_kspace/time_total);
|
||||
utils::logmesg(lmp," Direct time: {:.6g} {:.3g}%\n", time_direct_kspace, time_direct_kspace/time_total);
|
||||
utils::logmesg(lmp," Mutual time: {:.6g} {:.3g}%\n", time_mutual_kspace, time_mutual_kspace/time_total);
|
||||
utils::logmesg(lmp," - FFT time: {:.6g} {:.3g}%\n", time_mutual_fft, time_mutual_fft/time_total);
|
||||
utils::logmesg(lmp," Polar time: {:.6g} {:.3g}%\n", time_polar_kspace, time_polar_kspace/time_total);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -21,12 +21,13 @@ using namespace LAMMPS_NS;
|
||||
|
||||
#define SCALE 0
|
||||
|
||||
enum {FORWARD,BACKWARD};
|
||||
//#define USE_AMOEBA_FFT
|
||||
|
||||
#ifdef USE_AMOEBA_FFT
|
||||
// External functions from GPU library
|
||||
|
||||
int amoeba_setup_fft(const int size, const int element_type);
|
||||
int amoeba_compute_fft1d(void* in, void* out, const int mode);
|
||||
int amoeba_setup_fft(const int size, const int numel, const int element_type);
|
||||
int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
|
||||
#endif
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
partition an FFT grid across processors
|
||||
@ -52,6 +53,7 @@ AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
|
||||
FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
|
||||
{
|
||||
int ix,iy,iz,n;
|
||||
double time0,time1;
|
||||
|
||||
// reverse comm for 4d brick grid + ghosts
|
||||
|
||||
@ -87,11 +89,20 @@ FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
|
||||
debug_file(FFT,"pre.convo.post.remap");
|
||||
#endif
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = MPI_Wtime();
|
||||
|
||||
// perform forward FFT
|
||||
|
||||
#ifdef USE_AMOEBA_FFT
|
||||
amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
|
||||
#else
|
||||
fft1->compute(cfft,cfft,FFT3d::FORWARD);
|
||||
#endif
|
||||
|
||||
//amoeba_compute_fft1d(cfft,cfft,FORWARD);
|
||||
time1 = MPI_Wtime();
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
if (SCALE) {
|
||||
double scale = 1.0/nfft_global;
|
||||
@ -119,7 +130,16 @@ void *AmoebaConvolutionGPU::post_convolution_4d()
|
||||
debug_scalar(CFFT1,"POST Convo / PRE FFT");
|
||||
debug_file(CFFT1,"post.convo.pre.fft");
|
||||
#endif
|
||||
|
||||
double time0,time1;
|
||||
|
||||
MPI_Barrier(world);
|
||||
time0 = MPI_Wtime();
|
||||
|
||||
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
|
||||
time1 = MPI_Wtime();
|
||||
|
||||
time_fft += time1 - time0;
|
||||
|
||||
#if DEBUG_AMOEBA
|
||||
debug_scalar(CFFT2,"POST Convo / POST FFT");
|
||||
|
||||
Reference in New Issue
Block a user