Files
lammps/src/GPU/amoeba_convolution_gpu.cpp

182 lines
5.1 KiB
C++

/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/ Sandia National Laboratories
LAMMPS Development team: developers@lammps.org
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "amoeba_convolution_gpu.h"
#include "comm.h"
#include "fft3d_wrap.h"
#include "remap_wrap.h"
#include "grid3d.h"
using namespace LAMMPS_NS;
// DEBUG
#define DEBUG_AMOEBA 0
#if DEBUG_AMOEBA
char *labels[7] =
{(char *) "MPOLE_GRID", (char *) "POLAR_GRID",
(char *) "POLAR_GRIDC", (char *) "DISP_GRID",
(char *) "INDUCE_GRID", (char *) "INDUCE_GRIDC"};
enum{GRIDBRICK_OUT,GRIDBRICK_IN,FFT,CFFT1,CFFT2};
#endif
// END DEBUG
#define SCALE 0
//#define USE_AMOEBA_FFT
#ifdef USE_AMOEBA_FFT
// External functions from GPU library
int amoeba_setup_fft(const int size, const int numel, const int element_type);
int amoeba_compute_fft1d(void* in, void* out, const int numel, const int mode);
#endif
/* ----------------------------------------------------------------------
partition an FFT grid across processors
both for a brick and FFT x pencil decomposition
nx,nz,nz = global FFT grid size
order = size of stencil in each dimension that maps atoms to grid
adapted from PPPM::set_grid_local()
------------------------------------------------------------------------- */
AmoebaConvolutionGPU::AmoebaConvolutionGPU(LAMMPS *lmp, Pair *pair,
int nx_caller, int ny_caller, int nz_caller,
int order_caller, int which_caller) :
AmoebaConvolution(lmp, pair, nx_caller, ny_caller, nz_caller, order_caller,
which_caller)
{
}
/* ----------------------------------------------------------------------
perform pre-convolution grid operations for 4d cgrid_brick array
------------------------------------------------------------------------- */
FFT_SCALAR *AmoebaConvolutionGPU::pre_convolution_4d()
{
int ix,iy,iz,n;
// reverse comm for 4d brick grid + ghosts
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_OUT,"PRE Convo / PRE Grid3d");
#endif
gc->reverse_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_IN,"PRE Convo / POST Grid3d");
debug_file(GRIDBRICK_IN,"pre.convo.post.grid3d");
#endif
// copy owned 4d brick grid values to FFT grid
n = 0;
for (iz = nzlo_in; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++) {
cfft[n++] = cgrid_brick[iz][iy][ix][0];
cfft[n++] = cgrid_brick[iz][iy][ix][1];
}
// remap FFT grid from brick to x pencil partitioning
// NOTE: could just setup FFT to start from brick decomp and skip remap
remap->perform(cfft,cfft,remap_buf);
#if DEBUG_AMOEBA
debug_scalar(FFT,"PRE Convo / POST Remap");
debug_file(FFT,"pre.convo.post.remap");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = platform::walltime();
// perform forward FFT
#ifdef USE_AMOEBA_FFT
amoeba_compute_fft1d(cfft,cfft,2*nfft_owned,FFT3d::FORWARD);
#else
fft1->compute(cfft,cfft,FFT3d::FORWARD);
#endif
time1 = platform::walltime();
time_fft += time1 - time0;
if (SCALE) {
double scale = 1.0/nfft_global;
for (int i = 0; i < 2*nfft_owned; i++) cfft[i] *= scale;
}
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"PRE Convo / POST FFT");
debug_file(CFFT1,"pre.convo.post.fft");
#endif
return cfft;
}
/* ----------------------------------------------------------------------
perform post-convolution grid operations for 4d cgrid_brick array
------------------------------------------------------------------------- */
void *AmoebaConvolutionGPU::post_convolution_4d()
{
int ix,iy,iz,n;
// perform backward FFT
#if DEBUG_AMOEBA
debug_scalar(CFFT1,"POST Convo / PRE FFT");
debug_file(CFFT1,"post.convo.pre.fft");
#endif
double time0,time1;
MPI_Barrier(world);
time0 = platform::walltime();
fft2->compute(cfft,cfft,FFT3d::BACKWARD);
time1 = platform::walltime();
time_fft += time1 - time0;
#if DEBUG_AMOEBA
debug_scalar(CFFT2,"POST Convo / POST FFT");
debug_file(CFFT2,"post.convo.post.fft");
#endif
// copy 1d complex values into 4d complex grid
n = 0;
for (iz = nzlo_in; iz <= nzhi_in; iz++)
for (iy = nylo_in; iy <= nyhi_in; iy++)
for (ix = nxlo_in; ix <= nxhi_in; ix++) {
cgrid_brick[iz][iy][ix][0] = cfft[n++];
cgrid_brick[iz][iy][ix][1] = cfft[n++];
}
// forward comm to populate ghost grid values
#if DEBUG_AMOEBA
debug_scalar(GRIDBRICK_IN,"POST Convo / PRE grid3d");
debug_file(GRIDBRICK_IN,"post.convo.pre.grid3d");
#endif
gc->forward_comm(Grid3d::PAIR,amoeba,which,2,sizeof(FFT_SCALAR),
gc_buf1,gc_buf2,MPI_FFT_SCALAR);
return (void *) cgrid_brick;
}