|
|
|
@ -24,6 +24,7 @@
|
|
|
|
#include "pppm_gpu.h"
|
|
|
|
#include "pppm_gpu.h"
|
|
|
|
#include "atom.h"
|
|
|
|
#include "atom.h"
|
|
|
|
#include "comm.h"
|
|
|
|
#include "comm.h"
|
|
|
|
|
|
|
|
#include "commgrid.h"
|
|
|
|
#include "neighbor.h"
|
|
|
|
#include "neighbor.h"
|
|
|
|
#include "force.h"
|
|
|
|
#include "force.h"
|
|
|
|
#include "pair.h"
|
|
|
|
#include "pair.h"
|
|
|
|
@ -48,6 +49,9 @@ using namespace MathConst;
|
|
|
|
#define LARGE 10000.0
|
|
|
|
#define LARGE 10000.0
|
|
|
|
#define EPS_HOC 1.0e-7
|
|
|
|
#define EPS_HOC 1.0e-7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
enum{REVERSE_RHO};
|
|
|
|
|
|
|
|
enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef FFT_SINGLE
|
|
|
|
#ifdef FFT_SINGLE
|
|
|
|
#define ZEROF 0.0f
|
|
|
|
#define ZEROF 0.0f
|
|
|
|
#define ONEF 1.0f
|
|
|
|
#define ONEF 1.0f
|
|
|
|
@ -56,7 +60,7 @@ using namespace MathConst;
|
|
|
|
#define ONEF 1.0
|
|
|
|
#define ONEF 1.0
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// External functions from cuda library for atom decomposition
|
|
|
|
// external functions from cuda library for atom decomposition
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef FFT_SINGLE
|
|
|
|
#ifdef FFT_SINGLE
|
|
|
|
#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _f
|
|
|
|
#define PPPM_GPU_API(api) pppm_gpu_ ## api ## _f
|
|
|
|
@ -111,10 +115,35 @@ PPPMGPU::~PPPMGPU()
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::init()
|
|
|
|
void PPPMGPU::init()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
|
|
|
|
// PPPM init manages all arrays except density_brick_gpu and vd_brick
|
|
|
|
|
|
|
|
// thru its deallocate(), allocate()
|
|
|
|
|
|
|
|
// NOTE: could free density_brick and vdxyz_brick after PPPM allocates them,
|
|
|
|
|
|
|
|
// before allocating db_gpu and vd_brick down below, if don't need,
|
|
|
|
|
|
|
|
// if do this, make sure to set them to NULL
|
|
|
|
|
|
|
|
// NOTE: delete/realloc of cg necessary b/c packing 4 values per grid pt,
|
|
|
|
|
|
|
|
// not 3 as PPPM does - probably a better way to account for this
|
|
|
|
|
|
|
|
// in PPPM::init()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
|
|
|
|
|
|
|
|
destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
|
|
|
|
|
|
|
|
density_brick_gpu = vd_brick = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
PPPM::init();
|
|
|
|
PPPM::init();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (differentiation_flag == 0) {
|
|
|
|
|
|
|
|
delete cg;
|
|
|
|
|
|
|
|
int (*procneigh)[2] = comm->procneigh;
|
|
|
|
|
|
|
|
cg = new CommGrid(lmp,world,4,1,
|
|
|
|
|
|
|
|
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
|
|
|
|
|
|
|
nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
|
|
|
|
|
|
|
|
procneigh[0][0],procneigh[0][1],procneigh[1][0],
|
|
|
|
|
|
|
|
procneigh[1][1],procneigh[2][0],procneigh[2][1]);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// unsupported option
|
|
|
|
|
|
|
|
|
|
|
|
if (differentiation_flag == 1)
|
|
|
|
if (differentiation_flag == 1)
|
|
|
|
error->all(FLERR,"Cannot (yet) do analytic differentiation with pppm/gpu.");
|
|
|
|
error->all(FLERR,"Cannot (yet) do analytic differentiation with pppm/gpu");
|
|
|
|
|
|
|
|
|
|
|
|
if (strcmp(update->integrate_style,"verlet/split") == 0) {
|
|
|
|
if (strcmp(update->integrate_style,"verlet/split") == 0) {
|
|
|
|
kspace_split=true;
|
|
|
|
kspace_split=true;
|
|
|
|
@ -142,6 +171,8 @@ void PPPMGPU::init()
|
|
|
|
|
|
|
|
|
|
|
|
GPU_EXTRA::check_flag(success,error,world);
|
|
|
|
GPU_EXTRA::check_flag(success,error,world);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// allocate density_brick_gpu and vd_brick
|
|
|
|
|
|
|
|
|
|
|
|
density_brick_gpu =
|
|
|
|
density_brick_gpu =
|
|
|
|
create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
|
|
|
create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
|
|
|
nxlo_out,nxhi_out,"pppm:density_brick_gpu",h_brick,1);
|
|
|
|
nxlo_out,nxhi_out,"pppm:density_brick_gpu",h_brick,1);
|
|
|
|
@ -149,7 +180,7 @@ void PPPMGPU::init()
|
|
|
|
create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
|
|
|
create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
|
|
|
|
nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
|
|
|
|
nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
|
|
|
|
|
|
|
|
|
|
|
|
poisson_time=0;
|
|
|
|
poisson_time = 0.0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
@ -164,10 +195,8 @@ void PPPMGPU::compute(int eflag, int vflag)
|
|
|
|
if (atom->nlocal > old_nlocal) {
|
|
|
|
if (atom->nlocal > old_nlocal) {
|
|
|
|
nago=0;
|
|
|
|
nago=0;
|
|
|
|
old_nlocal = atom->nlocal;
|
|
|
|
old_nlocal = atom->nlocal;
|
|
|
|
} else
|
|
|
|
} else nago = 1;
|
|
|
|
nago=1;
|
|
|
|
} else nago = neighbor->ago;
|
|
|
|
} else
|
|
|
|
|
|
|
|
nago=neighbor->ago;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// set energy/virial flags
|
|
|
|
// set energy/virial flags
|
|
|
|
// invoke allocate_peratom() if needed for first time
|
|
|
|
// invoke allocate_peratom() if needed for first time
|
|
|
|
@ -217,6 +246,7 @@ void PPPMGPU::compute(int eflag, int vflag)
|
|
|
|
// to fully sum contribution in their 3d bricks
|
|
|
|
// to fully sum contribution in their 3d bricks
|
|
|
|
// remap from 3d decomposition to FFT decomposition
|
|
|
|
// remap from 3d decomposition to FFT decomposition
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cg->reverse_comm(this,REVERSE_RHO);
|
|
|
|
brick2fft();
|
|
|
|
brick2fft();
|
|
|
|
|
|
|
|
|
|
|
|
// compute potential gradient on my FFT grid and
|
|
|
|
// compute potential gradient on my FFT grid and
|
|
|
|
@ -225,10 +255,20 @@ void PPPMGPU::compute(int eflag, int vflag)
|
|
|
|
|
|
|
|
|
|
|
|
poisson();
|
|
|
|
poisson();
|
|
|
|
|
|
|
|
|
|
|
|
// all procs communicate E-field values to fill ghost cells
|
|
|
|
// all procs communicate E-field values
|
|
|
|
// surrounding their 3d bricks
|
|
|
|
// to fill ghost cells surrounding their 3d bricks
|
|
|
|
|
|
|
|
|
|
|
|
fillbrick();
|
|
|
|
if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
|
|
|
|
|
|
|
|
else cg->forward_comm(this,FORWARD_IK);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// extra per-atom energy/virial communication
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (evflag_atom) {
|
|
|
|
|
|
|
|
if (differentiation_flag == 1 && vflag_atom)
|
|
|
|
|
|
|
|
cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
|
|
|
|
|
|
|
|
else if (differentiation_flag == 0)
|
|
|
|
|
|
|
|
cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
poisson_time += MPI_Wtime()-t3;
|
|
|
|
poisson_time += MPI_Wtime()-t3;
|
|
|
|
|
|
|
|
|
|
|
|
@ -237,11 +277,10 @@ void PPPMGPU::compute(int eflag, int vflag)
|
|
|
|
FFT_SCALAR qscale = force->qqrd2e * scale;
|
|
|
|
FFT_SCALAR qscale = force->qqrd2e * scale;
|
|
|
|
PPPM_GPU_API(interp)(qscale);
|
|
|
|
PPPM_GPU_API(interp)(qscale);
|
|
|
|
|
|
|
|
|
|
|
|
// Compute per-atom energy/virial on host if requested
|
|
|
|
// per-atom energy/virial
|
|
|
|
|
|
|
|
// energy includes self-energy correction
|
|
|
|
|
|
|
|
|
|
|
|
if (evflag_atom) {
|
|
|
|
if (evflag_atom) {
|
|
|
|
fillbrick_peratom();
|
|
|
|
|
|
|
|
fieldforce_peratom();
|
|
|
|
|
|
|
|
double *q = atom->q;
|
|
|
|
double *q = atom->q;
|
|
|
|
int nlocal = atom->nlocal;
|
|
|
|
int nlocal = atom->nlocal;
|
|
|
|
|
|
|
|
|
|
|
|
@ -293,246 +332,13 @@ void PPPMGPU::compute(int eflag, int vflag)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
allocate memory that depends on # of K-vectors and order
|
|
|
|
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::allocate()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
memory->create(density_fft,nfft_both,"pppm:density_fft");
|
|
|
|
|
|
|
|
memory->create(greensfn,nfft_both,"pppm:greensfn");
|
|
|
|
|
|
|
|
memory->create(work1,2*nfft_both,"pppm:work1");
|
|
|
|
|
|
|
|
memory->create(work2,2*nfft_both,"pppm:work2");
|
|
|
|
|
|
|
|
memory->create(vg,nfft_both,6,"pppm:vg");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
|
|
|
|
|
|
|
|
memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
|
|
|
|
|
|
|
|
memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memory->create(buf1,nbuf,"pppm:buf1");
|
|
|
|
|
|
|
|
memory->create(buf2,nbuf,"pppm:buf2");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// summation coeffs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memory->create(gf_b,order,"pppm:gf_b");
|
|
|
|
|
|
|
|
memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
|
|
|
|
|
|
|
|
memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm:drho1d");
|
|
|
|
|
|
|
|
memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
|
|
|
|
|
|
|
|
memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
|
|
|
|
|
|
|
|
"pppm:drho_coeff");
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// create 2 FFTs and a Remap
|
|
|
|
|
|
|
|
// 1st FFT keeps data in FFT decompostion
|
|
|
|
|
|
|
|
// 2nd FFT returns data in 3d brick decomposition
|
|
|
|
|
|
|
|
// remap takes data from 3d brick to FFT decomposition
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int tmp;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
|
|
|
|
|
|
|
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
|
|
|
|
|
|
|
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
|
|
|
|
|
|
|
0,0,&tmp);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
|
|
|
|
|
|
|
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
|
|
|
|
|
|
|
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
|
|
|
|
|
|
|
0,0,&tmp);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
remap = new Remap(lmp,world,
|
|
|
|
|
|
|
|
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
|
|
|
|
|
|
|
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
|
|
|
|
|
|
|
1,0,0,FFT_PRECISION);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
deallocate memory that depends on # of K-vectors and order
|
|
|
|
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::deallocate()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
destroy_3d_offset(density_brick_gpu,nzlo_out,nylo_out);
|
|
|
|
|
|
|
|
destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memory->destroy(density_fft);
|
|
|
|
|
|
|
|
memory->destroy(greensfn);
|
|
|
|
|
|
|
|
memory->destroy(work1);
|
|
|
|
|
|
|
|
memory->destroy(work2);
|
|
|
|
|
|
|
|
memory->destroy(vg);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memory->destroy1d_offset(fkx,nxlo_fft);
|
|
|
|
|
|
|
|
memory->destroy1d_offset(fky,nylo_fft);
|
|
|
|
|
|
|
|
memory->destroy1d_offset(fkz,nzlo_fft);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memory->destroy(buf1);
|
|
|
|
|
|
|
|
memory->destroy(buf2);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
memory->destroy(gf_b);
|
|
|
|
|
|
|
|
memory->destroy2d_offset(rho1d,-order/2);
|
|
|
|
|
|
|
|
memory->destroy2d_offset(drho1d,-order/2);
|
|
|
|
|
|
|
|
memory->destroy2d_offset(rho_coeff,(1-order)/2);
|
|
|
|
|
|
|
|
memory->destroy2d_offset(drho_coeff,(1-order)/2);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
delete fft1;
|
|
|
|
|
|
|
|
delete fft2;
|
|
|
|
|
|
|
|
delete remap;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
ghost-swap to accumulate full density in brick decomposition
|
|
|
|
|
|
|
|
remap density from 3d brick decomposition to FFT decomposition
|
|
|
|
remap density from 3d brick decomposition to FFT decomposition
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::brick2fft()
|
|
|
|
void PPPMGPU::brick2fft()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
int i,n,ix,iy,iz;
|
|
|
|
int n,ix,iy,iz;
|
|
|
|
MPI_Request request;
|
|
|
|
|
|
|
|
MPI_Status status;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my ghosts for +x processor
|
|
|
|
|
|
|
|
// pass data to self or +x processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my real cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = nxhi_in+1; ix <= nxhi_out; ix++)
|
|
|
|
|
|
|
|
buf1[n++] = density_brick_gpu[iz][iy][ix];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[0][1] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++)
|
|
|
|
|
|
|
|
density_brick_gpu[iz][iy][ix] += buf2[n++];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my ghosts for -x processor
|
|
|
|
|
|
|
|
// pass data to self or -x processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my real cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_out; ix < nxlo_in; ix++)
|
|
|
|
|
|
|
|
buf1[n++] = density_brick_gpu[iz][iy][ix];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[0][0] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
density_brick_gpu[iz][iy][ix] += buf2[n++];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my ghosts for +y processor
|
|
|
|
|
|
|
|
// pass data to self or +y processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my real cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
buf1[n++] = density_brick_gpu[iz][iy][ix];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[1][1] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
density_brick_gpu[iz][iy][ix] += buf2[n++];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my ghosts for -y processor
|
|
|
|
|
|
|
|
// pass data to self or -y processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my real cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy < nylo_in; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
buf1[n++] = density_brick_gpu[iz][iy][ix];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[1][0] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
density_brick_gpu[iz][iy][ix] += buf2[n++];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my ghosts for +z processor
|
|
|
|
|
|
|
|
// pass data to self or +z processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my real cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
buf1[n++] = density_brick_gpu[iz][iy][ix];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[2][1] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
density_brick_gpu[iz][iy][ix] += buf2[n++];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my ghosts for -z processor
|
|
|
|
|
|
|
|
// pass data to self or -z processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my real cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz < nzlo_in; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
buf1[n++] = density_brick_gpu[iz][iy][ix];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[2][0] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = nxlo_in; ix <= nxhi_in; ix++)
|
|
|
|
|
|
|
|
density_brick_gpu[iz][iy][ix] += buf2[n++];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// remap from 3d brick decomposition to FFT decomposition
|
|
|
|
|
|
|
|
// copy grabs inner portion of density from 3d brick
|
|
|
|
// copy grabs inner portion of density from 3d brick
|
|
|
|
// remap could be done as pre-stage of FFT,
|
|
|
|
// remap could be done as pre-stage of FFT,
|
|
|
|
// but this works optimally on only double values, not complex values
|
|
|
|
// but this works optimally on only double values, not complex values
|
|
|
|
@ -546,216 +352,6 @@ void PPPMGPU::brick2fft()
|
|
|
|
remap->perform(density_fft,density_fft,work1);
|
|
|
|
remap->perform(density_fft,density_fft,work1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
Same as base class - needed to call GPU version of fillbrick_.
|
|
|
|
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::fillbrick()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (differentiation_flag == 1) fillbrick_ad();
|
|
|
|
|
|
|
|
else fillbrick_ik();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
ghost-swap to fill ghost cells of my brick with field values
|
|
|
|
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::fillbrick_ik()
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
int i,n,ix,iy,iz;
|
|
|
|
|
|
|
|
MPI_Request request;
|
|
|
|
|
|
|
|
MPI_Status status;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my real cells for +z processor
|
|
|
|
|
|
|
|
// pass data to self or +z processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my ghost cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
int x_lo = nxlo_in * 4;
|
|
|
|
|
|
|
|
int x_hi = nxhi_in * 4 + 1;
|
|
|
|
|
|
|
|
for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+1];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+2];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[2][1] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz < nzlo_in; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+1] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+2] = buf2[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my real cells for -z processor
|
|
|
|
|
|
|
|
// pass data to self or -z processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my ghost cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+1];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+2];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[2][0] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+1] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+2] = buf2[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my real cells for +y processor
|
|
|
|
|
|
|
|
// pass data to self or +y processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my ghost cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+1];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+2];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[1][1] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy < nylo_in; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+1] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+2] = buf2[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my real cells for -y processor
|
|
|
|
|
|
|
|
// pass data to self or -y processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my ghost cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+1];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+2];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[1][0] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+1] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+2] = buf2[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my real cells for +x processor
|
|
|
|
|
|
|
|
// pass data to self or +x processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my ghost cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
x_lo = (nxhi_in-nxhi_ghost+1) * 4;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+1];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+2];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[0][1] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
x_lo = nxlo_out * 4;
|
|
|
|
|
|
|
|
x_hi = nxlo_in * 4;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+1] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+2] = buf2[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// pack my real cells for -x processor
|
|
|
|
|
|
|
|
// pass data to self or -x processor
|
|
|
|
|
|
|
|
// unpack and sum recv data into my ghost cells
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
x_lo = x_hi;
|
|
|
|
|
|
|
|
x_hi = (nxlo_in+nxlo_ghost) * 4;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+1];
|
|
|
|
|
|
|
|
buf1[n++] = vd_brick[iz][iy][ix+2];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (comm->procneigh[0][0] == me)
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) buf2[i] = buf1[i];
|
|
|
|
|
|
|
|
else {
|
|
|
|
|
|
|
|
MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
|
|
|
|
|
|
|
|
MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
|
|
|
|
|
|
|
|
MPI_Wait(&request,&status);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n = 0;
|
|
|
|
|
|
|
|
x_lo = (nxhi_in + 1) * 4;
|
|
|
|
|
|
|
|
x_hi = nxhi_out * 4 + 1;
|
|
|
|
|
|
|
|
for (iz = nzlo_out; iz <= nzhi_out; iz++)
|
|
|
|
|
|
|
|
for (iy = nylo_out; iy <= nyhi_out; iy++)
|
|
|
|
|
|
|
|
for (ix = x_lo; ix < x_hi; ix+=4) {
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+1] = buf2[n++];
|
|
|
|
|
|
|
|
vd_brick[iz][iy][ix+2] = buf2[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
Same code as base class - necessary to call GPU version of poisson_ik
|
|
|
|
Same code as base class - necessary to call GPU version of poisson_ik
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
@ -892,7 +488,149 @@ void PPPMGPU::poisson_ik()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
Create array using offsets from pinned memory allocation
|
|
|
|
pack own values to buf to send to another proc
|
|
|
|
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
int n = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (flag == FORWARD_IK) {
|
|
|
|
|
|
|
|
int offset;
|
|
|
|
|
|
|
|
FFT_SCALAR *src = &vd_brick[nzlo_out][nylo_out][4*nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++) {
|
|
|
|
|
|
|
|
offset = 4*list[i];
|
|
|
|
|
|
|
|
buf[n++] = src[offset++];
|
|
|
|
|
|
|
|
buf[n++] = src[offset++];
|
|
|
|
|
|
|
|
buf[n++] = src[offset];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (flag == FORWARD_AD) {
|
|
|
|
|
|
|
|
FFT_SCALAR *src = &u_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++)
|
|
|
|
|
|
|
|
buf[i] = src[list[i]];
|
|
|
|
|
|
|
|
} else if (flag == FORWARD_IK_PERATOM) {
|
|
|
|
|
|
|
|
FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++) {
|
|
|
|
|
|
|
|
if (eflag_atom) buf[n++] = esrc[list[i]];
|
|
|
|
|
|
|
|
if (vflag_atom) {
|
|
|
|
|
|
|
|
buf[n++] = v0src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v1src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v2src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v3src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v4src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v5src[list[i]];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (flag == FORWARD_AD_PERATOM) {
|
|
|
|
|
|
|
|
FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++) {
|
|
|
|
|
|
|
|
buf[n++] = v0src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v1src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v2src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v3src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v4src[list[i]];
|
|
|
|
|
|
|
|
buf[n++] = v5src[list[i]];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
unpack another proc's own values from buf and set own ghost values
|
|
|
|
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
int n = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (flag == FORWARD_IK) {
|
|
|
|
|
|
|
|
int offset;
|
|
|
|
|
|
|
|
FFT_SCALAR *dest = &vdx_brick[nzlo_out][nylo_out][4*nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++) {
|
|
|
|
|
|
|
|
offset = 4*list[i];
|
|
|
|
|
|
|
|
dest[offset++] = buf[n++];
|
|
|
|
|
|
|
|
dest[offset++] = buf[n++];
|
|
|
|
|
|
|
|
dest[offset] = buf[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (flag == FORWARD_AD) {
|
|
|
|
|
|
|
|
FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++)
|
|
|
|
|
|
|
|
dest[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
} else if (flag == FORWARD_IK_PERATOM) {
|
|
|
|
|
|
|
|
FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++) {
|
|
|
|
|
|
|
|
if (eflag_atom) esrc[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
if (vflag_atom) {
|
|
|
|
|
|
|
|
v0src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v1src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v2src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v3src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v4src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v5src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} else if (flag == FORWARD_AD_PERATOM) {
|
|
|
|
|
|
|
|
FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++) {
|
|
|
|
|
|
|
|
v0src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v1src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v2src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v3src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v4src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
v5src[list[i]] = buf[n++];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
pack ghost values into buf to send to another proc
|
|
|
|
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (flag == REVERSE_RHO) {
|
|
|
|
|
|
|
|
FFT_SCALAR *src = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++)
|
|
|
|
|
|
|
|
buf[i] = src[list[i]];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
unpack another proc's ghost values from buf and add to own values
|
|
|
|
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void PPPMGPU::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
if (flag == REVERSE_RHO) {
|
|
|
|
|
|
|
|
FFT_SCALAR *dest = &density_brick_gpu[nzlo_out][nylo_out][nxlo_out];
|
|
|
|
|
|
|
|
for (int i = 0; i < nlist; i++)
|
|
|
|
|
|
|
|
dest[list[i]] += buf[i];
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* ----------------------------------------------------------------------
|
|
|
|
|
|
|
|
create array using offsets from pinned memory allocation
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
------------------------------------------------------------------------- */
|
|
|
|
|
|
|
|
|
|
|
|
FFT_SCALAR ***PPPMGPU::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi,
|
|
|
|
FFT_SCALAR ***PPPMGPU::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi,
|
|
|
|
@ -942,19 +680,11 @@ void PPPMGPU::destroy_3d_offset(FFT_SCALAR ***array, int n1_offset,
|
|
|
|
|
|
|
|
|
|
|
|
double PPPMGPU::memory_usage()
|
|
|
|
double PPPMGPU::memory_usage()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
double bytes = nmax*3 * sizeof(double);
|
|
|
|
double bytes = PPPM::memory_usage();
|
|
|
|
int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
|
|
|
|
|
|
|
|
(nzhi_out-nzlo_out+1);
|
|
|
|
|
|
|
|
bytes += 4 * nbrick * sizeof(FFT_SCALAR);
|
|
|
|
|
|
|
|
bytes += 6 * nfft_both * sizeof(double);
|
|
|
|
|
|
|
|
bytes += nfft_both * sizeof(double);
|
|
|
|
|
|
|
|
bytes += nfft_both*5 * sizeof(FFT_SCALAR);
|
|
|
|
|
|
|
|
bytes += 2 * nbuf * sizeof(double);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (peratom_allocate_flag) {
|
|
|
|
// NOTE: add tallying here for density_brick_gpu and vd_brick
|
|
|
|
bytes += 7 * nbrick * sizeof(FFT_SCALAR);
|
|
|
|
// could subtract out density_brick and vdxyz_brick if freed them above
|
|
|
|
bytes += 2 * nbuf_peratom * sizeof(FFT_SCALAR);
|
|
|
|
// it the net efffect is zero, do nothing
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return bytes + PPPM_GPU_API(bytes)();
|
|
|
|
return bytes + PPPM_GPU_API(bytes)();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|