Merge pull request #4143 from hagertnl/hagerty_issue4140_kokkos_collective

Implement kspace_modify collective yes in KOKKOS package
This commit is contained in:
Axel Kohlmeyer
2024-09-11 00:07:10 -04:00
committed by GitHub
4 changed files with 617 additions and 256 deletions

View File

@ -794,7 +794,7 @@ void PPPMKokkos<DeviceType>::allocate()
// 2nd FFT returns data in 3d brick decomposition
// remap takes data from 3d brick to FFT decomposition
int collective_flag = 0; // not yet supported in Kokkos version
int collective_flag = force->kspace->collective_flag;
int gpu_aware_flag = lmp->kokkos->gpu_aware_flag;
int tmp;

View File

@ -103,12 +103,10 @@ template<class DeviceType>
void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d_in, typename FFT_AT::t_FFT_SCALAR_1d d_out, typename FFT_AT::t_FFT_SCALAR_1d d_buf,
struct remap_plan_3d_kokkos<DeviceType> *plan)
{
// collective flag not yet supported
// use point-to-point communication
int i,isend,irecv;
typename FFT_AT::t_FFT_SCALAR_1d d_scratch;
int me;
MPI_Comm_rank(plan->comm,&me);
if (plan->memory == 0)
d_scratch = d_buf;
@ -116,70 +114,132 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
d_scratch = plan->d_scratch;
// post all recvs into scratch space
// If not using GPU-aware MPI, mirror data to host
FFT_SCALAR* v_scratch = d_scratch.data();
if (!plan->usegpu_aware) {
plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
v_scratch = plan->h_scratch.data();
}
for (irecv = 0; irecv < plan->nrecv; irecv++) {
FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
MPI_Irecv(scratch,plan->recv_size[irecv],
MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
plan->comm,&plan->request[irecv]);
}
FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data();
if (!plan->usegpu_aware) {
plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf);
v_scratch = plan->h_scratch.data();
v_sendbuf = plan->h_sendbuf.data();
}
// send all messages to other procs
// use point-to-point communication
for (isend = 0; isend < plan->nsend; isend++) {
int in_offset = plan->send_offset[isend];
plan->pack(d_in,in_offset,
plan->d_sendbuf,0,&plan->packplan[isend]);
if (!plan->usecollective) {
int i,isend,irecv;
if (!plan->usegpu_aware)
Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
plan->send_proc[isend],0,plan->comm);
}
for (irecv = 0; irecv < plan->nrecv; irecv++) {
FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
MPI_Irecv(scratch,plan->recv_size[irecv],
MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
plan->comm,&plan->request[irecv]);
}
// copy in -> scratch -> out for self data
// send all messages to other procs
if (plan->self) {
isend = plan->nsend;
irecv = plan->nrecv;
for (isend = 0; isend < plan->nsend; isend++) {
int in_offset = plan->send_offset[isend];
plan->pack(d_in,in_offset,
plan->d_sendbuf,0,&plan->packplan[isend]);
int in_offset = plan->send_offset[isend];
int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv];
if (!plan->usegpu_aware)
Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
plan->pack(d_in,in_offset,
d_scratch,scratch_offset,
&plan->packplan[isend]);
plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]);
}
MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
plan->send_proc[isend],0,plan->comm);
}
// unpack all messages from scratch -> out
// copy in -> scratch -> out for self data
for (i = 0; i < plan->nrecv; i++) {
MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE);
if (plan->self) {
isend = plan->nsend;
irecv = plan->nrecv;
int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv];
int in_offset = plan->send_offset[isend];
int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv];
if (!plan->usegpu_aware)
Kokkos::deep_copy(d_scratch,plan->h_scratch);
plan->pack(d_in,in_offset,
d_scratch,scratch_offset,
&plan->packplan[isend]);
plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]);
}
plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]);
// unpack all messages from scratch -> out
for (i = 0; i < plan->nrecv; i++) {
MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE);
int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv];
if (!plan->usegpu_aware)
Kokkos::deep_copy(d_scratch,plan->h_scratch);
plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]);
}
} else {
if (plan->commringlen > 0) {
int isend,irecv;
// populate send data
// buffers are allocated and count/displacement buffers
// are populated in remap_3d_create_plan_kokkos
int numpacked = 0;
for (isend = 0; isend < plan->commringlen; isend++) {
if (plan->sendcnts[isend]) {
plan->pack(d_in,plan->send_offset[numpacked],
plan->d_sendbuf,plan->sdispls[isend],
&plan->packplan[numpacked]);
numpacked++;
}
else if (plan->commringlist[isend] == me && plan->self) {
numpacked++;
}
}
if (!plan->usegpu_aware)
Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
MPI_Alltoallv(v_sendbuf, plan->sendcnts, plan->sdispls,
MPI_FFT_SCALAR, v_scratch, plan->rcvcnts,
plan->rdispls, MPI_FFT_SCALAR, plan->comm);
// unpack the data from the recv buffer into out
if (!plan->usegpu_aware)
Kokkos::deep_copy(d_scratch,plan->h_scratch);
// copy in -> scratch -> out for self data
if (plan->self) {
plan->pack(d_in,plan->send_offset[plan->selfnsendloc],
plan->d_sendbuf,plan->sdispls[plan->selfcommringloc],
&plan->packplan[plan->selfnsendloc]);
plan->unpack(plan->d_sendbuf,plan->sdispls[plan->selfcommringloc],
d_out,plan->recv_offset[plan->selfnrecvloc],
&plan->unpackplan[plan->selfnrecvloc]);
}
numpacked = 0;
for (irecv = 0; irecv < plan->commringlen; irecv++) {
if (plan->rcvcnts[irecv]) {
plan->unpack(d_scratch,plan->rdispls[irecv],
d_out,plan->recv_offset[numpacked],
&plan->unpackplan[numpacked]);
numpacked++;
}
else if (plan->commringlist[irecv] == me && plan->self) {
numpacked++;
}
}
}
}
}
@ -263,224 +323,488 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
out.khi = out_khi;
out.ksize = out.khi - out.klo + 1;
// combine output extents across all procs
inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
if (inarray == nullptr) return nullptr;
outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
if (outarray == nullptr) return nullptr;
MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE,
outarray,sizeof(struct extent_3d),MPI_BYTE,comm);
// count send collides, including self
nsend = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
nsend += remap_3d_collide(&in,&outarray[iproc],&overlap);
}
// malloc space for send info
if (nsend) {
plan->pack = PackKokkos<DeviceType>::pack_3d;
plan->send_offset = (int *) malloc(nsend*sizeof(int));
plan->send_size = (int *) malloc(nsend*sizeof(int));
plan->send_proc = (int *) malloc(nsend*sizeof(int));
plan->packplan = (struct pack_plan_3d *)
malloc(nsend*sizeof(struct pack_plan_3d));
if (plan->send_offset == nullptr || plan->send_size == nullptr ||
plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr;
}
// store send info, with self as last entry
nsend = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
plan->send_proc[nsend] = iproc;
plan->send_offset[nsend] = nqty *
((overlap.klo-in.klo)*in.jsize*in.isize +
((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
plan->packplan[nsend].nfast = nqty*overlap.isize;
plan->packplan[nsend].nmid = overlap.jsize;
plan->packplan[nsend].nslow = overlap.ksize;
plan->packplan[nsend].nstride_line = nqty*in.isize;
plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
plan->packplan[nsend].nqty = nqty;
plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
nsend++;
}
}
// plan->nsend = # of sends not including self
if (nsend && plan->send_proc[nsend-1] == me) {
if (plan->usecollective) // for collectives include self in nsend list
plan->nsend = nsend;
else
plan->nsend = nsend - 1;
} else
plan->nsend = nsend;
// combine input extents across all procs
// combine input & output extents across all procs
MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE,
inarray,sizeof(struct extent_3d),MPI_BYTE,comm);
MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE,
outarray,sizeof(struct extent_3d),MPI_BYTE,comm);
// count recv collides, including self
// for efficiency, handle collective & non-collective setup separately
nrecv = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap);
}
if (!plan->usecollective) {
// count send & recv collides, including self
// malloc space for recv info
if (nrecv) {
if (permute == 0)
plan->unpack = PackKokkos<DeviceType>::unpack_3d;
else if (permute == 1) {
if (nqty == 1)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
else if (nqty == 2)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
else
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
}
else if (permute == 2) {
if (nqty == 1)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
else if (nqty == 2)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
else
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
nsend = 0;
nrecv = 0;
for (i = 0; i < nprocs; i++) {
nsend += remap_3d_collide(&in,&outarray[i],&overlap);
nrecv += remap_3d_collide(&out,&inarray[i],&overlap);
}
plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
plan->recv_size = (int *) malloc(nrecv*sizeof(int));
plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
plan->unpackplan = (struct pack_plan_3d *)
malloc(nrecv*sizeof(struct pack_plan_3d));
// malloc space for send & recv info
if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
plan->recv_proc == nullptr || plan->recv_bufloc == nullptr ||
plan->request == nullptr || plan->unpackplan == nullptr) return nullptr;
}
if (nsend) {
plan->pack = PackKokkos<DeviceType>::pack_3d;
// store recv info, with self as last entry
plan->send_offset = (int *) malloc(nsend*sizeof(int));
plan->send_size = (int *) malloc(nsend*sizeof(int));
plan->send_proc = (int *) malloc(nsend*sizeof(int));
plan->packplan = (struct pack_plan_3d *)
malloc(nsend*sizeof(struct pack_plan_3d));
ibuf = 0;
nrecv = 0;
iproc = me;
if (plan->send_offset == nullptr || plan->send_size == nullptr ||
plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr;
}
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
plan->recv_proc[nrecv] = iproc;
plan->recv_bufloc[nrecv] = ibuf;
if (permute == 0) {
plan->recv_offset[nrecv] = nqty *
((overlap.klo-out.klo)*out.jsize*out.isize +
(overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
plan->unpackplan[nrecv].nqty = nqty;
}
if (nrecv) {
if (permute == 0)
plan->unpack = PackKokkos<DeviceType>::unpack_3d;
else if (permute == 1) {
plan->recv_offset[nrecv] = nqty *
((overlap.ilo-out.ilo)*out.ksize*out.jsize +
(overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
plan->unpackplan[nrecv].nfast = overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
plan->unpackplan[nrecv].nqty = nqty;
if (nqty == 1)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
else if (nqty == 2)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
else
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
}
else {
plan->recv_offset[nrecv] = nqty *
((overlap.jlo-out.jlo)*out.isize*out.ksize +
(overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
plan->unpackplan[nrecv].nfast = overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
plan->unpackplan[nrecv].nqty = nqty;
else if (permute == 2) {
if (nqty == 1)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
else if (nqty == 2)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
else
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
}
plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
ibuf += plan->recv_size[nrecv];
nrecv++;
plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
plan->recv_size = (int *) malloc(nrecv*sizeof(int));
plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
plan->unpackplan = (struct pack_plan_3d *)
malloc(nrecv*sizeof(struct pack_plan_3d));
if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
plan->recv_proc == nullptr || plan->recv_bufloc == nullptr ||
plan->request == nullptr || plan->unpackplan == nullptr) return nullptr;
}
// store send info, with self as last entry
nsend = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
plan->send_proc[nsend] = iproc;
plan->send_offset[nsend] = nqty *
((overlap.klo-in.klo)*in.jsize*in.isize +
((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
plan->packplan[nsend].nfast = nqty*overlap.isize;
plan->packplan[nsend].nmid = overlap.jsize;
plan->packplan[nsend].nslow = overlap.ksize;
plan->packplan[nsend].nstride_line = nqty*in.isize;
plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
plan->packplan[nsend].nqty = nqty;
plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
nsend++;
}
}
// plan->nsend = # of sends not including self
if (nsend && plan->send_proc[nsend-1] == me) plan->nsend = nsend - 1;
else plan->nsend = nsend;
// store recv info, with self as last entry
ibuf = 0;
nrecv = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
plan->recv_proc[nrecv] = iproc;
plan->recv_bufloc[nrecv] = ibuf;
if (permute == 0) {
plan->recv_offset[nrecv] = nqty *
((overlap.klo-out.klo)*out.jsize*out.isize +
(overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
plan->unpackplan[nrecv].nqty = nqty;
}
else if (permute == 1) {
plan->recv_offset[nrecv] = nqty *
((overlap.ilo-out.ilo)*out.ksize*out.jsize +
(overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
plan->unpackplan[nrecv].nfast = overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
plan->unpackplan[nrecv].nqty = nqty;
}
else {
plan->recv_offset[nrecv] = nqty *
((overlap.jlo-out.jlo)*out.isize*out.ksize +
(overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
plan->unpackplan[nrecv].nfast = overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
plan->unpackplan[nrecv].nqty = nqty;
}
plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
ibuf += plan->recv_size[nrecv];
nrecv++;
}
}
// plan->nrecv = # of recvs not including self
if (nrecv && plan->recv_proc[nrecv-1] == me) plan->nrecv = nrecv - 1;
else plan->nrecv = nrecv;
// init remaining fields in remap plan
plan->memory = memory;
if (nrecv == plan->nrecv) plan->self = 0;
else plan->self = 1;
// the plan->d_sendbuf and plan->d_recvbuf are used by both the
// collective & non-collective implementations.
// For non-collective, the buffer size is MAX(send_size) for any one send
// find biggest send message (not including self) and malloc space for it
size = 0;
for (nsend = 0; nsend < plan->nsend; nsend++)
size = MAX(size,plan->send_size[nsend]);
if (size) {
plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
if (!plan->d_sendbuf.data()) return nullptr;
}
// if requested, allocate internal scratch space for recvs,
// only need it if I will receive any data (including self)
if (memory == 1) {
if (nrecv > 0) {
plan->d_scratch =
typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
if (!plan->d_scratch.data()) return nullptr;
}
}
// Non-collectives do not use MPI Communicator Groups
MPI_Comm_dup(comm,&plan->comm);
} else {
// Improved approach - use an AllReduce to aggregate which ranks need to be included
// To do this, we build the local proc's send/receive list, then do an AllReduce
// to create the send/recv count for the Alltoallv
// local arrays to be used in the allreduce
// start with max length -- nprocs. Unused entries will be removed later
int *local_cnts = (int*) malloc(2*nprocs*sizeof(int));
if (local_cnts == nullptr) return nullptr;
int *local_sendcnts = local_cnts;
int *local_recvcnts = (local_cnts + nprocs);
// local arrays used to store the results of the allreduce
int *global_cnts = (int*) malloc(2*nprocs*sizeof(int));
if (global_cnts == nullptr) return nullptr;
int *global_sendcnts = global_cnts;
int *global_recvcnts = (global_cnts + nprocs);
// count send & recv collides, including self
nsend = 0;
nrecv = 0;
for (i = 0; i < nprocs; i++) {
local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap);
local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap);
nsend += local_sendcnts[i];
nrecv += local_recvcnts[i];
}
// perform an AllReduce to get the counts from all other processors and build sendcnts list
MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm);
// now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv
// also builds commringlist -- which is already sorted
int *commringlist = (int*) malloc(nprocs * sizeof(int));
int commringlen = 0;
for (i = 0; i < nprocs; i++) {
if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) {
commringlist[commringlen] = i;
commringlen++;
}
}
// resize commringlist to final size
commringlist = (int *) realloc(commringlist, commringlen*sizeof(int));
// set the plan->commringlist
plan->commringlen = commringlen;
plan->commringlist = commringlist;
// clean up local buffers that are finished
local_sendcnts = nullptr;
local_recvcnts = nullptr;
global_recvcnts = nullptr;
global_sendcnts = nullptr;
free(local_cnts);
free(global_cnts);
// malloc space for send & recv info
// if the current proc is involved in any way in the communication, allocate space
// because of the Alltoallv, both send and recv have to be initialized even if
// only one of those is performed
if (nsend || nrecv) {
// send space
plan->selfcommringloc = -1;
plan->selfnsendloc = -1;
plan->selfnrecvloc = -1;
plan->nsend = nsend;
plan->pack = PackKokkos<DeviceType>::pack_3d;
plan->send_offset = (int *) malloc(nsend*sizeof(int));
plan->send_size = (int *) malloc(plan->commringlen*sizeof(int));
plan->sendcnts = (int *) malloc(plan->commringlen*sizeof(int));
plan->sdispls = (int *) malloc(plan->commringlen*sizeof(int));
// only used when sendcnt > 0
plan->packplan = (struct pack_plan_3d *)
malloc(nsend*sizeof(struct pack_plan_3d));
if (plan->send_offset == nullptr || plan->send_size == nullptr ||
plan->sendcnts == nullptr || plan->sdispls == nullptr ||
plan->packplan == nullptr) return nullptr;
// recv space
plan->nrecv = nrecv;
if (permute == 0)
plan->unpack = PackKokkos<DeviceType>::unpack_3d;
else if (permute == 1) {
if (nqty == 1)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
else if (nqty == 2)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
else
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
}
else if (permute == 2) {
if (nqty == 1)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
else if (nqty == 2)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
else
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
}
plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
plan->recv_size = (int *) malloc(plan->commringlen*sizeof(int));
plan->rcvcnts = (int *) malloc(plan->commringlen*sizeof(int));
plan->rdispls = (int *) malloc(plan->commringlen*sizeof(int));
// only used when recvcnt > 0
plan->unpackplan = (struct pack_plan_3d *)
malloc(nrecv*sizeof(struct pack_plan_3d));
if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
plan->rcvcnts == nullptr || plan->rdispls == nullptr ||
plan->unpackplan == nullptr) return nullptr;
}
// store send info, with self as last entry
nsend = 0;
ibuf = 0;
int total_send_size = 0;
for (i = 0; i < plan->commringlen; i++) {
iproc = plan->commringlist[i];
if (iproc == me) {
plan->selfcommringloc = i;
plan->selfnsendloc = nsend;
}
if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
//plan->send_proc[nsend] = i;
// number of entries required for this pack's 3-d coords
plan->send_offset[nsend] = nqty *
((overlap.klo-in.klo)*in.jsize*in.isize +
((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
plan->packplan[nsend].nfast = nqty*overlap.isize;
plan->packplan[nsend].nmid = overlap.jsize;
plan->packplan[nsend].nslow = overlap.ksize;
plan->packplan[nsend].nstride_line = nqty*in.isize;
plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
plan->packplan[nsend].nqty = nqty;
// total amount of overlap
plan->send_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
plan->sendcnts[i] = plan->send_size[i];
plan->sdispls[i] = ibuf;
ibuf += plan->send_size[i];
nsend++;
} else {
plan->send_size[i] = 0;
plan->sdispls[i] = ibuf;
plan->sendcnts[i] = 0;
}
total_send_size += plan->send_size[i];
}
if (total_send_size) {
plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",total_send_size);
if (!plan->d_sendbuf.data()) return nullptr;
}
// store recv info, with self as last entry
ibuf = 0;
nrecv = 0;
for (i = 0; i < plan->commringlen; i++) {
iproc = plan->commringlist[i];
if (iproc == me) {
plan->selfnrecvloc = nrecv;
}
if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
if (permute == 0) {
plan->recv_offset[nrecv] = nqty *
((overlap.klo-out.klo)*out.jsize*out.isize +
(overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
plan->unpackplan[nrecv].nqty = nqty;
}
else if (permute == 1) {
plan->recv_offset[nrecv] = nqty *
((overlap.ilo-out.ilo)*out.ksize*out.jsize +
(overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
plan->unpackplan[nrecv].nfast = overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
plan->unpackplan[nrecv].nqty = nqty;
}
else {
plan->recv_offset[nrecv] = nqty *
((overlap.jlo-out.jlo)*out.isize*out.ksize +
(overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
plan->unpackplan[nrecv].nfast = overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
plan->unpackplan[nrecv].nqty = nqty;
}
plan->recv_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
plan->rcvcnts[i] = plan->recv_size[i];
plan->rdispls[i] = ibuf;
ibuf += plan->recv_size[i];
nrecv++;
} else {
plan->recv_size[i] = 0;
plan->rcvcnts[i] = 0;
plan->rdispls[i] = ibuf;
}
}
// init remaining fields in remap plan
plan->memory = memory;
if (plan->sendcnts[plan->selfcommringloc]) {
plan->self = 1;
plan->sendcnts[plan->selfcommringloc] = 0;
plan->rcvcnts[plan->selfcommringloc] = 0;
}
else {
plan->self = 0;
}
// if requested, allocate internal scratch space for recvs,
// only need it if I will receive any data (including self)
if (memory == 1) {
if (nrecv > 0) {
plan->d_scratch =
typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
if (!plan->d_scratch.data()) return nullptr;
}
}
// if using collective and the commringlist is NOT empty create a
// communicator for the plan based off an MPI_Group created with
// ranks from the commringlist
if (plan->commringlen > 0) {
MPI_Group orig_group, new_group;
MPI_Comm_group(comm, &orig_group);
MPI_Group_incl(orig_group, plan->commringlen,
plan->commringlist, &new_group);
MPI_Comm_create(comm, new_group, &plan->comm);
}
// if using collective and the comm ring list is empty create
// a communicator for the plan with an empty group
else
MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm);
}
// plan->nrecv = # of recvs not including self
// for collectives include self in the nsend list
if (nrecv && plan->recv_proc[nrecv-1] == me) {
if (plan->usecollective) plan->nrecv = nrecv;
else plan->nrecv = nrecv - 1;
} else plan->nrecv = nrecv;
// init remaining fields in remap plan
plan->memory = memory;
if (nrecv == plan->nrecv) plan->self = 0;
else plan->self = 1;
// free locally malloced space
free(inarray);
free(outarray);
// find biggest send message (not including self) and malloc space for it
size = 0;
for (nsend = 0; nsend < plan->nsend; nsend++)
size = MAX(size,plan->send_size[nsend]);
if (size) {
plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
if (!plan->d_sendbuf.data()) return nullptr;
}
// if requested, allocate internal scratch space for recvs,
// only need it if I will receive any data (including self)
if (memory == 1) {
if (nrecv > 0) {
plan->d_scratch =
typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
if (!plan->d_scratch.data()) return nullptr;
}
}
// not using collective - dup comm
MPI_Comm_dup(comm,&plan->comm);
// return pointer to plan
return plan;
@ -500,22 +824,45 @@ void RemapKokkos<DeviceType>::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_
if (!((plan->usecollective) && (plan->commringlen == 0)))
MPI_Comm_free(&plan->comm);
// free internal arrays
if (plan->usecollective) {
if (plan->commringlist != nullptr) {
free(plan->commringlist);
free(plan->sendcnts);
free(plan->rcvcnts);
free(plan->sdispls);
free(plan->rdispls);
}
if (plan->nsend || plan->self) {
free(plan->send_offset);
free(plan->send_size);
free(plan->send_proc);
free(plan->packplan);
}
if (plan->nsend) {
free(plan->send_offset);
free(plan->send_size);
free(plan->packplan);
}
if (plan->nrecv || plan->self) {
free(plan->recv_offset);
free(plan->recv_size);
free(plan->recv_proc);
free(plan->recv_bufloc);
free(plan->request);
free(plan->unpackplan);
if (plan->nrecv) {
free(plan->recv_offset);
free(plan->recv_size);
free(plan->unpackplan);
}
} else {
// free arrays used in pt2pt communication
if (plan->nsend || plan->self) {
free(plan->send_offset);
free(plan->send_size);
free(plan->send_proc);
free(plan->packplan);
}
if (plan->nrecv || plan->self) {
free(plan->recv_offset);
free(plan->recv_size);
free(plan->recv_proc);
free(plan->recv_bufloc);
free(plan->request);
free(plan->unpackplan);
}
}
// free plan itself

View File

@ -44,6 +44,7 @@ struct remap_plan_3d_kokkos {
int *recv_size; // size of each recv message
int *recv_proc; // proc to recv each message from
int *recv_bufloc; // offset in scratch buf for each recv
int *nrecvmap; // maps receive index to rank index
MPI_Request *request; // MPI request for each posted recv
struct pack_plan_3d *unpackplan; // unpack plan for each recv message
int nrecv; // # of recvs from other procs
@ -52,9 +53,17 @@ struct remap_plan_3d_kokkos {
int memory; // user provides scratch space or not
MPI_Comm comm; // group of procs performing remap
int usecollective; // use collective or point-to-point MPI
int usegpu_aware; // use GPU-Aware MPI or not
// variables for collective MPI only
int commringlen; // length of commringlist
int *commringlist; // ranks on communication ring of this plan
int usegpu_aware; // use GPU-Aware MPI or not
int *sendcnts; // # of elements in send buffer for each rank
int *rcvcnts; // # of elements in recv buffer for each rank
int *sdispls; // extraction location in send buffer for each rank
int *rdispls; // extraction location in recv buffer for each rank
int selfcommringloc; // current proc's location in commringlist
int selfnsendloc; // current proc's location in send lists
int selfnrecvloc; // current proc's location in recv lists
};
template<class DeviceType>

View File

@ -1384,15 +1384,20 @@ void PPPM::set_grid_local()
// npey_fft,npez_fft = # of procs in y,z dims
// if nprocs is small enough, proc can own 1 or more entire xy planes,
// else proc owns 2d sub-blocks of yz plane
// NOTE: commented out lines support this
// need to ensure fft3d.cpp and remap.cpp support 2D planes
// me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
// nlo_fft,nhi_fft = lower/upper limit of the section
// of the global FFT mesh that I own in x-pencil decomposition
int npey_fft,npez_fft;
if (nz_pppm >= nprocs) {
npey_fft = 1;
npez_fft = nprocs;
} else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
//if (nz_pppm >= nprocs) {
// npey_fft = 1;
// npez_fft = nprocs;
//} else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
int me_y = me % npey_fft;
int me_z = me / npey_fft;