From 3116250da98be7e4de86d89e50a49f29bccbe2dc Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Fri, 19 Apr 2024 16:00:58 -0400 Subject: [PATCH 01/14] Initial port of kspace_modify collective yes to KOKKOS package --- src/KOKKOS/remap_kokkos.cpp | 359 ++++++++++++++++++++++++++++++------ src/KOKKOS/remap_kokkos.h | 8 +- 2 files changed, 309 insertions(+), 58 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 0d539ada83..7fbfdd1130 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -103,11 +103,7 @@ template void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d_in, typename FFT_AT::t_FFT_SCALAR_1d d_out, typename FFT_AT::t_FFT_SCALAR_1d d_buf, struct remap_plan_3d_kokkos *plan) { - // collective flag not yet supported - // use point-to-point communication - - int i,isend,irecv; typename FFT_AT::t_FFT_SCALAR_1d d_scratch; if (plan->memory == 0) @@ -116,70 +112,119 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d d_scratch = plan->d_scratch; // post all recvs into scratch space + // If not using GPU-aware MPI, mirror data to host FFT_SCALAR* v_scratch = d_scratch.data(); - if (!plan->usegpu_aware) { - plan->h_scratch = Kokkos::create_mirror_view(d_scratch); - v_scratch = plan->h_scratch.data(); - } - - for (irecv = 0; irecv < plan->nrecv; irecv++) { - FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv]; - MPI_Irecv(scratch,plan->recv_size[irecv], - MPI_FFT_SCALAR,plan->recv_proc[irecv],0, - plan->comm,&plan->request[irecv]); - } - FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data(); if (!plan->usegpu_aware) { + plan->h_scratch = Kokkos::create_mirror_view(d_scratch); plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf); + v_scratch = plan->h_scratch.data(); v_sendbuf = plan->h_sendbuf.data(); } - // send all messages to other procs + // use point-to-point communication - for (isend = 0; isend < plan->nsend; isend++) { - int in_offset = plan->send_offset[isend]; - plan->pack(d_in,in_offset, - plan->d_sendbuf,0,&plan->packplan[isend]); + if (!plan->usecollective) { + int i,isend,irecv; - if (!plan->usegpu_aware) - Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf); - MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, - plan->send_proc[isend],0,plan->comm); - } + for (irecv = 0; irecv < plan->nrecv; irecv++) { + FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv]; + MPI_Irecv(scratch,plan->recv_size[irecv], + MPI_FFT_SCALAR,plan->recv_proc[irecv],0, + plan->comm,&plan->request[irecv]); + } - // copy in -> scratch -> out for self data + // send all messages to other procs - if (plan->self) { - isend = plan->nsend; - irecv = plan->nrecv; + for (isend = 0; isend < plan->nsend; isend++) { + int in_offset = plan->send_offset[isend]; + plan->pack(d_in,in_offset, + plan->d_sendbuf,0,&plan->packplan[isend]); - int in_offset = plan->send_offset[isend]; - int scratch_offset = plan->recv_bufloc[irecv]; - int out_offset = plan->recv_offset[irecv]; + if (!plan->usegpu_aware) + Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf); - plan->pack(d_in,in_offset, - d_scratch,scratch_offset, - &plan->packplan[isend]); - plan->unpack(d_scratch,scratch_offset, - d_out,out_offset,&plan->unpackplan[irecv]); - } + MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm); + } - // unpack all messages from scratch -> out + // copy in -> scratch -> out for self data - for (i = 0; i < plan->nrecv; i++) { - MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE); + if (plan->self) { + isend = plan->nsend; + irecv = plan->nrecv; - int scratch_offset = plan->recv_bufloc[irecv]; - int out_offset = plan->recv_offset[irecv]; + int in_offset = plan->send_offset[isend]; + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; - if (!plan->usegpu_aware) - Kokkos::deep_copy(d_scratch,plan->h_scratch); + plan->pack(d_in,in_offset, + d_scratch,scratch_offset, + &plan->packplan[isend]); + plan->unpack(d_scratch,scratch_offset, + d_out,out_offset,&plan->unpackplan[irecv]); + } - plan->unpack(d_scratch,scratch_offset, - d_out,out_offset,&plan->unpackplan[irecv]); + // unpack all messages from scratch -> out + + for (i = 0; i < plan->nrecv; i++) { + MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE); + + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; + + if (!plan->usegpu_aware) + Kokkos::deep_copy(d_scratch,plan->h_scratch); + + plan->unpack(d_scratch,scratch_offset, + d_out,out_offset,&plan->unpackplan[irecv]); + } + } else { + if (plan->commringlen > 0) { + int isend,irecv; + + + // populate send data + // buffers are allocated and count/displacement buffers + // are populated in remap_3d_create_plan_kokkos + + int currentSendBufferOffset = 0; + for (isend = 0; isend < plan->commringlen; isend++) { + int foundentry = 0; + for (int i=0;(insend && !foundentry); i++) { + if (plan->send_proc[i] == plan->commringlist[isend]) { + foundentry = 1; + plan->pack(d_in,plan->send_offset[i], + plan->d_sendbuf,currentSendBufferOffset, + &plan->packplan[i]); + currentSendBufferOffset += plan->send_size[i]; + } + } + } + if (!plan->usegpu_aware) + Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf); + + MPI_Alltoallv(v_sendbuf, plan->sendcnts, plan->sdispls, + MPI_FFT_SCALAR, v_scratch, plan->rcvcnts, + plan->rdispls, MPI_FFT_SCALAR, plan->comm); + + // unpack the data from the recv buffer into out + + if (!plan->usegpu_aware) + Kokkos::deep_copy(d_scratch,plan->h_scratch); + + int currentRecvBufferOffset = 0; + for (irecv = 0; irecv < plan->commringlen; irecv++) { + if (plan->nrecvmap[irecv] > -1) { + plan->unpack(d_scratch,currentRecvBufferOffset, + d_out,plan->recv_offset[plan->nrecvmap[irecv]], + &plan->unpackplan[plan->nrecvmap[irecv]]); + currentRecvBufferOffset += plan->recv_size[plan->nrecvmap[irecv]]; + } + } + } } } @@ -223,7 +268,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat struct remap_plan_3d_kokkos *plan; struct extent_3d *inarray, *outarray; struct extent_3d in,out,overlap; - int i,iproc,nsend,nrecv,ibuf,size,me,nprocs; + int i,j,iproc,nsend,nrecv,ibuf,size,me,nprocs,isend,irecv; // query MPI info @@ -435,6 +480,108 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } + // create sub-comm rank list + if (plan->usecollective) { + plan->commringlist = nullptr; + + // merge recv and send rank lists + // ask Steve Plimpton about method to more accurately determine + // maximum number of procs contributing to pencil + + int maxcommsize = nprocs; + int *commringlist = (int *) malloc(maxcommsize*sizeof(int)); + int commringlen = 0; + + for (i = 0; i < nrecv; i++) { + commringlist[i] = plan->recv_proc[i]; + commringlen++; + } + + for (i = 0; i < nsend; i++) { + int foundentry = 0; + for (j = 0; j < commringlen;j++) + if (commringlist[j] == plan->send_proc[i]) foundentry = 1; + if (!foundentry) { + commringlist[commringlen] = plan->send_proc[i]; + commringlen++; + } + } + + // sort initial commringlist + + int swap = 0; + for (i = 0 ; i < (commringlen - 1); i++) { + for (j = 0 ; j < commringlen - i - 1; j++) { + if (commringlist[j] > commringlist[j+1]) { + swap = commringlist[j]; + commringlist[j] = commringlist[j+1]; + commringlist[j+1] = swap; + } + } + } + + // collide all inarray extents for the comm ring with all output + // extents and all outarray extents for the comm ring with all input + // extents - if there is a collison add the rank to the comm ring, + // keep iterating until nothing is added to commring + + int commringappend = 1; + while (commringappend) { + int newcommringlen = commringlen; + commringappend = 0; + for (i = 0; i < commringlen; i++) { + for (j = 0; j < nprocs; j++) { + if (remap_3d_collide(&inarray[commringlist[i]], + &outarray[j],&overlap)) { + int alreadyinlist = 0; + for (int k = 0; k < newcommringlen; k++) { + if (commringlist[k] == j) { + alreadyinlist = 1; + } + } + if (!alreadyinlist) { + commringlist[newcommringlen++] = j; + commringappend = 1; + } + } + if (remap_3d_collide(&outarray[commringlist[i]], + &inarray[j],&overlap)) { + int alreadyinlist = 0; + for (int k = 0 ; k < newcommringlen; k++) { + if (commringlist[k] == j) alreadyinlist = 1; + } + if (!alreadyinlist) { + commringlist[newcommringlen++] = j; + commringappend = 1; + } + } + } + } + commringlen = newcommringlen; + } + + // sort the final commringlist + + for (i = 0 ; i < ( commringlen - 1 ); i++) { + for (j = 0 ; j < commringlen - i - 1; j++) { + if (commringlist[j] > commringlist[j+1]) { + swap = commringlist[j]; + commringlist[j] = commringlist[j+1]; + commringlist[j+1] = swap; + } + } + } + + // resize commringlist to final size + + commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); + + // set the plan->commringlist + + plan->commringlen = commringlen; + plan->commringlist = commringlist; + } + // plan->nrecv = # of recvs not including self // for collectives include self in the nsend list @@ -455,15 +602,83 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat free(inarray); free(outarray); - // find biggest send message (not including self) and malloc space for it + // the plan->d_sendbuf and plan->d_recvbuf are used by both the + // collective & non-collective implementations. + // For non-collective, the buffer size is MAX(send_size) for any one send + // For collective, the buffer size is SUM(send_size) for all sends - size = 0; - for (nsend = 0; nsend < plan->nsend; nsend++) - size = MAX(size,plan->send_size[nsend]); + if (!plan->usecollective) { - if (size) { - plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); - if (!plan->d_sendbuf.data()) return nullptr; + // find biggest send message (not including self) and malloc space for it + + size = 0; + for (nsend = 0; nsend < plan->nsend; nsend++) + size = MAX(size,plan->send_size[nsend]); + + if (size) { + plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); + if (!plan->d_sendbuf.data()) return nullptr; + } + } else { + + // allocate buffer for all send messages (including self) + // the method to allocate receive scratch space is sufficient + // for collectives + + size = 0; + for (nsend = 0; nsend < plan->nsend; nsend++) + size += plan->send_size[nsend]; + + if (size) { + plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); + if (!plan->d_sendbuf.data()) return nullptr; + } + + // allocate buffers for send and receive counts, displacements + + if (plan->commringlen) { + plan->sendcnts = (int *) malloc(sizeof(int) * plan->commringlen); + plan->rcvcnts = (int *) malloc(sizeof(int) * plan->commringlen); + plan->sdispls = (int *) malloc(sizeof(int) * plan->commringlen); + plan->rdispls = (int *) malloc(sizeof(int) * plan->commringlen); + plan->nrecvmap = (int *) malloc(sizeof(int) * plan->commringlen); + + // populate buffers for send counts & displacements + + int currentSendBufferOffset = 0; + for (isend = 0; isend < plan->commringlen; isend++) { + plan->sendcnts[isend] = 0; + plan->sdispls[isend] = 0; + int foundentry = 0; + for (int i=0;(insend && !foundentry); i++) { + if (plan->send_proc[i] == plan->commringlist[isend]) { + foundentry = 1; + plan->sendcnts[isend] = plan->send_size[i]; + plan->sdispls[isend] = currentSendBufferOffset; + currentSendBufferOffset += plan->send_size[i]; + } + } + } + + // populate buffers for recv counts & displacements + + int currentRecvBufferOffset = 0; + for (irecv = 0; irecv < plan->commringlen; irecv++) { + plan->rcvcnts[irecv] = 0; + plan->rdispls[irecv] = 0; + plan->nrecvmap[irecv] = -1; + int foundentry = 0; + for (int i=0;(inrecv && !foundentry); i++) { + if (plan->recv_proc[i] == plan->commringlist[irecv]) { + foundentry = 1; + plan->rcvcnts[irecv] = plan->recv_size[i]; + plan->rdispls[irecv] = currentRecvBufferOffset; + currentRecvBufferOffset += plan->recv_size[i]; + plan->nrecvmap[irecv] = i; + } + } + } + } } // if requested, allocate internal scratch space for recvs, @@ -477,9 +692,28 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } + // if using collective and the commringlist is NOT empty create a + // communicator for the plan based off an MPI_Group created with + // ranks from the commringlist + + if ((plan->usecollective && (plan->commringlen > 0))) { + MPI_Group orig_group, new_group; + MPI_Comm_group(comm, &orig_group); + MPI_Group_incl(orig_group, plan->commringlen, + plan->commringlist, &new_group); + MPI_Comm_create(comm, new_group, &plan->comm); + } + + // if using collective and the comm ring list is empty create + // a communicator for the plan with an empty group + + else if ((plan->usecollective) && (plan->commringlen == 0)) { + MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); + } + // not using collective - dup comm - MPI_Comm_dup(comm,&plan->comm); + else MPI_Comm_dup(comm,&plan->comm); // return pointer to plan @@ -500,6 +734,17 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ if (!((plan->usecollective) && (plan->commringlen == 0))) MPI_Comm_free(&plan->comm); + if (plan->usecollective) { + if (plan->commringlist != nullptr) { + free(plan->commringlist); + free(plan->sendcnts); + free(plan->rcvcnts); + free(plan->sdispls); + free(plan->rdispls); + free(plan->nrecvmap); + } + } + // free internal arrays if (plan->nsend || plan->self) { diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h index 77a3b1a37a..9dbd8d683c 100644 --- a/src/KOKKOS/remap_kokkos.h +++ b/src/KOKKOS/remap_kokkos.h @@ -52,9 +52,15 @@ struct remap_plan_3d_kokkos { int memory; // user provides scratch space or not MPI_Comm comm; // group of procs performing remap int usecollective; // use collective or point-to-point MPI + int usegpu_aware; // use GPU-Aware MPI or not + // variables specific to collective MPI int commringlen; // length of commringlist int *commringlist; // ranks on communication ring of this plan - int usegpu_aware; // use GPU-Aware MPI or not + int *sendcnts; // # of elements in send buffer for each rank + int *rcvcnts; // # of elements in recv buffer for each rank + int *sdispls; // extraction location in send buffer for each rank + int *rdispls; // extraction location in recv buffer for each rank + int *nrecvmap; // maps receive index to rank index }; template From 0af4fe270281e49ee9e0cb243742d1f19bd408a7 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Sat, 20 Apr 2024 09:39:41 -0600 Subject: [PATCH 02/14] Enable collective comm for PPPMKokkos --- src/KOKKOS/pppm_kokkos.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index 73e2c1f06f..5a936626d9 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -794,7 +794,6 @@ void PPPMKokkos::allocate() // 2nd FFT returns data in 3d brick decomposition // remap takes data from 3d brick to FFT decomposition - int collective_flag = 0; // not yet supported in Kokkos version int gpu_aware_flag = lmp->kokkos->gpu_aware_flag; int tmp; From f43fec417d33734a93b180b1759cab8dc34998b2 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Mon, 22 Apr 2024 10:23:14 -0400 Subject: [PATCH 03/14] Updated collective flag keyword in KOKKOS PPPM to use setting from input file --- src/KOKKOS/pppm_kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index 73e2c1f06f..273a53ab8f 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -794,7 +794,7 @@ void PPPMKokkos::allocate() // 2nd FFT returns data in 3d brick decomposition // remap takes data from 3d brick to FFT decomposition - int collective_flag = 0; // not yet supported in Kokkos version + int collective_flag = force->kspace->collective_flag; int gpu_aware_flag = lmp->kokkos->gpu_aware_flag; int tmp; From f9e349a2bc5f9adc9d868dc6832af9a736873c26 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Tue, 23 Apr 2024 13:42:46 -0400 Subject: [PATCH 04/14] Fix load balancing issue with 2D FFTs --- src/KSPACE/pppm.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp index 4fe5075f44..06cbf119e3 100644 --- a/src/KSPACE/pppm.cpp +++ b/src/KSPACE/pppm.cpp @@ -1389,10 +1389,7 @@ void PPPM::set_grid_local() // of the global FFT mesh that I own in x-pencil decomposition int npey_fft,npez_fft; - if (nz_pppm >= nprocs) { - npey_fft = 1; - npez_fft = nprocs; - } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); + procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); int me_y = me % npey_fft; int me_z = me / npey_fft; From 65e8a5c981c559ef3ba5f5d1b2fb9e26667206ce Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Tue, 23 Apr 2024 14:59:42 -0400 Subject: [PATCH 05/14] Relocated send & recv initializations to occur together, since they are independent --- src/KOKKOS/remap_kokkos.cpp | 97 ++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 55 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 7fbfdd1130..404fddd7c4 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -308,28 +308,29 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat out.khi = out_khi; out.ksize = out.khi - out.klo + 1; - // combine output extents across all procs - inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); if (inarray == nullptr) return nullptr; outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); if (outarray == nullptr) return nullptr; + // combine input & output extents across all procs + + MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE, + inarray,sizeof(struct extent_3d),MPI_BYTE,comm); MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE, outarray,sizeof(struct extent_3d),MPI_BYTE,comm); - // count send collides, including self + // count send & recv collides, including self nsend = 0; - iproc = me; + nrecv = 0; for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; - nsend += remap_3d_collide(&in,&outarray[iproc],&overlap); + nsend += remap_3d_collide(&in,&outarray[i],&overlap); + nrecv += remap_3d_collide(&out,&inarray[i],&overlap); } - // malloc space for send info + // malloc space for send & recv info if (nsend) { plan->pack = PackKokkos::pack_3d; @@ -344,6 +345,39 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; } + if (nrecv) { + if (permute == 0) + plan->unpack = PackKokkos::unpack_3d; + else if (permute == 1) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute1_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute1_2; + else + plan->unpack = PackKokkos::unpack_3d_permute1_n; + } + else if (permute == 2) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute2_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute2_2; + else + plan->unpack = PackKokkos::unpack_3d_permute2_n; + } + + plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); + plan->recv_size = (int *) malloc(nrecv*sizeof(int)); + plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); + plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); + plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); + plan->unpackplan = (struct pack_plan_3d *) + malloc(nrecv*sizeof(struct pack_plan_3d)); + + if (plan->recv_offset == nullptr || plan->recv_size == nullptr || + plan->recv_proc == nullptr || plan->recv_bufloc == nullptr || + plan->request == nullptr || plan->unpackplan == nullptr) return nullptr; + } + // store send info, with self as last entry nsend = 0; @@ -377,55 +411,8 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } else plan->nsend = nsend; - // combine input extents across all procs - - MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE, - inarray,sizeof(struct extent_3d),MPI_BYTE,comm); - - // count recv collides, including self - - nrecv = 0; - iproc = me; - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; - nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap); - } - // malloc space for recv info - if (nrecv) { - if (permute == 0) - plan->unpack = PackKokkos::unpack_3d; - else if (permute == 1) { - if (nqty == 1) - plan->unpack = PackKokkos::unpack_3d_permute1_1; - else if (nqty == 2) - plan->unpack = PackKokkos::unpack_3d_permute1_2; - else - plan->unpack = PackKokkos::unpack_3d_permute1_n; - } - else if (permute == 2) { - if (nqty == 1) - plan->unpack = PackKokkos::unpack_3d_permute2_1; - else if (nqty == 2) - plan->unpack = PackKokkos::unpack_3d_permute2_2; - else - plan->unpack = PackKokkos::unpack_3d_permute2_n; - } - - plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); - plan->recv_size = (int *) malloc(nrecv*sizeof(int)); - plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); - plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); - plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); - plan->unpackplan = (struct pack_plan_3d *) - malloc(nrecv*sizeof(struct pack_plan_3d)); - - if (plan->recv_offset == nullptr || plan->recv_size == nullptr || - plan->recv_proc == nullptr || plan->recv_bufloc == nullptr || - plan->request == nullptr || plan->unpackplan == nullptr) return nullptr; - } // store recv info, with self as last entry From 1a431b02aed66fba073f22458ffefb6c81021ba7 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Wed, 24 Apr 2024 09:23:29 -0400 Subject: [PATCH 06/14] Split collective and non-collective into conditionals. Multi-node test passing. Beginning optimization of collective --- src/KOKKOS/remap_kokkos.cpp | 611 ++++++++++++++++++++++-------------- 1 file changed, 382 insertions(+), 229 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 404fddd7c4..1dcf6c4938 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -321,169 +321,349 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE, outarray,sizeof(struct extent_3d),MPI_BYTE,comm); - // count send & recv collides, including self + // for efficiency, handle collective & non-collective setup separately - nsend = 0; - nrecv = 0; - for (i = 0; i < nprocs; i++) { - nsend += remap_3d_collide(&in,&outarray[i],&overlap); - nrecv += remap_3d_collide(&out,&inarray[i],&overlap); - } + if (!plan->usecollective) { + // count send & recv collides, including self - // malloc space for send & recv info - - if (nsend) { - plan->pack = PackKokkos::pack_3d; - - plan->send_offset = (int *) malloc(nsend*sizeof(int)); - plan->send_size = (int *) malloc(nsend*sizeof(int)); - plan->send_proc = (int *) malloc(nsend*sizeof(int)); - plan->packplan = (struct pack_plan_3d *) - malloc(nsend*sizeof(struct pack_plan_3d)); - - if (plan->send_offset == nullptr || plan->send_size == nullptr || - plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; - } - - if (nrecv) { - if (permute == 0) - plan->unpack = PackKokkos::unpack_3d; - else if (permute == 1) { - if (nqty == 1) - plan->unpack = PackKokkos::unpack_3d_permute1_1; - else if (nqty == 2) - plan->unpack = PackKokkos::unpack_3d_permute1_2; - else - plan->unpack = PackKokkos::unpack_3d_permute1_n; - } - else if (permute == 2) { - if (nqty == 1) - plan->unpack = PackKokkos::unpack_3d_permute2_1; - else if (nqty == 2) - plan->unpack = PackKokkos::unpack_3d_permute2_2; - else - plan->unpack = PackKokkos::unpack_3d_permute2_n; + nsend = 0; + nrecv = 0; + for (i = 0; i < nprocs; i++) { + nsend += remap_3d_collide(&in,&outarray[i],&overlap); + nrecv += remap_3d_collide(&out,&inarray[i],&overlap); } - plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); - plan->recv_size = (int *) malloc(nrecv*sizeof(int)); - plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); - plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); - plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); - plan->unpackplan = (struct pack_plan_3d *) - malloc(nrecv*sizeof(struct pack_plan_3d)); + // malloc space for send & recv info - if (plan->recv_offset == nullptr || plan->recv_size == nullptr || - plan->recv_proc == nullptr || plan->recv_bufloc == nullptr || - plan->request == nullptr || plan->unpackplan == nullptr) return nullptr; - } + if (nsend) { + plan->pack = PackKokkos::pack_3d; - // store send info, with self as last entry + plan->send_offset = (int *) malloc(nsend*sizeof(int)); + plan->send_size = (int *) malloc(nsend*sizeof(int)); + plan->send_proc = (int *) malloc(nsend*sizeof(int)); + plan->packplan = (struct pack_plan_3d *) + malloc(nsend*sizeof(struct pack_plan_3d)); - nsend = 0; - iproc = me; - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; - if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { - plan->send_proc[nsend] = iproc; - plan->send_offset[nsend] = nqty * - ((overlap.klo-in.klo)*in.jsize*in.isize + - ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); - plan->packplan[nsend].nfast = nqty*overlap.isize; - plan->packplan[nsend].nmid = overlap.jsize; - plan->packplan[nsend].nslow = overlap.ksize; - plan->packplan[nsend].nstride_line = nqty*in.isize; - plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; - plan->packplan[nsend].nqty = nqty; - plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; - nsend++; + if (plan->send_offset == nullptr || plan->send_size == nullptr || + plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; } - } - // plan->nsend = # of sends not including self - - if (nsend && plan->send_proc[nsend-1] == me) { - if (plan->usecollective) // for collectives include self in nsend list - plan->nsend = nsend; - else - plan->nsend = nsend - 1; - } else - plan->nsend = nsend; - - // malloc space for recv info - - - // store recv info, with self as last entry - - ibuf = 0; - nrecv = 0; - iproc = me; - - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; - if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { - plan->recv_proc[nrecv] = iproc; - plan->recv_bufloc[nrecv] = ibuf; - - if (permute == 0) { - plan->recv_offset[nrecv] = nqty * - ((overlap.klo-out.klo)*out.jsize*out.isize + - (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); - plan->unpackplan[nrecv].nfast = nqty*overlap.isize; - plan->unpackplan[nrecv].nmid = overlap.jsize; - plan->unpackplan[nrecv].nslow = overlap.ksize; - plan->unpackplan[nrecv].nstride_line = nqty*out.isize; - plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize; - plan->unpackplan[nrecv].nqty = nqty; - } + if (nrecv) { + if (permute == 0) + plan->unpack = PackKokkos::unpack_3d; else if (permute == 1) { - plan->recv_offset[nrecv] = nqty * - ((overlap.ilo-out.ilo)*out.ksize*out.jsize + - (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); - plan->unpackplan[nrecv].nfast = overlap.isize; - plan->unpackplan[nrecv].nmid = overlap.jsize; - plan->unpackplan[nrecv].nslow = overlap.ksize; - plan->unpackplan[nrecv].nstride_line = nqty*out.jsize; - plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize; - plan->unpackplan[nrecv].nqty = nqty; + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute1_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute1_2; + else + plan->unpack = PackKokkos::unpack_3d_permute1_n; } - else { - plan->recv_offset[nrecv] = nqty * - ((overlap.jlo-out.jlo)*out.isize*out.ksize + - (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); - plan->unpackplan[nrecv].nfast = overlap.isize; - plan->unpackplan[nrecv].nmid = overlap.jsize; - plan->unpackplan[nrecv].nslow = overlap.ksize; - plan->unpackplan[nrecv].nstride_line = nqty*out.ksize; - plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; - plan->unpackplan[nrecv].nqty = nqty; + else if (permute == 2) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute2_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute2_2; + else + plan->unpack = PackKokkos::unpack_3d_permute2_n; } - plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize; - ibuf += plan->recv_size[nrecv]; - nrecv++; + plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); + plan->recv_size = (int *) malloc(nrecv*sizeof(int)); + plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); + plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); + plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); + plan->unpackplan = (struct pack_plan_3d *) + malloc(nrecv*sizeof(struct pack_plan_3d)); + + if (plan->recv_offset == nullptr || plan->recv_size == nullptr || + plan->recv_proc == nullptr || plan->recv_bufloc == nullptr || + plan->request == nullptr || plan->unpackplan == nullptr) return nullptr; } - } - // create sub-comm rank list - if (plan->usecollective) { + // store send info, with self as last entry + + nsend = 0; + iproc = me; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { + plan->send_proc[nsend] = iproc; + plan->send_offset[nsend] = nqty * + ((overlap.klo-in.klo)*in.jsize*in.isize + + ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); + plan->packplan[nsend].nfast = nqty*overlap.isize; + plan->packplan[nsend].nmid = overlap.jsize; + plan->packplan[nsend].nslow = overlap.ksize; + plan->packplan[nsend].nstride_line = nqty*in.isize; + plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; + plan->packplan[nsend].nqty = nqty; + plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + nsend++; + } + } + + // plan->nsend = # of sends not including self + + if (nsend && plan->send_proc[nsend-1] == me) plan->nsend = nsend - 1; + else plan->nsend = nsend; + + // store recv info, with self as last entry + + ibuf = 0; + nrecv = 0; + iproc = me; + + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { + plan->recv_proc[nrecv] = iproc; + plan->recv_bufloc[nrecv] = ibuf; + + if (permute == 0) { + plan->recv_offset[nrecv] = nqty * + ((overlap.klo-out.klo)*out.jsize*out.isize + + (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); + plan->unpackplan[nrecv].nfast = nqty*overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.isize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize; + plan->unpackplan[nrecv].nqty = nqty; + } + else if (permute == 1) { + plan->recv_offset[nrecv] = nqty * + ((overlap.ilo-out.ilo)*out.ksize*out.jsize + + (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.jsize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize; + plan->unpackplan[nrecv].nqty = nqty; + } + else { + plan->recv_offset[nrecv] = nqty * + ((overlap.jlo-out.jlo)*out.isize*out.ksize + + (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.ksize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; + plan->unpackplan[nrecv].nqty = nqty; + } + + plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + ibuf += plan->recv_size[nrecv]; + nrecv++; + } + } + + // plan->nrecv = # of recvs not including self + + if (nrecv && plan->recv_proc[nrecv-1] == me) plan->nrecv = nrecv - 1; + else plan->nrecv = nrecv; + + // init remaining fields in remap plan + + plan->memory = memory; + + if (nrecv == plan->nrecv) plan->self = 0; + else plan->self = 1; + + + // the plan->d_sendbuf and plan->d_recvbuf are used by both the + // collective & non-collective implementations. + // For non-collective, the buffer size is MAX(send_size) for any one send + + // find biggest send message (not including self) and malloc space for it + + size = 0; + for (nsend = 0; nsend < plan->nsend; nsend++) + size = MAX(size,plan->send_size[nsend]); + + if (size) { + plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); + if (!plan->d_sendbuf.data()) return nullptr; + } + + // if requested, allocate internal scratch space for recvs, + // only need it if I will receive any data (including self) + + if (memory == 1) { + if (nrecv > 0) { + plan->d_scratch = + typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize); + if (!plan->d_scratch.data()) return nullptr; + } + } + + // Non-collectives do not use MPI Communicator Groups + + MPI_Comm_dup(comm,&plan->comm); + } else { + // count send & recv collides, including self + + nsend = 0; + nrecv = 0; + for (i = 0; i < nprocs; i++) { + nsend += remap_3d_collide(&in,&outarray[i],&overlap); + nrecv += remap_3d_collide(&out,&inarray[i],&overlap); + } + + // malloc space for send & recv info + + if (nsend) { + plan->pack = PackKokkos::pack_3d; + + plan->send_offset = (int *) malloc(nsend*sizeof(int)); + plan->send_size = (int *) malloc(nsend*sizeof(int)); + plan->send_proc = (int *) malloc(nsend*sizeof(int)); + plan->packplan = (struct pack_plan_3d *) + malloc(nsend*sizeof(struct pack_plan_3d)); + + if (plan->send_offset == nullptr || plan->send_size == nullptr || + plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; + } + + if (nrecv) { + if (permute == 0) + plan->unpack = PackKokkos::unpack_3d; + else if (permute == 1) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute1_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute1_2; + else + plan->unpack = PackKokkos::unpack_3d_permute1_n; + } + else if (permute == 2) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute2_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute2_2; + else + plan->unpack = PackKokkos::unpack_3d_permute2_n; + } + + plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); + plan->recv_size = (int *) malloc(nrecv*sizeof(int)); + plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); + plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); + plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); + plan->unpackplan = (struct pack_plan_3d *) + malloc(nrecv*sizeof(struct pack_plan_3d)); + + if (plan->recv_offset == nullptr || plan->recv_size == nullptr || + plan->recv_proc == nullptr || plan->recv_bufloc == nullptr || + plan->request == nullptr || plan->unpackplan == nullptr) return nullptr; + } + + // store send info, with self as last entry + + nsend = 0; + iproc = me; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { + plan->send_proc[nsend] = iproc; + plan->send_offset[nsend] = nqty * + ((overlap.klo-in.klo)*in.jsize*in.isize + + ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); + plan->packplan[nsend].nfast = nqty*overlap.isize; + plan->packplan[nsend].nmid = overlap.jsize; + plan->packplan[nsend].nslow = overlap.ksize; + plan->packplan[nsend].nstride_line = nqty*in.isize; + plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; + plan->packplan[nsend].nqty = nqty; + plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + nsend++; + } + } + + // plan->nsend = # of sends not including self + + plan->nsend = nsend; + + // store recv info, with self as last entry + + ibuf = 0; + nrecv = 0; + iproc = me; + + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { + plan->recv_proc[nrecv] = iproc; + plan->recv_bufloc[nrecv] = ibuf; + + if (permute == 0) { + plan->recv_offset[nrecv] = nqty * + ((overlap.klo-out.klo)*out.jsize*out.isize + + (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); + plan->unpackplan[nrecv].nfast = nqty*overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.isize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize; + plan->unpackplan[nrecv].nqty = nqty; + } + else if (permute == 1) { + plan->recv_offset[nrecv] = nqty * + ((overlap.ilo-out.ilo)*out.ksize*out.jsize + + (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.jsize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize; + plan->unpackplan[nrecv].nqty = nqty; + } + else { + plan->recv_offset[nrecv] = nqty * + ((overlap.jlo-out.jlo)*out.isize*out.ksize + + (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.ksize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; + plan->unpackplan[nrecv].nqty = nqty; + } + + plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + ibuf += plan->recv_size[nrecv]; + nrecv++; + } + } + + // plan->nrecv = # of recvs not including self + // for collectives include self in the nsend list + + plan->nrecv = nrecv; + + // create sub-comm rank list + plan->commringlist = nullptr; - + // merge recv and send rank lists // ask Steve Plimpton about method to more accurately determine // maximum number of procs contributing to pencil - + int maxcommsize = nprocs; int *commringlist = (int *) malloc(maxcommsize*sizeof(int)); int commringlen = 0; - + for (i = 0; i < nrecv; i++) { commringlist[i] = plan->recv_proc[i]; commringlen++; } - + for (i = 0; i < nsend; i++) { int foundentry = 0; for (j = 0; j < commringlen;j++) @@ -493,9 +673,9 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat commringlen++; } } - + // sort initial commringlist - + int swap = 0; for (i = 0 ; i < (commringlen - 1); i++) { for (j = 0 ; j < commringlen - i - 1; j++) { @@ -506,12 +686,12 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } } - + // collide all inarray extents for the comm ring with all output // extents and all outarray extents for the comm ring with all input // extents - if there is a collison add the rank to the comm ring, // keep iterating until nothing is added to commring - + int commringappend = 1; while (commringappend) { int newcommringlen = commringlen; @@ -519,7 +699,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat for (i = 0; i < commringlen; i++) { for (j = 0; j < nprocs; j++) { if (remap_3d_collide(&inarray[commringlist[i]], - &outarray[j],&overlap)) { + &outarray[j],&overlap)) { int alreadyinlist = 0; for (int k = 0; k < newcommringlen; k++) { if (commringlist[k] == j) { @@ -532,7 +712,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } if (remap_3d_collide(&outarray[commringlist[i]], - &inarray[j],&overlap)) { + &inarray[j],&overlap)) { int alreadyinlist = 0; for (int k = 0 ; k < newcommringlen; k++) { if (commringlist[k] == j) alreadyinlist = 1; @@ -546,9 +726,9 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } commringlen = newcommringlen; } - + // sort the final commringlist - + for (i = 0 ; i < ( commringlen - 1 ); i++) { for (j = 0 ; j < commringlen - i - 1; j++) { if (commringlist[j] > commringlist[j+1]) { @@ -558,80 +738,52 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } } - + // resize commringlist to final size - + commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); - + // set the plan->commringlist - + plan->commringlen = commringlen; plan->commringlist = commringlist; - } - - // plan->nrecv = # of recvs not including self - // for collectives include self in the nsend list - - if (nrecv && plan->recv_proc[nrecv-1] == me) { - if (plan->usecollective) plan->nrecv = nrecv; - else plan->nrecv = nrecv - 1; - } else plan->nrecv = nrecv; - - // init remaining fields in remap plan - - plan->memory = memory; - - if (nrecv == plan->nrecv) plan->self = 0; - else plan->self = 1; - - // free locally malloced space - - free(inarray); - free(outarray); - - // the plan->d_sendbuf and plan->d_recvbuf are used by both the - // collective & non-collective implementations. - // For non-collective, the buffer size is MAX(send_size) for any one send - // For collective, the buffer size is SUM(send_size) for all sends - - if (!plan->usecollective) { - - // find biggest send message (not including self) and malloc space for it - - size = 0; - for (nsend = 0; nsend < plan->nsend; nsend++) - size = MAX(size,plan->send_size[nsend]); - - if (size) { - plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); - if (!plan->d_sendbuf.data()) return nullptr; - } - } else { - + + // init remaining fields in remap plan + + plan->memory = memory; + + if (nrecv == plan->nrecv) plan->self = 0; + else plan->self = 1; + + // the plan->d_sendbuf and plan->d_recvbuf are used by both the + // collective & non-collective implementations. + // For non-collective, the buffer size is MAX(send_size) for any one send + // For collective, the buffer size is SUM(send_size) for all sends + // allocate buffer for all send messages (including self) // the method to allocate receive scratch space is sufficient // for collectives - + size = 0; for (nsend = 0; nsend < plan->nsend; nsend++) size += plan->send_size[nsend]; - + if (size) { plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); if (!plan->d_sendbuf.data()) return nullptr; } - + // allocate buffers for send and receive counts, displacements - + if (plan->commringlen) { plan->sendcnts = (int *) malloc(sizeof(int) * plan->commringlen); plan->rcvcnts = (int *) malloc(sizeof(int) * plan->commringlen); plan->sdispls = (int *) malloc(sizeof(int) * plan->commringlen); plan->rdispls = (int *) malloc(sizeof(int) * plan->commringlen); plan->nrecvmap = (int *) malloc(sizeof(int) * plan->commringlen); - + // populate buffers for send counts & displacements - + int currentSendBufferOffset = 0; for (isend = 0; isend < plan->commringlen; isend++) { plan->sendcnts[isend] = 0; @@ -646,9 +798,9 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } } - + // populate buffers for recv counts & displacements - + int currentRecvBufferOffset = 0; for (irecv = 0; irecv < plan->commringlen; irecv++) { plan->rcvcnts[irecv] = 0; @@ -666,41 +818,42 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } } - } - - // if requested, allocate internal scratch space for recvs, - // only need it if I will receive any data (including self) - - if (memory == 1) { - if (nrecv > 0) { - plan->d_scratch = - typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize); - if (!plan->d_scratch.data()) return nullptr; + + // if requested, allocate internal scratch space for recvs, + // only need it if I will receive any data (including self) + + if (memory == 1) { + if (nrecv > 0) { + plan->d_scratch = + typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize); + if (!plan->d_scratch.data()) return nullptr; + } } + + // if using collective and the commringlist is NOT empty create a + // communicator for the plan based off an MPI_Group created with + // ranks from the commringlist + + if (plan->commringlen > 0) { + MPI_Group orig_group, new_group; + MPI_Comm_group(comm, &orig_group); + MPI_Group_incl(orig_group, plan->commringlen, + plan->commringlist, &new_group); + MPI_Comm_create(comm, new_group, &plan->comm); + } + + // if using collective and the comm ring list is empty create + // a communicator for the plan with an empty group + + else + MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); + } - // if using collective and the commringlist is NOT empty create a - // communicator for the plan based off an MPI_Group created with - // ranks from the commringlist + // free locally malloced space - if ((plan->usecollective && (plan->commringlen > 0))) { - MPI_Group orig_group, new_group; - MPI_Comm_group(comm, &orig_group); - MPI_Group_incl(orig_group, plan->commringlen, - plan->commringlist, &new_group); - MPI_Comm_create(comm, new_group, &plan->comm); - } - - // if using collective and the comm ring list is empty create - // a communicator for the plan with an empty group - - else if ((plan->usecollective) && (plan->commringlen == 0)) { - MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); - } - - // not using collective - dup comm - - else MPI_Comm_dup(comm,&plan->comm); + free(inarray); + free(outarray); // return pointer to plan From 71f82e70ef896f76f22f98d60f2bf396ac20763f Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Thu, 25 Apr 2024 13:16:03 -0400 Subject: [PATCH 07/14] Refactored kspace+kokkos collective remap. Need to go back and tidy up memory allocations --- src/KOKKOS/remap_kokkos.cpp | 381 ++++++++++++++---------------------- 1 file changed, 150 insertions(+), 231 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 1dcf6c4938..fafb3dc37b 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -185,22 +185,17 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d if (plan->commringlen > 0) { int isend,irecv; - // populate send data // buffers are allocated and count/displacement buffers // are populated in remap_3d_create_plan_kokkos - int currentSendBufferOffset = 0; + int numpacked = 0; for (isend = 0; isend < plan->commringlen; isend++) { - int foundentry = 0; - for (int i=0;(insend && !foundentry); i++) { - if (plan->send_proc[i] == plan->commringlist[isend]) { - foundentry = 1; - plan->pack(d_in,plan->send_offset[i], - plan->d_sendbuf,currentSendBufferOffset, - &plan->packplan[i]); - currentSendBufferOffset += plan->send_size[i]; - } + if (plan->sendcnts[isend] > 0) { + plan->pack(d_in,plan->send_offset[numpacked], + plan->d_sendbuf,plan->sdispls[isend], + &plan->packplan[numpacked]); + numpacked++; } } if (!plan->usegpu_aware) @@ -215,13 +210,13 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d if (!plan->usegpu_aware) Kokkos::deep_copy(d_scratch,plan->h_scratch); - int currentRecvBufferOffset = 0; + numpacked = 0; for (irecv = 0; irecv < plan->commringlen; irecv++) { - if (plan->nrecvmap[irecv] > -1) { - plan->unpack(d_scratch,currentRecvBufferOffset, - d_out,plan->recv_offset[plan->nrecvmap[irecv]], - &plan->unpackplan[plan->nrecvmap[irecv]]); - currentRecvBufferOffset += plan->recv_size[plan->nrecvmap[irecv]]; + if (plan->rcvcnts[irecv] > 0) { + plan->unpack(d_scratch,plan->rdispls[irecv], + d_out,plan->recv_offset[numpacked], + &plan->unpackplan[numpacked]); + numpacked++; } } } @@ -505,31 +500,105 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat MPI_Comm_dup(comm,&plan->comm); } else { + + // Improved approach - use an AllReduce to aggregate which ranks need to be included + // To do this, we build the local proc's send/receive list, then do an AllReduce + // to create the send/recv count for the Alltoallv + + // local arrays to be used in the allreduce + // start with max length -- nprocs. Unused entries will be removed later + + int *local_cnts = (int*) malloc(2*nprocs*sizeof(int)); + if (local_cnts == nullptr) return nullptr; + int *local_sendcnts = local_cnts; + int *local_recvcnts = (local_cnts + nprocs); + + // local arrays used to store the results of the allreduce + + int *global_cnts = (int*) malloc(2*nprocs*sizeof(int)); + if (global_cnts == nullptr) return nullptr; + int *global_sendcnts = global_cnts; + int *global_recvcnts = (global_cnts + nprocs); + // count send & recv collides, including self nsend = 0; nrecv = 0; for (i = 0; i < nprocs; i++) { - nsend += remap_3d_collide(&in,&outarray[i],&overlap); - nrecv += remap_3d_collide(&out,&inarray[i],&overlap); + local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap); + local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap); + nsend += local_sendcnts[i]; + nrecv += local_recvcnts[i]; } - // malloc space for send & recv info + // perform an AllReduce to get the counts from all other processors and build sendcnts list + + MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm); - if (nsend) { + // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv + // also builds commringlist -- which is already sorted + + int *commringlist = (int*) malloc(nprocs * sizeof(int)); + int commringlen = 0; + + for (i = 0; i < nprocs; i++) { + if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) { + commringlist[commringlen] = i; + commringlen++; + } + } + + // resize commringlist to final size + + commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); + + // set the plan->commringlist + + plan->commringlen = commringlen; + plan->commringlist = commringlist; + + // clean up local buffers that are finished + + local_sendcnts = nullptr; + local_recvcnts = nullptr; + global_recvcnts = nullptr; + global_sendcnts = nullptr; + free(local_cnts); + free(global_cnts); + + // malloc space for send & recv info + // if the current proc is involved in any way in the communication, allocate space + // because of the Alltoallv, both send and recv have to be initialized even if + // only one of those is performed + + if (nsend || nrecv) { + + // send space + + plan->nsend = nsend; plan->pack = PackKokkos::pack_3d; plan->send_offset = (int *) malloc(nsend*sizeof(int)); - plan->send_size = (int *) malloc(nsend*sizeof(int)); - plan->send_proc = (int *) malloc(nsend*sizeof(int)); + plan->send_size = (int *) malloc(plan->commringlen*sizeof(int)); + + plan->sendcnts = (int *) malloc(plan->commringlen*sizeof(int)); + plan->sdispls = (int *) malloc(plan->commringlen*sizeof(int)); + + // unused + plan->send_proc = (int *) malloc(plan->commringlen*sizeof(int)); + + // only used when sendcnt > 0 + plan->packplan = (struct pack_plan_3d *) malloc(nsend*sizeof(struct pack_plan_3d)); if (plan->send_offset == nullptr || plan->send_size == nullptr || plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; - } - if (nrecv) { + // recv space + + plan->nrecv = nrecv; + if (permute == 0) plan->unpack = PackKokkos::unpack_3d; else if (permute == 1) { @@ -550,10 +619,18 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); - plan->recv_size = (int *) malloc(nrecv*sizeof(int)); + plan->recv_size = (int *) malloc(plan->commringlen*sizeof(int)); + + plan->rcvcnts = (int *) malloc(plan->commringlen*sizeof(int)); + plan->rdispls = (int *) malloc(plan->commringlen*sizeof(int)); + + // unused plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); + + // only used when recvcnt > 0 + plan->unpackplan = (struct pack_plan_3d *) malloc(nrecv*sizeof(struct pack_plan_3d)); @@ -565,47 +642,56 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat // store send info, with self as last entry nsend = 0; - iproc = me; - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; + ibuf = 0; + int total_send_size = 0; + for (i = 0; i < plan->commringlen; i++) { + iproc = plan->commringlist[i]; if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { - plan->send_proc[nsend] = iproc; + //plan->send_proc[nsend] = i; + // number of entries required for this pack's 3-d coords plan->send_offset[nsend] = nqty * ((overlap.klo-in.klo)*in.jsize*in.isize + - ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); + ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); plan->packplan[nsend].nfast = nqty*overlap.isize; plan->packplan[nsend].nmid = overlap.jsize; plan->packplan[nsend].nslow = overlap.ksize; plan->packplan[nsend].nstride_line = nqty*in.isize; plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; plan->packplan[nsend].nqty = nqty; - plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + // total amount of overlap + plan->send_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + plan->sendcnts[i] = plan->send_size[i]; + plan->sdispls[i] = ibuf; + ibuf += plan->send_size[i]; nsend++; + } else { + plan->send_size[i] = 0; + plan->sdispls[i] = ibuf; + plan->sendcnts[i] = 0; } + total_send_size += plan->send_size[i]; + } + + if (total_send_size) { + plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",total_send_size); + if (!plan->d_sendbuf.data()) return nullptr; } - - // plan->nsend = # of sends not including self - - plan->nsend = nsend; // store recv info, with self as last entry ibuf = 0; nrecv = 0; - iproc = me; - for (i = 0; i < nprocs; i++) { - iproc++; - if (iproc == nprocs) iproc = 0; + for (i = 0; i < plan->commringlen; i++) { + iproc = plan->commringlist[i]; if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { - plan->recv_proc[nrecv] = iproc; + //plan->recv_proc[nrecv] = iproc; plan->recv_bufloc[nrecv] = ibuf; if (permute == 0) { plan->recv_offset[nrecv] = nqty * ((overlap.klo-out.klo)*out.jsize*out.isize + - (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); + (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); plan->unpackplan[nrecv].nfast = nqty*overlap.isize; plan->unpackplan[nrecv].nmid = overlap.jsize; plan->unpackplan[nrecv].nslow = overlap.ksize; @@ -616,7 +702,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat else if (permute == 1) { plan->recv_offset[nrecv] = nqty * ((overlap.ilo-out.ilo)*out.ksize*out.jsize + - (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); + (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); plan->unpackplan[nrecv].nfast = overlap.isize; plan->unpackplan[nrecv].nmid = overlap.jsize; plan->unpackplan[nrecv].nslow = overlap.ksize; @@ -627,7 +713,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat else { plan->recv_offset[nrecv] = nqty * ((overlap.jlo-out.jlo)*out.isize*out.ksize + - (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); + (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); plan->unpackplan[nrecv].nfast = overlap.isize; plan->unpackplan[nrecv].nmid = overlap.jsize; plan->unpackplan[nrecv].nslow = overlap.ksize; @@ -636,192 +722,26 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->unpackplan[nrecv].nqty = nqty; } - plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize; - ibuf += plan->recv_size[nrecv]; + plan->recv_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + plan->rcvcnts[i] = plan->recv_size[i]; + plan->rdispls[i] = ibuf; + ibuf += plan->recv_size[i]; nrecv++; + } else { + plan->recv_size[i] = 0; + plan->rcvcnts[i] = 0; + plan->rdispls[i] = ibuf; } } - - // plan->nrecv = # of recvs not including self - // for collectives include self in the nsend list - - plan->nrecv = nrecv; - - // create sub-comm rank list - - plan->commringlist = nullptr; - - // merge recv and send rank lists - // ask Steve Plimpton about method to more accurately determine - // maximum number of procs contributing to pencil - - int maxcommsize = nprocs; - int *commringlist = (int *) malloc(maxcommsize*sizeof(int)); - int commringlen = 0; - - for (i = 0; i < nrecv; i++) { - commringlist[i] = plan->recv_proc[i]; - commringlen++; - } - - for (i = 0; i < nsend; i++) { - int foundentry = 0; - for (j = 0; j < commringlen;j++) - if (commringlist[j] == plan->send_proc[i]) foundentry = 1; - if (!foundentry) { - commringlist[commringlen] = plan->send_proc[i]; - commringlen++; - } - } - - // sort initial commringlist - - int swap = 0; - for (i = 0 ; i < (commringlen - 1); i++) { - for (j = 0 ; j < commringlen - i - 1; j++) { - if (commringlist[j] > commringlist[j+1]) { - swap = commringlist[j]; - commringlist[j] = commringlist[j+1]; - commringlist[j+1] = swap; - } - } - } - - // collide all inarray extents for the comm ring with all output - // extents and all outarray extents for the comm ring with all input - // extents - if there is a collison add the rank to the comm ring, - // keep iterating until nothing is added to commring - - int commringappend = 1; - while (commringappend) { - int newcommringlen = commringlen; - commringappend = 0; - for (i = 0; i < commringlen; i++) { - for (j = 0; j < nprocs; j++) { - if (remap_3d_collide(&inarray[commringlist[i]], - &outarray[j],&overlap)) { - int alreadyinlist = 0; - for (int k = 0; k < newcommringlen; k++) { - if (commringlist[k] == j) { - alreadyinlist = 1; - } - } - if (!alreadyinlist) { - commringlist[newcommringlen++] = j; - commringappend = 1; - } - } - if (remap_3d_collide(&outarray[commringlist[i]], - &inarray[j],&overlap)) { - int alreadyinlist = 0; - for (int k = 0 ; k < newcommringlen; k++) { - if (commringlist[k] == j) alreadyinlist = 1; - } - if (!alreadyinlist) { - commringlist[newcommringlen++] = j; - commringappend = 1; - } - } - } - } - commringlen = newcommringlen; - } - - // sort the final commringlist - - for (i = 0 ; i < ( commringlen - 1 ); i++) { - for (j = 0 ; j < commringlen - i - 1; j++) { - if (commringlist[j] > commringlist[j+1]) { - swap = commringlist[j]; - commringlist[j] = commringlist[j+1]; - commringlist[j+1] = swap; - } - } - } - - // resize commringlist to final size - - commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); - - // set the plan->commringlist - - plan->commringlen = commringlen; - plan->commringlist = commringlist; - + // init remaining fields in remap plan - + plan->memory = memory; - - if (nrecv == plan->nrecv) plan->self = 0; - else plan->self = 1; - - // the plan->d_sendbuf and plan->d_recvbuf are used by both the - // collective & non-collective implementations. - // For non-collective, the buffer size is MAX(send_size) for any one send - // For collective, the buffer size is SUM(send_size) for all sends - - // allocate buffer for all send messages (including self) - // the method to allocate receive scratch space is sufficient - // for collectives - - size = 0; - for (nsend = 0; nsend < plan->nsend; nsend++) - size += plan->send_size[nsend]; - - if (size) { - plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); - if (!plan->d_sendbuf.data()) return nullptr; - } - - // allocate buffers for send and receive counts, displacements - - if (plan->commringlen) { - plan->sendcnts = (int *) malloc(sizeof(int) * plan->commringlen); - plan->rcvcnts = (int *) malloc(sizeof(int) * plan->commringlen); - plan->sdispls = (int *) malloc(sizeof(int) * plan->commringlen); - plan->rdispls = (int *) malloc(sizeof(int) * plan->commringlen); - plan->nrecvmap = (int *) malloc(sizeof(int) * plan->commringlen); - - // populate buffers for send counts & displacements - - int currentSendBufferOffset = 0; - for (isend = 0; isend < plan->commringlen; isend++) { - plan->sendcnts[isend] = 0; - plan->sdispls[isend] = 0; - int foundentry = 0; - for (int i=0;(insend && !foundentry); i++) { - if (plan->send_proc[i] == plan->commringlist[isend]) { - foundentry = 1; - plan->sendcnts[isend] = plan->send_size[i]; - plan->sdispls[isend] = currentSendBufferOffset; - currentSendBufferOffset += plan->send_size[i]; - } - } - } - - // populate buffers for recv counts & displacements - - int currentRecvBufferOffset = 0; - for (irecv = 0; irecv < plan->commringlen; irecv++) { - plan->rcvcnts[irecv] = 0; - plan->rdispls[irecv] = 0; - plan->nrecvmap[irecv] = -1; - int foundentry = 0; - for (int i=0;(inrecv && !foundentry); i++) { - if (plan->recv_proc[i] == plan->commringlist[irecv]) { - foundentry = 1; - plan->rcvcnts[irecv] = plan->recv_size[i]; - plan->rdispls[irecv] = currentRecvBufferOffset; - currentRecvBufferOffset += plan->recv_size[i]; - plan->nrecvmap[irecv] = i; - } - } - } - } - + plan->self = 0; + // if requested, allocate internal scratch space for recvs, // only need it if I will receive any data (including self) - + if (memory == 1) { if (nrecv > 0) { plan->d_scratch = @@ -829,25 +749,24 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat if (!plan->d_scratch.data()) return nullptr; } } - + // if using collective and the commringlist is NOT empty create a // communicator for the plan based off an MPI_Group created with // ranks from the commringlist - + if (plan->commringlen > 0) { MPI_Group orig_group, new_group; MPI_Comm_group(comm, &orig_group); MPI_Group_incl(orig_group, plan->commringlen, - plan->commringlist, &new_group); + plan->commringlist, &new_group); MPI_Comm_create(comm, new_group, &plan->comm); } - + // if using collective and the comm ring list is empty create // a communicator for the plan with an empty group - + else MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); - } // free locally malloced space @@ -881,7 +800,7 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ free(plan->rcvcnts); free(plan->sdispls); free(plan->rdispls); - free(plan->nrecvmap); + //free(plan->nrecvmap); } } From 3f9d96d38db993c6dbc91145017d4b9fe0bc3b79 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 24 Apr 2024 01:55:13 -0400 Subject: [PATCH 08/14] make pip install packages in virtual environment --- .github/workflows/unittest-macos.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/unittest-macos.yml b/.github/workflows/unittest-macos.yml index f9c2a838d6..0a9d31bd84 100644 --- a/.github/workflows/unittest-macos.yml +++ b/.github/workflows/unittest-macos.yml @@ -43,6 +43,8 @@ jobs: working-directory: build run: | ccache -z + python3 -m venv macosenv + source macosenv/bin/activate python3 -m pip install numpy python3 -m pip install pyyaml cmake -C ../cmake/presets/clang.cmake \ From 8c3dab03b7969d3c52291f1407e15608393de2bd Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 24 Apr 2024 02:45:36 -0400 Subject: [PATCH 09/14] downgrade macOS to version 13 --- .github/workflows/unittest-macos.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest-macos.yml b/.github/workflows/unittest-macos.yml index 0a9d31bd84..b0bc4b2727 100644 --- a/.github/workflows/unittest-macos.yml +++ b/.github/workflows/unittest-macos.yml @@ -15,7 +15,7 @@ jobs: build: name: MacOS Unit Test if: ${{ github.repository == 'lammps/lammps' }} - runs-on: macos-latest + runs-on: macos-13 env: CCACHE_DIR: ${{ github.workspace }}/.ccache From bd52e31128f19b6f7b87200f13f8ac9f8f8f9db6 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Sat, 11 May 2024 20:58:47 -0400 Subject: [PATCH 10/14] Added optimization to collective to local copy to self --- src/KOKKOS/remap_kokkos.cpp | 46 ++++++++++++++++++++++++++++++++++--- src/KOKKOS/remap_kokkos.h | 3 +++ 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index fafb3dc37b..39b3aff4fa 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -105,6 +105,8 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d { typename FFT_AT::t_FFT_SCALAR_1d d_scratch; + int me; + MPI_Comm_rank(plan->comm,&me); if (plan->memory == 0) d_scratch = d_buf; @@ -191,13 +193,17 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d int numpacked = 0; for (isend = 0; isend < plan->commringlen; isend++) { - if (plan->sendcnts[isend] > 0) { + if (plan->sendcnts[isend]) { plan->pack(d_in,plan->send_offset[numpacked], plan->d_sendbuf,plan->sdispls[isend], &plan->packplan[numpacked]); numpacked++; } + else if (plan->commringlist[isend] == me && plan->self) { + numpacked++; + } } + if (!plan->usegpu_aware) Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf); @@ -210,14 +216,28 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d if (!plan->usegpu_aware) Kokkos::deep_copy(d_scratch,plan->h_scratch); + // copy in -> scratch -> out for self data + + if (plan->self) { + plan->pack(d_in,plan->send_offset[plan->selfnsendloc], + plan->d_sendbuf,plan->sdispls[plan->selfcommringloc], + &plan->packplan[plan->selfnsendloc]); + plan->unpack(plan->d_sendbuf,plan->sdispls[plan->selfcommringloc], + d_out,plan->recv_offset[plan->selfnrecvloc], + &plan->unpackplan[plan->selfnrecvloc]); + } + numpacked = 0; for (irecv = 0; irecv < plan->commringlen; irecv++) { - if (plan->rcvcnts[irecv] > 0) { + if (plan->rcvcnts[irecv]) { plan->unpack(d_scratch,plan->rdispls[irecv], d_out,plan->recv_offset[numpacked], &plan->unpackplan[numpacked]); numpacked++; } + else if (plan->commringlist[irecv] == me && plan->self) { + numpacked++; + } } } } @@ -575,6 +595,10 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat // send space + plan->selfcommringloc = -1; + plan->selfnsendloc = -1; + plan->selfnrecvloc = -1; + plan->nsend = nsend; plan->pack = PackKokkos::pack_3d; @@ -646,6 +670,10 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat int total_send_size = 0; for (i = 0; i < plan->commringlen; i++) { iproc = plan->commringlist[i]; + if (iproc == me) { + plan->selfcommringloc = i; + plan->selfnsendloc = nsend; + } if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { //plan->send_proc[nsend] = i; // number of entries required for this pack's 3-d coords @@ -684,6 +712,9 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat for (i = 0; i < plan->commringlen; i++) { iproc = plan->commringlist[i]; + if (iproc == me) { + plan->selfnrecvloc = nrecv; + } if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { //plan->recv_proc[nrecv] = iproc; plan->recv_bufloc[nrecv] = ibuf; @@ -737,7 +768,16 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat // init remaining fields in remap plan plan->memory = memory; - plan->self = 0; + //plan->self = 0; + if (plan->sendcnts[plan->selfcommringloc]) { + plan->self = 1; + plan->sendcnts[plan->selfcommringloc] = 0; + plan->rcvcnts[plan->selfcommringloc] = 0; + } + else { + plan->self = 0; + } + // if requested, allocate internal scratch space for recvs, // only need it if I will receive any data (including self) diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h index 9dbd8d683c..737b45a398 100644 --- a/src/KOKKOS/remap_kokkos.h +++ b/src/KOKKOS/remap_kokkos.h @@ -61,6 +61,9 @@ struct remap_plan_3d_kokkos { int *sdispls; // extraction location in send buffer for each rank int *rdispls; // extraction location in recv buffer for each rank int *nrecvmap; // maps receive index to rank index + int selfcommringloc; + int selfnsendloc; + int selfnrecvloc; }; template From c9049c090dc8d7a6b5d59d8517739e20861162b8 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Mon, 13 May 2024 15:47:25 -0400 Subject: [PATCH 11/14] Cleaned up memory deallocation logic, improved comments on remap_kokkos.h for collective fields --- src/KOKKOS/remap_kokkos.cpp | 54 ++++++++++++++++++++----------------- src/KOKKOS/remap_kokkos.h | 10 +++---- 2 files changed, 34 insertions(+), 30 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 39b3aff4fa..7ebe35dddb 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -608,9 +608,6 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->sendcnts = (int *) malloc(plan->commringlen*sizeof(int)); plan->sdispls = (int *) malloc(plan->commringlen*sizeof(int)); - // unused - plan->send_proc = (int *) malloc(plan->commringlen*sizeof(int)); - // only used when sendcnt > 0 plan->packplan = (struct pack_plan_3d *) @@ -648,11 +645,6 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->rcvcnts = (int *) malloc(plan->commringlen*sizeof(int)); plan->rdispls = (int *) malloc(plan->commringlen*sizeof(int)); - // unused - plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); - plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); - plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); - // only used when recvcnt > 0 plan->unpackplan = (struct pack_plan_3d *) @@ -768,7 +760,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat // init remaining fields in remap plan plan->memory = memory; - //plan->self = 0; + if (plan->sendcnts[plan->selfcommringloc]) { plan->self = 1; plan->sendcnts[plan->selfcommringloc] = 0; @@ -840,26 +832,38 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ free(plan->rcvcnts); free(plan->sdispls); free(plan->rdispls); - //free(plan->nrecvmap); } - } - // free internal arrays + if (plan->nsend) { + free(plan->send_offset); + free(plan->send_size); + free(plan->packplan); + } - if (plan->nsend || plan->self) { - free(plan->send_offset); - free(plan->send_size); - free(plan->send_proc); - free(plan->packplan); - } + if (plan->nrecv) { + free(plan->recv_offset); + free(plan->recv_size); + free(plan->unpackplan); + } + } else { + + // free arrays used in pt2pt communication - if (plan->nrecv || plan->self) { - free(plan->recv_offset); - free(plan->recv_size); - free(plan->recv_proc); - free(plan->recv_bufloc); - free(plan->request); - free(plan->unpackplan); + if (plan->nsend || plan->self) { + free(plan->send_offset); + free(plan->send_size); + free(plan->send_proc); + free(plan->packplan); + } + + if (plan->nrecv || plan->self) { + free(plan->recv_offset); + free(plan->recv_size); + free(plan->recv_proc); + free(plan->recv_bufloc); + free(plan->request); + free(plan->unpackplan); + } } // free plan itself diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h index 737b45a398..b0ccdb342d 100644 --- a/src/KOKKOS/remap_kokkos.h +++ b/src/KOKKOS/remap_kokkos.h @@ -44,6 +44,7 @@ struct remap_plan_3d_kokkos { int *recv_size; // size of each recv message int *recv_proc; // proc to recv each message from int *recv_bufloc; // offset in scratch buf for each recv + int *nrecvmap; // maps receive index to rank index MPI_Request *request; // MPI request for each posted recv struct pack_plan_3d *unpackplan; // unpack plan for each recv message int nrecv; // # of recvs from other procs @@ -53,17 +54,16 @@ struct remap_plan_3d_kokkos { MPI_Comm comm; // group of procs performing remap int usecollective; // use collective or point-to-point MPI int usegpu_aware; // use GPU-Aware MPI or not - // variables specific to collective MPI + // variables for collective MPI only int commringlen; // length of commringlist int *commringlist; // ranks on communication ring of this plan int *sendcnts; // # of elements in send buffer for each rank int *rcvcnts; // # of elements in recv buffer for each rank int *sdispls; // extraction location in send buffer for each rank int *rdispls; // extraction location in recv buffer for each rank - int *nrecvmap; // maps receive index to rank index - int selfcommringloc; - int selfnsendloc; - int selfnrecvloc; + int selfcommringloc; // current proc's location in commringlist + int selfnsendloc; // current proc's location in send lists + int selfnrecvloc; // current proc's location in recv lists }; template From d630fc67abb8a1402924b20d61e1b15d1b7a30e4 Mon Sep 17 00:00:00 2001 From: Nick Hagerty Date: Tue, 2 Jul 2024 15:10:44 -0400 Subject: [PATCH 12/14] Updated remap kokkos to remove unused recv_proc and recv_bufloc buffers --- src/KOKKOS/remap_kokkos.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 7ebe35dddb..50d61cc1a3 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -614,7 +614,8 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat malloc(nsend*sizeof(struct pack_plan_3d)); if (plan->send_offset == nullptr || plan->send_size == nullptr || - plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr; + plan->sendcnts == nullptr || plan->sdispls == nullptr || + plan->packplan == nullptr) return nullptr; // recv space @@ -651,8 +652,8 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat malloc(nrecv*sizeof(struct pack_plan_3d)); if (plan->recv_offset == nullptr || plan->recv_size == nullptr || - plan->recv_proc == nullptr || plan->recv_bufloc == nullptr || - plan->request == nullptr || plan->unpackplan == nullptr) return nullptr; + plan->rcvcnts == nullptr || plan->rdispls == nullptr || + plan->unpackplan == nullptr) return nullptr; } // store send info, with self as last entry @@ -708,8 +709,6 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->selfnrecvloc = nrecv; } if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { - //plan->recv_proc[nrecv] = iproc; - plan->recv_bufloc[nrecv] = ibuf; if (permute == 0) { plan->recv_offset[nrecv] = nqty * From 9513c0edac92f67f5ed18d4842f97822e40e502b Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 2 Jul 2024 13:28:46 -0600 Subject: [PATCH 13/14] small cleanup --- src/KOKKOS/remap_kokkos.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 50d61cc1a3..573f4c2508 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -283,7 +283,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat struct remap_plan_3d_kokkos *plan; struct extent_3d *inarray, *outarray; struct extent_3d in,out,overlap; - int i,j,iproc,nsend,nrecv,ibuf,size,me,nprocs,isend,irecv; + int i,iproc,nsend,nrecv,ibuf,size,me,nprocs; // query MPI info @@ -552,7 +552,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } // perform an AllReduce to get the counts from all other processors and build sendcnts list - + MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm); // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv @@ -569,11 +569,11 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } // resize commringlist to final size - + commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); - + // set the plan->commringlist - + plan->commringlen = commringlen; plan->commringlist = commringlist; @@ -655,9 +655,9 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->rcvcnts == nullptr || plan->rdispls == nullptr || plan->unpackplan == nullptr) return nullptr; } - + // store send info, with self as last entry - + nsend = 0; ibuf = 0; int total_send_size = 0; @@ -697,19 +697,19 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",total_send_size); if (!plan->d_sendbuf.data()) return nullptr; } - + // store recv info, with self as last entry - + ibuf = 0; nrecv = 0; - + for (i = 0; i < plan->commringlen; i++) { iproc = plan->commringlist[i]; if (iproc == me) { plan->selfnrecvloc = nrecv; } if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { - + if (permute == 0) { plan->recv_offset[nrecv] = nqty * ((overlap.klo-out.klo)*out.jsize*out.isize + @@ -743,7 +743,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; plan->unpackplan[nrecv].nqty = nqty; } - + plan->recv_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize; plan->rcvcnts[i] = plan->recv_size[i]; plan->rdispls[i] = ibuf; @@ -845,7 +845,7 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ free(plan->unpackplan); } } else { - + // free arrays used in pt2pt communication if (plan->nsend || plan->self) { From 12abaf83cca8f6e57144b2624d25339865179282 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 10 Sep 2024 16:25:06 -0600 Subject: [PATCH 14/14] Changes from @sjplimp --- src/KSPACE/pppm.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp index 06cbf119e3..9665d2392d 100644 --- a/src/KSPACE/pppm.cpp +++ b/src/KSPACE/pppm.cpp @@ -1384,11 +1384,19 @@ void PPPM::set_grid_local() // npey_fft,npez_fft = # of procs in y,z dims // if nprocs is small enough, proc can own 1 or more entire xy planes, // else proc owns 2d sub-blocks of yz plane + // NOTE: commented out lines support this + // need to ensure fft3d.cpp and remap.cpp support 2D planes // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions // nlo_fft,nhi_fft = lower/upper limit of the section // of the global FFT mesh that I own in x-pencil decomposition int npey_fft,npez_fft; + + //if (nz_pppm >= nprocs) { + // npey_fft = 1; + // npez_fft = nprocs; + //} else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); + procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); int me_y = me % npey_fft;