Merge pull request #4143 from hagertnl/hagerty_issue4140_kokkos_collective

Implement kspace_modify collective yes in KOKKOS package
2024-09-11 00:07:10 -04:00
parent 94e3351e4f 12abaf83cc
commit fde43d5079
4 changed files with 617 additions and 256 deletions
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@ -794,7 +794,7 @@ void PPPMKokkos<DeviceType>::allocate()
  // 2nd FFT returns data in 3d brick decomposition
  // remap takes data from 3d brick to FFT decomposition

-  int collective_flag = 0; // not yet supported in Kokkos version
+  int collective_flag = force->kspace->collective_flag;
  int gpu_aware_flag = lmp->kokkos->gpu_aware_flag;
  int tmp;

--- a/src/KOKKOS/remap_kokkos.cpp
+++ b/src/KOKKOS/remap_kokkos.cpp
@ -103,12 +103,10 @@ template<class DeviceType>
 void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d_in, typename FFT_AT::t_FFT_SCALAR_1d d_out, typename FFT_AT::t_FFT_SCALAR_1d d_buf,
              struct remap_plan_3d_kokkos<DeviceType> *plan)
 {
-  // collective flag not yet supported

-  // use point-to-point communication
-
-  int i,isend,irecv;
  typename FFT_AT::t_FFT_SCALAR_1d d_scratch;
+  int me;
+  MPI_Comm_rank(plan->comm,&me);

  if (plan->memory == 0)
    d_scratch = d_buf;
@ -116,70 +114,132 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
    d_scratch = plan->d_scratch;

  // post all recvs into scratch space
+  // If not using GPU-aware MPI, mirror data to host

  FFT_SCALAR* v_scratch = d_scratch.data();
-  if (!plan->usegpu_aware) {
-    plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
-    v_scratch = plan->h_scratch.data();
-  }
-
-  for (irecv = 0; irecv < plan->nrecv; irecv++) {
-    FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
-    MPI_Irecv(scratch,plan->recv_size[irecv],
-              MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
-              plan->comm,&plan->request[irecv]);
-  }
-
  FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data();
  if (!plan->usegpu_aware) {
+    plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
    plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf);
+    v_scratch = plan->h_scratch.data();
    v_sendbuf = plan->h_sendbuf.data();
  }

-  // send all messages to other procs
+  // use point-to-point communication

-  for (isend = 0; isend < plan->nsend; isend++) {
-    int in_offset = plan->send_offset[isend];
-    plan->pack(d_in,in_offset,
-               plan->d_sendbuf,0,&plan->packplan[isend]);
+  if (!plan->usecollective) {
+    int i,isend,irecv;

-    if (!plan->usegpu_aware)
-      Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);

-    MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
-             plan->send_proc[isend],0,plan->comm);
-  }
+    for (irecv = 0; irecv < plan->nrecv; irecv++) {
+      FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
+      MPI_Irecv(scratch,plan->recv_size[irecv],
+                MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
+                plan->comm,&plan->request[irecv]);
+    }

-  // copy in -> scratch -> out for self data
+    // send all messages to other procs

-  if (plan->self) {
-    isend = plan->nsend;
-    irecv = plan->nrecv;
+    for (isend = 0; isend < plan->nsend; isend++) {
+      int in_offset = plan->send_offset[isend];
+      plan->pack(d_in,in_offset,
+                 plan->d_sendbuf,0,&plan->packplan[isend]);

-    int in_offset = plan->send_offset[isend];
-    int scratch_offset = plan->recv_bufloc[irecv];
-    int out_offset = plan->recv_offset[irecv];
+      if (!plan->usegpu_aware)
+        Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);

-    plan->pack(d_in,in_offset,
-               d_scratch,scratch_offset,
-               &plan->packplan[isend]);
-    plan->unpack(d_scratch,scratch_offset,
-                 d_out,out_offset,&plan->unpackplan[irecv]);
-  }
+      MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
+               plan->send_proc[isend],0,plan->comm);
+    }

-  // unpack all messages from scratch -> out
+    // copy in -> scratch -> out for self data

-  for (i = 0; i < plan->nrecv; i++) {
-    MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE);
+    if (plan->self) {
+      isend = plan->nsend;
+      irecv = plan->nrecv;

-    int scratch_offset = plan->recv_bufloc[irecv];
-    int out_offset = plan->recv_offset[irecv];
+      int in_offset = plan->send_offset[isend];
+      int scratch_offset = plan->recv_bufloc[irecv];
+      int out_offset = plan->recv_offset[irecv];

-    if (!plan->usegpu_aware)
-      Kokkos::deep_copy(d_scratch,plan->h_scratch);
+      plan->pack(d_in,in_offset,
+                 d_scratch,scratch_offset,
+                 &plan->packplan[isend]);
+      plan->unpack(d_scratch,scratch_offset,
+                   d_out,out_offset,&plan->unpackplan[irecv]);
+    }

-    plan->unpack(d_scratch,scratch_offset,
-                 d_out,out_offset,&plan->unpackplan[irecv]);
+    // unpack all messages from scratch -> out
+
+    for (i = 0; i < plan->nrecv; i++) {
+      MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE);
+
+      int scratch_offset = plan->recv_bufloc[irecv];
+      int out_offset = plan->recv_offset[irecv];
+
+      if (!plan->usegpu_aware)
+        Kokkos::deep_copy(d_scratch,plan->h_scratch);
+
+      plan->unpack(d_scratch,scratch_offset,
+                   d_out,out_offset,&plan->unpackplan[irecv]);
+    }
+  } else {
+    if (plan->commringlen > 0) {
+      int isend,irecv;
+
+      // populate send data
+      // buffers are allocated and count/displacement buffers
+      // are populated in remap_3d_create_plan_kokkos
+
+      int numpacked = 0;
+      for (isend = 0; isend < plan->commringlen; isend++) {
+        if (plan->sendcnts[isend]) {
+          plan->pack(d_in,plan->send_offset[numpacked],
+                      plan->d_sendbuf,plan->sdispls[isend],
+                      &plan->packplan[numpacked]);
+          numpacked++;
+        }
+        else if (plan->commringlist[isend] == me && plan->self) {
+          numpacked++;
+        }
+      }
+
+      if (!plan->usegpu_aware)
+        Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
+
+      MPI_Alltoallv(v_sendbuf, plan->sendcnts, plan->sdispls,
+                    MPI_FFT_SCALAR, v_scratch, plan->rcvcnts,
+                    plan->rdispls, MPI_FFT_SCALAR, plan->comm);
+
+      // unpack the data from the recv buffer into out
+
+      if (!plan->usegpu_aware)
+        Kokkos::deep_copy(d_scratch,plan->h_scratch);
+
+      // copy in -> scratch -> out for self data
+
+      if (plan->self) {
+        plan->pack(d_in,plan->send_offset[plan->selfnsendloc],
+                  plan->d_sendbuf,plan->sdispls[plan->selfcommringloc],
+                  &plan->packplan[plan->selfnsendloc]);
+        plan->unpack(plan->d_sendbuf,plan->sdispls[plan->selfcommringloc],
+                    d_out,plan->recv_offset[plan->selfnrecvloc],
+                    &plan->unpackplan[plan->selfnrecvloc]);
+      }
+
+      numpacked = 0;
+      for (irecv = 0; irecv < plan->commringlen; irecv++) {
+        if (plan->rcvcnts[irecv]) {
+          plan->unpack(d_scratch,plan->rdispls[irecv],
+                       d_out,plan->recv_offset[numpacked],
+                       &plan->unpackplan[numpacked]);
+          numpacked++;
+        }
+        else if (plan->commringlist[irecv] == me && plan->self) {
+          numpacked++;
+        }
+      }
+    }
  }
 }

@ -263,224 +323,488 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
  out.khi = out_khi;
  out.ksize = out.khi - out.klo + 1;

-  // combine output extents across all procs
-
  inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
  if (inarray == nullptr) return nullptr;

  outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
  if (outarray == nullptr) return nullptr;

-  MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE,
-                outarray,sizeof(struct extent_3d),MPI_BYTE,comm);
-
-  // count send collides, including self
-
-  nsend = 0;
-  iproc = me;
-  for (i = 0; i < nprocs; i++) {
-    iproc++;
-    if (iproc == nprocs) iproc = 0;
-    nsend += remap_3d_collide(&in,&outarray[iproc],&overlap);
-  }
-
-  // malloc space for send info
-
-  if (nsend) {
-    plan->pack = PackKokkos<DeviceType>::pack_3d;
-
-    plan->send_offset = (int *) malloc(nsend*sizeof(int));
-    plan->send_size = (int *) malloc(nsend*sizeof(int));
-    plan->send_proc = (int *) malloc(nsend*sizeof(int));
-    plan->packplan = (struct pack_plan_3d *)
-      malloc(nsend*sizeof(struct pack_plan_3d));
-
-    if (plan->send_offset == nullptr || plan->send_size == nullptr ||
-        plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr;
-  }
-
-  // store send info, with self as last entry
-
-  nsend = 0;
-  iproc = me;
-  for (i = 0; i < nprocs; i++) {
-    iproc++;
-    if (iproc == nprocs) iproc = 0;
-    if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
-      plan->send_proc[nsend] = iproc;
-      plan->send_offset[nsend] = nqty *
-        ((overlap.klo-in.klo)*in.jsize*in.isize +
-         ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
-      plan->packplan[nsend].nfast = nqty*overlap.isize;
-      plan->packplan[nsend].nmid = overlap.jsize;
-      plan->packplan[nsend].nslow = overlap.ksize;
-      plan->packplan[nsend].nstride_line = nqty*in.isize;
-      plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
-      plan->packplan[nsend].nqty = nqty;
-      plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
-      nsend++;
-    }
-  }
-
-  // plan->nsend = # of sends not including self
-
-  if (nsend && plan->send_proc[nsend-1] == me) {
-    if (plan->usecollective) // for collectives include self in nsend list
-      plan->nsend = nsend;
-    else
-      plan->nsend = nsend - 1;
-  } else
-    plan->nsend = nsend;
-
-  // combine input extents across all procs
+  // combine input & output extents across all procs

  MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE,
                inarray,sizeof(struct extent_3d),MPI_BYTE,comm);
+  MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE,
+                outarray,sizeof(struct extent_3d),MPI_BYTE,comm);

-  // count recv collides, including self
+  // for efficiency, handle collective & non-collective setup separately

-  nrecv = 0;
-  iproc = me;
-  for (i = 0; i < nprocs; i++) {
-    iproc++;
-    if (iproc == nprocs) iproc = 0;
-    nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap);
-  }
+  if (!plan->usecollective) {
+    // count send & recv collides, including self

-  // malloc space for recv info
-
-  if (nrecv) {
-    if (permute == 0)
-      plan->unpack = PackKokkos<DeviceType>::unpack_3d;
-    else if (permute == 1) {
-      if (nqty == 1)
-        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
-      else if (nqty == 2)
-        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
-      else
-        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
-    }
-    else if (permute == 2) {
-      if (nqty == 1)
-        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
-      else if (nqty == 2)
-        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
-      else
-        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
+    nsend = 0;
+    nrecv = 0;
+    for (i = 0; i < nprocs; i++) {
+      nsend += remap_3d_collide(&in,&outarray[i],&overlap);
+      nrecv += remap_3d_collide(&out,&inarray[i],&overlap);
    }

-    plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
-    plan->recv_size = (int *) malloc(nrecv*sizeof(int));
-    plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
-    plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
-    plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
-    plan->unpackplan = (struct pack_plan_3d *)
-      malloc(nrecv*sizeof(struct pack_plan_3d));
+    // malloc space for send & recv info

-    if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
-        plan->recv_proc == nullptr || plan->recv_bufloc == nullptr ||
-        plan->request == nullptr || plan->unpackplan == nullptr) return nullptr;
-  }
+    if (nsend) {
+      plan->pack = PackKokkos<DeviceType>::pack_3d;

-  // store recv info, with self as last entry
+      plan->send_offset = (int *) malloc(nsend*sizeof(int));
+      plan->send_size = (int *) malloc(nsend*sizeof(int));
+      plan->send_proc = (int *) malloc(nsend*sizeof(int));
+      plan->packplan = (struct pack_plan_3d *)
+        malloc(nsend*sizeof(struct pack_plan_3d));

-  ibuf = 0;
-  nrecv = 0;
-  iproc = me;
+      if (plan->send_offset == nullptr || plan->send_size == nullptr ||
+          plan->send_proc == nullptr || plan->packplan == nullptr) return nullptr;
+    }

-  for (i = 0; i < nprocs; i++) {
-    iproc++;
-    if (iproc == nprocs) iproc = 0;
-    if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
-      plan->recv_proc[nrecv] = iproc;
-      plan->recv_bufloc[nrecv] = ibuf;
-
-      if (permute == 0) {
-        plan->recv_offset[nrecv] = nqty *
-          ((overlap.klo-out.klo)*out.jsize*out.isize +
-           (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
-        plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
-        plan->unpackplan[nrecv].nmid = overlap.jsize;
-        plan->unpackplan[nrecv].nslow = overlap.ksize;
-        plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
-        plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
-        plan->unpackplan[nrecv].nqty = nqty;
-      }
+    if (nrecv) {
+      if (permute == 0)
+        plan->unpack = PackKokkos<DeviceType>::unpack_3d;
      else if (permute == 1) {
-        plan->recv_offset[nrecv] = nqty *
-          ((overlap.ilo-out.ilo)*out.ksize*out.jsize +
-           (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
-        plan->unpackplan[nrecv].nfast = overlap.isize;
-        plan->unpackplan[nrecv].nmid = overlap.jsize;
-        plan->unpackplan[nrecv].nslow = overlap.ksize;
-        plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
-        plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
-        plan->unpackplan[nrecv].nqty = nqty;
+        if (nqty == 1)
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
+        else if (nqty == 2)
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
+        else
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
      }
-      else {
-        plan->recv_offset[nrecv] = nqty *
-          ((overlap.jlo-out.jlo)*out.isize*out.ksize +
-           (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
-        plan->unpackplan[nrecv].nfast = overlap.isize;
-        plan->unpackplan[nrecv].nmid = overlap.jsize;
-        plan->unpackplan[nrecv].nslow = overlap.ksize;
-        plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
-        plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
-        plan->unpackplan[nrecv].nqty = nqty;
+      else if (permute == 2) {
+        if (nqty == 1)
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
+        else if (nqty == 2)
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
+        else
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
      }

-      plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
-      ibuf += plan->recv_size[nrecv];
-      nrecv++;
+      plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
+      plan->recv_size = (int *) malloc(nrecv*sizeof(int));
+      plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
+      plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
+      plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
+      plan->unpackplan = (struct pack_plan_3d *)
+        malloc(nrecv*sizeof(struct pack_plan_3d));
+
+      if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
+          plan->recv_proc == nullptr || plan->recv_bufloc == nullptr ||
+          plan->request == nullptr || plan->unpackplan == nullptr) return nullptr;
    }
+
+    // store send info, with self as last entry
+
+    nsend = 0;
+    iproc = me;
+    for (i = 0; i < nprocs; i++) {
+      iproc++;
+      if (iproc == nprocs) iproc = 0;
+      if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
+        plan->send_proc[nsend] = iproc;
+        plan->send_offset[nsend] = nqty *
+          ((overlap.klo-in.klo)*in.jsize*in.isize +
+          ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
+        plan->packplan[nsend].nfast = nqty*overlap.isize;
+        plan->packplan[nsend].nmid = overlap.jsize;
+        plan->packplan[nsend].nslow = overlap.ksize;
+        plan->packplan[nsend].nstride_line = nqty*in.isize;
+        plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
+        plan->packplan[nsend].nqty = nqty;
+        plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        nsend++;
+      }
+    }
+
+    // plan->nsend = # of sends not including self
+
+    if (nsend && plan->send_proc[nsend-1] == me) plan->nsend = nsend - 1;
+    else plan->nsend = nsend;
+
+    // store recv info, with self as last entry
+
+    ibuf = 0;
+    nrecv = 0;
+    iproc = me;
+
+    for (i = 0; i < nprocs; i++) {
+      iproc++;
+      if (iproc == nprocs) iproc = 0;
+      if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
+        plan->recv_proc[nrecv] = iproc;
+        plan->recv_bufloc[nrecv] = ibuf;
+
+        if (permute == 0) {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.klo-out.klo)*out.jsize*out.isize +
+            (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
+          plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+        else if (permute == 1) {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.ilo-out.ilo)*out.ksize*out.jsize +
+            (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
+          plan->unpackplan[nrecv].nfast = overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+        else {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.jlo-out.jlo)*out.isize*out.ksize +
+            (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
+          plan->unpackplan[nrecv].nfast = overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+
+        plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        ibuf += plan->recv_size[nrecv];
+        nrecv++;
+      }
+    }
+
+    // plan->nrecv = # of recvs not including self
+
+    if (nrecv && plan->recv_proc[nrecv-1] == me) plan->nrecv = nrecv - 1;
+    else plan->nrecv = nrecv;
+
+    // init remaining fields in remap plan
+
+    plan->memory = memory;
+
+    if (nrecv == plan->nrecv) plan->self = 0;
+    else plan->self = 1;
+
+
+    // the plan->d_sendbuf and plan->d_recvbuf are used by both the
+    // collective & non-collective implementations.
+    // For non-collective, the buffer size is MAX(send_size) for any one send
+
+    // find biggest send message (not including self) and malloc space for it
+
+    size = 0;
+    for (nsend = 0; nsend < plan->nsend; nsend++)
+      size = MAX(size,plan->send_size[nsend]);
+
+    if (size) {
+      plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
+      if (!plan->d_sendbuf.data()) return nullptr;
+    }
+
+    // if requested, allocate internal scratch space for recvs,
+    // only need it if I will receive any data (including self)
+
+    if (memory == 1) {
+      if (nrecv > 0) {
+        plan->d_scratch =
+          typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
+        if (!plan->d_scratch.data()) return nullptr;
+      }
+    }
+
+    // Non-collectives do not use MPI Communicator Groups
+
+    MPI_Comm_dup(comm,&plan->comm);
+  } else {
+
+    // Improved approach - use an AllReduce to aggregate which ranks need to be included
+    // To do this, we build the local proc's send/receive list, then do an AllReduce
+    // to create the send/recv count for the Alltoallv
+
+    // local arrays to be used in the allreduce
+    // start with max length -- nprocs. Unused entries will be removed later
+
+    int *local_cnts = (int*) malloc(2*nprocs*sizeof(int));
+    if (local_cnts == nullptr) return nullptr;
+    int *local_sendcnts = local_cnts;
+    int *local_recvcnts = (local_cnts + nprocs);
+
+    // local arrays used to store the results of the allreduce
+
+    int *global_cnts = (int*) malloc(2*nprocs*sizeof(int));
+    if (global_cnts == nullptr) return nullptr;
+    int *global_sendcnts = global_cnts;
+    int *global_recvcnts = (global_cnts + nprocs);
+
+    // count send & recv collides, including self
+
+    nsend = 0;
+    nrecv = 0;
+    for (i = 0; i < nprocs; i++) {
+      local_sendcnts[i] = remap_3d_collide(&in,&outarray[i],&overlap);
+      local_recvcnts[i] = remap_3d_collide(&out,&inarray[i],&overlap);
+      nsend += local_sendcnts[i];
+      nrecv += local_recvcnts[i];
+    }
+
+    // perform an AllReduce to get the counts from all other processors and build sendcnts list
+
+    MPI_Allreduce(local_cnts, global_cnts, 2*nprocs, MPI_INT, MPI_SUM, comm);
+
+    // now remove procs that are 0 in send or recv to create minimized sendcnts/recvcnts for AlltoAllv
+    // also builds commringlist -- which is already sorted
+
+    int *commringlist = (int*) malloc(nprocs * sizeof(int));
+    int commringlen = 0;
+
+    for (i = 0; i < nprocs; i++) {
+      if (global_sendcnts[i] > 0 || global_recvcnts[i] > 0) {
+        commringlist[commringlen] = i;
+        commringlen++;
+      }
+    }
+
+    // resize commringlist to final size
+
+    commringlist = (int *) realloc(commringlist, commringlen*sizeof(int));
+
+    // set the plan->commringlist
+
+    plan->commringlen = commringlen;
+    plan->commringlist = commringlist;
+
+    // clean up local buffers that are finished
+
+    local_sendcnts = nullptr;
+    local_recvcnts = nullptr;
+    global_recvcnts = nullptr;
+    global_sendcnts = nullptr;
+    free(local_cnts);
+    free(global_cnts);
+
+    // malloc space for send & recv info
+    // if the current proc is involved in any way in the communication, allocate space
+    // because of the Alltoallv, both send and recv have to be initialized even if
+    // only one of those is performed
+
+    if (nsend || nrecv) {
+
+      // send space
+
+      plan->selfcommringloc = -1;
+      plan->selfnsendloc = -1;
+      plan->selfnrecvloc = -1;
+
+      plan->nsend = nsend;
+      plan->pack = PackKokkos<DeviceType>::pack_3d;
+
+      plan->send_offset = (int *) malloc(nsend*sizeof(int));
+      plan->send_size = (int *) malloc(plan->commringlen*sizeof(int));
+
+      plan->sendcnts = (int *) malloc(plan->commringlen*sizeof(int));
+      plan->sdispls = (int *) malloc(plan->commringlen*sizeof(int));
+
+      // only used when sendcnt > 0
+
+      plan->packplan = (struct pack_plan_3d *)
+        malloc(nsend*sizeof(struct pack_plan_3d));
+
+      if (plan->send_offset == nullptr || plan->send_size == nullptr ||
+          plan->sendcnts == nullptr || plan->sdispls == nullptr ||
+          plan->packplan == nullptr) return nullptr;
+
+      // recv space
+
+      plan->nrecv = nrecv;
+
+      if (permute == 0)
+        plan->unpack = PackKokkos<DeviceType>::unpack_3d;
+      else if (permute == 1) {
+        if (nqty == 1)
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
+        else if (nqty == 2)
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
+        else
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
+      }
+      else if (permute == 2) {
+        if (nqty == 1)
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
+        else if (nqty == 2)
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
+        else
+          plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
+      }
+
+      plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
+      plan->recv_size = (int *) malloc(plan->commringlen*sizeof(int));
+
+      plan->rcvcnts = (int *) malloc(plan->commringlen*sizeof(int));
+      plan->rdispls = (int *) malloc(plan->commringlen*sizeof(int));
+
+      // only used when recvcnt > 0
+
+      plan->unpackplan = (struct pack_plan_3d *)
+        malloc(nrecv*sizeof(struct pack_plan_3d));
+
+      if (plan->recv_offset == nullptr || plan->recv_size == nullptr ||
+          plan->rcvcnts == nullptr || plan->rdispls == nullptr ||
+          plan->unpackplan == nullptr) return nullptr;
+    }
+
+    // store send info, with self as last entry
+
+    nsend = 0;
+    ibuf = 0;
+    int total_send_size = 0;
+    for (i = 0; i < plan->commringlen; i++) {
+      iproc = plan->commringlist[i];
+      if (iproc == me) {
+        plan->selfcommringloc = i;
+        plan->selfnsendloc = nsend;
+      }
+      if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
+        //plan->send_proc[nsend] = i;
+        // number of entries required for this pack's 3-d coords
+        plan->send_offset[nsend] = nqty *
+          ((overlap.klo-in.klo)*in.jsize*in.isize +
+            ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
+        plan->packplan[nsend].nfast = nqty*overlap.isize;
+        plan->packplan[nsend].nmid = overlap.jsize;
+        plan->packplan[nsend].nslow = overlap.ksize;
+        plan->packplan[nsend].nstride_line = nqty*in.isize;
+        plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
+        plan->packplan[nsend].nqty = nqty;
+        // total amount of overlap
+        plan->send_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        plan->sendcnts[i] = plan->send_size[i];
+        plan->sdispls[i] = ibuf;
+        ibuf += plan->send_size[i];
+        nsend++;
+      } else {
+        plan->send_size[i] = 0;
+        plan->sdispls[i] = ibuf;
+        plan->sendcnts[i] = 0;
+      }
+      total_send_size += plan->send_size[i];
+    }
+
+    if (total_send_size) {
+      plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",total_send_size);
+      if (!plan->d_sendbuf.data()) return nullptr;
+    }
+
+    // store recv info, with self as last entry
+
+    ibuf = 0;
+    nrecv = 0;
+
+    for (i = 0; i < plan->commringlen; i++) {
+      iproc = plan->commringlist[i];
+      if (iproc == me) {
+        plan->selfnrecvloc = nrecv;
+      }
+      if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
+
+        if (permute == 0) {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.klo-out.klo)*out.jsize*out.isize +
+              (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
+          plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+        else if (permute == 1) {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.ilo-out.ilo)*out.ksize*out.jsize +
+              (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
+          plan->unpackplan[nrecv].nfast = overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+        else {
+          plan->recv_offset[nrecv] = nqty *
+            ((overlap.jlo-out.jlo)*out.isize*out.ksize +
+              (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
+          plan->unpackplan[nrecv].nfast = overlap.isize;
+          plan->unpackplan[nrecv].nmid = overlap.jsize;
+          plan->unpackplan[nrecv].nslow = overlap.ksize;
+          plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
+          plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
+          plan->unpackplan[nrecv].nqty = nqty;
+        }
+
+        plan->recv_size[i] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+        plan->rcvcnts[i] = plan->recv_size[i];
+        plan->rdispls[i] = ibuf;
+        ibuf += plan->recv_size[i];
+        nrecv++;
+      } else {
+        plan->recv_size[i] = 0;
+        plan->rcvcnts[i] = 0;
+        plan->rdispls[i] = ibuf;
+      }
+    }
+
+    // init remaining fields in remap plan
+
+    plan->memory = memory;
+
+    if (plan->sendcnts[plan->selfcommringloc]) {
+      plan->self = 1;
+      plan->sendcnts[plan->selfcommringloc] = 0;
+      plan->rcvcnts[plan->selfcommringloc] = 0;
+    }
+    else {
+      plan->self = 0;
+    }
+
+
+    // if requested, allocate internal scratch space for recvs,
+    // only need it if I will receive any data (including self)
+
+    if (memory == 1) {
+      if (nrecv > 0) {
+        plan->d_scratch =
+          typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
+        if (!plan->d_scratch.data()) return nullptr;
+      }
+    }
+
+    // if using collective and the commringlist is NOT empty create a
+    // communicator for the plan based off an MPI_Group created with
+    // ranks from the commringlist
+
+    if (plan->commringlen > 0) {
+      MPI_Group orig_group, new_group;
+      MPI_Comm_group(comm, &orig_group);
+      MPI_Group_incl(orig_group, plan->commringlen,
+                      plan->commringlist, &new_group);
+      MPI_Comm_create(comm, new_group, &plan->comm);
+    }
+
+    // if using collective and the comm ring list is empty create
+    // a communicator for the plan with an empty group
+
+    else
+      MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm);
  }

-  // plan->nrecv = # of recvs not including self
-  // for collectives include self in the nsend list
-
-  if (nrecv && plan->recv_proc[nrecv-1] == me) {
-    if (plan->usecollective) plan->nrecv = nrecv;
-    else plan->nrecv = nrecv - 1;
-  } else plan->nrecv = nrecv;
-
-  // init remaining fields in remap plan
-
-  plan->memory = memory;
-
-  if (nrecv == plan->nrecv) plan->self = 0;
-  else plan->self = 1;
-
  // free locally malloced space

  free(inarray);
  free(outarray);

-  // find biggest send message (not including self) and malloc space for it
-
-  size = 0;
-  for (nsend = 0; nsend < plan->nsend; nsend++)
-    size = MAX(size,plan->send_size[nsend]);
-
-  if (size) {
-    plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
-    if (!plan->d_sendbuf.data()) return nullptr;
-  }
-
-  // if requested, allocate internal scratch space for recvs,
-  // only need it if I will receive any data (including self)
-
-  if (memory == 1) {
-    if (nrecv > 0) {
-      plan->d_scratch =
-        typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
-      if (!plan->d_scratch.data()) return nullptr;
-    }
-  }
-
-  // not using collective - dup comm
-
-  MPI_Comm_dup(comm,&plan->comm);
-
  // return pointer to plan

  return plan;
@ -500,22 +824,45 @@ void RemapKokkos<DeviceType>::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_
  if (!((plan->usecollective) && (plan->commringlen == 0)))
    MPI_Comm_free(&plan->comm);

-  // free internal arrays
+  if (plan->usecollective) {
+    if (plan->commringlist != nullptr) {
+      free(plan->commringlist);
+      free(plan->sendcnts);
+      free(plan->rcvcnts);
+      free(plan->sdispls);
+      free(plan->rdispls);
+    }

-  if (plan->nsend || plan->self) {
-    free(plan->send_offset);
-    free(plan->send_size);
-    free(plan->send_proc);
-    free(plan->packplan);
-  }
+    if (plan->nsend) {
+      free(plan->send_offset);
+      free(plan->send_size);
+      free(plan->packplan);
+    }

-  if (plan->nrecv || plan->self) {
-    free(plan->recv_offset);
-    free(plan->recv_size);
-    free(plan->recv_proc);
-    free(plan->recv_bufloc);
-    free(plan->request);
-    free(plan->unpackplan);
+    if (plan->nrecv) {
+      free(plan->recv_offset);
+      free(plan->recv_size);
+      free(plan->unpackplan);
+    }
+  } else {
+
+    // free arrays used in pt2pt communication
+
+    if (plan->nsend || plan->self) {
+      free(plan->send_offset);
+      free(plan->send_size);
+      free(plan->send_proc);
+      free(plan->packplan);
+    }
+
+    if (plan->nrecv || plan->self) {
+      free(plan->recv_offset);
+      free(plan->recv_size);
+      free(plan->recv_proc);
+      free(plan->recv_bufloc);
+      free(plan->request);
+      free(plan->unpackplan);
+    }
  }

  // free plan itself
--- a/src/KOKKOS/remap_kokkos.h
+++ b/src/KOKKOS/remap_kokkos.h
@ -44,6 +44,7 @@ struct remap_plan_3d_kokkos {
  int *recv_size;                   // size of each recv message
  int *recv_proc;                   // proc to recv each message from
  int *recv_bufloc;                 // offset in scratch buf for each recv
+  int *nrecvmap;                    // maps receive index to rank index
  MPI_Request *request;             // MPI request for each posted recv
  struct pack_plan_3d *unpackplan;  // unpack plan for each recv message
  int nrecv;                        // # of recvs from other procs
@ -52,9 +53,17 @@ struct remap_plan_3d_kokkos {
  int memory;                       // user provides scratch space or not
  MPI_Comm comm;                    // group of procs performing remap
  int usecollective;                // use collective or point-to-point MPI
+  int usegpu_aware;                 // use GPU-Aware MPI or not
+  // variables for collective MPI only
  int commringlen;                  // length of commringlist
  int *commringlist;                // ranks on communication ring of this plan
-  int usegpu_aware;                 // use GPU-Aware MPI or not
+  int *sendcnts;                    // # of elements in send buffer for each rank
+  int *rcvcnts;                     // # of elements in recv buffer for each rank
+  int *sdispls;                     // extraction location in send buffer for each rank
+  int *rdispls;                     // extraction location in recv buffer for each rank
+  int selfcommringloc;              // current proc's location in commringlist
+  int selfnsendloc;                 // current proc's location in send lists
+  int selfnrecvloc;                 // current proc's location in recv lists
 };

 template<class DeviceType>
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@ -1384,15 +1384,20 @@ void PPPM::set_grid_local()
  // npey_fft,npez_fft = # of procs in y,z dims
  // if nprocs is small enough, proc can own 1 or more entire xy planes,
  //   else proc owns 2d sub-blocks of yz plane
+  //   NOTE: commented out lines support this
+  //     need to ensure fft3d.cpp and remap.cpp support 2D planes
  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
  // nlo_fft,nhi_fft = lower/upper limit of the section
  //   of the global FFT mesh that I own in x-pencil decomposition

  int npey_fft,npez_fft;
-  if (nz_pppm >= nprocs) {
-    npey_fft = 1;
-    npez_fft = nprocs;
-  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+  //if (nz_pppm >= nprocs) {
+  //  npey_fft = 1;
+  //  npez_fft = nprocs;
+  //} else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+  procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);

  int me_y = me % npey_fft;
  int me_z = me / npey_fft;