From 82be3ee32c24b033b75f4516672a67ac412ae28c Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 9 Apr 2019 09:17:07 -0600 Subject: [PATCH] Only use team with full neigh list --- doc/src/package.txt | 12 ++-- src/KOKKOS/comm_kokkos.cpp | 128 ++++++++++++++++++------------------- src/KOKKOS/kokkos.h | 4 +- src/KOKKOS/pair_kokkos.h | 2 +- 4 files changed, 75 insertions(+), 71 deletions(-) diff --git a/doc/src/package.txt b/doc/src/package.txt index aef35d8d13..b6759bf2e9 100644 --- a/doc/src/package.txt +++ b/doc/src/package.txt @@ -450,10 +450,14 @@ qeq/reax/kk"_fix_qeq_reax.html. If not explicitly set, the value of If the {neigh/thread} keyword is set to {off}, then the KOKKOS package threads only over atoms. However, for small systems, this may not expose enough parallelism to keep a GPU busy. When this keyword is set to {on}, -the KOKKOS package threads over both atoms and neighbors of atoms. -Using {neigh/thread} {on} may be slower for large systems, so this this -option is turned on by default only when there are 16K atoms or less -owned by an MPI rank. +the KOKKOS package threads over both atoms and neighbors of atoms. When +using {neigh/thread} {on}, a full neighbor list must also be used. Using +{neigh/thread} {on} may be slower for large systems, so this this option +is turned on by default only when there are 16K atoms or less owned by +an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled +potentials support this keyword yet, and only thread over atoms. Many +simple pair-wise potentials such as Lennard-Jones do support threading +over both atoms and neighbors. The {newton} keyword sets the Newton flags for pairwise and bonded interactions to {off} or {on}, the same as the "newton"_newton.html diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 720a79617f..c496065ea0 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -194,75 +194,75 @@ void CommKokkos::forward_comm_device(int dummy) k_firstrecv,k_pbc_flag,k_pbc,k_g2l); } else { - for (int iswap = 0; iswap < nswap; iswap++) { - if (sendproc[iswap] != me) { - if (comm_x_only) { - if (size_forward_recv[iswap]) { - buf = atomKK->k_x.view().data() + - firstrecv[iswap]*atomKK->k_x.view().extent(1); - MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); + for (int iswap = 0; iswap < nswap; iswap++) { + if (sendproc[iswap] != me) { + if (comm_x_only) { + if (size_forward_recv[iswap]) { + buf = atomKK->k_x.view().data() + + firstrecv[iswap]*atomKK->k_x.view().extent(1); + MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + } + n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, + iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); + DeviceType::fence(); + if (n) { + MPI_Send(k_buf_send.view().data(), + n,MPI_DOUBLE,sendproc[iswap],0,world); + } + + if (size_forward_recv[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + atomKK->modified(ExecutionSpaceFromDevice:: + space,X_MASK); + } + } else if (ghost_velocity) { + if (size_forward_recv[iswap]) { + MPI_Irecv(k_buf_recv.view().data(), + size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + } + n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap, + k_buf_send,pbc_flag[iswap],pbc[iswap]); + DeviceType::fence(); + if (n) { + MPI_Send(k_buf_send.view().data(),n, + MPI_DOUBLE,sendproc[iswap],0,world); + } + if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); + DeviceType::fence(); + } else { + if (size_forward_recv[iswap]) + MPI_Irecv(k_buf_recv.view().data(), + size_forward_recv[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap, + k_buf_send,pbc_flag[iswap],pbc[iswap]); + DeviceType::fence(); + if (n) + MPI_Send(k_buf_send.view().data(),n, + MPI_DOUBLE,sendproc[iswap],0,world); + if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); + DeviceType::fence(); } - n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, - iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); - DeviceType::fence(); - if (n) { - MPI_Send(k_buf_send.view().data(), - n,MPI_DOUBLE,sendproc[iswap],0,world); - } - - if (size_forward_recv[iswap]) { - MPI_Wait(&request,MPI_STATUS_IGNORE); - atomKK->modified(ExecutionSpaceFromDevice:: - space,X_MASK); - } - } else if (ghost_velocity) { - if (size_forward_recv[iswap]) { - MPI_Irecv(k_buf_recv.view().data(), - size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); - } - n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap, - k_buf_send,pbc_flag[iswap],pbc[iswap]); - DeviceType::fence(); - if (n) { - MPI_Send(k_buf_send.view().data(),n, - MPI_DOUBLE,sendproc[iswap],0,world); - } - if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); - avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); - DeviceType::fence(); } else { - if (size_forward_recv[iswap]) - MPI_Irecv(k_buf_recv.view().data(), - size_forward_recv[iswap],MPI_DOUBLE, - recvproc[iswap],0,world,&request); - n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap, - k_buf_send,pbc_flag[iswap],pbc[iswap]); - DeviceType::fence(); - if (n) - MPI_Send(k_buf_send.view().data(),n, - MPI_DOUBLE,sendproc[iswap],0,world); - if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); - avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); - DeviceType::fence(); - } - } else { - if (!ghost_velocity) { - if (sendnum[iswap]) - n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap, - firstrecv[iswap],pbc_flag[iswap],pbc[iswap]); - DeviceType::fence(); - } else { - n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap, - k_buf_send,pbc_flag[iswap],pbc[iswap]); - DeviceType::fence(); - avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); - DeviceType::fence(); + if (!ghost_velocity) { + if (sendnum[iswap]) + n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap, + firstrecv[iswap],pbc_flag[iswap],pbc[iswap]); + DeviceType::fence(); + } else { + n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap, + k_buf_send,pbc_flag[iswap],pbc[iswap]); + DeviceType::fence(); + avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); + DeviceType::fence(); + } } } } - } } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 3804d24040..ad41c83949 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -89,8 +89,8 @@ U: Must use Kokkos half/thread or full neighbor list with threads or GPUs Using Kokkos half-neighbor lists with threading is not allowed. -E: Must use KOKKOS package option 'neigh full' with 'neigh_thread on' +E: Must use KOKKOS package option 'neigh full' with 'neigh/thread on' -The 'neigh_thread on' option requires a full neighbor list +The 'neigh/thread on' option requires a full neighbor list */ diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h index 04d756932a..9ca5d9578d 100644 --- a/src/KOKKOS/pair_kokkos.h +++ b/src/KOKKOS/pair_kokkos.h @@ -868,7 +868,7 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, typename Kokkos::Impl::enable EV_FLOAT ev; if (!fpair->lmp->kokkos->neigh_thread_set) - if (list->inum <= 16384) + if (list->inum <= 16384 && NEIGHFLAG == FULL) fpair->lmp->kokkos->neigh_thread = 1; if (fpair->lmp->kokkos->neigh_thread) {