From e400e5b6f7be6530db7ca63ca4f422d6dc4e8fee Mon Sep 17 00:00:00 2001 From: Emily Kahl Date: Wed, 14 Jul 2021 14:46:54 +1000 Subject: [PATCH] Fixed bug in PPPMKokkos::setup_triclinic for MPI calculations. This fix should probably be considered a temporary fix - it relies on a 3-dimensional Kokkos range which seems to be disfavoured in the rest of LAMMPS' codebase. --- src/KOKKOS/pppm_kokkos.cpp | 122 +++---------------------------------- src/KOKKOS/pppm_kokkos.h | 3 +- 2 files changed, 10 insertions(+), 115 deletions(-) diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index 98b6492e48..3fe93f6825 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -455,7 +455,6 @@ void PPPMKokkos::operator()(TagPPPM_setup4, const int &n) const template void PPPMKokkos::setup_triclinic() { - int i,j,k,n; double *prd; // volume-dependent factors @@ -480,13 +479,9 @@ void PPPMKokkos::setup_triclinic() delzinv = nz_pppm; delvolinv = delxinv*delyinv*delzinv/volume; - // merge three outer loops into one for better threading - numz_fft = nzhi_fft-nzlo_fft + 1; - numy_fft = nyhi_fft-nylo_fft + 1; - numx_fft = nxhi_fft-nxlo_fft + 1; - const int inum_fft = numx_fft*numy_fft*numz_fft; copymode = 1; - Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_fft),*this); + Kokkos::parallel_for(Kokkos::MDRangePolicy, DeviceType, TagPPPM_setup_triclinic1>\ + ({nzlo_fft, nylo_fft, nxlo_fft}, {nzhi_fft+1, nyhi_fft+1, nxhi_fft+1}),*this); copymode = 0; // virial coefficients @@ -500,14 +495,14 @@ void PPPMKokkos::setup_triclinic() template KOKKOS_INLINE_FUNCTION -void PPPMKokkos::operator()(TagPPPM_setup_triclinic1, const int &n) const +void PPPMKokkos::operator()(TagPPPM_setup_triclinic1, const int &k, const int &j, const int& i) const { - const int k = n/(numy_fft*numx_fft); - const int j = (n - k*numy_fft*numx_fft) / numx_fft; - const int i = n - k*numy_fft*numx_fft - j*numx_fft; double per_k = k - nz_pppm*(2*k/nz_pppm); double per_j = j - ny_pppm*(2*j/ny_pppm); double per_i = i - nx_pppm*(2*i/nx_pppm); + // n corresponds to the "number" of this iteration if we were to execute the loop monotonically and in serial + int n = (nxhi_fft - nxlo_fft + 1)*(nyhi_fft - nylo_fft + 1)*(k - nzlo_fft)+ + (nxhi_fft - nxlo_fft + 1)*(j - nylo_fft) + (i - nxlo_fft); double unitk_lamda[3]; unitk_lamda[0] = 2.0*MY_PI*per_i; @@ -1244,7 +1239,7 @@ void PPPMKokkos::compute_gf_ik() numz_fft = nzhi_fft-nzlo_fft + 1; numy_fft = nyhi_fft-nylo_fft + 1; numx_fft = nxhi_fft-nxlo_fft + 1; - const int inum_fft = numz_fft*numy_fft*numx_fft; + const int inum_fft = numx_fft*numy_fft*numz_fft; copymode = 1; Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_fft),*this); @@ -1883,76 +1878,12 @@ void PPPMKokkos::operator()(TagPPPM_poisson_ik10, const int &ii) con template void PPPMKokkos::poisson_ik_triclinic() { -/************************************************** - int i,j,k,n; - // compute gradients of V(r) in each of 3 dims by transforming ik*V(k) // FFT leaves data in 3d brick decomposition // copy it into inner portion of vdx,vdy,vdz arrays // x direction gradient - n = 0; - for (i = 0; i < nfft; i++) { // parallel_for1 - d_work2[n] = -d_fkx[i]*d_work1[n+1]; - d_work2[n+1] = d_fkx[i]*d_work1[n]; - n += 2; - } - - fft2->compute(d_work2,d_work2,FFT3dKokkos::BACKWARD); - - n = 0; - for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for2 - - - // y direction gradient - - n = 0; - for (i = 0; i < nfft; i++) { // parallel_for3 - d_work2[n] = -d_fky[i]*d_work1[n+1]; - d_work2[n+1] = d_fky[i]*d_work1[n]; - n += 2; - } - - fft2->compute(d_work2,d_work2,FFT3dKokkos::BACKWARD); - - n = 0; - for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for4 - for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++) - for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) { - d_vdy_brick(k,j,i) = d_work2[n]; - n += 2; - } - - // z direction gradient - - n = 0; - for (i = 0; i < nfft; i++) { // parallel_for5 - d_work2[n] = -d_fkz[i]*d_work1[n+1]; - d_work2[n+1] = d_fkz[i]*d_work1[n]; - n += 2; - } - - fft2->compute(d_work2,d_work2,FFT3dKokkos::BACKWARD); - - n = 0; - for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for6 -******************************/ - int i,j,k,n; - - // compute gradients of V(r) in each of 3 dims by transforming ik*V(k) - // FFT leaves data in 3d brick decomposition - // copy it into inner portion of vdx,vdy,vdz arrays - - // x direction gradient - - //n = 0; - //for (i = 0; i < nfft; i++) { - // d_work2[n] = -d_fkx[i]*d_work1[n+1]; - // d_work2[n+1] = d_fkx[i]*d_work1[n]; - // n += 2; - //} - // merge three outer loops into one for better threading numz_fft = nzhi_fft-nzlo_fft + 1; @@ -1971,68 +1902,30 @@ void PPPMKokkos::poisson_ik_triclinic() fft2->compute(d_work2,d_work2,FFT3dKokkos::BACKWARD); - //n = 0; - //for (k = nzlo_in; k <= nzhi_in; k++) - // for (j = nylo_in; j <= nyhi_in; j++) - // for (i = nxlo_in; i <= nxhi_in; i++) { - // d_vdx_brick(k,j,i) = d_work2[n]; - // n += 2; - // } - copymode = 1; Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_inout),*this); copymode = 0; // y direction gradient - //n = 0; - //for (i = 0; i < nfft; i++) { - // d_work2[n] = -d_fky[i]*d_work1[n+1]; - // d_work2[n+1] = d_fky[i]*d_work1[n]; - // n += 2; - //} - copymode = 1; Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; fft2->compute(d_work2,d_work2,FFT3dKokkos::BACKWARD); - //n = 0; - //for (k = nzlo_in; k <= nzhi_in; k++) - // for (j = nylo_in; j <= nyhi_in; j++) - // for (i = nxlo_in; i <= nxhi_in; i++) { - // d_vdy_brick(k,j,i) = d_work2[n]; - // n += 2; - // } - copymode = 1; Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_inout),*this); copymode = 0; // z direction gradient - //n = 0; - //for (i = 0; i < nfft; i++) { - // d_work2[n] = -d_fkz[i]*d_work1[n+1]; - // d_work2[n+1] = d_fkz[i]*d_work1[n]; - // n += 2; - //} - copymode = 1; Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; fft2->compute(d_work2,d_work2,FFT3dKokkos::BACKWARD); - //n = 0; - //for (k = nzlo_in; k <= nzhi_in; k++) - // for (j = nylo_in; j <= nyhi_in; j++) - // for (i = nxlo_in; i <= nxhi_in; i++) { - // d_vdz_brick(k,j,i) = d_work2[n]; - // n += 2; - // } - copymode = 1; Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_inout),*this); copymode = 0; @@ -3040,3 +2933,4 @@ template class PPPMKokkos; template class PPPMKokkos; #endif } + diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h index c92210b748..e7c55b1726 100644 --- a/src/KOKKOS/pppm_kokkos.h +++ b/src/KOKKOS/pppm_kokkos.h @@ -141,7 +141,7 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT { void operator()(TagPPPM_setup4, const int&) const; KOKKOS_INLINE_FUNCTION - void operator()(TagPPPM_setup_triclinic1, const int&) const; + void operator()(TagPPPM_setup_triclinic1, const int&, const int&, const int&) const; KOKKOS_INLINE_FUNCTION void operator()(TagPPPM_setup_triclinic2, const int&) const; @@ -623,3 +623,4 @@ accuracy. This error should not occur for typical problems. Please send an email to the developers. */ +