From f02fe15ec23c127db2448049ec71bda9c75f32f1 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 12 Jun 2019 15:47:33 -0600 Subject: [PATCH 01/48] Add Kokkos version of FFTs --- src/KOKKOS/Install.sh | 7 + src/KOKKOS/fft3d_kokkos.cpp | 673 +++++++++++++++++++++++++++++++ src/KOKKOS/fft3d_kokkos.h | 99 +++++ src/KOKKOS/fftdata_kokkos.h | 95 +++++ src/KOKKOS/kissfft_kokkos.cpp | 537 +++++++++++++++++++++++++ src/KOKKOS/kissfft_kokkos.h | 572 ++++++++++++++++++++++++++ src/KOKKOS/pack_kokkos.h | 424 ++++++++++++++++++++ src/KOKKOS/pppm_kokkos.cpp | 117 ++---- src/KOKKOS/pppm_kokkos.h | 6 +- src/KOKKOS/remap_kokkos.cpp | 733 ++++++++++++++++++++++++++++++++++ src/KOKKOS/remap_kokkos.h | 87 ++++ 11 files changed, 3265 insertions(+), 85 deletions(-) create mode 100644 src/KOKKOS/fft3d_kokkos.cpp create mode 100644 src/KOKKOS/fft3d_kokkos.h create mode 100644 src/KOKKOS/fftdata_kokkos.h create mode 100644 src/KOKKOS/kissfft_kokkos.cpp create mode 100644 src/KOKKOS/kissfft_kokkos.h create mode 100644 src/KOKKOS/pack_kokkos.h create mode 100644 src/KOKKOS/remap_kokkos.cpp create mode 100644 src/KOKKOS/remap_kokkos.h diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 7c465128d8..941cecad9b 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -95,6 +95,9 @@ action dihedral_opls_kokkos.cpp dihedral_opls.cpp action dihedral_opls_kokkos.h dihedral_opls.h action domain_kokkos.cpp action domain_kokkos.h +action fftdata_kokkos.h fft3d.h +action fft3d_kokkos.cpp fft3d.cpp +action fft3d_kokkos.h fft3d.h action fix_deform_kokkos.cpp action fix_deform_kokkos.h action fix_enforce2d_kokkos.cpp @@ -149,6 +152,7 @@ action improper_class2_kokkos.cpp improper_class2.cpp action improper_class2_kokkos.h improper_class2.h action improper_harmonic_kokkos.cpp improper_harmonic.cpp action improper_harmonic_kokkos.h improper_harmonic.h +action kissfft_kokkos.h kissfft.h action kokkos.cpp action kokkos.h action kokkos_base.h @@ -179,6 +183,7 @@ action nbin_ssa_kokkos.cpp nbin_ssa.cpp action nbin_ssa_kokkos.h nbin_ssa.h action math_special_kokkos.cpp action math_special_kokkos.h +action pack_kokkos.h pack.h action pair_buck_coul_cut_kokkos.cpp action pair_buck_coul_cut_kokkos.h action pair_buck_coul_long_kokkos.cpp pair_buck_coul_long.cpp @@ -275,6 +280,8 @@ action rand_pool_wrap_kokkos.cpp action rand_pool_wrap_kokkos.h action region_block_kokkos.cpp action region_block_kokkos.h +action remap_kokkos.cpp remap.cpp +action remap_kokkos.h remap.h action sna_kokkos.h sna.h action sna_kokkos_impl.h sna.cpp action verlet_kokkos.cpp diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp new file mode 100644 index 0000000000..c5120f2bda --- /dev/null +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -0,0 +1,673 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include "fft3d_kokkos.h" +#include "remap_kokkos.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +template +FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int scaled, int permute, int *nbuf, int usecollective) : + Pointers(lmp) +{ + plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, + in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, + scaled,permute,nbuf,usecollective); + if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan"); +} + +/* ---------------------------------------------------------------------- */ + +template +FFT3dKokkos::~FFT3dKokkos() +{ + fft_3d_destroy_plan_kokkos(plan); +} + +/* ---------------------------------------------------------------------- */ + +template +void FFT3dKokkos::compute(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, int flag) +{ + typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size()/2); + typename AT::t_FFT_DATA_1d d_out_data(d_out.data(),d_out.size()/2); + + fft_3d_kokkos(d_in_data,d_out_data,flag,plan); +} + +/* ---------------------------------------------------------------------- */ + +template +void FFT3dKokkos::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int nsize, int flag) +{ + typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size()); + + fft_3d_1d_only_kokkos(d_in_data,nsize,flag,plan); +} + +#include +#include +#include +#include +#include "fft3d_kokkos.h" +#include "remap_kokkos.h" + +// include kissfft implementation + +#include "kissfft_kokkos.h" + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +/* ---------------------------------------------------------------------- + Data layout for 3d FFTs: + + data set of Nfast x Nmid x Nslow elements is owned by P procs + on input, each proc owns a subsection of the elements + on output, each proc will own a (possibly different) subsection + my subsection must not overlap with any other proc's subsection, + i.e. the union of all proc's input (or output) subsections must + exactly tile the global Nfast x Nmid x Nslow data set + when called from C, all subsection indices are + C-style from 0 to N-1 where N = Nfast or Nmid or Nslow + when called from F77, all subsection indices are + F77-style from 1 to N where N = Nfast or Nmid or Nslow + a proc can own 0 elements on input or output + by specifying hi index < lo index + on both input and output, data is stored contiguously on a processor + with a fast-varying, mid-varying, and slow-varying index +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Perform 3d FFT + + Arguments: + in starting address of input data on this proc + out starting address of where output data for this proc + will be placed (can be same as in) + flag 1 for forward FFT, -1 for inverse FFT + plan plan returned by previous call to fft_3d_create_plan +------------------------------------------------------------------------- */ + +#include "kokkos_type.h" + +template +struct kiss_fft_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_DATA_1d_um d_data,d_tmp; + kiss_fft_state_kokkos st; + int length; + + kiss_fft_functor() {} + + kiss_fft_functor(typename AT::t_FFT_DATA_1d &d_data_,typename AT::t_FFT_DATA_1d &d_tmp_, kiss_fft_state_kokkos &st_, int length_): + d_data(d_data_), + d_tmp(d_tmp_), + st(st_) + { + length = length_; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &i) const { + const int offset = i*length; + KissFFTKokkos::kiss_fft_kokkos(st,d_data,d_tmp,offset); + } +}; + +template +struct norm_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_DATA_1d_um d_out; + int norm; + + norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_): + d_out(d_out_) + { + norm = norm_; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &i) const { + d_out(i,0) *= norm; + d_out(i,1) *= norm; + } +}; + +template +void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_FFT_DATA_1d d_in, typename ArrayTypes::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos *plan) +{ + int i,total,length,offset,num; + FFT_SCALAR norm; + typename ArrayTypes::t_FFT_DATA_1d d_data,d_copy; + typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + typename ArrayTypes::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar; + kiss_fft_functor f; + + // pre-remap to prepare for 1st FFTs if needed + // copy = loc for remap result + + if (plan->pre_plan) { + if (plan->pre_target == 0) d_copy = d_out; + else d_copy = plan->d_copy; + + d_in_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_in.data(),d_in.size()); + d_copy_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); + d_scratch_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + + remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar, + d_scratch_scalar, plan->pre_plan); + + d_data = d_copy; + } else d_data = d_in; + + // 1d FFTs along fast axis + + total = plan->total1; + length = plan->length1; + + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length); + Kokkos::parallel_for(total/length,f); + DeviceType::fence(); + + d_data = d_tmp; + d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + + // 1st mid-remap to prepare for 2nd FFTs + // copy = loc for remap result + + if (plan->mid1_target == 0) d_copy = d_out; + else d_copy = plan->d_copy; + + d_data_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()*2); + d_copy_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2); + d_scratch_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2); + + remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, + d_scratch_scalar, plan->mid1_plan); + + d_data = d_copy; + + // 1d FFTs along mid axis + + total = plan->total2; + length = plan->length2; + + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length); + Kokkos::parallel_for(total/length,f); + DeviceType::fence(); + + d_data = d_tmp; + d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + + // 2nd mid-remap to prepare for 3rd FFTs + // copy = loc for remap result + + if (plan->mid2_target == 0) d_copy = d_out; + else d_copy = plan->d_copy; + + d_data_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + d_copy_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); + d_scratch_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + + remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, + d_scratch_scalar, plan->mid2_plan); + + d_data = d_copy; + + // 1d FFTs along slow axis + + total = plan->total3; + length = plan->length3; + + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_backward,length); + Kokkos::parallel_for(total/length,f); + DeviceType::fence(); + + d_data = d_tmp; + + // post-remap to put data in output format if needed + // destination is always out + + if (plan->post_plan) { + d_data_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + d_out_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_out.data(),d_out.size()); + d_scratch_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + + remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar, + d_scratch_scalar, plan->post_plan); + } + + // scaling if required + + if (flag == 1 && plan->scaled) { + norm = plan->norm; + num = plan->normnum; + norm_functor f(d_out,norm); + Kokkos::parallel_for(num,f); + DeviceType::fence(); + } + +} + +/* ---------------------------------------------------------------------- + Create plan for performing a 3d FFT + + Arguments: + comm MPI communicator for the P procs which own the d_data + nfast,nmid,nslow size of global 3d matrix + in_ilo,in_ihi input bounds of data I own in fast index + in_jlo,in_jhi input bounds of data I own in mid index + in_klo,in_khi input bounds of data I own in slow index + out_ilo,out_ihi output bounds of data I own in fast index + out_jlo,out_jhi output bounds of data I own in mid index + out_klo,out_khi output bounds of data I own in slow index + scaled 0 = no scaling of result, 1 = scaling + permute permutation in storage order of indices on output + 0 = no permutation + 1 = permute once = mid->fast, slow->mid, fast->slow + 2 = permute twice = slow->fast, fast->mid, mid->slow + nbuf returns size of internal storage buffers used by FFT + usecollective use collective MPI operations for remapping data +------------------------------------------------------------------------- */ + +template +struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_plan_kokkos ( + MPI_Comm comm, int nfast, int nmid, int nslow, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int scaled, int permute, int *nbuf, int usecollective) +{ + struct fft_plan_3d_kokkos *plan; + int me,nprocs; + int i,num,flag,remapflag,fftflag; + int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; + int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; + int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; + int out_size,first_size,second_size,third_size,copy_size,scratch_size; + int np1,np2,ip1,ip2; + int list[50]; + + // query MPI info + + MPI_Comm_rank(comm,&me); + MPI_Comm_size(comm,&nprocs); + + // compute division of procs in 2 dimensions not on-processor + + bifactor(nprocs,&np1,&np2); + ip1 = me % np1; + ip2 = me/np1; + + // allocate memory for plan data struct + + plan = new struct fft_plan_3d_kokkos; + remapKK = new RemapKokkos(lmp); + kissfftKK = new KissFFTKokkos(); + if (plan == NULL) return NULL; + + // remap from initial distribution to layout needed for 1st set of 1d FFTs + // not needed if all procs own entire fast axis initially + // first indices = distribution after 1st set of FFTs + + if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; + else flag = 1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) { + first_ilo = in_ilo; + first_ihi = in_ihi; + first_jlo = in_jlo; + first_jhi = in_jhi; + first_klo = in_klo; + first_khi = in_khi; + plan->pre_plan = NULL; + } + else { + first_ilo = 0; + first_ihi = nfast - 1; + first_jlo = ip1*nmid/np1; + first_jhi = (ip1+1)*nmid/np1 - 1; + first_klo = ip2*nslow/np2; + first_khi = (ip2+1)*nslow/np2 - 1; + plan->pre_plan = + remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + first_ilo,first_ihi,first_jlo,first_jhi, + first_klo,first_khi,2,0,0,FFT_PRECISION,0); + if (plan->pre_plan == NULL) return NULL; + } + + // 1d FFTs along fast axis + + plan->length1 = nfast; + plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); + + // remap from 1st to 2nd FFT + // choose which axis is split over np1 vs np2 to minimize communication + // second indices = distribution after 2nd set of FFTs + + second_ilo = ip1*nfast/np1; + second_ihi = (ip1+1)*nfast/np1 - 1; + second_jlo = 0; + second_jhi = nmid - 1; + second_klo = ip2*nslow/np2; + second_khi = (ip2+1)*nslow/np2 - 1; + plan->mid1_plan = + remapKK->remap_3d_create_plan_kokkos(comm, + first_ilo,first_ihi,first_jlo,first_jhi, + first_klo,first_khi, + second_ilo,second_ihi,second_jlo,second_jhi, + second_klo,second_khi,2,1,0,FFT_PRECISION, + usecollective); + if (plan->mid1_plan == NULL) return NULL; + + // 1d FFTs along mid axis + + plan->length2 = nmid; + plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1); + + // remap from 2nd to 3rd FFT + // if final distribution is permute=2 with all procs owning entire slow axis + // then this remapping goes directly to final distribution + // third indices = distribution after 3rd set of FFTs + + if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0; + else flag = 1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) { + third_ilo = out_ilo; + third_ihi = out_ihi; + third_jlo = out_jlo; + third_jhi = out_jhi; + third_klo = out_klo; + third_khi = out_khi; + } + else { + third_ilo = ip1*nfast/np1; + third_ihi = (ip1+1)*nfast/np1 - 1; + third_jlo = ip2*nmid/np2; + third_jhi = (ip2+1)*nmid/np2 - 1; + third_klo = 0; + third_khi = nslow - 1; + } + + plan->mid2_plan = + remapKK->remap_3d_create_plan_kokkos(comm, + second_jlo,second_jhi,second_klo,second_khi, + second_ilo,second_ihi, + third_jlo,third_jhi,third_klo,third_khi, + third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective); + if (plan->mid2_plan == NULL) return NULL; + + // 1d FFTs along slow axis + + plan->length3 = nslow; + plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow; + + // remap from 3rd FFT to final distribution + // not needed if permute = 2 and third indices = out indices on all procs + + if (permute == 2 && + out_ilo == third_ilo && out_ihi == third_ihi && + out_jlo == third_jlo && out_jhi == third_jhi && + out_klo == third_klo && out_khi == third_khi) flag = 0; + else flag = 1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) + plan->post_plan = NULL; + else { + plan->post_plan = + remapKK->remap_3d_create_plan_kokkos(comm, + third_klo,third_khi,third_ilo,third_ihi, + third_jlo,third_jhi, + out_klo,out_khi,out_ilo,out_ihi, + out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0); + if (plan->post_plan == NULL) return NULL; + } + + // configure plan memory pointers and allocate work space + // out_size = amount of memory given to FFT by user + // first/second/third_size = amount of memory needed after + // pre,mid1,mid2 remaps + // copy_size = amount needed internally for extra copy of data + // scratch_size = amount needed internally for remap scratch space + // for each remap: + // out space used for result if big enough, else require copy buffer + // accumulate largest required remap scratch space + + out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); + first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * + (first_khi-first_klo+1); + second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * + (second_khi-second_klo+1); + third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * + (third_khi-third_klo+1); + + copy_size = 0; + scratch_size = 0; + + if (plan->pre_plan) { + if (first_size <= out_size) + plan->pre_target = 0; + else { + plan->pre_target = 1; + copy_size = MAX(copy_size,first_size); + } + scratch_size = MAX(scratch_size,first_size); + } + + if (plan->mid1_plan) { + if (second_size <= out_size) + plan->mid1_target = 0; + else { + plan->mid1_target = 1; + copy_size = MAX(copy_size,second_size); + } + scratch_size = MAX(scratch_size,second_size); + } + + if (plan->mid2_plan) { + if (third_size <= out_size) + plan->mid2_target = 0; + else { + plan->mid2_target = 1; + copy_size = MAX(copy_size,third_size); + } + scratch_size = MAX(scratch_size,third_size); + } + + if (plan->post_plan) + scratch_size = MAX(scratch_size,out_size); + + *nbuf = copy_size + scratch_size; + + if (copy_size) { + plan->d_copy = typename ArrayTypes::t_FFT_DATA_1d("fft3d:copy",copy_size); + } + + if (scratch_size) { + plan->d_scratch = typename ArrayTypes::t_FFT_DATA_1d("fft3d:scratch",scratch_size); + } + + // system specific pre-computation of 1d FFT coeffs + // and scaling normalization + + plan->cfg_fast_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL); + plan->cfg_fast_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,1,NULL,NULL); + + if (nmid == nfast) { + plan->cfg_mid_forward = plan->cfg_fast_forward; + plan->cfg_mid_backward = plan->cfg_fast_backward; + } + else { + plan->cfg_mid_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nmid,0,NULL,NULL); + plan->cfg_mid_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nmid,1,NULL,NULL); + } + + if (nslow == nfast) { + plan->cfg_slow_forward = plan->cfg_fast_forward; + plan->cfg_slow_backward = plan->cfg_fast_backward; + } + else if (nslow == nmid) { + plan->cfg_slow_forward = plan->cfg_mid_forward; + plan->cfg_slow_backward = plan->cfg_mid_backward; + } + else { + plan->cfg_slow_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nslow,0,NULL,NULL); + plan->cfg_slow_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nslow,1,NULL,NULL); + } + + if (scaled == 0) + plan->scaled = 0; + else { + plan->scaled = 1; + plan->norm = 1.0/(nfast*nmid*nslow); + plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * + (out_khi-out_klo+1); + } + + return plan; +} + +/* ---------------------------------------------------------------------- + Destroy a 3d fft plan +------------------------------------------------------------------------- */ + +template +void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos *plan) +{ + if (plan->pre_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->pre_plan); + if (plan->mid1_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid1_plan); + if (plan->mid2_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid2_plan); + if (plan->post_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->post_plan); + + delete plan; + delete remapKK; + delete kissfftKK; +} + +/* ---------------------------------------------------------------------- + perform just the 1d FFTs needed by a 3d FFT, no data movement + used for timing purposes + + Arguments: + in starting address of input data on this proc, all set to 0.0 + nsize size of in + flag 1 for forward FFT, -1 for inverse FFT + plan plan returned by previous call to fft_3d_create_plan +------------------------------------------------------------------------- */ + +template +void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypes::t_FFT_DATA_1d d_data, int nsize, int flag, + struct fft_plan_3d_kokkos *plan) +{ + int i,total,length,offset,num; + typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); + kiss_fft_functor f; + + // total = size of data needed in each dim + // length = length of 1d FFT in each dim + // total/length = # of 1d FFTs in each dim + // if total > nsize, limit # of 1d FFTs to available size of data + + int total1 = plan->total1; + int length1 = plan->length1; + int total2 = plan->total2; + int length2 = plan->length2; + int total3 = plan->total3; + int length3 = plan->length3; + + // fftw3 and Dfti in MKL encode the number of transforms + // into the plan, so we cannot operate on a smaller data set + + if (total1 > nsize) total1 = (nsize/length1) * length1; + if (total2 > nsize) total2 = (nsize/length2) * length2; + if (total3 > nsize) total3 = (nsize/length3) * length3; + + // perform 1d FFTs in each of 3 dimensions + // data is just an array of 0.0 + + if (flag == -1) { + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length1); + Kokkos::parallel_for(total1/length1,f); + DeviceType::fence(); + + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length2); + Kokkos::parallel_for(total2/length2,f); + DeviceType::fence(); + + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length3); + Kokkos::parallel_for(total3/length3,f); + DeviceType::fence(); + } else { + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length1); + Kokkos::parallel_for(total1/length1,f); + DeviceType::fence(); + + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length2); + Kokkos::parallel_for(total2/length2,f); + DeviceType::fence(); + + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_backward,length3); + Kokkos::parallel_for(total3/length3,f); + DeviceType::fence(); + } + + // scaling if required + // limit num to size of data + + if (flag == 1 && plan->scaled) { + FFT_SCALAR norm = plan->norm; + num = MIN(plan->normnum,nsize); + norm_functor f(d_data,norm); + Kokkos::parallel_for(num,f); + DeviceType::fence(); + } +} + +namespace LAMMPS_NS { +template class FFT3dKokkos; +#ifdef KOKKOS_HAVE_CUDA +template class FFT3dKokkos; +#endif +} diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h new file mode 100644 index 0000000000..6fcf140228 --- /dev/null +++ b/src/KOKKOS/fft3d_kokkos.h @@ -0,0 +1,99 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_FFT3D_KOKKOS_H +#define LMP_FFT3D_KOKKOS_H + +#include "pointers.h" +#include "fft3d.h" +#include "kokkos_type.h" +#include "remap_kokkos.h" +#include "kissfft_kokkos.h" + +namespace LAMMPS_NS { + +// ------------------------------------------------------------------------- + +// plan for how to perform a 3d FFT + +template +struct fft_plan_3d_kokkos { + typedef DeviceType device_type; + typedef ArrayTypes AT; + + struct remap_plan_3d_kokkos *pre_plan; // remap from input -> 1st FFTs + struct remap_plan_3d_kokkos *mid1_plan; // remap from 1st -> 2nd FFTs + struct remap_plan_3d_kokkos *mid2_plan; // remap from 2nd -> 3rd FFTs + struct remap_plan_3d_kokkos *post_plan; // remap from 3rd FFTs -> output + typename AT::t_FFT_DATA_1d d_copy; // memory for remap results (if needed) + typename AT::t_FFT_DATA_1d d_scratch; // scratch space for remaps + int total1,total2,total3; // # of 1st,2nd,3rd FFTs (times length) + int length1,length2,length3; // length of 1st,2nd,3rd FFTs + int pre_target; // where to put remap results + int mid1_target,mid2_target; + int scaled; // whether to scale FFT results + int normnum; // # of values to rescale + double norm; // normalization factor for rescaling + + // system specific 1d FFT info + kiss_fft_state_kokkos cfg_fast_forward; + kiss_fft_state_kokkos cfg_fast_backward; + kiss_fft_state_kokkos cfg_mid_forward; + kiss_fft_state_kokkos cfg_mid_backward; + kiss_fft_state_kokkos cfg_slow_forward; + kiss_fft_state_kokkos cfg_slow_backward; +}; + +template +class FFT3dKokkos : protected Pointers { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + FFT3dKokkos(class LAMMPS *, MPI_Comm, + int,int,int,int,int,int,int,int,int,int,int,int,int,int,int, + int,int,int *,int); + ~FFT3dKokkos(); + void compute(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, int); + void timing1d(typename AT::t_FFT_SCALAR_1d, int, int); + + private: + struct fft_plan_3d_kokkos *plan; + RemapKokkos *remapKK; + KissFFTKokkos *kissfftKK; + + void fft_3d_kokkos(typename AT::t_FFT_DATA_1d, typename AT::t_FFT_DATA_1d, int, struct fft_plan_3d_kokkos *); + + struct fft_plan_3d_kokkos *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int, + int, int, int, int, int, + int, int, int, int, int, int, int, + int, int, int *, int); + + void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos *); + + void fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d, int, int, struct fft_plan_3d_kokkos *); +}; + +} + +#endif + +/* ERROR/WARNING messages: + +E: Could not create 3d FFT plan + +The FFT setup for the PPPM solver failed, typically due +to lack of memory. This is an unusual error. Check the +size of the FFT grid you are requesting. + +*/ diff --git a/src/KOKKOS/fftdata_kokkos.h b/src/KOKKOS/fftdata_kokkos.h new file mode 100644 index 0000000000..64254870ca --- /dev/null +++ b/src/KOKKOS/fftdata_kokkos.h @@ -0,0 +1,95 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + + +// data types for 2d/3d FFTs + +#ifndef FFT_DATA_KOKKOS_H +#define FFT_DATA_KOKKOS_H + +#include "kokkos_type.h" + +// User-settable FFT precision + +// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag) +// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag) + +#ifdef FFT_SINGLE +#define FFT_PRECISION 1 +#define MPI_FFT_SCALAR MPI_FLOAT +typedef float FFT_SCALAR; +#else +#define FFT_PRECISION 2 +#define MPI_FFT_SCALAR MPI_DOUBLE +typedef double FFT_SCALAR; +#endif + +// ------------------------------------------------------------------------- + +// Data types for single-precision complex + +#if FFT_PRECISION == 1 + +// use a stripped down version of kiss fft as default fft + +#ifndef FFT_KISSFFT +#define FFT_KISSFFT +#endif +#define kiss_fft_scalar_kokkos float +//typedef struct { +// kiss_fft_scalar re; +// kiss_fft_scalar im; +//} FFT_DATA; + +// ------------------------------------------------------------------------- + +// Data types for double-precision complex + +#elif FFT_PRECISION == 2 + +// use a stripped down version of kiss fft as default fft + +#ifndef FFT_KISSFFT +#define FFT_KISSFFT +#endif +#define kiss_fft_scalar_kokkos double +//typedef struct { +// kiss_fft_scalar re; +// kiss_fft_scalar im; +//} FFT_DATA; + +// ------------------------------------------------------------------------- + +#else +#error "FFT_PRECISION needs to be either 1 (=single) or 2 (=double)" +#endif + +// ------------------------------------------------------------------------- + +#define MAXFACTORS 32 +/* e.g. an fft of length 128 has 4 factors + as far as kissfft is concerned: 4*4*4*2 */ +template +struct kiss_fft_state_kokkos { + typedef DeviceType device_type; + typedef ArrayTypes AT; + int nfft; + int inverse; + typename AT::t_int_64 d_factors; + typename AT::t_FFT_DATA_1d d_twiddles; + typename AT::t_FFT_DATA_1d d_scratch; +}; + +#endif diff --git a/src/KOKKOS/kissfft_kokkos.cpp b/src/KOKKOS/kissfft_kokkos.cpp new file mode 100644 index 0000000000..6b878596ef --- /dev/null +++ b/src/KOKKOS/kissfft_kokkos.cpp @@ -0,0 +1,537 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* + we use a stripped down KISS FFT as default FFT for LAMMPS + this code is adapted from kiss_fft_v1_2_9 + homepage: http://kissfft.sf.net/ + + changes 2008-2011 by Axel Kohlmeyer +*/ + +#include +#include +#include +#include "kissfft_kokkos.h" +#include "fftdata_kokkos.h" + +#ifndef M_PI +#define M_PI 3.141592653589793238462643383279502884197169399375105820974944 +#endif + +using namespace LAMMPS_NS; + +/* + Copyright (c) 2003-2010, Mark Borgerding + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the author nor the names of any contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + Explanation of macros dealing with complex math: + + C_MUL(m,a,b) : m = a*b + C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise + C_SUB( res, a,b) : res = a - b + C_SUBFROM( res , a) : res -= a + C_ADDTO( res , a) : res += a + C_EQ( res , a) : res = a + * */ + +#define S_MUL(a,b) ( (a)*(b) ) + +#define C_MUL(m,a,a_index,b,b_index) \ + do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\ + (m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0) + +#define C_FIXDIV(c,div) /* NOOP */ + + + +#define C_MULBYSCALAR( c, s ) \ + do{ (c)[0] *= (s);\ + (c)[1] *= (s); }while(0) + +#define C_ADD( res, a,b)\ + do { \ + (res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \ + }while(0) + +#define C_SUB( res, a,b)\ + do { \ + (res)[0]=(a)[0]-(b)[0]; (res)[1]=(a)[1]-(b)[1]; \ + }while(0) + +#define C_ADDTO( res , a)\ + do { \ + (res)[0] += (a)[0]; (res)[1] += (a)[1];\ + }while(0) + +#define C_SUBFROM( res , a)\ + do {\ + (res)[0] -= (a)[0]; (res)[1] -= (a)[1]; \ + }while(0) + +#define C_EQ(res, a)\ + do {\ + (res)[0] = (a)[0]; (res)[1] = (a)[1]; \ + }while(0) + + +#define KISS_FFT_COS(phase) (FFT_SCALAR) cos(phase) +#define KISS_FFT_SIN(phase) (FFT_SCALAR) sin(phase) +#define HALF_OF(x) ((x)*.5) + +#define kf_cexp(x,x_index,phase) \ + do{ \ + (x)(x_index,0) = KISS_FFT_COS(phase);\ + (x)(x_index,1) = KISS_FFT_SIN(phase);\ + }while(0) + +template +KOKKOS_INLINE_FUNCTION +void KissFFTKokkos::kf_bfly2(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, int m, int Fout_count) +{ + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR t[2]; + int Fout2_count; + int tw1_count = 0; + + Fout2_count = Fout_count + m; + do { + //C_FIXDIV(d_Fout[Fout_count],2); C_FIXDIV(d_Fout[Fout2_count],2); + + C_MUL(t,d_Fout,Fout2_count,d_twiddles,tw1_count); + tw1_count += fstride; + //C_SUB(*Fout2,*Fout,t); + d_Fout(Fout2_count,0) = d_Fout(Fout_count,0) - t[0]; + d_Fout(Fout2_count,1) = d_Fout(Fout_count,1) - t[1]; + //C_ADDTO(d_Fout[Fout_count],t); + d_Fout(Fout_count,0) += t[0]; + d_Fout(Fout_count,1) += t[1]; + ++Fout2_count; + ++Fout_count; + } while(--m); +} + +template +KOKKOS_INLINE_FUNCTION +void KissFFTKokkos::kf_bfly4(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, const size_t m, int Fout_count) +{ + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR scratch[6][2]; + size_t k=m; + const size_t m2=2*m; + const size_t m3=3*m; + + int tw3_count,tw2_count,tw1_count; + tw3_count = tw2_count = tw1_count = 0; + + do { + //C_FIXDIV(d_Fout[Fout_count],4); C_FIXDIV(d_Fout[m],4); C_FIXDIV(d_Fout[m2],4); C_FIXDIV(d_Fout[m3],4); + + C_MUL(scratch[0],d_Fout,Fout_count + m,d_twiddles,tw1_count); + C_MUL(scratch[1],d_Fout,Fout_count + m2,d_twiddles,tw2_count); + C_MUL(scratch[2],d_Fout,Fout_count + m3,d_twiddles,tw3_count); + + //C_SUB(scratch[5],d_Fout[Fout_count],scratch[1] ); + scratch[5][0] = d_Fout(Fout_count,0) - scratch[1][0]; + scratch[5][1] = d_Fout(Fout_count,1) - scratch[1][1]; + //C_ADDTO(d_Fout[Fout_count], scratch[1]); + d_Fout(Fout_count,0) += scratch[1][0]; + d_Fout(Fout_count,1) += scratch[1][1]; + //C_ADD(scratch[3],scratch[0],scratch[2]); + scratch[3][0] = scratch[0][0] + scratch[2][0]; + scratch[3][1] = scratch[0][1] + scratch[2][1]; + //C_SUB( scratch[4] , scratch[0] , scratch[2] ); + scratch[4][0] = scratch[0][0] - scratch[2][0]; + scratch[4][1] = scratch[0][1] - scratch[2][1]; + //C_SUB(d_Fout[m2],d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count + m2,0) = d_Fout(Fout_count,0) - scratch[3][0]; + d_Fout(Fout_count + m2,1) = d_Fout(Fout_count,1) - scratch[3][1]; + + tw1_count += fstride; + tw2_count += fstride*2; + tw3_count += fstride*3; + //C_ADDTO(d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count,0) += scratch[3][0]; + d_Fout(Fout_count,1) += scratch[3][1]; + + if (st.inverse) { + d_Fout(Fout_count + m,0) = scratch[5][0] - scratch[4][1]; + d_Fout(Fout_count + m,1) = scratch[5][1] + scratch[4][0]; + d_Fout(Fout_count + m3,0) = scratch[5][0] + scratch[4][1]; + d_Fout(Fout_count + m3,1) = scratch[5][1] - scratch[4][0]; + } else{ + d_Fout(Fout_count + m,0) = scratch[5][0] + scratch[4][1]; + d_Fout(Fout_count + m,1) = scratch[5][1] - scratch[4][0]; + d_Fout(Fout_count + m3,0) = scratch[5][0] - scratch[4][1]; + d_Fout(Fout_count + m3,1) = scratch[5][1] + scratch[4][0]; + } + ++Fout_count; + } while(--k); +} + +template +KOKKOS_INLINE_FUNCTION +void KissFFTKokkos::kf_bfly3(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, size_t m, int Fout_count) +{ + size_t k=m; + const size_t m2 = 2*m; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR scratch[5][2]; + FFT_SCALAR epi3[2]; + //C_EQ(epi3,d_twiddles[fstride*m]); + epi3[0] = d_twiddles(fstride*m,0); + epi3[1] = d_twiddles(fstride*m,1); + + int tw1_count,tw2_count; + tw1_count = tw2_count = 0; + + do { + //C_FIXDIV(d_Fout[Fout_count],3); C_FIXDIV(d_Fout[m],3); C_FIXDIV(d_Fout[m2],3); + + C_MUL(scratch[1],d_Fout,Fout_count + m,d_twiddles,tw1_count); + C_MUL(scratch[2],d_Fout,Fout_count + m2,d_twiddles,tw2_count); + + //C_ADD(scratch[3],scratch[1],scratch[2]); + scratch[3][0] = scratch[1][0] + scratch[2][0]; + scratch[3][1] = scratch[1][1] + scratch[2][1]; + //C_SUB(scratch[0],scratch[1],scratch[2]); + scratch[0][0] = scratch[1][0] - scratch[2][0]; + scratch[0][1] = scratch[1][1] - scratch[2][1]; + tw1_count += fstride; + tw2_count += fstride*2; + + d_Fout(Fout_count + m,0) = d_Fout(Fout_count,0) - HALF_OF(scratch[3][0]); + d_Fout(Fout_count + m,1) = d_Fout(Fout_count,1) - HALF_OF(scratch[3][1]); + + //C_MULBYSCALAR(scratch[0],epi3[1]); + scratch[0][0] *= epi3[1]; + scratch[0][1] *= epi3[1]; + + //C_ADDTO(d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count,0) += scratch[3][0]; + d_Fout(Fout_count,1) += scratch[3][1]; + + d_Fout(Fout_count + m2,0) = d_Fout(Fout_count + m,0) + scratch[0][1]; + d_Fout(Fout_count + m2,1) = d_Fout(Fout_count + m,1) - scratch[0][0]; + + d_Fout(Fout_count + m,0) -= scratch[0][1]; + d_Fout(Fout_count + m,1) += scratch[0][0]; + + ++Fout_count; + } while(--k); +} + +template +KOKKOS_INLINE_FUNCTION +void KissFFTKokkos::kf_bfly5(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, int m, int Fout_count) +{ + int u; + FFT_SCALAR scratch[13][2]; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR ya[2],yb[2]; + //C_EQ(ya,d_twiddles[fstride*m]); + ya[1] = d_twiddles(fstride*m,1); + ya[0] = d_twiddles(fstride*m,0); + //C_EQ(yb,d_twiddles[fstride*2*m]); + yb[1] = d_twiddles(fstride*2*m,1); + yb[0] = d_twiddles(fstride*2*m,0); + + int Fout0_count=Fout_count; + int Fout1_count=Fout0_count+m; + int Fout2_count=Fout0_count+2*m; + int Fout3_count=Fout0_count+3*m; + int Fout4_count=Fout0_count+4*m; + + for ( u=0; u +KOKKOS_INLINE_FUNCTION +void KissFFTKokkos::kf_bfly_generic(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, int m, int p, int Fout_count) +{ + int u,k,q1,q; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR t[2]; + int Norig = st.nfft; + + typename AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch; + for ( u=0; u=Norig) twidx-=Norig; + C_MUL(t,d_scratch,q,d_twiddles,twidx); + //C_ADDTO(d_Fout[k],t); + d_Fout(Fout_count + k,0) += t[0]; + d_Fout(Fout_count + k,1) += t[1]; + } + k += m; + } + } + //KISS_FFT_TMP_FREE(d_scratch); +} + +template +KOKKOS_INLINE_FUNCTION +void KissFFTKokkos::kf_work(typename AT::t_FFT_DATA_1d_um &d_Fout, const typename AT::t_FFT_DATA_1d_um &d_f, + const size_t fstride, int in_stride, + const typename AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos &st, int Fout_count, int f_count, int factors_count) +{ + const int beg = Fout_count; + const int p = d_factors[factors_count++]; /* the radix */ + const int m = d_factors[factors_count++]; /* stage's fft length/p */ + const int end = Fout_count + p*m; + + if (m == 1) { + do { + //C_EQ(d_Fout[Fout_count],d_f[f_count]); + d_Fout(Fout_count,0) = d_f(f_count,0); + d_Fout(Fout_count,1) = d_f(f_count,1); + f_count += fstride*in_stride; + } while (++Fout_count != end); + } else { + do { + /* recursive call: + DFT of size m*p performed by doing + p instances of smaller DFTs of size m, + each one takes a decimated version of the input */ + kf_work(d_Fout, d_f, fstride*p, in_stride, d_factors, st, Fout_count, f_count, factors_count); + f_count += fstride*in_stride; + } while( (Fout_count += m) != end); + } + + Fout_count=beg; + + /* recombine the p smaller DFTs */ + switch (p) { + case 2: kf_bfly2(d_Fout,fstride,st,m,Fout_count); break; + case 3: kf_bfly3(d_Fout,fstride,st,m,Fout_count); break; + case 4: kf_bfly4(d_Fout,fstride,st,m,Fout_count); break; + case 5: kf_bfly5(d_Fout,fstride,st,m,Fout_count); break; + default: kf_bfly_generic(d_Fout,fstride,st,m,p,Fout_count); break; + } +} + +/* facbuf is populated by p1,m1,p2,m2, ... + where + p[i] * m[i] = m[i-1] + m0 = n */ + +template +int KissFFTKokkos::kf_factor(int n, HAT::t_int_64 h_facbuf) +{ + int p=4, nf=0; + double floor_sqrt; + floor_sqrt = floor( sqrt((double)n) ); + int facbuf_count = 0; + int p_max = 0; + + /* factor out the remaining powers of 4, powers of 2, + and then any other remaining primes */ + do { + if (nf == MAXFACTORS) p = n; /* make certain that we don't run out of space */ + while (n % p) { + switch (p) { + case 4: p = 2; break; + case 2: p = 3; break; + default: p += 2; break; + } + if (p > floor_sqrt) + p = n; /* no more factors, skip to end */ + } + n /= p; + h_facbuf[facbuf_count++] = p; + h_facbuf[facbuf_count++] = n; + p_max = MAX(p,p_max); + ++nf; + } while (n > 1); + return p_max; +} + +/* + * User-callable function to allocate all necessary storage space for the fft. + * + * The return value is a contiguous block of memory, allocated with malloc. As such, + * It can be freed with free(), rather than a kiss_fft-specific function. + */ + +template +kiss_fft_state_kokkos KissFFTKokkos::kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void *mem, size_t *lenmem) +{ + kiss_fft_state_kokkos st; + int i; + st.nfft = nfft; + st.inverse = inverse_fft; + + typename AT::tdual_int_64 k_factors = typename AT::tdual_int_64(); + typename AT::tdual_FFT_DATA_1d k_twiddles = typename AT::tdual_FFT_DATA_1d(); + + if (nfft > 0) { + k_factors = typename AT::tdual_int_64("kissfft:factors",MAXFACTORS*2); + k_twiddles = typename AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft); + + for (i=0;i(); + k_factors.template sync(); + st.d_factors = k_factors.template view(); + + k_twiddles.template modify(); + k_twiddles.template sync(); + st.d_twiddles = k_twiddles.template view(); + + return st; +} + +template +KOKKOS_INLINE_FUNCTION +void KissFFTKokkos::kiss_fft_stride(const kiss_fft_state_kokkos &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset) +{ + //if (d_fin == d_fout) { + // // NOTE: this is not really an in-place FFT algorithm. + // // It just performs an out-of-place FFT into a temp buffer + // typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",st.nfft); + // kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + // memcpy(d_fout,d_tmpbuf,sizeof(FFT_DATA)*st.nfft); + //} else { + kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + //} +} + +template +KOKKOS_INLINE_FUNCTION +void KissFFTKokkos::kiss_fft_kokkos(const kiss_fft_state_kokkos &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset) +{ + kiss_fft_stride(cfg,d_fin,d_fout,1,offset); +} + + +namespace LAMMPS_NS { +template class KissFFTKokkos; +#ifdef KOKKOS_HAVE_CUDA +template class KissFFTKokkos; +#endif +} diff --git a/src/KOKKOS/kissfft_kokkos.h b/src/KOKKOS/kissfft_kokkos.h new file mode 100644 index 0000000000..a85f0fc9d5 --- /dev/null +++ b/src/KOKKOS/kissfft_kokkos.h @@ -0,0 +1,572 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* + we use a stripped down KISS FFT as default FFT for LAMMPS + this code is adapted from kiss_fft_v1_2_9 + homepage: http://kissfft.sf.net/ + + changes 2008-2011 by Axel Kohlmeyer +*/ + +#ifndef LMP_KISSFFT_KOKKOS_H +#define LMP_KISSFFT_KOKKOS_H + +#include "fftdata_kokkos.h" +#include "kokkos_type.h" + +/* + Copyright (c) 2003-2010, Mark Borgerding + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the author nor the names of any contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include "kissfft_kokkos.h" +#include "fftdata_kokkos.h" + +#ifndef M_PI +#define M_PI 3.141592653589793238462643383279502884197169399375105820974944 +#endif + +/* + Copyright (c) 2003-2010, Mark Borgerding + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the author nor the names of any contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + Explanation of macros dealing with complex math: + + C_MUL(m,a,b) : m = a*b + C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise + C_SUB( res, a,b) : res = a - b + C_SUBFROM( res , a) : res -= a + C_ADDTO( res , a) : res += a + C_EQ( res , a) : res = a + * */ + +#define S_MUL(a,b) ( (a)*(b) ) + +#define C_MUL(m,a,a_index,b,b_index) \ + do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\ + (m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0) + +#define C_FIXDIV(c,div) /* NOOP */ + + + +#define C_MULBYSCALAR( c, s ) \ + do{ (c)[0] *= (s);\ + (c)[1] *= (s); }while(0) + +#define C_ADD( res, a,b)\ + do { \ + (res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \ + }while(0) + +#define C_SUB( res, a,b)\ + do { \ + (res)[0]=(a)[0]-(b)[0]; (res)[1]=(a)[1]-(b)[1]; \ + }while(0) + +#define C_ADDTO( res , a)\ + do { \ + (res)[0] += (a)[0]; (res)[1] += (a)[1];\ + }while(0) + +#define C_SUBFROM( res , a)\ + do {\ + (res)[0] -= (a)[0]; (res)[1] -= (a)[1]; \ + }while(0) + +#define C_EQ(res, a)\ + do {\ + (res)[0] = (a)[0]; (res)[1] = (a)[1]; \ + }while(0) + + +#define KISS_FFT_COS(phase) (FFT_SCALAR) cos(phase) +#define KISS_FFT_SIN(phase) (FFT_SCALAR) sin(phase) +#define HALF_OF(x) ((x)*.5) + +#define kf_cexp(x,x_index,phase) \ + do{ \ + (x)(x_index,0) = KISS_FFT_COS(phase);\ + (x)(x_index,1) = KISS_FFT_SIN(phase);\ + }while(0) + +namespace LAMMPS_NS { + +template +class KissFFTKokkos { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + KOKKOS_INLINE_FUNCTION + static void kf_bfly2(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, int m, int Fout_count) + { + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR t[2]; + int Fout2_count; + int tw1_count = 0; + + Fout2_count = Fout_count + m; + do { + //C_FIXDIV(d_Fout[Fout_count],2); C_FIXDIV(d_Fout[Fout2_count],2); + + C_MUL(t,d_Fout,Fout2_count,d_twiddles,tw1_count); + tw1_count += fstride; + //C_SUB(*Fout2,*Fout,t); + d_Fout(Fout2_count,0) = d_Fout(Fout_count,0) - t[0]; + d_Fout(Fout2_count,1) = d_Fout(Fout_count,1) - t[1]; + //C_ADDTO(d_Fout[Fout_count],t); + d_Fout(Fout_count,0) += t[0]; + d_Fout(Fout_count,1) += t[1]; + ++Fout2_count; + ++Fout_count; + } while(--m); + } + + KOKKOS_INLINE_FUNCTION + static void kf_bfly4(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, const size_t m, int Fout_count) + { + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR scratch[6][2]; + size_t k=m; + const size_t m2=2*m; + const size_t m3=3*m; + + int tw3_count,tw2_count,tw1_count; + tw3_count = tw2_count = tw1_count = 0; + + do { + //C_FIXDIV(d_Fout[Fout_count],4); C_FIXDIV(d_Fout[m],4); C_FIXDIV(d_Fout[m2],4); C_FIXDIV(d_Fout[m3],4); + + C_MUL(scratch[0],d_Fout,Fout_count + m,d_twiddles,tw1_count); + C_MUL(scratch[1],d_Fout,Fout_count + m2,d_twiddles,tw2_count); + C_MUL(scratch[2],d_Fout,Fout_count + m3,d_twiddles,tw3_count); + + //C_SUB(scratch[5],d_Fout[Fout_count],scratch[1] ); + scratch[5][0] = d_Fout(Fout_count,0) - scratch[1][0]; + scratch[5][1] = d_Fout(Fout_count,1) - scratch[1][1]; + //C_ADDTO(d_Fout[Fout_count], scratch[1]); + d_Fout(Fout_count,0) += scratch[1][0]; + d_Fout(Fout_count,1) += scratch[1][1]; + //C_ADD(scratch[3],scratch[0],scratch[2]); + scratch[3][0] = scratch[0][0] + scratch[2][0]; + scratch[3][1] = scratch[0][1] + scratch[2][1]; + //C_SUB( scratch[4] , scratch[0] , scratch[2] ); + scratch[4][0] = scratch[0][0] - scratch[2][0]; + scratch[4][1] = scratch[0][1] - scratch[2][1]; + //C_SUB(d_Fout[m2],d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count + m2,0) = d_Fout(Fout_count,0) - scratch[3][0]; + d_Fout(Fout_count + m2,1) = d_Fout(Fout_count,1) - scratch[3][1]; + + tw1_count += fstride; + tw2_count += fstride*2; + tw3_count += fstride*3; + //C_ADDTO(d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count,0) += scratch[3][0]; + d_Fout(Fout_count,1) += scratch[3][1]; + + if (st.inverse) { + d_Fout(Fout_count + m,0) = scratch[5][0] - scratch[4][1]; + d_Fout(Fout_count + m,1) = scratch[5][1] + scratch[4][0]; + d_Fout(Fout_count + m3,0) = scratch[5][0] + scratch[4][1]; + d_Fout(Fout_count + m3,1) = scratch[5][1] - scratch[4][0]; + } else{ + d_Fout(Fout_count + m,0) = scratch[5][0] + scratch[4][1]; + d_Fout(Fout_count + m,1) = scratch[5][1] - scratch[4][0]; + d_Fout(Fout_count + m3,0) = scratch[5][0] - scratch[4][1]; + d_Fout(Fout_count + m3,1) = scratch[5][1] + scratch[4][0]; + } + ++Fout_count; + } while(--k); + } + + KOKKOS_INLINE_FUNCTION + static void kf_bfly3(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, size_t m, int Fout_count) + { + size_t k=m; + const size_t m2 = 2*m; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR scratch[5][2]; + FFT_SCALAR epi3[2]; + //C_EQ(epi3,d_twiddles[fstride*m]); + epi3[0] = d_twiddles(fstride*m,0); + epi3[1] = d_twiddles(fstride*m,1); + + int tw1_count,tw2_count; + tw1_count = tw2_count = 0; + + do { + //C_FIXDIV(d_Fout[Fout_count],3); C_FIXDIV(d_Fout[m],3); C_FIXDIV(d_Fout[m2],3); + + C_MUL(scratch[1],d_Fout,Fout_count + m,d_twiddles,tw1_count); + C_MUL(scratch[2],d_Fout,Fout_count + m2,d_twiddles,tw2_count); + + //C_ADD(scratch[3],scratch[1],scratch[2]); + scratch[3][0] = scratch[1][0] + scratch[2][0]; + scratch[3][1] = scratch[1][1] + scratch[2][1]; + //C_SUB(scratch[0],scratch[1],scratch[2]); + scratch[0][0] = scratch[1][0] - scratch[2][0]; + scratch[0][1] = scratch[1][1] - scratch[2][1]; + tw1_count += fstride; + tw2_count += fstride*2; + + d_Fout(Fout_count + m,0) = d_Fout(Fout_count,0) - HALF_OF(scratch[3][0]); + d_Fout(Fout_count + m,1) = d_Fout(Fout_count,1) - HALF_OF(scratch[3][1]); + + //C_MULBYSCALAR(scratch[0],epi3[1]); + scratch[0][0] *= epi3[1]; + scratch[0][1] *= epi3[1]; + + //C_ADDTO(d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count,0) += scratch[3][0]; + d_Fout(Fout_count,1) += scratch[3][1]; + + d_Fout(Fout_count + m2,0) = d_Fout(Fout_count + m,0) + scratch[0][1]; + d_Fout(Fout_count + m2,1) = d_Fout(Fout_count + m,1) - scratch[0][0]; + + d_Fout(Fout_count + m,0) -= scratch[0][1]; + d_Fout(Fout_count + m,1) += scratch[0][0]; + + ++Fout_count; + } while(--k); + } + + KOKKOS_INLINE_FUNCTION + static void kf_bfly5(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, int m, int Fout_count) + { + int u; + FFT_SCALAR scratch[13][2]; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR ya[2],yb[2]; + //C_EQ(ya,d_twiddles[fstride*m]); + ya[1] = d_twiddles(fstride*m,1); + ya[0] = d_twiddles(fstride*m,0); + //C_EQ(yb,d_twiddles[fstride*2*m]); + yb[1] = d_twiddles(fstride*2*m,1); + yb[0] = d_twiddles(fstride*2*m,0); + + int Fout0_count=Fout_count; + int Fout1_count=Fout0_count+m; + int Fout2_count=Fout0_count+2*m; + int Fout3_count=Fout0_count+3*m; + int Fout4_count=Fout0_count+4*m; + + for ( u=0; u &st, int m, int p, int Fout_count) + { + int u,k,q1,q; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR t[2]; + int Norig = st.nfft; + + typename AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch; + for ( u=0; u=Norig) twidx-=Norig; + C_MUL(t,d_scratch,q,d_twiddles,twidx); + //C_ADDTO(d_Fout[k],t); + d_Fout(Fout_count + k,0) += t[0]; + d_Fout(Fout_count + k,1) += t[1]; + } + k += m; + } + } + //KISS_FFT_TMP_FREE(d_scratch); + } + + KOKKOS_INLINE_FUNCTION + static void kf_work(typename AT::t_FFT_DATA_1d_um &d_Fout, const typename AT::t_FFT_DATA_1d_um &d_f, + const size_t fstride, int in_stride, + const typename AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos &st, int Fout_count, int f_count, int factors_count) + { + const int beg = Fout_count; + const int p = d_factors[factors_count++]; /* the radix */ + const int m = d_factors[factors_count++]; /* stage's fft length/p */ + const int end = Fout_count + p*m; + + if (m == 1) { + do { + //C_EQ(d_Fout[Fout_count],d_f[f_count]); + d_Fout(Fout_count,0) = d_f(f_count,0); + d_Fout(Fout_count,1) = d_f(f_count,1); + f_count += fstride*in_stride; + } while (++Fout_count != end); + } else { + do { + /* recursive call: + DFT of size m*p performed by doing + p instances of smaller DFTs of size m, + each one takes a decimated version of the input */ + kf_work(d_Fout, d_f, fstride*p, in_stride, d_factors, st, Fout_count, f_count, factors_count); + f_count += fstride*in_stride; + } while( (Fout_count += m) != end); + } + + Fout_count=beg; + + /* recombine the p smaller DFTs */ + switch (p) { + case 2: kf_bfly2(d_Fout,fstride,st,m,Fout_count); break; + case 3: kf_bfly3(d_Fout,fstride,st,m,Fout_count); break; + case 4: kf_bfly4(d_Fout,fstride,st,m,Fout_count); break; + case 5: kf_bfly5(d_Fout,fstride,st,m,Fout_count); break; + default: kf_bfly_generic(d_Fout,fstride,st,m,p,Fout_count); break; + } + } + + /* facbuf is populated by p1,m1,p2,m2, ... + where + p[i] * m[i] = m[i-1] + m0 = n */ + + static int kf_factor(int n, HAT::t_int_64 h_facbuf) + { + int p=4, nf=0; + double floor_sqrt; + floor_sqrt = floor( sqrt((double)n) ); + int facbuf_count = 0; + int p_max = 0; + + /* factor out the remaining powers of 4, powers of 2, + and then any other remaining primes */ + do { + if (nf == MAXFACTORS) p = n; /* make certain that we don't run out of space */ + while (n % p) { + switch (p) { + case 4: p = 2; break; + case 2: p = 3; break; + default: p += 2; break; + } + if (p > floor_sqrt) + p = n; /* no more factors, skip to end */ + } + n /= p; + h_facbuf[facbuf_count++] = p; + h_facbuf[facbuf_count++] = n; + p_max = MAX(p,p_max); + ++nf; + } while (n > 1); + return p_max; + } + + /* + * User-callable function to allocate all necessary storage space for the fft. + * + * The return value is a contiguous block of memory, allocated with malloc. As such, + * It can be freed with free(), rather than a kiss_fft-specific function. + */ + + static kiss_fft_state_kokkos kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void *mem, size_t *lenmem) + { + kiss_fft_state_kokkos st; + int i; + st.nfft = nfft; + st.inverse = inverse_fft; + + typename AT::tdual_int_64 k_factors = typename AT::tdual_int_64(); + typename AT::tdual_FFT_DATA_1d k_twiddles = typename AT::tdual_FFT_DATA_1d(); + + if (nfft > 0) { + k_factors = typename AT::tdual_int_64("kissfft:factors",MAXFACTORS*2); + k_twiddles = typename AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft); + + for (i=0;i(); + k_factors.template sync(); + st.d_factors = k_factors.template view(); + + k_twiddles.template modify(); + k_twiddles.template sync(); + st.d_twiddles = k_twiddles.template view(); + + return st; + } + + KOKKOS_INLINE_FUNCTION + static void kiss_fft_stride(const kiss_fft_state_kokkos &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset) + { + //if (d_fin == d_fout) { + // // NOTE: this is not really an in-place FFT algorithm. + // // It just performs an out-of-place FFT into a temp buffer + // typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",st.nfft); + // kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + // memcpy(d_fout,d_tmpbuf,sizeof(FFT_DATA)*st.nfft); + //} else { + kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + //} + } + + + KOKKOS_INLINE_FUNCTION + static void kiss_fft_kokkos(const kiss_fft_state_kokkos &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset) + { + kiss_fft_stride(cfg,d_fin,d_fout,1,offset); + } + +}; + +} + +#endif \ No newline at end of file diff --git a/src/KOKKOS/pack_kokkos.h b/src/KOKKOS/pack_kokkos.h new file mode 100644 index 0000000000..38c610fcd3 --- /dev/null +++ b/src/KOKKOS/pack_kokkos.h @@ -0,0 +1,424 @@ +/* ---------------------------------------------------------------------- + SPARTA - Stochastic PArallel Rarefied-gas Time-accurate Analyzer + http://sparta.sandia.gov + Steve Plimpton, sjplimp@sandia.gov, Michael Gallis, magalli@sandia.gov + Sandia National Laboratories + + Copyright (2014) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level SPARTA directory. +------------------------------------------------------------------------- */ + +// loop counters for doing a pack/unpack + +#include "pack.h" + +/* ---------------------------------------------------------------------- + Pack and unpack functions: + + pack routines copy strided values from data into contiguous locs in buf + unpack routines copy contiguous values from buf into strided locs in data + different versions of unpack depending on permutation + and # of values/element +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + pack from data -> buf +------------------------------------------------------------------------- */ + +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +template +class PackKokkos { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + +struct pack_3d_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + pack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &slow) const { + int in = slow*nmid*nfast; + const int plane = slow*nstride_plane; + for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop + int out = plane + mid*nstride_line; + for (int fast = 0; fast < nfast; fast++) + d_buf[buf_offset + in++] = d_data[data_offset + out++]; + } + } +}; + +static void pack_3d(typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + pack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data +------------------------------------------------------------------------- */ + +struct unpack_3d_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &slow) const { + int out = slow*nmid*nfast; + const int plane = slow*nstride_plane; + for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop + int in = plane + mid*nstride_line; + for (int fast = 0; fast < nfast; fast++) + d_data[data_offset + in++] = d_buf[buf_offset + out++]; + } + } +}; + +static void unpack_3d(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + unpack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data, one axis permutation, 1 value/element +------------------------------------------------------------------------- */ + + +struct unpack_3d_permute1_1_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_permute1_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &slow) const { + int out = slow*nmid*nfast; + const int plane = slow*nstride_line; + for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop + int in = plane + mid; + for (int fast = 0; fast < nfast; fast++, in += nstride_plane) + d_data[data_offset + in] = d_buf[buf_offset + out++]; + } + } +}; + +static void unpack_3d_permute1_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + unpack_3d_permute1_1_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow,f); + DeviceType::fence(); +} +/* ---------------------------------------------------------------------- + unpack from buf -> data, one axis permutation, 2 values/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute1_2_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_permute1_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &slow) const { + int out = slow*nmid*nfast*2; + const int plane = slow*nstride_line; + for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop + int in = plane + 2*mid; + for (int fast = 0; fast < nfast; fast++, in += nstride_plane) { + d_data[data_offset + in] = d_buf[buf_offset + out++]; + d_data[data_offset + in+1] = d_buf[buf_offset + out++]; + } + } + } +}; + +static void unpack_3d_permute1_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + unpack_3d_permute1_2_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data, one axis permutation, nqty values/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute1_n_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + int nqty; // # of values/element + + unpack_3d_permute1_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + nqty = plan->nqty ; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &slow) const { + int out = slow*nmid*nfast*nqty; + const int plane = slow*nstride_line; + for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop + int instart = plane + nqty*mid; + for (int fast = 0; fast < nfast; fast++, instart += nstride_plane) { + int in = instart; + for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++]; + } + } + } +}; + +static void unpack_3d_permute1_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + unpack_3d_permute1_n_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data, two axis permutation, 1 value/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute2_1_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_permute2_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &slow) const { + int out = slow*nmid*nfast; + for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop + int in = slow + mid*nstride_plane; + for (int fast = 0; fast < nfast; fast++, in += nstride_line) + d_data[data_offset + in] = d_buf[buf_offset + out++]; + } + } +}; + +static void unpack_3d_permute2_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + unpack_3d_permute2_1_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data, two axis permutation, 2 values/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute2_2_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_permute2_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &slow) const { + int out = slow*nmid*nfast*2; + for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop + int in = 2*slow + mid*nstride_plane; + for (int fast = 0; fast < nfast; fast++, in += nstride_line) { + d_data[data_offset + in] = d_buf[buf_offset + out++]; + d_data[data_offset + in+1] = d_buf[buf_offset + out++]; + } + } + } +}; + +static void unpack_3d_permute2_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + unpack_3d_permute2_2_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow,f); + DeviceType::fence(); +} +/* ---------------------------------------------------------------------- + unpack from buf -> data, two axis permutation, nqty values/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute2_n_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + int nqty; // # of values/element + + unpack_3d_permute2_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + nqty = plan->nqty ; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &slow) const { + int out = slow*nmid*nfast*nqty; + for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop + int instart = nqty*slow + mid*nstride_plane; + for (int fast = 0; fast < nfast; fast++, instart += nstride_line) { + int in = instart; + for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++]; + } + } + } +}; + +static void unpack_3d_permute2_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + unpack_3d_permute2_n_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow,f); + DeviceType::fence(); +} + +}; + +} diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index 7c01adc510..f25ea2313f 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -30,8 +30,8 @@ #include "bond.h" #include "angle.h" #include "domain.h" -#include "fft3d_wrap.h" -#include "remap_wrap.h" +#include "fft3d_kokkos.h" +#include "remap_kokkos.h" #include "memory_kokkos.h" #include "error.h" #include "atom_masks.h" @@ -849,19 +849,19 @@ void PPPMKokkos::allocate() int tmp; - fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag); + fft1 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 0,0,&tmp,collective_flag); - fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag); - remap = new Remap(lmp,world, - nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag); + fft2 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + 0,0,&tmp,collective_flag); + remap = new RemapKokkos(lmp,world, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 1,0,0,FFT_PRECISION,collective_flag); // create ghost grid object for rho and electric field communication @@ -1790,12 +1790,7 @@ void PPPMKokkos::brick2fft() Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_inout),*this); copymode = 0; - k_density_fft.template modify(); - k_density_fft.template sync(); - remap->perform(density_fft,density_fft,work1); - k_density_fft.template modify(); - k_density_fft.template sync(); - + remap->perform(d_density_fft,d_density_fft,d_work1); } template @@ -1837,11 +1832,7 @@ void PPPMKokkos::poisson_ik() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work1.template modify(); - k_work1.template sync(); - fft1->compute(work1,work1,1); - k_work1.template modify(); - k_work1.template sync(); + fft1->compute(d_work1,d_work1,1); // global energy and virial contribution @@ -1905,11 +1896,7 @@ void PPPMKokkos::poisson_ik() Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_fft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -1923,11 +1910,7 @@ void PPPMKokkos::poisson_ik() Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_fft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -1940,11 +1923,7 @@ void PPPMKokkos::poisson_ik() Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_fft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2200,11 +2179,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2221,11 +2196,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2237,11 +2208,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2253,11 +2220,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2268,11 +2231,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2284,11 +2243,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2300,11 +2255,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -3044,10 +2995,10 @@ int PPPMKokkos::timing_1d(int n, double &time1d) time1 = MPI_Wtime(); for (int i = 0; i < n; i++) { - fft1->timing1d(work1,nfft_both,1); - fft2->timing1d(work1,nfft_both,-1); - fft2->timing1d(work1,nfft_both,-1); - fft2->timing1d(work1,nfft_both,-1); + fft1->timing1d(d_work1,nfft_both,1); + fft2->timing1d(d_work1,nfft_both,-1); + fft2->timing1d(d_work1,nfft_both,-1); + fft2->timing1d(d_work1,nfft_both,-1); } MPI_Barrier(world); @@ -3081,10 +3032,10 @@ int PPPMKokkos::timing_3d(int n, double &time3d) time1 = MPI_Wtime(); for (int i = 0; i < n; i++) { - fft1->compute(work1,work1,1); - fft2->compute(work1,work1,-1); - fft2->compute(work1,work1,-1); - fft2->compute(work1,work1,-1); + fft1->compute(d_work1,d_work1,1); + fft2->compute(d_work1,d_work1,-1); + fft2->compute(d_work1,d_work1,-1); + fft2->compute(d_work1,d_work1,-1); } MPI_Barrier(world); diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h index 21896eb714..5dd99af5fb 100644 --- a/src/KOKKOS/pppm_kokkos.h +++ b/src/KOKKOS/pppm_kokkos.h @@ -24,6 +24,8 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos) #include "pppm.h" #include "gridcomm_kokkos.h" +#include "remap_kokkos.h" +#include "fft3d_kokkos.h" #include "kokkos_base.h" #include "kokkos_type.h" @@ -331,8 +333,8 @@ class PPPMKokkos : public PPPM, public KokkosBase { //double **acons; typename Kokkos::DualView::t_host acons; - class FFT3d *fft1,*fft2; - class Remap *remap; + FFT3dKokkos *fft1,*fft2; + RemapKokkos *remap; GridCommKokkos *cg; GridCommKokkos *cg_peratom; diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp new file mode 100644 index 0000000000..048d04ed28 --- /dev/null +++ b/src/KOKKOS/remap_kokkos.cpp @@ -0,0 +1,733 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include "remap_kokkos.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +template +RemapKokkos::RemapKokkos(LAMMPS *lmp) : Pointers(lmp) +{ + plan = NULL; +} + +template +RemapKokkos::RemapKokkos(LAMMPS *lmp, MPI_Comm comm, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int nqty, int permute, int memory, + int precision, int usecollective) : Pointers(lmp) +{ + plan = remap_3d_create_plan_kokkos(comm, + in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, + nqty,permute,memory,precision,usecollective); + if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan"); +} + +/* ---------------------------------------------------------------------- */ + +template +RemapKokkos::~RemapKokkos() +{ + remap_3d_destroy_plan_kokkos(plan); +} + +/* ---------------------------------------------------------------------- */ + +template +void RemapKokkos::perform(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf) +{ + remap_3d_kokkos(d_in,d_out,d_buf,plan); +} + +#include +#include + +#include "pack_kokkos.h" + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +/* ---------------------------------------------------------------------- + Data layout for 3d remaps: + + data set of Nfast x Nmid x Nslow elements is owned by P procs + each element = nqty contiguous datums + on input, each proc owns a subsection of the elements + on output, each proc will own a (presumably different) subsection + my subsection must not overlap with any other proc's subsection, + i.e. the union of all proc's input (or output) subsections must + exactly tile the global Nfast x Nmid x Nslow data set + when called from C, all subsection indices are + C-style from 0 to N-1 where N = Nfast or Nmid or Nslow + when called from F77, all subsection indices are + F77-style from 1 to N where N = Nfast or Nmid or Nslow + a proc can own 0 elements on input or output + by specifying hi index < lo index + on both input and output, data is stored contiguously on a processor + with a fast-varying, mid-varying, and slow-varying index +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Perform 3d remap + + Arguments: + in starting address of input data on this proc + out starting address of where output data for this proc + will be placed (can be same as in) + buf extra memory required for remap + if memory=0 was used in call to remap_3d_create_plan + then buf must be big enough to hold output result + i.e. nqty * (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * + (out_khi-out_klo+1) + if memory=1 was used in call to remap_3d_create_plan + then buf is not used, can just be a dummy pointer + plan plan returned by previous call to remap_3d_create_plan +------------------------------------------------------------------------- */ + +template +void RemapKokkos::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf, + struct remap_plan_3d_kokkos *plan) +{ + // use point-to-point communication + + //if (!plan->usecollective) { + int i,isend,irecv; + typename AT::t_FFT_SCALAR_1d d_scratch; + + if (plan->memory == 0) + d_scratch = d_buf; + else + d_scratch = plan->d_scratch; + + // post all recvs into scratch space + + for (irecv = 0; irecv < plan->nrecv; irecv++) { + double* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv]; + MPI_Irecv(scratch,plan->recv_size[irecv], + MPI_FFT_SCALAR,plan->recv_proc[irecv],0, + plan->comm,&plan->request[irecv]); + } + + // send all messages to other procs + + for (isend = 0; isend < plan->nsend; isend++) { + int in_offset = plan->send_offset[isend]; + plan->pack(d_in,in_offset, + plan->d_sendbuf,0,&plan->packplan[isend]); + MPI_Send(plan->d_sendbuf.ptr_on_device(),plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm); + } + + // copy in -> scratch -> out for self data + + if (plan->self) { + isend = plan->nsend; + irecv = plan->nrecv; + + int in_offset = plan->send_offset[isend]; + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; + + plan->pack(d_in,in_offset, + d_scratch,scratch_offset, + &plan->packplan[isend]); + plan->unpack(d_scratch,scratch_offset, + d_out,out_offset,&plan->unpackplan[irecv]); + } + + // unpack all messages from scratch -> out + + for (i = 0; i < plan->nrecv; i++) { + MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE); + + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; + + plan->unpack(d_scratch,scratch_offset, + d_out,out_offset,&plan->unpackplan[irecv]); + } + + // use All2Allv collective for remap communication + + //} else { + // if (plan->commringlen > 0) { + // int isend,irecv; + // + // // create send and recv buffers for alltoallv collective + // + // int sendBufferSize = 0; + // int recvBufferSize = 0; + // for (int i=0;insend;i++) + // sendBufferSize += plan->send_size[i]; + // for (int i=0;inrecv;i++) + // recvBufferSize += plan->recv_size[i]; + // + // typename AT::t_FFT_SCALAR_1d packedSendBuffer + // = (typename AT::t_FFT_SCALAR_1d ) malloc(sizeof(FFT_SCALAR) * sendBufferSize); + // typename AT::t_FFT_SCALAR_1d packedRecvBuffer + // = (typename AT::t_FFT_SCALAR_1d ) malloc(sizeof(FFT_SCALAR) * recvBufferSize); + // + // int *sendcnts = (int *) malloc(sizeof(int) * plan->commringlen); + // int *rcvcnts = (int *) malloc(sizeof(int) * plan->commringlen); + // int *sdispls = (int *) malloc(sizeof(int) * plan->commringlen); + // int *rdispls = (int *) malloc(sizeof(int) * plan->commringlen); + // int *nrecvmap = (int *) malloc(sizeof(int) * plan->commringlen); + // + // // create and populate send data, count and displacement buffers + // + // int currentSendBufferOffset = 0; + // for (isend = 0; isend < plan->commringlen; isend++) { + // sendcnts[isend] = 0; + // sdispls[isend] = 0; + // int foundentry = 0; + // for (int i=0;(insend && !foundentry); i++) { + // if (plan->send_proc[i] == plan->commringlist[isend]) { + // foundentry = 1; + // sendcnts[isend] = plan->send_size[i]; + // sdispls[isend] = currentSendBufferOffset; + // plan->pack(&d_in[plan->send_offset[i]], + // &packedSendBuffer[currentSendBufferOffset], + // &plan->packplan[i]); + // currentSendBufferOffset += plan->send_size[i]; + // } + // } + // } + // + // // create and populate recv count and displacement buffers + // + // int currentRecvBufferOffset = 0; + // for (irecv = 0; irecv < plan->commringlen; irecv++) { + // rcvcnts[irecv] = 0; + // rdispls[irecv] = 0; + // nrecvmap[irecv] = -1; + // int foundentry = 0; + // for (int i=0;(inrecv && !foundentry); i++) { + // if (plan->recv_proc[i] == plan->commringlist[irecv]) { + // foundentry = 1; + // rcvcnts[irecv] = plan->recv_size[i]; + // rdispls[irecv] = currentRecvBufferOffset; + // currentRecvBufferOffset += plan->recv_size[i]; + // nrecvmap[irecv] = i; + // } + // } + // } + // + // MPI_Alltoallv(packedSendBuffer, sendcnts, sdispls, + // MPI_FFT_SCALAR, packedRecvBuffer, rcvcnts, + // rdispls, MPI_FFT_SCALAR, plan->comm); + // + // // unpack the data from the recv buffer into out + // + // currentRecvBufferOffset = 0; + // for (irecv = 0; irecv < plan->commringlen; irecv++) { + // if (nrecvmap[irecv] > -1) { + // plan->unpack(&packedRecvBuffer[currentRecvBufferOffset], + // &d_out[plan->recv_offset[nrecvmap[irecv]]], + // &plan->unpackplan[nrecvmap[irecv]]); + // currentRecvBufferOffset += plan->recv_size[nrecvmap[irecv]]; + // } + // } + // + // // free temporary data structures + // + // free(sendcnts); + // free(rcvcnts); + // free(sdispls); + // free(rdispls); + // free(nrecvmap); + // free(packedSendBuffer); + // free(packedRecvBuffer); + // } + //} +} + +/* ---------------------------------------------------------------------- + Create plan for performing a 3d remap + + Arguments: + comm MPI communicator for the P procs which own the data + in_ilo,in_ihi input bounds of data I own in fast index + in_jlo,in_jhi input bounds of data I own in mid index + in_klo,in_khi input bounds of data I own in slow index + out_ilo,out_ihi output bounds of data I own in fast index + out_jlo,out_jhi output bounds of data I own in mid index + out_klo,out_khi output bounds of data I own in slow index + nqty # of datums per element + permute permutation in storage order of indices on output + 0 = no permutation + 1 = permute once = mid->fast, slow->mid, fast->slow + 2 = permute twice = slow->fast, fast->mid, mid->slow + memory user provides buffer memory for remap or system does + 0 = user provides memory + 1 = system provides memory + precision precision of data + 1 = single precision (4 bytes per datum) + 2 = double precision (8 bytes per datum) + usecollective whether to use collective MPI or point-to-point +------------------------------------------------------------------------- */ + +template +struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_create_plan_kokkos( + MPI_Comm comm, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int nqty, int permute, int memory, int precision, int usecollective) +{ + + struct remap_plan_3d_kokkos *plan; + struct extent_3d *inarray, *outarray; + struct extent_3d in,out,overlap; + int i,iproc,nsend,nrecv,ibuf,size,me,nprocs; + + // query MPI info + + MPI_Comm_rank(comm,&me); + MPI_Comm_size(comm,&nprocs); + + // allocate memory for plan data struct + + plan = new struct remap_plan_3d_kokkos; + if (plan == NULL) return NULL; + plan->usecollective = usecollective; + + // store parameters in local data structs + + in.ilo = in_ilo; + in.ihi = in_ihi; + in.isize = in.ihi - in.ilo + 1; + + in.jlo = in_jlo; + in.jhi = in_jhi; + in.jsize = in.jhi - in.jlo + 1; + + in.klo = in_klo; + in.khi = in_khi; + in.ksize = in.khi - in.klo + 1; + + out.ilo = out_ilo; + out.ihi = out_ihi; + out.isize = out.ihi - out.ilo + 1; + + out.jlo = out_jlo; + out.jhi = out_jhi; + out.jsize = out.jhi - out.jlo + 1; + + out.klo = out_klo; + out.khi = out_khi; + out.ksize = out.khi - out.klo + 1; + + // combine output extents across all procs + + inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); + if (inarray == NULL) return NULL; + + outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); + if (outarray == NULL) return NULL; + + MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE, + outarray,sizeof(struct extent_3d),MPI_BYTE,comm); + + // count send collides, including self + + nsend = 0; + iproc = me; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + nsend += remap_3d_collide(&in,&outarray[iproc],&overlap); + } + + // malloc space for send info + + if (nsend) { + plan->pack = PackKokkos::pack_3d; + + plan->send_offset = (int *) malloc(nsend*sizeof(int)); + plan->send_size = (int *) malloc(nsend*sizeof(int)); + plan->send_proc = (int *) malloc(nsend*sizeof(int)); + plan->packplan = (struct pack_plan_3d *) + malloc(nsend*sizeof(struct pack_plan_3d)); + + if (plan->send_offset == NULL || plan->send_size == NULL || + plan->send_proc == NULL || plan->packplan == NULL) return NULL; + } + + // store send info, with self as last entry + + nsend = 0; + iproc = me; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { + plan->send_proc[nsend] = iproc; + plan->send_offset[nsend] = nqty * + ((overlap.klo-in.klo)*in.jsize*in.isize + + ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); + plan->packplan[nsend].nfast = nqty*overlap.isize; + plan->packplan[nsend].nmid = overlap.jsize; + plan->packplan[nsend].nslow = overlap.ksize; + plan->packplan[nsend].nstride_line = nqty*in.isize; + plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; + plan->packplan[nsend].nqty = nqty; + plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + nsend++; + } + } + + // plan->nsend = # of sends not including self + + if (nsend && plan->send_proc[nsend-1] == me) { + if (plan->usecollective) // for collectives include self in nsend list + plan->nsend = nsend; + else + plan->nsend = nsend - 1; + } else + plan->nsend = nsend; + + // combine input extents across all procs + + MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE, + inarray,sizeof(struct extent_3d),MPI_BYTE,comm); + + // count recv collides, including self + + nrecv = 0; + iproc = me; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap); + } + + // malloc space for recv info + + if (nrecv) { + if (permute == 0) + plan->unpack = PackKokkos::unpack_3d; + else if (permute == 1) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute1_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute1_2; + else + plan->unpack = PackKokkos::unpack_3d_permute1_n; + } + else if (permute == 2) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute2_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute2_2; + else + plan->unpack = PackKokkos::unpack_3d_permute2_n; + } + + plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); + plan->recv_size = (int *) malloc(nrecv*sizeof(int)); + plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); + plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); + plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); + plan->unpackplan = (struct pack_plan_3d *) + malloc(nrecv*sizeof(struct pack_plan_3d)); + + if (plan->recv_offset == NULL || plan->recv_size == NULL || + plan->recv_proc == NULL || plan->recv_bufloc == NULL || + plan->request == NULL || plan->unpackplan == NULL) return NULL; + } + + // store recv info, with self as last entry + + ibuf = 0; + nrecv = 0; + iproc = me; + + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { + plan->recv_proc[nrecv] = iproc; + plan->recv_bufloc[nrecv] = ibuf; + + if (permute == 0) { + plan->recv_offset[nrecv] = nqty * + ((overlap.klo-out.klo)*out.jsize*out.isize + + (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); + plan->unpackplan[nrecv].nfast = nqty*overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.isize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize; + plan->unpackplan[nrecv].nqty = nqty; + } + else if (permute == 1) { + plan->recv_offset[nrecv] = nqty * + ((overlap.ilo-out.ilo)*out.ksize*out.jsize + + (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.jsize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize; + plan->unpackplan[nrecv].nqty = nqty; + } + else { + plan->recv_offset[nrecv] = nqty * + ((overlap.jlo-out.jlo)*out.isize*out.ksize + + (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.ksize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; + plan->unpackplan[nrecv].nqty = nqty; + } + + plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + ibuf += plan->recv_size[nrecv]; + nrecv++; + } + } + + // create sub-comm rank list + + //if (plan->usecollective) { + // plan->commringlist = NULL; + // + // // merge recv and send rank lists + // // ask Steve Plimpton about method to more accurately determine + // // maximum number of procs contributing to pencil + // + // int maxcommsize = nprocs; + // int *commringlist = (int *) malloc(maxcommsize*sizeof(int)); + // int commringlen = 0; + // + // for (int i = 0; i < nrecv; i++) { + // commringlist[i] = plan->recv_proc[i]; + // commringlen++; + // } + // + // for (int i = 0; i < nsend; i++) { + // int foundentry = 0; + // for (int j=0;jsend_proc[i]) foundentry = 1; + // if (!foundentry) { + // commringlist[commringlen] = plan->send_proc[i]; + // commringlen++; + // } + // } + // + // // sort initial commringlist + // + // int swap = 0; + // for (int c = 0 ; c < (commringlen - 1); c++) { + // for (int d = 0 ; d < commringlen - c - 1; d++) { + // if (commringlist[d] > commringlist[d+1]) { + // swap = commringlist[d]; + // commringlist[d] = commringlist[d+1]; + // commringlist[d+1] = swap; + // } + // } + // } + // + // // collide all inarray extents for the comm ring with all output + // // extents and all outarray extents for the comm ring with all input + // // extents - if there is a collison add the rank to the comm ring, + // // keep iterating until nothing is added to commring + // + // int commringappend = 1; + // while (commringappend) { + // int newcommringlen = commringlen; + // commringappend = 0; + // for (int i=0;i commringlist[d+1]) { + // swap = commringlist[d]; + // commringlist[d] = commringlist[d+1]; + // commringlist[d+1] = swap; + // } + // } + // } + // + // // resize commringlist to final size + // + // commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); + // + // // set the plan->commringlist + // + // plan->commringlen = commringlen; + // plan->commringlist = commringlist; + //} + + // plan->nrecv = # of recvs not including self + // for collectives include self in the nsend list + + if (nrecv && plan->recv_proc[nrecv-1] == me) { + if (plan->usecollective) plan->nrecv = nrecv; + else plan->nrecv = nrecv - 1; + } else plan->nrecv = nrecv; + + // init remaining fields in remap plan + + plan->memory = memory; + + if (nrecv == plan->nrecv) plan->self = 0; + else plan->self = 1; + + // free locally malloced space + + free(inarray); + free(outarray); + + // find biggest send message (not including self) and malloc space for it + + size = 0; + for (nsend = 0; nsend < plan->nsend; nsend++) + size = MAX(size,plan->send_size[nsend]); + + if (size) { + plan->d_sendbuf = typename AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); + if (!plan->d_sendbuf.data()) return NULL; + } + + // if requested, allocate internal scratch space for recvs, + // only need it if I will receive any data (including self) + + if (memory == 1) { + if (nrecv > 0) { + plan->d_scratch = + typename AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize); + if (!plan->d_scratch.data()) return NULL; + } + } + + // if using collective and the commringlist is NOT empty create a + // communicator for the plan based off an MPI_Group created with + // ranks from the commringlist + + //if ((plan->usecollective && (plan->commringlen > 0))) { + // MPI_Group orig_group, new_group; + // MPI_Comm_group(comm, &orig_group); + // MPI_Group_incl(orig_group, plan->commringlen, + // plan->commringlist, &new_group); + // MPI_Comm_create(comm, new_group, &plan->comm); + //} + + // if using collective and the comm ring list is empty create + // a communicator for the plan with an empty group + + //else if ((plan->usecollective) && (plan->commringlen == 0)) { + // MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); + //} + + // not using collective - dup comm + + /*else*/ MPI_Comm_dup(comm,&plan->comm); + + // return pointer to plan + + return plan; +} + +/* ---------------------------------------------------------------------- + Destroy a 3d remap plan +------------------------------------------------------------------------- */ + +template +void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos *plan) +{ + if (plan == NULL) return; + + // free MPI communicator + + if (!((plan->usecollective) && (plan->commringlen == 0))) + MPI_Comm_free(&plan->comm); + + //if (plan->usecollective) { + // if (plan->commringlist != NULL) + // free(plan->commringlist); + //} + + // free internal arrays + + if (plan->nsend || plan->self) { + free(plan->send_offset); + free(plan->send_size); + free(plan->send_proc); + free(plan->packplan); + //if (plan->d_sendbuf) free(plan->d_sendbuf); + } + + if (plan->nrecv || plan->self) { + free(plan->recv_offset); + free(plan->recv_size); + free(plan->recv_proc); + free(plan->recv_bufloc); + free(plan->request); + free(plan->unpackplan); + //if (plan->scratch) free(plan->scratch); + } + + // free plan itself + + delete plan; +} + +namespace LAMMPS_NS { +template class RemapKokkos; +#ifdef KOKKOS_HAVE_CUDA +template class RemapKokkos; +#endif +} diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h new file mode 100644 index 0000000000..fe8df20f16 --- /dev/null +++ b/src/KOKKOS/remap_kokkos.h @@ -0,0 +1,87 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_REMAP_KOKKOS_H +#define LMP_REMAP_KOKKOS_H + +#include "pointers.h" +#include +#include "fftdata_kokkos.h" +#include "remap.h" + +namespace LAMMPS_NS { + +// details of how to do a 3d remap + +template +struct remap_plan_3d_kokkos { + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends + typename AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs + void (*pack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *); + // which pack function to use + void (*unpack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *); + // which unpack function to use + int *send_offset; // extraction loc for each send + int *send_size; // size of each send message + int *send_proc; // proc to send each message to + struct pack_plan_3d *packplan; // pack plan for each send message + int *recv_offset; // insertion loc for each recv + int *recv_size; // size of each recv message + int *recv_proc; // proc to recv each message from + int *recv_bufloc; // offset in scratch buf for each recv + MPI_Request *request; // MPI request for each posted recv + struct pack_plan_3d *unpackplan; // unpack plan for each recv message + int nrecv; // # of recvs from other procs + int nsend; // # of sends to other procs + int self; // whether I send/recv with myself + int memory; // user provides scratch space or not + MPI_Comm comm; // group of procs performing remap + int usecollective; // use collective or point-to-point MPI + int commringlen; // length of commringlist + int *commringlist; // ranks on communication ring of this plan +}; + +template +class RemapKokkos : protected Pointers { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + RemapKokkos(class LAMMPS *); + RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int, + int,int,int,int,int,int,int,int,int,int,int); + ~RemapKokkos(); + void perform(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d); + + struct remap_plan_3d_kokkos *plan; + + void remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, struct remap_plan_3d_kokkos *); + struct remap_plan_3d_kokkos *remap_3d_create_plan_kokkos(MPI_Comm, + int, int, int, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int); + void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos *); +}; + +} + +#endif + +/* ERROR/WARNING messages: + +E: Could not create 3d remap plan + +The FFT setup in pppm failed. + +*/ From ab6be65ab3a02a75fa9068c834f7620edbd4d03f Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 14 Jun 2019 09:20:15 -0600 Subject: [PATCH 02/48] Flatten loops in pack_kokkos --- src/KOKKOS/pack_kokkos.h | 194 ++++++++++++++++++++++++--------------- 1 file changed, 122 insertions(+), 72 deletions(-) diff --git a/src/KOKKOS/pack_kokkos.h b/src/KOKKOS/pack_kokkos.h index 38c610fcd3..5283e0d3fc 100644 --- a/src/KOKKOS/pack_kokkos.h +++ b/src/KOKKOS/pack_kokkos.h @@ -47,6 +47,7 @@ public: int buf_offset,data_offset; int nfast; // # of elements in fast index int nmid; // # of elements in mid index + int nslow; // # of elements in slow index int nstride_line; // stride between successive mid indices int nstride_plane; // stride between successive slow indices @@ -58,27 +59,33 @@ public: data_offset = data_offset_ ; nfast = plan->nfast ; nmid = plan->nmid ; + nslow = plan->nslow ; nstride_line = plan->nstride_line ; nstride_plane = plan->nstride_plane; } KOKKOS_INLINE_FUNCTION - void operator() (const int &slow) const { - int in = slow*nmid*nfast; + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int in_orig = slow*nmid*nfast; const int plane = slow*nstride_plane; - for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop - int out = plane + mid*nstride_line; - for (int fast = 0; fast < nfast; fast++) - d_buf[buf_offset + in++] = d_data[data_offset + out++]; - } + const int out_orig = plane + mid*nstride_line; + const int in = in_orig + mid*nfast + fast; + const int out = out_orig + fast; + d_buf[buf_offset + in] = d_data[data_offset + out]; } }; static void pack_3d(typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, struct pack_plan_3d *plan) { const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; pack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan); - Kokkos::parallel_for(nslow,f); + Kokkos::parallel_for(nslow*nmid*nfast,f); DeviceType::fence(); } @@ -94,6 +101,7 @@ public: int buf_offset,data_offset; int nfast; // # of elements in fast index int nmid; // # of elements in mid index + int nslow; // # of elements in slow index int nstride_line; // stride between successive mid indices int nstride_plane; // stride between successive slow indices @@ -105,27 +113,33 @@ public: data_offset = data_offset_ ; nfast = plan->nfast ; nmid = plan->nmid ; + nslow = plan->nslow ; nstride_line = plan->nstride_line ; nstride_plane = plan->nstride_plane; } KOKKOS_INLINE_FUNCTION - void operator() (const int &slow) const { - int out = slow*nmid*nfast; + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast; const int plane = slow*nstride_plane; - for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop - int in = plane + mid*nstride_line; - for (int fast = 0; fast < nfast; fast++) - d_data[data_offset + in++] = d_buf[buf_offset + out++]; - } + const int in_orig = plane + mid*nstride_line; + const int in = in_orig + fast; + const int out = out_orig + mid*nfast + fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; } }; static void unpack_3d(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) { const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; unpack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan); - Kokkos::parallel_for(nslow,f); + Kokkos::parallel_for(nslow*nmid*nfast,f); DeviceType::fence(); } @@ -142,6 +156,7 @@ public: int buf_offset,data_offset; int nfast; // # of elements in fast index int nmid; // # of elements in mid index + int nslow; // # of elements in slow index int nstride_line; // stride between successive mid indices int nstride_plane; // stride between successive slow indices @@ -153,27 +168,33 @@ public: data_offset = data_offset_ ; nfast = plan->nfast ; nmid = plan->nmid ; + nslow = plan->nslow ; nstride_line = plan->nstride_line ; nstride_plane = plan->nstride_plane; } KOKKOS_INLINE_FUNCTION - void operator() (const int &slow) const { - int out = slow*nmid*nfast; + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast; const int plane = slow*nstride_line; - for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop - int in = plane + mid; - for (int fast = 0; fast < nfast; fast++, in += nstride_plane) - d_data[data_offset + in] = d_buf[buf_offset + out++]; - } + const int in_orig = plane + mid; + const int in = in_orig + fast*nstride_plane; + const int out = out_orig + mid*nfast + fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; } }; static void unpack_3d_permute1_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) { const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; unpack_3d_permute1_1_functor f(d_buf,buf_offset,d_data,data_offset,plan); - Kokkos::parallel_for(nslow,f); + Kokkos::parallel_for(nslow*nmid*nfast,f); DeviceType::fence(); } /* ---------------------------------------------------------------------- @@ -188,6 +209,7 @@ public: int buf_offset,data_offset; int nfast; // # of elements in fast index int nmid; // # of elements in mid index + int nslow; // # of elements in slow index int nstride_line; // stride between successive mid indices int nstride_plane; // stride between successive slow indices @@ -199,29 +221,34 @@ public: data_offset = data_offset_ ; nfast = plan->nfast ; nmid = plan->nmid ; + nslow = plan->nslow ; nstride_line = plan->nstride_line ; nstride_plane = plan->nstride_plane; } KOKKOS_INLINE_FUNCTION - void operator() (const int &slow) const { - int out = slow*nmid*nfast*2; + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast*2; const int plane = slow*nstride_line; - for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop - int in = plane + 2*mid; - for (int fast = 0; fast < nfast; fast++, in += nstride_plane) { - d_data[data_offset + in] = d_buf[buf_offset + out++]; - d_data[data_offset + in+1] = d_buf[buf_offset + out++]; - } - } + const int in_orig = plane + 2*mid; + const int in = in_orig + fast*nstride_plane; + const int out = out_orig + 2*mid*nfast + 2*fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; + d_data[data_offset + in+1] = d_buf[buf_offset + out+1]; } }; static void unpack_3d_permute1_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) { const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; unpack_3d_permute1_2_functor f(d_buf,buf_offset,d_data,data_offset,plan); - Kokkos::parallel_for(nslow,f); + Kokkos::parallel_for(nslow*nmid*nfast,f); DeviceType::fence(); } @@ -237,6 +264,7 @@ public: int buf_offset,data_offset; int nfast; // # of elements in fast index int nmid; // # of elements in mid index + int nslow; // # of elements in slow index int nstride_line; // stride between successive mid indices int nstride_plane; // stride between successive slow indices int nqty; // # of values/element @@ -249,30 +277,34 @@ public: data_offset = data_offset_ ; nfast = plan->nfast ; nmid = plan->nmid ; + nslow = plan->nslow ; nstride_line = plan->nstride_line ; nstride_plane = plan->nstride_plane; nqty = plan->nqty ; } KOKKOS_INLINE_FUNCTION - void operator() (const int &slow) const { - int out = slow*nmid*nfast*nqty; + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast*nqty; const int plane = slow*nstride_line; - for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop - int instart = plane + nqty*mid; - for (int fast = 0; fast < nfast; fast++, instart += nstride_plane) { - int in = instart; - for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++]; - } - } + const int instart = plane + nqty*mid; + int in = instart + nqty*fast*nstride_plane; + int out = out_orig + nqty*mid*nfast + nqty*fast; + for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++]; } }; static void unpack_3d_permute1_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) { const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; unpack_3d_permute1_n_functor f(d_buf,buf_offset,d_data,data_offset,plan); - Kokkos::parallel_for(nslow,f); + Kokkos::parallel_for(nslow*nmid*nfast,f); DeviceType::fence(); } @@ -288,6 +320,7 @@ public: int buf_offset,data_offset; int nfast; // # of elements in fast index int nmid; // # of elements in mid index + int nslow; // # of elements in slow index int nstride_line; // stride between successive mid indices int nstride_plane; // stride between successive slow indices @@ -299,26 +332,32 @@ public: data_offset = data_offset_ ; nfast = plan->nfast ; nmid = plan->nmid ; + nslow = plan->nslow ; nstride_line = plan->nstride_line ; nstride_plane = plan->nstride_plane; } KOKKOS_INLINE_FUNCTION - void operator() (const int &slow) const { - int out = slow*nmid*nfast; - for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop - int in = slow + mid*nstride_plane; - for (int fast = 0; fast < nfast; fast++, in += nstride_line) - d_data[data_offset + in] = d_buf[buf_offset + out++]; - } + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast; + const int in_orig = slow + mid*nstride_plane; + const int in = in_orig + fast*nstride_line; + const int out = out_orig + mid*nfast + fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; } }; static void unpack_3d_permute2_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) { const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; unpack_3d_permute2_1_functor f(d_buf,buf_offset,d_data,data_offset,plan); - Kokkos::parallel_for(nslow,f); + Kokkos::parallel_for(nslow*nmid*nfast,f); DeviceType::fence(); } @@ -334,6 +373,7 @@ public: int buf_offset,data_offset; int nfast; // # of elements in fast index int nmid; // # of elements in mid index + int nslow; // # of elements in slow index int nstride_line; // stride between successive mid indices int nstride_plane; // stride between successive slow indices @@ -345,28 +385,33 @@ public: data_offset = data_offset_ ; nfast = plan->nfast ; nmid = plan->nmid ; + nslow = plan->nslow ; nstride_line = plan->nstride_line ; nstride_plane = plan->nstride_plane; } KOKKOS_INLINE_FUNCTION - void operator() (const int &slow) const { - int out = slow*nmid*nfast*2; - for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop - int in = 2*slow + mid*nstride_plane; - for (int fast = 0; fast < nfast; fast++, in += nstride_line) { - d_data[data_offset + in] = d_buf[buf_offset + out++]; - d_data[data_offset + in+1] = d_buf[buf_offset + out++]; - } - } + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast*2; + const int in_orig = 2*slow + mid*nstride_plane; + const int in = in_orig + fast*nstride_line; + const int out = out_orig + 2*mid*nfast + 2*fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; + d_data[data_offset + in+1] = d_buf[buf_offset + out+1]; } }; static void unpack_3d_permute2_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) { const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; unpack_3d_permute2_2_functor f(d_buf,buf_offset,d_data,data_offset,plan); - Kokkos::parallel_for(nslow,f); + Kokkos::parallel_for(nslow*nmid*nfast,f); DeviceType::fence(); } /* ---------------------------------------------------------------------- @@ -381,6 +426,7 @@ public: int buf_offset,data_offset; int nfast; // # of elements in fast index int nmid; // # of elements in mid index + int nslow; // # of elements in slow index int nstride_line; // stride between successive mid indices int nstride_plane; // stride between successive slow indices int nqty; // # of values/element @@ -393,29 +439,33 @@ public: data_offset = data_offset_ ; nfast = plan->nfast ; nmid = plan->nmid ; + nslow = plan->nslow ; nstride_line = plan->nstride_line ; nstride_plane = plan->nstride_plane; nqty = plan->nqty ; } KOKKOS_INLINE_FUNCTION - void operator() (const int &slow) const { - int out = slow*nmid*nfast*nqty; - for (int mid = 0; mid < nmid; mid++) { // TODO: use thread team or flatten loop - int instart = nqty*slow + mid*nstride_plane; - for (int fast = 0; fast < nfast; fast++, instart += nstride_line) { - int in = instart; - for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++]; - } - } + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast*nqty; + const int instart = nqty*slow + mid*nstride_plane; + int in = instart + nqty*fast*nstride_line; + int out = out_orig + nqty*mid*nfast + nqty*fast; + for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++]; } }; static void unpack_3d_permute2_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) { const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; unpack_3d_permute2_n_functor f(d_buf,buf_offset,d_data,data_offset,plan); - Kokkos::parallel_for(nslow,f); + Kokkos::parallel_for(nslow*nmid*nfast,f); DeviceType::fence(); } From 0322ebd0933942608a2f6aadfb946536db251db8 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 17 Jun 2019 09:19:40 -0600 Subject: [PATCH 03/48] WIP: add cuFFT support --- src/KOKKOS/fft3d_kokkos.cpp | 187 +++++++++++++++++++++++++++--------- src/KOKKOS/fft3d_kokkos.h | 39 +++++++- src/KOKKOS/kokkos_type.h | 84 ++++++++-------- 3 files changed, 224 insertions(+), 86 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index c5120f2bda..bd0f46a8d2 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -12,10 +12,15 @@ ------------------------------------------------------------------------- */ #include +#include +#include +#include #include "fft3d_kokkos.h" #include "remap_kokkos.h" +#include "kokkos_type.h" #include "error.h" + using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ @@ -65,17 +70,6 @@ void FFT3dKokkos::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int ns fft_3d_1d_only_kokkos(d_in_data,nsize,flag,plan); } -#include -#include -#include -#include -#include "fft3d_kokkos.h" -#include "remap_kokkos.h" - -// include kissfft implementation - -#include "kissfft_kokkos.h" - #define MIN(A,B) ((A) < (B) ? (A) : (B)) #define MAX(A,B) ((A) > (B) ? (A) : (B)) @@ -109,8 +103,7 @@ void FFT3dKokkos::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int ns plan plan returned by previous call to fft_3d_create_plan ------------------------------------------------------------------------- */ -#include "kokkos_type.h" - +#ifdef FFT_KISSFFT template struct kiss_fft_functor { public: @@ -157,16 +150,20 @@ public: d_out(i,1) *= norm; } }; +#endif + +const int forward = 0; +const int backward = 1; template void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_FFT_DATA_1d d_in, typename ArrayTypes::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos *plan) { - int i,total,length,offset,num; + int i,total,length,offset,num,dim; FFT_SCALAR norm; typename ArrayTypes::t_FFT_DATA_1d d_data,d_copy; - typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); typename ArrayTypes::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar; - kiss_fft_functor f; + + int dir = (flag == -1) ? forward : backward; // pre-remap to prepare for 1st FFTs if needed // copy = loc for remap result @@ -189,16 +186,25 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F total = plan->total1; length = plan->length1; + dim = 0; - if (flag == -1) - f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length); - else - f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length); - Kokkos::parallel_for(total/length,f); - DeviceType::fence(); + #if defined(FFT_FFTW3) + fftw_execute_dft(plan->plan_1D[dim][dir],d_data.data(),d_data.data()); + #elif defined(FFT_CUFFT) + cufftExecZ2Z(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + #else + typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + kiss_fft_functor f; + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length); + Kokkos::parallel_for(total/length,f); + DeviceType::fence(); + d_data = d_tmp; + d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + #endif - d_data = d_tmp; - d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); // 1st mid-remap to prepare for 2nd FFTs // copy = loc for remap result @@ -219,16 +225,22 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F total = plan->total2; length = plan->length2; + dim = 1; - if (flag == -1) - f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length); - else - f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length); - Kokkos::parallel_for(total/length,f); - DeviceType::fence(); - - d_data = d_tmp; - d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + #if defined(FFT_FFTW3) + fftw_execute_dft(plan->plan_1D[dim][dir],d_data.data(),d_data.data()); + #elif defined(FFT_CUFFT) + cufftExecZ2Z(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + #else + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length); + Kokkos::parallel_for(total/length,f); + DeviceType::fence(); + d_data = d_tmp; + d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + #endif // 2nd mid-remap to prepare for 3rd FFTs // copy = loc for remap result @@ -249,15 +261,23 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F total = plan->total3; length = plan->length3; + dim = 2; - if (flag == -1) - f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length); - else - f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_backward,length); - Kokkos::parallel_for(total/length,f); - DeviceType::fence(); + #if defined(FFT_FFTW3) + fftw_execute_dft(plan->plan_1D[dim][dir],d_data.data(),d_data.data()); + #elif defined(FFT_CUFFT) + cufftExecZ2Z(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + // CUDA + #else + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_backward,length); + Kokkos::parallel_for(total/length,f); + DeviceType::fence(); + d_data = d_tmp; + #endif - d_data = d_tmp; // post-remap to put data in output format if needed // destination is always out @@ -276,9 +296,13 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F if (flag == 1 && plan->scaled) { norm = plan->norm; num = plan->normnum; + #if defined(FFT_CUFFT) + //scale(ptr, norm, num); ////////////////////// + #else norm_functor f(d_out,norm); Kokkos::parallel_for(num,f); DeviceType::fence(); + #endif } } @@ -322,6 +346,13 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; int list[50]; + + #ifdef FFT_FFTW + if (nthreads > 1) { ///////////// + std::cout << "fftw_init_threads: " << fftw_init_threads() << std::endl;; + fftw_plan_with_nthreads(nthreads); + } + #endif // query MPI info @@ -338,13 +369,13 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl plan = new struct fft_plan_3d_kokkos; remapKK = new RemapKokkos(lmp); - kissfftKK = new KissFFTKokkos(); if (plan == NULL) return NULL; // remap from initial distribution to layout needed for 1st set of 1d FFTs // not needed if all procs own entire fast axis initially // first indices = distribution after 1st set of FFTs + if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; else flag = 1; @@ -529,9 +560,41 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl plan->d_scratch = typename ArrayTypes::t_FFT_DATA_1d("fft3d:scratch",scratch_size); } + + kissfftKK = new KissFFTKokkos(); + // system specific pre-computation of 1d FFT coeffs // and scaling normalization +#if defined(FFT_FFTW3) + + for (int dim = 0; dim < 3; dim++) { + for (int dir = 0; dir < 2; dir++) { + plan->plan_1D[dim][dir] = + fftw_plan_many_dft(1, &n[dim],plan->totals[dim]/plan->lengths[dim], + NULL,&n[dim],1,plan->lengths[dim], + NULL,&n[dim],1,plan->lengths[dim], + (dir == forward) ? FFTW_FORWARD : FFTW_BACKWARD, + FFTW_ESTIMATE); + } + } + +#elif defined(FFT_CUFFT) + cufftPlanMany(&(plan->plan_fast), 1, &nfast, + &nfast,1,plan->length1, + &nfast,1,plan->length1, + CUFFT_Z2Z,plan->total1/plan->length1); + + cufftPlanMany(&(plan->plan_mid), 1, &nmid, + &nmid,1,plan->length2, + &nmid,1,plan->length2, + CUFFT_Z2Z,plan->total2/plan->length2); + + cufftPlanMany(&(plan->plan_slow), 1, &nslow, + &nslow,1,plan->length3, + &nslow,1,plan->length3, + CUFFT_Z2Z,plan->total3/plan->length3); +#else plan->cfg_fast_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL); plan->cfg_fast_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,1,NULL,NULL); @@ -556,6 +619,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl plan->cfg_slow_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nslow,0,NULL,NULL); plan->cfg_slow_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nslow,1,NULL,NULL); } +#endif if (scaled == 0) plan->scaled = 0; @@ -583,9 +647,31 @@ void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokk delete plan; delete remapKK; + delete kissfftKK; } +/* ---------------------------------------------------------------------- + divide n into 2 factors of as equal size as possible +------------------------------------------------------------------------- */ + +template +void FFT3dKokkos::bifactor(int n, int *factor1, int *factor2) +{ + int n1,n2,facmax; + + facmax = static_cast (sqrt((double) n)); + + for (n1 = facmax; n1 > 0; n1--) { + n2 = n/n1; + if (n1*n2 == n) { + *factor1 = n1; + *factor2 = n2; + return; + } + } +} + /* ---------------------------------------------------------------------- perform just the 1d FFTs needed by a 3d FFT, no data movement used for timing purposes @@ -602,8 +688,6 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypes *plan) { int i,total,length,offset,num; - typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); - kiss_fft_functor f; // total = size of data needed in each dim // length = length of 1d FFT in each dim @@ -627,6 +711,18 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypesplan_1D[dim][dir],data,data); +#elif defined(FFT_CUFFT) + cufftExecZ2Z(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExecZ2Z(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExecZ2Z(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); +#else + kiss_fft_functor f; + typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); if (flag == -1) { f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length1); Kokkos::parallel_for(total1/length1,f); @@ -652,6 +748,7 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypes::fft_3d_1d_only_kokkos(typename ArrayTypesscaled) { FFT_SCALAR norm = plan->norm; num = MIN(plan->normnum,nsize); + #if defined(FFT_CUFFT) + //scale(ptr, norm, num); /////////////// + #else norm_functor f(d_data,norm); Kokkos::parallel_for(num,f); DeviceType::fence(); + #endif } } diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index 6fcf140228..a85fc61705 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -15,11 +15,31 @@ #define LMP_FFT3D_KOKKOS_H #include "pointers.h" -#include "fft3d.h" #include "kokkos_type.h" #include "remap_kokkos.h" #include "kissfft_kokkos.h" +#define FFT_PRECISION 2 + +typedef double FFT_SCALAR; + +#if defined(FFT_FFTW3) + #include "fftw3.h" + #if defined(FFT_MKL) + #include "fftw/fftw3_mkl.h" + #endif + typedef fftw_complex FFT_DATA; +#elif defined(FFT_CUFFT) + #include "cufft.h" + void scale(double * ptr, double value, int n); + typedef cufftDoubleComplex FFT_DATA; +#else + #include "kissfft_kokkos.h" + #ifndef FFT_KISSFFT + #define FFT_KISSFFT + #endif +#endif + namespace LAMMPS_NS { // ------------------------------------------------------------------------- @@ -46,12 +66,25 @@ struct fft_plan_3d_kokkos { double norm; // normalization factor for rescaling // system specific 1d FFT info +#if defined(FFT_FFTW3) + FFTW_API(plan) plan_fast_forward; + FFTW_API(plan) plan_fast_backward; + FFTW_API(plan) plan_mid_forward; + FFTW_API(plan) plan_mid_backward; + FFTW_API(plan) plan_slow_forward; + FFTW_API(plan) plan_slow_backward; +#elif defined(FFT_CUFFT) + cufftHandle plan_fast; + cufftHandle plan_mid; + cufftHandle plan_slow; +#else kiss_fft_state_kokkos cfg_fast_forward; kiss_fft_state_kokkos cfg_fast_backward; kiss_fft_state_kokkos cfg_mid_forward; kiss_fft_state_kokkos cfg_mid_backward; kiss_fft_state_kokkos cfg_slow_forward; kiss_fft_state_kokkos cfg_slow_backward; +#endif }; template @@ -70,7 +103,9 @@ class FFT3dKokkos : protected Pointers { private: struct fft_plan_3d_kokkos *plan; RemapKokkos *remapKK; +#ifdef FFT_KISSFFT KissFFTKokkos *kissfftKK; +#endif void fft_3d_kokkos(typename AT::t_FFT_DATA_1d, typename AT::t_FFT_DATA_1d, int, struct fft_plan_3d_kokkos *); @@ -82,6 +117,8 @@ class FFT3dKokkos : protected Pointers { void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos *); void fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d, int, int, struct fft_plan_3d_kokkos *); + + void bifactor(int, int *, int *); }; } diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 942c2af241..4c1acf47e1 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -174,20 +174,20 @@ t_scalar3 operator * return t_scalar3(a.x*b,a.y*b,a.z*b); } -#if !defined(__CUDACC__) && !defined(__VECTOR_TYPES_H__) - struct double2 { - double x, y; - }; - struct float2 { - float x, y; - }; - struct float4 { - float x, y, z, w; - }; - struct double4 { - double x, y, z, w; - }; -#endif +//#if !defined(__CUDACC__) && !defined(__VECTOR_TYPES_H__) +// struct double2 { +// double x, y; +// }; +// struct float2 { +// float x, y; +// }; +// struct float4 { +// float x, y, z, w; +// }; +// struct double4 { +// double x, y, z, w; +// }; +//#endif // set LMPHostype and LMPDeviceType from Kokkos Default Types typedef Kokkos::DefaultExecutionSpace LMPDeviceType; typedef Kokkos::HostSpace::execution_space LMPHostType; @@ -310,14 +310,14 @@ public: #endif #if PRECISION==1 typedef float LMP_FLOAT; -typedef float2 LMP_FLOAT2; -typedef lmp_float3 LMP_FLOAT3; -typedef float4 LMP_FLOAT4; +//typedef float2 LMP_FLOAT2; +//typedef lmp_float3 LMP_FLOAT3; +//typedef float4 LMP_FLOAT4; #else typedef double LMP_FLOAT; -typedef double2 LMP_FLOAT2; -typedef lmp_double3 LMP_FLOAT3; -typedef double4 LMP_FLOAT4; +//typedef double2 LMP_FLOAT2; +//typedef lmp_double3 LMP_FLOAT3; +//typedef double4 LMP_FLOAT4; #endif #ifndef PREC_FORCE @@ -326,14 +326,14 @@ typedef double4 LMP_FLOAT4; #if PREC_FORCE==1 typedef float F_FLOAT; -typedef float2 F_FLOAT2; -typedef lmp_float3 F_FLOAT3; -typedef float4 F_FLOAT4; +//typedef float2 F_FLOAT2; +//typedef lmp_float3 F_FLOAT3; +//typedef float4 F_FLOAT4; #else typedef double F_FLOAT; -typedef double2 F_FLOAT2; -typedef lmp_double3 F_FLOAT3; -typedef double4 F_FLOAT4; +//typedef double2 F_FLOAT2; +//typedef lmp_double3 F_FLOAT3; +//typedef double4 F_FLOAT4; #endif #ifndef PREC_ENERGY @@ -342,12 +342,12 @@ typedef double4 F_FLOAT4; #if PREC_ENERGY==1 typedef float E_FLOAT; -typedef float2 E_FLOAT2; -typedef float4 E_FLOAT4; +//typedef float2 E_FLOAT2; +//typedef float4 E_FLOAT4; #else typedef double E_FLOAT; -typedef double2 E_FLOAT2; -typedef double4 E_FLOAT4; +//typedef double2 E_FLOAT2; +//typedef double4 E_FLOAT4; #endif struct s_EV_FLOAT { @@ -500,12 +500,12 @@ typedef struct s_FEV_FLOAT FEV_FLOAT; #if PREC_POS==1 typedef float X_FLOAT; -typedef float2 X_FLOAT2; -typedef float4 X_FLOAT4; +//typedef float2 X_FLOAT2; +//typedef float4 X_FLOAT4; #else typedef double X_FLOAT; -typedef double2 X_FLOAT2; -typedef double4 X_FLOAT4; +//typedef double2 X_FLOAT2; +//typedef double4 X_FLOAT4; #endif #ifndef PREC_VELOCITIES @@ -514,22 +514,22 @@ typedef double4 X_FLOAT4; #if PREC_VELOCITIES==1 typedef float V_FLOAT; -typedef float2 V_FLOAT2; -typedef float4 V_FLOAT4; +//typedef float2 V_FLOAT2; +//typedef float4 V_FLOAT4; #else typedef double V_FLOAT; -typedef double2 V_FLOAT2; -typedef double4 V_FLOAT4; +//typedef double2 V_FLOAT2; +//typedef double4 V_FLOAT4; #endif #if PREC_KSPACE==1 typedef float K_FLOAT; -typedef float2 K_FLOAT2; -typedef float4 K_FLOAT4; +//typedef float2 K_FLOAT2; +//typedef float4 K_FLOAT4; #else typedef double K_FLOAT; -typedef double2 K_FLOAT2; -typedef double4 K_FLOAT4; +//typedef double2 K_FLOAT2; +//typedef double4 K_FLOAT4; #endif typedef int T_INT; From ca5aa1f907f46e64d569a363e04d1bb23e95da41 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 17 Jun 2019 13:18:26 -0600 Subject: [PATCH 04/48] Add cuFFT norm functor --- src/KOKKOS/fft3d_kokkos.cpp | 42 +++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index bd0f46a8d2..77b8da054b 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -103,6 +103,28 @@ void FFT3dKokkos::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int ns plan plan returned by previous call to fft_3d_create_plan ------------------------------------------------------------------------- */ +#ifdef FFT_CUFFT +template +struct cufft_norm_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_out; + int norm; + + cufft_norm_functor(typename AT::t_FFT_SCALAR_1d &d_out_, int norm_): + d_out(d_out_) + { + norm = norm_; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &i) const { + d_out(i) *= norm; + } +}; +#endif + #ifdef FFT_KISSFFT template struct kiss_fft_functor { @@ -131,14 +153,14 @@ public: }; template -struct norm_functor { +struct kiss_norm_functor { public: typedef DeviceType device_type; typedef ArrayTypes AT; typename AT::t_FFT_DATA_1d_um d_out; int norm; - norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_): + kiss_norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_): d_out(d_out_) { norm = norm_; @@ -297,9 +319,13 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F norm = plan->norm; num = plan->normnum; #if defined(FFT_CUFFT) - //scale(ptr, norm, num); ////////////////////// + typename ArrayTypes::t_FFT_SCALAR_1d d_norm_scalar = + typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + cufft_norm_functor f(d_norm_scalar,norm); + Kokkos::parallel_for(num,f); + DeviceType::fence(); #else - norm_functor f(d_out,norm); + kiss_norm_functor f(d_out,norm); Kokkos::parallel_for(num,f); DeviceType::fence(); #endif @@ -757,9 +783,13 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypesnorm; num = MIN(plan->normnum,nsize); #if defined(FFT_CUFFT) - //scale(ptr, norm, num); /////////////// + typename ArrayTypes::t_FFT_SCALAR_1d d_norm_scalar = + typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + cufft_norm_functor f(d_norm_scalar,norm); + Kokkos::parallel_for(num,f); + DeviceType::fence(); #else - norm_functor f(d_data,norm); + kiss_norm_functor f(d_data,norm); Kokkos::parallel_for(num,f); DeviceType::fence(); #endif From 80846e3ed542f6449c4f0f707692078f43536378 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 17 Jun 2019 13:52:34 -0600 Subject: [PATCH 05/48] WIP: add FFTW3 threaded support --- src/KOKKOS/fft3d_kokkos.cpp | 95 ++++++++++++++++++++++++++----------- src/KOKKOS/fft3d_kokkos.h | 3 +- src/KOKKOS/pppm_kokkos.cpp | 4 +- 3 files changed, 71 insertions(+), 31 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 77b8da054b..e0ec74da9e 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -31,13 +31,14 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective) : + int scaled, int permute, int *nbuf, int usecollective, + int nthreads) : Pointers(lmp) { plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - scaled,permute,nbuf,usecollective); + scaled,permute,nbuf,usecollective,nthreads); if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan"); } @@ -211,7 +212,10 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F dim = 0; #if defined(FFT_FFTW3) - fftw_execute_dft(plan->plan_1D[dim][dir],d_data.data(),d_data.data()); + if (flag == -1) + fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) cufftExecZ2Z(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else @@ -250,7 +254,10 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F dim = 1; #if defined(FFT_FFTW3) - fftw_execute_dft(plan->plan_1D[dim][dir],d_data.data(),d_data.data()); + if (flag == -1) + fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) cufftExecZ2Z(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else @@ -286,7 +293,11 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F dim = 2; #if defined(FFT_FFTW3) - fftw_execute_dft(plan->plan_1D[dim][dir],d_data.data(),d_data.data()); + if (flag == -1) + fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + #elif defined(FFT_CUFFT) cufftExecZ2Z(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); // CUDA @@ -361,7 +372,8 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective) + int scaled, int permute, int *nbuf, int usecollective, + int nthreads) { struct fft_plan_3d_kokkos *plan; int me,nprocs; @@ -372,13 +384,6 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; int list[50]; - - #ifdef FFT_FFTW - if (nthreads > 1) { ///////////// - std::cout << "fftw_init_threads: " << fftw_init_threads() << std::endl;; - fftw_plan_with_nthreads(nthreads); - } - #endif // query MPI info @@ -586,25 +591,49 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl plan->d_scratch = typename ArrayTypes::t_FFT_DATA_1d("fft3d:scratch",scratch_size); } - - kissfftKK = new KissFFTKokkos(); - // system specific pre-computation of 1d FFT coeffs // and scaling normalization #if defined(FFT_FFTW3) + if (nthreads > 1) + fftw_plan_with_nthreads(nthreads); - for (int dim = 0; dim < 3; dim++) { - for (int dir = 0; dir < 2; dir++) { - plan->plan_1D[dim][dir] = - fftw_plan_many_dft(1, &n[dim],plan->totals[dim]/plan->lengths[dim], - NULL,&n[dim],1,plan->lengths[dim], - NULL,&n[dim],1,plan->lengths[dim], - (dir == forward) ? FFTW_FORWARD : FFTW_BACKWARD, - FFTW_ESTIMATE); - } - } + plan->plan_1D_fast_forward = + fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, + NULL,&nfast,1,plan->length1, + NULL,&nfast,1,plan->length1, + FFTW_FORWARD,FFTW_ESTIMATE); + plan->plan_1D_fast_backward = + fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, + NULL,&nfast,1,plan->length1, + NULL,&nfast,1,plan->length1, + FFTW_BACKWARD,FFTW_ESTIMATE); + + plan->plan_1D_mid_forward = + fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, + NULL,&nmid,1,plan->length2, + NULL,&nmid,1,plan->length2, + FFTW_FORWARD,FFTW_ESTIMATE); + + plan->plan_1D_mid_backward = + fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, + NULL,&nmid,1,plan->length2, + NULL,&nmid,1,plan->length2, + FFTW_BACKWARD,FFTW_ESTIMATE); + + + plan->plan_1D_slow_forward = + fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, + NULL,&slow,1,plan->length3, + NULL,&slow,1,plan->length3, + FFTW_FORWARD,FFTW_ESTIMATE); + + plan->plan_1D_slow_backward = + fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, + NULL,&nslow,1,plan->length3, + NULL,&nslow,1,plan->length3, + FFTW_BACKWARD,FFTW_ESTIMATE); #elif defined(FFT_CUFFT) cufftPlanMany(&(plan->plan_fast), 1, &nfast, &nfast,1,plan->length1, @@ -620,7 +649,9 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl &nslow,1,plan->length3, &nslow,1,plan->length3, CUFFT_Z2Z,plan->total3/plan->length3); -#else +#else + kissfftKK = new KissFFTKokkos(); + plan->cfg_fast_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL); plan->cfg_fast_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,1,NULL,NULL); @@ -741,7 +772,15 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypesplan_1D[dim][dir],data,data); + if (flag == -1) { + fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + } else { + fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + } #elif defined(FFT_CUFFT) cufftExecZ2Z(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); cufftExecZ2Z(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index a85fc61705..aa94be2c24 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -103,6 +103,7 @@ class FFT3dKokkos : protected Pointers { private: struct fft_plan_3d_kokkos *plan; RemapKokkos *remapKK; + #ifdef FFT_KISSFFT KissFFTKokkos *kissfftKK; #endif @@ -112,7 +113,7 @@ class FFT3dKokkos : protected Pointers { struct fft_plan_3d_kokkos *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int *, int); + int, int, int *, int, int); void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos *); diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index f25ea2313f..e9aef86064 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -852,12 +852,12 @@ void PPPMKokkos::allocate() fft1 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,lmp->kokkos->nthreads); fft2 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,lmp->kokkos->nthreads); remap = new RemapKokkos(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, From 3c9098d2ee640836944bb87938f16a52b77c5188 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 17 Jun 2019 14:42:23 -0600 Subject: [PATCH 06/48] FFTW3 for Kokkos --- src/KOKKOS/fft3d_kokkos.cpp | 33 +++++++++------- src/KOKKOS/fft3d_kokkos.h | 13 +++--- src/KOKKOS/fftdata_kokkos.h | 48 ----------------------- src/KOKKOS/kissfft_kokkos.h | 22 +++++++++-- src/KOKKOS/pppm_kokkos.cpp | 4 +- src/KOKKOS/remap_kokkos.h | 1 + src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi | 4 +- 7 files changed, 48 insertions(+), 77 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index e0ec74da9e..6bf224ac1a 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -19,6 +19,7 @@ #include "remap_kokkos.h" #include "kokkos_type.h" #include "error.h" +#include "kokkos.h" using namespace LAMMPS_NS; @@ -31,10 +32,10 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective, - int nthreads) : + int scaled, int permute, int *nbuf, int usecollective) : Pointers(lmp) { + int nthreads = lmp->kokkos->nthreads; plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, @@ -335,7 +336,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F cufft_norm_functor f(d_norm_scalar,norm); Kokkos::parallel_for(num,f); DeviceType::fence(); - #else + #elif defined(FFT_KISFFT) kiss_norm_functor f(d_out,norm); Kokkos::parallel_for(num,f); DeviceType::fence(); @@ -595,41 +596,41 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl // and scaling normalization #if defined(FFT_FFTW3) - if (nthreads > 1) - fftw_plan_with_nthreads(nthreads); +// if (nthreads > 1) +// fftw_plan_with_nthreads(nthreads); - plan->plan_1D_fast_forward = + plan->plan_fast_forward = fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_FORWARD,FFTW_ESTIMATE); - plan->plan_1D_fast_backward = + plan->plan_fast_backward = fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_BACKWARD,FFTW_ESTIMATE); - plan->plan_1D_mid_forward = + plan->plan_mid_forward = fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_FORWARD,FFTW_ESTIMATE); - plan->plan_1D_mid_backward = + plan->plan_mid_backward = fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_BACKWARD,FFTW_ESTIMATE); - plan->plan_1D_slow_forward = + plan->plan_slow_forward = fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, - NULL,&slow,1,plan->length3, - NULL,&slow,1,plan->length3, + NULL,&nslow,1,plan->length3, + NULL,&nslow,1,plan->length3, FFTW_FORWARD,FFTW_ESTIMATE); - plan->plan_1D_slow_backward = + plan->plan_slow_backward = fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, @@ -649,7 +650,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl &nslow,1,plan->length3, &nslow,1,plan->length3, CUFFT_Z2Z,plan->total3/plan->length3); -#else +#else kissfftKK = new KissFFTKokkos(); plan->cfg_fast_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL); @@ -705,7 +706,9 @@ void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokk delete plan; delete remapKK; +#ifdef FFT_KISSFFT delete kissfftKK; +#endif } /* ---------------------------------------------------------------------- @@ -827,7 +830,7 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypes f(d_norm_scalar,norm); Kokkos::parallel_for(num,f); DeviceType::fence(); - #else + #elif defined(FFT_KISFFT) kiss_norm_functor f(d_data,norm); Kokkos::parallel_for(num,f); DeviceType::fence(); diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index aa94be2c24..bcf65ec0fd 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -17,21 +17,22 @@ #include "pointers.h" #include "kokkos_type.h" #include "remap_kokkos.h" -#include "kissfft_kokkos.h" #define FFT_PRECISION 2 - typedef double FFT_SCALAR; +// if user set FFTW, it means FFTW3 + +#ifdef FFT_FFTW +#define FFT_FFTW3 +#endif + #if defined(FFT_FFTW3) #include "fftw3.h" - #if defined(FFT_MKL) - #include "fftw/fftw3_mkl.h" - #endif typedef fftw_complex FFT_DATA; + #define FFTW_API(function) fftw_ ## function #elif defined(FFT_CUFFT) #include "cufft.h" - void scale(double * ptr, double value, int n); typedef cufftDoubleComplex FFT_DATA; #else #include "kissfft_kokkos.h" diff --git a/src/KOKKOS/fftdata_kokkos.h b/src/KOKKOS/fftdata_kokkos.h index 64254870ca..2c70c0ae55 100644 --- a/src/KOKKOS/fftdata_kokkos.h +++ b/src/KOKKOS/fftdata_kokkos.h @@ -19,8 +19,6 @@ #ifndef FFT_DATA_KOKKOS_H #define FFT_DATA_KOKKOS_H -#include "kokkos_type.h" - // User-settable FFT precision // FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag) @@ -41,55 +39,9 @@ typedef double FFT_SCALAR; // Data types for single-precision complex #if FFT_PRECISION == 1 - -// use a stripped down version of kiss fft as default fft - -#ifndef FFT_KISSFFT -#define FFT_KISSFFT -#endif -#define kiss_fft_scalar_kokkos float -//typedef struct { -// kiss_fft_scalar re; -// kiss_fft_scalar im; -//} FFT_DATA; - -// ------------------------------------------------------------------------- - -// Data types for double-precision complex - #elif FFT_PRECISION == 2 - -// use a stripped down version of kiss fft as default fft - -#ifndef FFT_KISSFFT -#define FFT_KISSFFT -#endif -#define kiss_fft_scalar_kokkos double -//typedef struct { -// kiss_fft_scalar re; -// kiss_fft_scalar im; -//} FFT_DATA; - -// ------------------------------------------------------------------------- - #else #error "FFT_PRECISION needs to be either 1 (=single) or 2 (=double)" #endif -// ------------------------------------------------------------------------- - -#define MAXFACTORS 32 -/* e.g. an fft of length 128 has 4 factors - as far as kissfft is concerned: 4*4*4*2 */ -template -struct kiss_fft_state_kokkos { - typedef DeviceType device_type; - typedef ArrayTypes AT; - int nfft; - int inverse; - typename AT::t_int_64 d_factors; - typename AT::t_FFT_DATA_1d d_twiddles; - typename AT::t_FFT_DATA_1d d_scratch; -}; - #endif diff --git a/src/KOKKOS/kissfft_kokkos.h b/src/KOKKOS/kissfft_kokkos.h index a85f0fc9d5..db9e880393 100644 --- a/src/KOKKOS/kissfft_kokkos.h +++ b/src/KOKKOS/kissfft_kokkos.h @@ -128,10 +128,10 @@ do{ (c)[0] *= (s);\ (c)[1] *= (s); }while(0) -#define C_ADD( res, a,b)\ - do { \ - (res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \ - }while(0) +//#define C_ADD( res, a,b)\ +// do { \ +// (res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \ +// }while(0) #define C_SUB( res, a,b)\ do { \ @@ -166,6 +166,20 @@ namespace LAMMPS_NS { +#define MAXFACTORS 32 +/* e.g. an fft of length 128 has 4 factors + as far as kissfft is concerned: 4*4*4*2 */ +template +struct kiss_fft_state_kokkos { + typedef DeviceType device_type; + typedef ArrayTypes AT; + int nfft; + int inverse; + typename AT::t_int_64 d_factors; + typename AT::t_FFT_DATA_1d d_twiddles; + typename AT::t_FFT_DATA_1d d_scratch; +}; + template class KissFFTKokkos { public: diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index e9aef86064..f25ea2313f 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -852,12 +852,12 @@ void PPPMKokkos::allocate() fft1 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag,lmp->kokkos->nthreads); + 0,0,&tmp,collective_flag); fft2 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag,lmp->kokkos->nthreads); + 0,0,&tmp,collective_flag); remap = new RemapKokkos(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h index fe8df20f16..f77ce9fb72 100644 --- a/src/KOKKOS/remap_kokkos.h +++ b/src/KOKKOS/remap_kokkos.h @@ -18,6 +18,7 @@ #include #include "fftdata_kokkos.h" #include "remap.h" +#include "kokkos_type.h" namespace LAMMPS_NS { diff --git a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi index d6c5c566aa..ba34bc25eb 100644 --- a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi +++ b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi @@ -55,9 +55,9 @@ MPI_LIB = # PATH = path for FFT library # LIB = name of FFT library -FFT_INC = +FFT_INC = -DFFT_CUFFT FFT_PATH = -FFT_LIB = +FFT_LIB = -lcufft # JPEG and/or PNG library # see discussion in Section 2.2 (step 7) of manual From 739097f34b67f0d62285f6e6cd38d5eed71de5a4 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 17 Jun 2019 15:24:47 -0600 Subject: [PATCH 07/48] Error out if using FFTW on GPUs --- src/KOKKOS/fft3d_kokkos.cpp | 15 +++++++++++++-- src/KOKKOS/fft3d_kokkos.h | 5 +++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 6bf224ac1a..24e3be8943 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -11,6 +11,10 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + Contributing authors: Stan Moore (SNL), Sam Mish (U.C. Davis) +------------------------------------------------------------------------- */ + #include #include #include @@ -36,6 +40,13 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int Pointers(lmp) { int nthreads = lmp->kokkos->nthreads; + int ngpus = lmp->kokkos->ngpus; + +#if defined(FFT_FFTW3) + if (ngpus > 0) + lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA"); +#endif + plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, @@ -596,8 +607,8 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl // and scaling normalization #if defined(FFT_FFTW3) -// if (nthreads > 1) -// fftw_plan_with_nthreads(nthreads); + if (nthreads > 1) + fftw_plan_with_nthreads(nthreads); plan->plan_fast_forward = fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index bcf65ec0fd..be943aedb1 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -135,4 +135,9 @@ The FFT setup for the PPPM solver failed, typically due to lack of memory. This is an unusual error. Check the size of the FFT grid you are requesting. +E: Cannot use the FFTW library with Kokkos CUDA + +Kokkos CUDA doesn't support using the FFTW library to +calculate FFTs for PPPM on GPUs. + */ From fce9439c72a3d7b9426b726493bc500a56f599e4 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 17 Jun 2019 15:25:22 -0600 Subject: [PATCH 08/48] Update docs --- doc/src/kspace_style.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/src/kspace_style.txt b/doc/src/kspace_style.txt index 98ec1e64e6..7824331f01 100644 --- a/doc/src/kspace_style.txt +++ b/doc/src/kspace_style.txt @@ -366,9 +366,9 @@ calculation can be performed concurrently on the GPU while other calculations for non-bonded and bonded force calculation are performed on the CPU. -The {pppm/kk} style also performs charge assignment and force -interpolation calculations on the GPU while the FFTs themselves are -calculated on the CPU in non-threaded mode. +The {pppm/kk} style performs charge assignment and force interpolation +calculations, along with the FFTs themselves, on the GPU or threaded +using OpenMP. These accelerated styles are part of the GPU, USER-INTEL, KOKKOS, USER-OMP, and OPT packages respectively. They are only enabled if From 811b28ac972e127a07d82f24607af4ed5012c41c Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 18 Jun 2019 09:43:54 -0600 Subject: [PATCH 09/48] Code cleanup --- src/KOKKOS/fft3d_kokkos.cpp | 19 +- src/KOKKOS/fft3d_kokkos.h | 2 +- src/KOKKOS/fftdata_kokkos.h | 1 - src/KOKKOS/kissfft_kokkos.cpp | 517 ---------------------------------- src/KOKKOS/kissfft_kokkos.h | 87 ++---- src/KOKKOS/remap_kokkos.cpp | 324 ++++----------------- 6 files changed, 80 insertions(+), 870 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 24e3be8943..ba7d4bffaa 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -28,6 +28,9 @@ using namespace LAMMPS_NS; +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + /* ---------------------------------------------------------------------- */ template @@ -83,9 +86,6 @@ void FFT3dKokkos::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int ns fft_3d_1d_only_kokkos(d_in_data,nsize,flag,plan); } -#define MIN(A,B) ((A) < (B) ? (A) : (B)) -#define MAX(A,B) ((A) > (B) ? (A) : (B)) - /* ---------------------------------------------------------------------- Data layout for 3d FFTs: @@ -238,7 +238,6 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F else f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length); Kokkos::parallel_for(total/length,f); - DeviceType::fence(); d_data = d_tmp; d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); #endif @@ -278,7 +277,6 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F else f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length); Kokkos::parallel_for(total/length,f); - DeviceType::fence(); d_data = d_tmp; d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); #endif @@ -319,7 +317,6 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F else f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_backward,length); Kokkos::parallel_for(total/length,f); - DeviceType::fence(); d_data = d_tmp; #endif @@ -346,11 +343,9 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); cufft_norm_functor f(d_norm_scalar,norm); Kokkos::parallel_for(num,f); - DeviceType::fence(); #elif defined(FFT_KISFFT) kiss_norm_functor f(d_out,norm); Kokkos::parallel_for(num,f); - DeviceType::fence(); #endif } @@ -805,27 +800,21 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypes(d_data,d_tmp,plan->cfg_fast_forward,length1); Kokkos::parallel_for(total1/length1,f); - DeviceType::fence(); f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length2); Kokkos::parallel_for(total2/length2,f); - DeviceType::fence(); f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length3); Kokkos::parallel_for(total3/length3,f); - DeviceType::fence(); } else { f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length1); Kokkos::parallel_for(total1/length1,f); - DeviceType::fence(); f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length2); Kokkos::parallel_for(total2/length2,f); - DeviceType::fence(); f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_backward,length3); Kokkos::parallel_for(total3/length3,f); - DeviceType::fence(); } #endif @@ -840,11 +829,9 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); cufft_norm_functor f(d_norm_scalar,norm); Kokkos::parallel_for(num,f); - DeviceType::fence(); #elif defined(FFT_KISFFT) kiss_norm_functor f(d_data,norm); Kokkos::parallel_for(num,f); - DeviceType::fence(); #endif } } diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index be943aedb1..ba55e4986b 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -30,7 +30,7 @@ typedef double FFT_SCALAR; #if defined(FFT_FFTW3) #include "fftw3.h" typedef fftw_complex FFT_DATA; - #define FFTW_API(function) fftw_ ## function + #define FFTW_API(function) fftw_ ## function #elif defined(FFT_CUFFT) #include "cufft.h" typedef cufftDoubleComplex FFT_DATA; diff --git a/src/KOKKOS/fftdata_kokkos.h b/src/KOKKOS/fftdata_kokkos.h index 2c70c0ae55..262b963802 100644 --- a/src/KOKKOS/fftdata_kokkos.h +++ b/src/KOKKOS/fftdata_kokkos.h @@ -13,7 +13,6 @@ #define MAX(A,B) ((A) > (B) ? (A) : (B)) - // data types for 2d/3d FFTs #ifndef FFT_DATA_KOKKOS_H diff --git a/src/KOKKOS/kissfft_kokkos.cpp b/src/KOKKOS/kissfft_kokkos.cpp index 6b878596ef..a9e68fe723 100644 --- a/src/KOKKOS/kissfft_kokkos.cpp +++ b/src/KOKKOS/kissfft_kokkos.cpp @@ -11,523 +11,6 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ -/* - we use a stripped down KISS FFT as default FFT for LAMMPS - this code is adapted from kiss_fft_v1_2_9 - homepage: http://kissfft.sf.net/ - - changes 2008-2011 by Axel Kohlmeyer -*/ - -#include -#include -#include -#include "kissfft_kokkos.h" -#include "fftdata_kokkos.h" - -#ifndef M_PI -#define M_PI 3.141592653589793238462643383279502884197169399375105820974944 -#endif - -using namespace LAMMPS_NS; - -/* - Copyright (c) 2003-2010, Mark Borgerding - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - - * Neither the author nor the names of any contributors may be used - to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/* - Explanation of macros dealing with complex math: - - C_MUL(m,a,b) : m = a*b - C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise - C_SUB( res, a,b) : res = a - b - C_SUBFROM( res , a) : res -= a - C_ADDTO( res , a) : res += a - C_EQ( res , a) : res = a - * */ - -#define S_MUL(a,b) ( (a)*(b) ) - -#define C_MUL(m,a,a_index,b,b_index) \ - do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\ - (m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0) - -#define C_FIXDIV(c,div) /* NOOP */ - - - -#define C_MULBYSCALAR( c, s ) \ - do{ (c)[0] *= (s);\ - (c)[1] *= (s); }while(0) - -#define C_ADD( res, a,b)\ - do { \ - (res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \ - }while(0) - -#define C_SUB( res, a,b)\ - do { \ - (res)[0]=(a)[0]-(b)[0]; (res)[1]=(a)[1]-(b)[1]; \ - }while(0) - -#define C_ADDTO( res , a)\ - do { \ - (res)[0] += (a)[0]; (res)[1] += (a)[1];\ - }while(0) - -#define C_SUBFROM( res , a)\ - do {\ - (res)[0] -= (a)[0]; (res)[1] -= (a)[1]; \ - }while(0) - -#define C_EQ(res, a)\ - do {\ - (res)[0] = (a)[0]; (res)[1] = (a)[1]; \ - }while(0) - - -#define KISS_FFT_COS(phase) (FFT_SCALAR) cos(phase) -#define KISS_FFT_SIN(phase) (FFT_SCALAR) sin(phase) -#define HALF_OF(x) ((x)*.5) - -#define kf_cexp(x,x_index,phase) \ - do{ \ - (x)(x_index,0) = KISS_FFT_COS(phase);\ - (x)(x_index,1) = KISS_FFT_SIN(phase);\ - }while(0) - -template -KOKKOS_INLINE_FUNCTION -void KissFFTKokkos::kf_bfly2(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, - const kiss_fft_state_kokkos &st, int m, int Fout_count) -{ - typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; - FFT_SCALAR t[2]; - int Fout2_count; - int tw1_count = 0; - - Fout2_count = Fout_count + m; - do { - //C_FIXDIV(d_Fout[Fout_count],2); C_FIXDIV(d_Fout[Fout2_count],2); - - C_MUL(t,d_Fout,Fout2_count,d_twiddles,tw1_count); - tw1_count += fstride; - //C_SUB(*Fout2,*Fout,t); - d_Fout(Fout2_count,0) = d_Fout(Fout_count,0) - t[0]; - d_Fout(Fout2_count,1) = d_Fout(Fout_count,1) - t[1]; - //C_ADDTO(d_Fout[Fout_count],t); - d_Fout(Fout_count,0) += t[0]; - d_Fout(Fout_count,1) += t[1]; - ++Fout2_count; - ++Fout_count; - } while(--m); -} - -template -KOKKOS_INLINE_FUNCTION -void KissFFTKokkos::kf_bfly4(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, - const kiss_fft_state_kokkos &st, const size_t m, int Fout_count) -{ - typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; - FFT_SCALAR scratch[6][2]; - size_t k=m; - const size_t m2=2*m; - const size_t m3=3*m; - - int tw3_count,tw2_count,tw1_count; - tw3_count = tw2_count = tw1_count = 0; - - do { - //C_FIXDIV(d_Fout[Fout_count],4); C_FIXDIV(d_Fout[m],4); C_FIXDIV(d_Fout[m2],4); C_FIXDIV(d_Fout[m3],4); - - C_MUL(scratch[0],d_Fout,Fout_count + m,d_twiddles,tw1_count); - C_MUL(scratch[1],d_Fout,Fout_count + m2,d_twiddles,tw2_count); - C_MUL(scratch[2],d_Fout,Fout_count + m3,d_twiddles,tw3_count); - - //C_SUB(scratch[5],d_Fout[Fout_count],scratch[1] ); - scratch[5][0] = d_Fout(Fout_count,0) - scratch[1][0]; - scratch[5][1] = d_Fout(Fout_count,1) - scratch[1][1]; - //C_ADDTO(d_Fout[Fout_count], scratch[1]); - d_Fout(Fout_count,0) += scratch[1][0]; - d_Fout(Fout_count,1) += scratch[1][1]; - //C_ADD(scratch[3],scratch[0],scratch[2]); - scratch[3][0] = scratch[0][0] + scratch[2][0]; - scratch[3][1] = scratch[0][1] + scratch[2][1]; - //C_SUB( scratch[4] , scratch[0] , scratch[2] ); - scratch[4][0] = scratch[0][0] - scratch[2][0]; - scratch[4][1] = scratch[0][1] - scratch[2][1]; - //C_SUB(d_Fout[m2],d_Fout[Fout_count],scratch[3]); - d_Fout(Fout_count + m2,0) = d_Fout(Fout_count,0) - scratch[3][0]; - d_Fout(Fout_count + m2,1) = d_Fout(Fout_count,1) - scratch[3][1]; - - tw1_count += fstride; - tw2_count += fstride*2; - tw3_count += fstride*3; - //C_ADDTO(d_Fout[Fout_count],scratch[3]); - d_Fout(Fout_count,0) += scratch[3][0]; - d_Fout(Fout_count,1) += scratch[3][1]; - - if (st.inverse) { - d_Fout(Fout_count + m,0) = scratch[5][0] - scratch[4][1]; - d_Fout(Fout_count + m,1) = scratch[5][1] + scratch[4][0]; - d_Fout(Fout_count + m3,0) = scratch[5][0] + scratch[4][1]; - d_Fout(Fout_count + m3,1) = scratch[5][1] - scratch[4][0]; - } else{ - d_Fout(Fout_count + m,0) = scratch[5][0] + scratch[4][1]; - d_Fout(Fout_count + m,1) = scratch[5][1] - scratch[4][0]; - d_Fout(Fout_count + m3,0) = scratch[5][0] - scratch[4][1]; - d_Fout(Fout_count + m3,1) = scratch[5][1] + scratch[4][0]; - } - ++Fout_count; - } while(--k); -} - -template -KOKKOS_INLINE_FUNCTION -void KissFFTKokkos::kf_bfly3(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, - const kiss_fft_state_kokkos &st, size_t m, int Fout_count) -{ - size_t k=m; - const size_t m2 = 2*m; - typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; - FFT_SCALAR scratch[5][2]; - FFT_SCALAR epi3[2]; - //C_EQ(epi3,d_twiddles[fstride*m]); - epi3[0] = d_twiddles(fstride*m,0); - epi3[1] = d_twiddles(fstride*m,1); - - int tw1_count,tw2_count; - tw1_count = tw2_count = 0; - - do { - //C_FIXDIV(d_Fout[Fout_count],3); C_FIXDIV(d_Fout[m],3); C_FIXDIV(d_Fout[m2],3); - - C_MUL(scratch[1],d_Fout,Fout_count + m,d_twiddles,tw1_count); - C_MUL(scratch[2],d_Fout,Fout_count + m2,d_twiddles,tw2_count); - - //C_ADD(scratch[3],scratch[1],scratch[2]); - scratch[3][0] = scratch[1][0] + scratch[2][0]; - scratch[3][1] = scratch[1][1] + scratch[2][1]; - //C_SUB(scratch[0],scratch[1],scratch[2]); - scratch[0][0] = scratch[1][0] - scratch[2][0]; - scratch[0][1] = scratch[1][1] - scratch[2][1]; - tw1_count += fstride; - tw2_count += fstride*2; - - d_Fout(Fout_count + m,0) = d_Fout(Fout_count,0) - HALF_OF(scratch[3][0]); - d_Fout(Fout_count + m,1) = d_Fout(Fout_count,1) - HALF_OF(scratch[3][1]); - - //C_MULBYSCALAR(scratch[0],epi3[1]); - scratch[0][0] *= epi3[1]; - scratch[0][1] *= epi3[1]; - - //C_ADDTO(d_Fout[Fout_count],scratch[3]); - d_Fout(Fout_count,0) += scratch[3][0]; - d_Fout(Fout_count,1) += scratch[3][1]; - - d_Fout(Fout_count + m2,0) = d_Fout(Fout_count + m,0) + scratch[0][1]; - d_Fout(Fout_count + m2,1) = d_Fout(Fout_count + m,1) - scratch[0][0]; - - d_Fout(Fout_count + m,0) -= scratch[0][1]; - d_Fout(Fout_count + m,1) += scratch[0][0]; - - ++Fout_count; - } while(--k); -} - -template -KOKKOS_INLINE_FUNCTION -void KissFFTKokkos::kf_bfly5(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, - const kiss_fft_state_kokkos &st, int m, int Fout_count) -{ - int u; - FFT_SCALAR scratch[13][2]; - typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; - FFT_SCALAR ya[2],yb[2]; - //C_EQ(ya,d_twiddles[fstride*m]); - ya[1] = d_twiddles(fstride*m,1); - ya[0] = d_twiddles(fstride*m,0); - //C_EQ(yb,d_twiddles[fstride*2*m]); - yb[1] = d_twiddles(fstride*2*m,1); - yb[0] = d_twiddles(fstride*2*m,0); - - int Fout0_count=Fout_count; - int Fout1_count=Fout0_count+m; - int Fout2_count=Fout0_count+2*m; - int Fout3_count=Fout0_count+3*m; - int Fout4_count=Fout0_count+4*m; - - for ( u=0; u -KOKKOS_INLINE_FUNCTION -void KissFFTKokkos::kf_bfly_generic(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, - const kiss_fft_state_kokkos &st, int m, int p, int Fout_count) -{ - int u,k,q1,q; - typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; - FFT_SCALAR t[2]; - int Norig = st.nfft; - - typename AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch; - for ( u=0; u=Norig) twidx-=Norig; - C_MUL(t,d_scratch,q,d_twiddles,twidx); - //C_ADDTO(d_Fout[k],t); - d_Fout(Fout_count + k,0) += t[0]; - d_Fout(Fout_count + k,1) += t[1]; - } - k += m; - } - } - //KISS_FFT_TMP_FREE(d_scratch); -} - -template -KOKKOS_INLINE_FUNCTION -void KissFFTKokkos::kf_work(typename AT::t_FFT_DATA_1d_um &d_Fout, const typename AT::t_FFT_DATA_1d_um &d_f, - const size_t fstride, int in_stride, - const typename AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos &st, int Fout_count, int f_count, int factors_count) -{ - const int beg = Fout_count; - const int p = d_factors[factors_count++]; /* the radix */ - const int m = d_factors[factors_count++]; /* stage's fft length/p */ - const int end = Fout_count + p*m; - - if (m == 1) { - do { - //C_EQ(d_Fout[Fout_count],d_f[f_count]); - d_Fout(Fout_count,0) = d_f(f_count,0); - d_Fout(Fout_count,1) = d_f(f_count,1); - f_count += fstride*in_stride; - } while (++Fout_count != end); - } else { - do { - /* recursive call: - DFT of size m*p performed by doing - p instances of smaller DFTs of size m, - each one takes a decimated version of the input */ - kf_work(d_Fout, d_f, fstride*p, in_stride, d_factors, st, Fout_count, f_count, factors_count); - f_count += fstride*in_stride; - } while( (Fout_count += m) != end); - } - - Fout_count=beg; - - /* recombine the p smaller DFTs */ - switch (p) { - case 2: kf_bfly2(d_Fout,fstride,st,m,Fout_count); break; - case 3: kf_bfly3(d_Fout,fstride,st,m,Fout_count); break; - case 4: kf_bfly4(d_Fout,fstride,st,m,Fout_count); break; - case 5: kf_bfly5(d_Fout,fstride,st,m,Fout_count); break; - default: kf_bfly_generic(d_Fout,fstride,st,m,p,Fout_count); break; - } -} - -/* facbuf is populated by p1,m1,p2,m2, ... - where - p[i] * m[i] = m[i-1] - m0 = n */ - -template -int KissFFTKokkos::kf_factor(int n, HAT::t_int_64 h_facbuf) -{ - int p=4, nf=0; - double floor_sqrt; - floor_sqrt = floor( sqrt((double)n) ); - int facbuf_count = 0; - int p_max = 0; - - /* factor out the remaining powers of 4, powers of 2, - and then any other remaining primes */ - do { - if (nf == MAXFACTORS) p = n; /* make certain that we don't run out of space */ - while (n % p) { - switch (p) { - case 4: p = 2; break; - case 2: p = 3; break; - default: p += 2; break; - } - if (p > floor_sqrt) - p = n; /* no more factors, skip to end */ - } - n /= p; - h_facbuf[facbuf_count++] = p; - h_facbuf[facbuf_count++] = n; - p_max = MAX(p,p_max); - ++nf; - } while (n > 1); - return p_max; -} - -/* - * User-callable function to allocate all necessary storage space for the fft. - * - * The return value is a contiguous block of memory, allocated with malloc. As such, - * It can be freed with free(), rather than a kiss_fft-specific function. - */ - -template -kiss_fft_state_kokkos KissFFTKokkos::kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void *mem, size_t *lenmem) -{ - kiss_fft_state_kokkos st; - int i; - st.nfft = nfft; - st.inverse = inverse_fft; - - typename AT::tdual_int_64 k_factors = typename AT::tdual_int_64(); - typename AT::tdual_FFT_DATA_1d k_twiddles = typename AT::tdual_FFT_DATA_1d(); - - if (nfft > 0) { - k_factors = typename AT::tdual_int_64("kissfft:factors",MAXFACTORS*2); - k_twiddles = typename AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft); - - for (i=0;i(); - k_factors.template sync(); - st.d_factors = k_factors.template view(); - - k_twiddles.template modify(); - k_twiddles.template sync(); - st.d_twiddles = k_twiddles.template view(); - - return st; -} - -template -KOKKOS_INLINE_FUNCTION -void KissFFTKokkos::kiss_fft_stride(const kiss_fft_state_kokkos &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset) -{ - //if (d_fin == d_fout) { - // // NOTE: this is not really an in-place FFT algorithm. - // // It just performs an out-of-place FFT into a temp buffer - // typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",st.nfft); - // kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); - // memcpy(d_fout,d_tmpbuf,sizeof(FFT_DATA)*st.nfft); - //} else { - kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); - //} -} - -template -KOKKOS_INLINE_FUNCTION -void KissFFTKokkos::kiss_fft_kokkos(const kiss_fft_state_kokkos &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset) -{ - kiss_fft_stride(cfg,d_fin,d_fout,1,offset); -} - namespace LAMMPS_NS { template class KissFFTKokkos; diff --git a/src/KOKKOS/kissfft_kokkos.h b/src/KOKKOS/kissfft_kokkos.h index db9e880393..4f494e8436 100644 --- a/src/KOKKOS/kissfft_kokkos.h +++ b/src/KOKKOS/kissfft_kokkos.h @@ -17,14 +17,10 @@ homepage: http://kissfft.sf.net/ changes 2008-2011 by Axel Kohlmeyer + + KISS FFT ported to Kokkos by Stan Moore (SNL) */ -#ifndef LMP_KISSFFT_KOKKOS_H -#define LMP_KISSFFT_KOKKOS_H - -#include "fftdata_kokkos.h" -#include "kokkos_type.h" - /* Copyright (c) 2003-2010, Mark Borgerding @@ -59,50 +55,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + +#ifndef LMP_KISSFFT_KOKKOS_H +#define LMP_KISSFFT_KOKKOS_H + #include #include #include #include "kissfft_kokkos.h" #include "fftdata_kokkos.h" +#include "kokkos_type.h" #ifndef M_PI #define M_PI 3.141592653589793238462643383279502884197169399375105820974944 #endif -/* - Copyright (c) 2003-2010, Mark Borgerding - - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the - distribution. - - * Neither the author nor the names of any contributors may be used - to endorse or promote products derived from this software without - specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - /* Explanation of macros dealing with complex math: @@ -112,7 +79,7 @@ C_SUBFROM( res , a) : res -= a C_ADDTO( res , a) : res += a C_EQ( res , a) : res = a - * */ + #define S_MUL(a,b) ( (a)*(b) ) @@ -120,18 +87,16 @@ do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\ (m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0) -#define C_FIXDIV(c,div) /* NOOP */ - - +#define C_FIXDIV(c,div) // NOOP #define C_MULBYSCALAR( c, s ) \ do{ (c)[0] *= (s);\ (c)[1] *= (s); }while(0) -//#define C_ADD( res, a,b)\ -// do { \ -// (res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \ -// }while(0) +#define C_ADD( res, a,b)\ + do { \ + (res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \ + }while(0) #define C_SUB( res, a,b)\ do { \ @@ -152,7 +117,7 @@ do {\ (res)[0] = (a)[0]; (res)[1] = (a)[1]; \ }while(0) - +*/ #define KISS_FFT_COS(phase) (FFT_SCALAR) cos(phase) #define KISS_FFT_SIN(phase) (FFT_SCALAR) sin(phase) @@ -164,6 +129,7 @@ (x)(x_index,1) = KISS_FFT_SIN(phase);\ }while(0) + namespace LAMMPS_NS { #define MAXFACTORS 32 @@ -439,7 +405,6 @@ class KissFFTKokkos { k += m; } } - //KISS_FFT_TMP_FREE(d_scratch); } KOKKOS_INLINE_FUNCTION @@ -561,18 +526,17 @@ class KissFFTKokkos { KOKKOS_INLINE_FUNCTION static void kiss_fft_stride(const kiss_fft_state_kokkos &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset) { - //if (d_fin == d_fout) { - // // NOTE: this is not really an in-place FFT algorithm. - // // It just performs an out-of-place FFT into a temp buffer - // typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",st.nfft); - // kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); - // memcpy(d_fout,d_tmpbuf,sizeof(FFT_DATA)*st.nfft); - //} else { - kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); - //} + if (d_fin.data() == d_fout.data()) { + // NOTE: this is not really an in-place FFT algorithm. + // It just performs an out-of-place FFT into a temp buffer + typename AT::t_FFT_DATA_1d d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1)); + kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + Kokkos::deep_copy(d_fout,d_tmpbuf); + } else { + kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + } } - KOKKOS_INLINE_FUNCTION static void kiss_fft_kokkos(const kiss_fft_state_kokkos &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset) { @@ -582,5 +546,4 @@ class KissFFTKokkos { }; } - -#endif \ No newline at end of file +#endif diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 048d04ed28..23d99394fb 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -12,11 +12,17 @@ ------------------------------------------------------------------------- */ #include +#include +#include #include "remap_kokkos.h" +#include "pack_kokkos.h" #include "error.h" using namespace LAMMPS_NS; +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + /* ---------------------------------------------------------------------- */ template @@ -57,13 +63,6 @@ void RemapKokkos::perform(typename AT::t_FFT_SCALAR_1d d_in, typenam remap_3d_kokkos(d_in,d_out,d_buf,plan); } -#include -#include - -#include "pack_kokkos.h" - -#define MIN(A,B) ((A) < (B) ? (A) : (B)) -#define MAX(A,B) ((A) > (B) ? (A) : (B)) /* ---------------------------------------------------------------------- Data layout for 3d remaps: @@ -106,157 +105,65 @@ template void RemapKokkos::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf, struct remap_plan_3d_kokkos *plan) { + // collective flag not yet supported + // use point-to-point communication - //if (!plan->usecollective) { - int i,isend,irecv; - typename AT::t_FFT_SCALAR_1d d_scratch; + int i,isend,irecv; + typename AT::t_FFT_SCALAR_1d d_scratch; - if (plan->memory == 0) - d_scratch = d_buf; - else - d_scratch = plan->d_scratch; + if (plan->memory == 0) + d_scratch = d_buf; + else + d_scratch = plan->d_scratch; - // post all recvs into scratch space + // post all recvs into scratch space - for (irecv = 0; irecv < plan->nrecv; irecv++) { - double* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv]; - MPI_Irecv(scratch,plan->recv_size[irecv], - MPI_FFT_SCALAR,plan->recv_proc[irecv],0, - plan->comm,&plan->request[irecv]); - } + for (irecv = 0; irecv < plan->nrecv; irecv++) { + double* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv]; + MPI_Irecv(scratch,plan->recv_size[irecv], + MPI_FFT_SCALAR,plan->recv_proc[irecv],0, + plan->comm,&plan->request[irecv]); + } - // send all messages to other procs + // send all messages to other procs - for (isend = 0; isend < plan->nsend; isend++) { - int in_offset = plan->send_offset[isend]; - plan->pack(d_in,in_offset, - plan->d_sendbuf,0,&plan->packplan[isend]); - MPI_Send(plan->d_sendbuf.ptr_on_device(),plan->send_size[isend],MPI_FFT_SCALAR, - plan->send_proc[isend],0,plan->comm); - } + for (isend = 0; isend < plan->nsend; isend++) { + int in_offset = plan->send_offset[isend]; + plan->pack(d_in,in_offset, + plan->d_sendbuf,0,&plan->packplan[isend]); + MPI_Send(plan->d_sendbuf.ptr_on_device(),plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm); + } - // copy in -> scratch -> out for self data + // copy in -> scratch -> out for self data - if (plan->self) { - isend = plan->nsend; - irecv = plan->nrecv; + if (plan->self) { + isend = plan->nsend; + irecv = plan->nrecv; - int in_offset = plan->send_offset[isend]; - int scratch_offset = plan->recv_bufloc[irecv]; - int out_offset = plan->recv_offset[irecv]; + int in_offset = plan->send_offset[isend]; + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; - plan->pack(d_in,in_offset, - d_scratch,scratch_offset, - &plan->packplan[isend]); - plan->unpack(d_scratch,scratch_offset, - d_out,out_offset,&plan->unpackplan[irecv]); - } + plan->pack(d_in,in_offset, + d_scratch,scratch_offset, + &plan->packplan[isend]); + plan->unpack(d_scratch,scratch_offset, + d_out,out_offset,&plan->unpackplan[irecv]); + } - // unpack all messages from scratch -> out + // unpack all messages from scratch -> out - for (i = 0; i < plan->nrecv; i++) { - MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE); + for (i = 0; i < plan->nrecv; i++) { + MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE); - int scratch_offset = plan->recv_bufloc[irecv]; - int out_offset = plan->recv_offset[irecv]; + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; - plan->unpack(d_scratch,scratch_offset, - d_out,out_offset,&plan->unpackplan[irecv]); - } - - // use All2Allv collective for remap communication - - //} else { - // if (plan->commringlen > 0) { - // int isend,irecv; - // - // // create send and recv buffers for alltoallv collective - // - // int sendBufferSize = 0; - // int recvBufferSize = 0; - // for (int i=0;insend;i++) - // sendBufferSize += plan->send_size[i]; - // for (int i=0;inrecv;i++) - // recvBufferSize += plan->recv_size[i]; - // - // typename AT::t_FFT_SCALAR_1d packedSendBuffer - // = (typename AT::t_FFT_SCALAR_1d ) malloc(sizeof(FFT_SCALAR) * sendBufferSize); - // typename AT::t_FFT_SCALAR_1d packedRecvBuffer - // = (typename AT::t_FFT_SCALAR_1d ) malloc(sizeof(FFT_SCALAR) * recvBufferSize); - // - // int *sendcnts = (int *) malloc(sizeof(int) * plan->commringlen); - // int *rcvcnts = (int *) malloc(sizeof(int) * plan->commringlen); - // int *sdispls = (int *) malloc(sizeof(int) * plan->commringlen); - // int *rdispls = (int *) malloc(sizeof(int) * plan->commringlen); - // int *nrecvmap = (int *) malloc(sizeof(int) * plan->commringlen); - // - // // create and populate send data, count and displacement buffers - // - // int currentSendBufferOffset = 0; - // for (isend = 0; isend < plan->commringlen; isend++) { - // sendcnts[isend] = 0; - // sdispls[isend] = 0; - // int foundentry = 0; - // for (int i=0;(insend && !foundentry); i++) { - // if (plan->send_proc[i] == plan->commringlist[isend]) { - // foundentry = 1; - // sendcnts[isend] = plan->send_size[i]; - // sdispls[isend] = currentSendBufferOffset; - // plan->pack(&d_in[plan->send_offset[i]], - // &packedSendBuffer[currentSendBufferOffset], - // &plan->packplan[i]); - // currentSendBufferOffset += plan->send_size[i]; - // } - // } - // } - // - // // create and populate recv count and displacement buffers - // - // int currentRecvBufferOffset = 0; - // for (irecv = 0; irecv < plan->commringlen; irecv++) { - // rcvcnts[irecv] = 0; - // rdispls[irecv] = 0; - // nrecvmap[irecv] = -1; - // int foundentry = 0; - // for (int i=0;(inrecv && !foundentry); i++) { - // if (plan->recv_proc[i] == plan->commringlist[irecv]) { - // foundentry = 1; - // rcvcnts[irecv] = plan->recv_size[i]; - // rdispls[irecv] = currentRecvBufferOffset; - // currentRecvBufferOffset += plan->recv_size[i]; - // nrecvmap[irecv] = i; - // } - // } - // } - // - // MPI_Alltoallv(packedSendBuffer, sendcnts, sdispls, - // MPI_FFT_SCALAR, packedRecvBuffer, rcvcnts, - // rdispls, MPI_FFT_SCALAR, plan->comm); - // - // // unpack the data from the recv buffer into out - // - // currentRecvBufferOffset = 0; - // for (irecv = 0; irecv < plan->commringlen; irecv++) { - // if (nrecvmap[irecv] > -1) { - // plan->unpack(&packedRecvBuffer[currentRecvBufferOffset], - // &d_out[plan->recv_offset[nrecvmap[irecv]]], - // &plan->unpackplan[nrecvmap[irecv]]); - // currentRecvBufferOffset += plan->recv_size[nrecvmap[irecv]]; - // } - // } - // - // // free temporary data structures - // - // free(sendcnts); - // free(rcvcnts); - // free(sdispls); - // free(rdispls); - // free(nrecvmap); - // free(packedSendBuffer); - // free(packedRecvBuffer); - // } - //} + plan->unpack(d_scratch,scratch_offset, + d_out,out_offset,&plan->unpackplan[irecv]); + } } /* ---------------------------------------------------------------------- @@ -508,109 +415,6 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } - // create sub-comm rank list - - //if (plan->usecollective) { - // plan->commringlist = NULL; - // - // // merge recv and send rank lists - // // ask Steve Plimpton about method to more accurately determine - // // maximum number of procs contributing to pencil - // - // int maxcommsize = nprocs; - // int *commringlist = (int *) malloc(maxcommsize*sizeof(int)); - // int commringlen = 0; - // - // for (int i = 0; i < nrecv; i++) { - // commringlist[i] = plan->recv_proc[i]; - // commringlen++; - // } - // - // for (int i = 0; i < nsend; i++) { - // int foundentry = 0; - // for (int j=0;jsend_proc[i]) foundentry = 1; - // if (!foundentry) { - // commringlist[commringlen] = plan->send_proc[i]; - // commringlen++; - // } - // } - // - // // sort initial commringlist - // - // int swap = 0; - // for (int c = 0 ; c < (commringlen - 1); c++) { - // for (int d = 0 ; d < commringlen - c - 1; d++) { - // if (commringlist[d] > commringlist[d+1]) { - // swap = commringlist[d]; - // commringlist[d] = commringlist[d+1]; - // commringlist[d+1] = swap; - // } - // } - // } - // - // // collide all inarray extents for the comm ring with all output - // // extents and all outarray extents for the comm ring with all input - // // extents - if there is a collison add the rank to the comm ring, - // // keep iterating until nothing is added to commring - // - // int commringappend = 1; - // while (commringappend) { - // int newcommringlen = commringlen; - // commringappend = 0; - // for (int i=0;i commringlist[d+1]) { - // swap = commringlist[d]; - // commringlist[d] = commringlist[d+1]; - // commringlist[d+1] = swap; - // } - // } - // } - // - // // resize commringlist to final size - // - // commringlist = (int *) realloc(commringlist, commringlen*sizeof(int)); - // - // // set the plan->commringlist - // - // plan->commringlen = commringlen; - // plan->commringlist = commringlist; - //} - // plan->nrecv = # of recvs not including self // for collectives include self in the nsend list @@ -653,28 +457,9 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat } } - // if using collective and the commringlist is NOT empty create a - // communicator for the plan based off an MPI_Group created with - // ranks from the commringlist - - //if ((plan->usecollective && (plan->commringlen > 0))) { - // MPI_Group orig_group, new_group; - // MPI_Comm_group(comm, &orig_group); - // MPI_Group_incl(orig_group, plan->commringlen, - // plan->commringlist, &new_group); - // MPI_Comm_create(comm, new_group, &plan->comm); - //} - - // if using collective and the comm ring list is empty create - // a communicator for the plan with an empty group - - //else if ((plan->usecollective) && (plan->commringlen == 0)) { - // MPI_Comm_create(comm, MPI_GROUP_EMPTY, &plan->comm); - //} - // not using collective - dup comm - /*else*/ MPI_Comm_dup(comm,&plan->comm); + MPI_Comm_dup(comm,&plan->comm); // return pointer to plan @@ -695,11 +480,6 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ if (!((plan->usecollective) && (plan->commringlen == 0))) MPI_Comm_free(&plan->comm); - //if (plan->usecollective) { - // if (plan->commringlist != NULL) - // free(plan->commringlist); - //} - // free internal arrays if (plan->nsend || plan->self) { @@ -707,7 +487,6 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ free(plan->send_size); free(plan->send_proc); free(plan->packplan); - //if (plan->d_sendbuf) free(plan->d_sendbuf); } if (plan->nrecv || plan->self) { @@ -717,7 +496,6 @@ void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_ free(plan->recv_bufloc); free(plan->request); free(plan->unpackplan); - //if (plan->scratch) free(plan->scratch); } // free plan itself From f34c899a19d6a7f16fe3c320786d76c31c037890 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 18 Jun 2019 11:09:51 -0600 Subject: [PATCH 10/48] Reduce compiler warnings and fix a few issues --- src/KOKKOS/fft3d_kokkos.cpp | 55 ++++++++++++++----------------------- src/KOKKOS/kissfft_kokkos.h | 19 +++++++------ 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index ba7d4bffaa..9d85da26d1 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -43,9 +43,9 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int Pointers(lmp) { int nthreads = lmp->kokkos->nthreads; - int ngpus = lmp->kokkos->ngpus; #if defined(FFT_FFTW3) + int ngpus = lmp->kokkos->ngpus; if (ngpus > 0) lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA"); #endif @@ -187,19 +187,13 @@ public: }; #endif -const int forward = 0; -const int backward = 1; - template void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_FFT_DATA_1d d_in, typename ArrayTypes::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos *plan) { - int i,total,length,offset,num,dim; - FFT_SCALAR norm; + int total,length; typename ArrayTypes::t_FFT_DATA_1d d_data,d_copy; typename ArrayTypes::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar; - int dir = (flag == -1) ? forward : backward; - // pre-remap to prepare for 1st FFTs if needed // copy = loc for remap result @@ -221,7 +215,6 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F total = plan->total1; length = plan->length1; - dim = 0; #if defined(FFT_FFTW3) if (flag == -1) @@ -262,7 +255,6 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F total = plan->total2; length = plan->length2; - dim = 1; #if defined(FFT_FFTW3) if (flag == -1) @@ -300,7 +292,6 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F total = plan->total3; length = plan->length3; - dim = 2; #if defined(FFT_FFTW3) if (flag == -1) @@ -336,14 +327,14 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F // scaling if required if (flag == 1 && plan->scaled) { - norm = plan->norm; - num = plan->normnum; + int norm = plan->norm; + FFT_SCALAR num = plan->normnum; #if defined(FFT_CUFFT) typename ArrayTypes::t_FFT_SCALAR_1d d_norm_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); cufft_norm_functor f(d_norm_scalar,norm); Kokkos::parallel_for(num,f); - #elif defined(FFT_KISFFT) + #elif defined(FFT_KISSFFT) kiss_norm_functor f(d_out,norm); Kokkos::parallel_for(num,f); #endif @@ -384,13 +375,12 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl { struct fft_plan_3d_kokkos *plan; int me,nprocs; - int i,num,flag,remapflag,fftflag; + int flag,remapflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; int out_size,first_size,second_size,third_size,copy_size,scratch_size; int np1,np2,ip1,ip2; - int list[50]; // query MPI info @@ -753,8 +743,6 @@ template void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypes::t_FFT_DATA_1d d_data, int nsize, int flag, struct fft_plan_3d_kokkos *plan) { - int i,total,length,offset,num; - // total = size of data needed in each dim // length = length of 1d FFT in each dim // total/length = # of 1d FFTs in each dim @@ -777,23 +765,20 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypesplan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - } else { - fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - } + if (flag == -1) { + fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + } else { + fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + } #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); - cufftExecZ2Z(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); - cufftExecZ2Z(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExecZ2Z(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExecZ2Z(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExecZ2Z(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); #else kiss_fft_functor f; typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); @@ -823,13 +808,13 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypesscaled) { FFT_SCALAR norm = plan->norm; - num = MIN(plan->normnum,nsize); + int num = MIN(plan->normnum,nsize); #if defined(FFT_CUFFT) typename ArrayTypes::t_FFT_SCALAR_1d d_norm_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); cufft_norm_functor f(d_norm_scalar,norm); Kokkos::parallel_for(num,f); - #elif defined(FFT_KISFFT) + #elif defined(FFT_KISSFFT) kiss_norm_functor f(d_data,norm); Kokkos::parallel_for(num,f); #endif diff --git a/src/KOKKOS/kissfft_kokkos.h b/src/KOKKOS/kissfft_kokkos.h index 4f494e8436..ca11c6e6a5 100644 --- a/src/KOKKOS/kissfft_kokkos.h +++ b/src/KOKKOS/kissfft_kokkos.h @@ -79,7 +79,7 @@ C_SUBFROM( res , a) : res -= a C_ADDTO( res , a) : res += a C_EQ( res , a) : res = a - +*/ #define S_MUL(a,b) ( (a)*(b) ) @@ -87,6 +87,7 @@ do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\ (m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0) +/* #define C_FIXDIV(c,div) // NOOP #define C_MULBYSCALAR( c, s ) \ @@ -526,15 +527,15 @@ class KissFFTKokkos { KOKKOS_INLINE_FUNCTION static void kiss_fft_stride(const kiss_fft_state_kokkos &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset) { - if (d_fin.data() == d_fout.data()) { - // NOTE: this is not really an in-place FFT algorithm. - // It just performs an out-of-place FFT into a temp buffer - typename AT::t_FFT_DATA_1d d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1)); - kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); - Kokkos::deep_copy(d_fout,d_tmpbuf); - } else { + //if (d_fin.data() == d_fout.data()) { + // // NOTE: this is not really an in-place FFT algorithm. + // // It just performs an out-of-place FFT into a temp buffer + // typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1)); + // kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + // Kokkos::deep_copy(d_fout,d_tmpbuf); + //} else { kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); - } + //} } KOKKOS_INLINE_FUNCTION From 9f40899ad79ae5cd4723e58d6fd75001c0e9aa5e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 18 Jun 2019 12:10:41 -0600 Subject: [PATCH 11/48] Error out if using cuFFT with Kokkos CUDA on host CPUs --- src/KOKKOS/fft3d_kokkos.cpp | 11 +++++++---- src/KOKKOS/fft3d_kokkos.h | 11 ++++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 9d85da26d1..1c8aa7a3be 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -43,11 +43,15 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int Pointers(lmp) { int nthreads = lmp->kokkos->nthreads; + int ngpus = lmp->kokkos->ngpus; + ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; #if defined(FFT_FFTW3) - int ngpus = lmp->kokkos->ngpus; - if (ngpus > 0) - lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA"); + if (ngpus > 0 && execution_space == Device) + lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA on GPUs"); +#elif defined(FFT_CUFFT) + if (ngpus > 0 && execution_space == Host) + lmp->error->all(FLERR,"Cannot use the cuFFT library with Kokkos CUDA on the host CPUs"); #endif plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, @@ -339,7 +343,6 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F Kokkos::parallel_for(num,f); #endif } - } /* ---------------------------------------------------------------------- diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index ba55e4986b..798f5b51bd 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -135,9 +135,14 @@ The FFT setup for the PPPM solver failed, typically due to lack of memory. This is an unusual error. Check the size of the FFT grid you are requesting. -E: Cannot use the FFTW library with Kokkos CUDA +E: Cannot use the FFTW library with Kokkos CUDA on GPUs -Kokkos CUDA doesn't support using the FFTW library to -calculate FFTs for PPPM on GPUs. +Kokkos CUDA doesn't support using the FFTW library to calculate FFTs for +PPPM on GPUs. + +E: Cannot use the cuFFT library with Kokkos CUDA on the host CPUs + +Kokkos CUDA doesn't support using the cuFFT library to calculate FFTs +for PPPM on the host CPUs, use KISS FFT instead. */ From fafd824b74e487e6279ace0eac2951d6d85730a9 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 18 Jun 2019 13:55:02 -0600 Subject: [PATCH 12/48] Small tweaks --- src/KOKKOS/fft3d_kokkos.cpp | 53 +++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 1c8aa7a3be..c90b35deb8 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -192,11 +192,11 @@ public: #endif template -void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_FFT_DATA_1d d_in, typename ArrayTypes::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos *plan) +void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typename AT::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos *plan) { int total,length; - typename ArrayTypes::t_FFT_DATA_1d d_data,d_copy; - typename ArrayTypes::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar; + typename AT::t_FFT_DATA_1d d_data,d_copy; + typename AT::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar; // pre-remap to prepare for 1st FFTs if needed // copy = loc for remap result @@ -205,9 +205,9 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F if (plan->pre_target == 0) d_copy = d_out; else d_copy = plan->d_copy; - d_in_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_in.data(),d_in.size()); - d_copy_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); - d_scratch_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + d_in_scalar = typename AT::t_FFT_SCALAR_1d(d_in.data(),d_in.size()); + d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); + d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar, d_scratch_scalar, plan->pre_plan); @@ -228,7 +228,8 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F #elif defined(FFT_CUFFT) cufftExecZ2Z(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else - typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + typename AT::t_FFT_DATA_1d d_tmp = + typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); kiss_fft_functor f; if (flag == -1) f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length); @@ -236,7 +237,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length); Kokkos::parallel_for(total/length,f); d_data = d_tmp; - d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); #endif @@ -246,9 +247,9 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F if (plan->mid1_target == 0) d_copy = d_out; else d_copy = plan->d_copy; - d_data_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()*2); - d_copy_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2); - d_scratch_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2); + d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()*2); + d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2); + d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2); remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, d_scratch_scalar, plan->mid1_plan); @@ -274,7 +275,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length); Kokkos::parallel_for(total/length,f); d_data = d_tmp; - d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); #endif // 2nd mid-remap to prepare for 3rd FFTs @@ -283,9 +284,9 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F if (plan->mid2_target == 0) d_copy = d_out; else d_copy = plan->d_copy; - d_data_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); - d_copy_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); - d_scratch_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); + d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, d_scratch_scalar, plan->mid2_plan); @@ -320,9 +321,9 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F // destination is always out if (plan->post_plan) { - d_data_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); - d_out_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(d_out.data(),d_out.size()); - d_scratch_scalar = typename ArrayTypes::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + d_out_scalar = typename AT::t_FFT_SCALAR_1d(d_out.data(),d_out.size()); + d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar, d_scratch_scalar, plan->post_plan); @@ -334,8 +335,8 @@ void FFT3dKokkos::fft_3d_kokkos(typename ArrayTypes::t_F int norm = plan->norm; FFT_SCALAR num = plan->normnum; #if defined(FFT_CUFFT) - typename ArrayTypes::t_FFT_SCALAR_1d d_norm_scalar = - typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + typename AT::t_FFT_SCALAR_1d d_norm_scalar = + typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); cufft_norm_functor f(d_norm_scalar,norm); Kokkos::parallel_for(num,f); #elif defined(FFT_KISSFFT) @@ -584,11 +585,11 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl *nbuf = copy_size + scratch_size; if (copy_size) { - plan->d_copy = typename ArrayTypes::t_FFT_DATA_1d("fft3d:copy",copy_size); + plan->d_copy = typename AT::t_FFT_DATA_1d("fft3d:copy",copy_size); } if (scratch_size) { - plan->d_scratch = typename ArrayTypes::t_FFT_DATA_1d("fft3d:scratch",scratch_size); + plan->d_scratch = typename AT::t_FFT_DATA_1d("fft3d:scratch",scratch_size); } // system specific pre-computation of 1d FFT coeffs @@ -743,7 +744,7 @@ void FFT3dKokkos::bifactor(int n, int *factor1, int *factor2) ------------------------------------------------------------------------- */ template -void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypes::t_FFT_DATA_1d d_data, int nsize, int flag, +void FFT3dKokkos::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d_data, int nsize, int flag, struct fft_plan_3d_kokkos *plan) { // total = size of data needed in each dim @@ -784,7 +785,7 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypesplan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); #else kiss_fft_functor f; - typename ArrayTypes::t_FFT_DATA_1d d_tmp = typename ArrayTypes::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); + typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); if (flag == -1) { f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length1); Kokkos::parallel_for(total1/length1,f); @@ -813,8 +814,8 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename ArrayTypesnorm; int num = MIN(plan->normnum,nsize); #if defined(FFT_CUFFT) - typename ArrayTypes::t_FFT_SCALAR_1d d_norm_scalar = - typename ArrayTypes::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + typename AT::t_FFT_SCALAR_1d d_norm_scalar = + typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); cufft_norm_functor f(d_norm_scalar,norm); Kokkos::parallel_for(num,f); #elif defined(FFT_KISSFFT) From c515b7dc985083a4718325a054d9c7c0c32ad245 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 18 Jun 2019 14:02:22 -0600 Subject: [PATCH 13/48] Turn of View initialization for d_tmp --- src/KOKKOS/fft3d_kokkos.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index c90b35deb8..83015490e0 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -229,7 +229,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ cufftExecZ2Z(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else typename AT::t_FFT_DATA_1d d_tmp = - typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); kiss_fft_functor f; if (flag == -1) f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length); @@ -237,7 +237,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length); Kokkos::parallel_for(total/length,f); d_data = d_tmp; - d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); #endif @@ -275,7 +275,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length); Kokkos::parallel_for(total/length,f); d_data = d_tmp; - d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_in.dimension_0()); + d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); #endif // 2nd mid-remap to prepare for 3rd FFTs From 5aae0956774b522bbe6b7ea71d07476e2c326d46 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 1 Jul 2019 17:58:25 -0600 Subject: [PATCH 14/48] Add support for single precision --- src/KOKKOS/fft3d_kokkos.cpp | 56 ++++++++++++++++++------------------- src/KOKKOS/fft3d_kokkos.h | 23 +++++++++++---- src/KOKKOS/remap_kokkos.cpp | 2 +- 3 files changed, 45 insertions(+), 36 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 83015490e0..6c1ed616da 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -222,11 +222,11 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ #if defined(FFT_FFTW3) if (flag == -1) - fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); else - fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); @@ -263,11 +263,11 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ #if defined(FFT_FFTW3) if (flag == -1) - fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); else - fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + cufftExec(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else if (flag == -1) f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length); @@ -300,13 +300,11 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ #if defined(FFT_FFTW3) if (flag == -1) - fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); else - fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); - + FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); - // CUDA + cufftExec(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else if (flag == -1) f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length); @@ -600,38 +598,38 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl fftw_plan_with_nthreads(nthreads); plan->plan_fast_forward = - fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, + FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_fast_backward = - fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, + FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_mid_forward = - fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, + FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_mid_backward = - fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, + FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_slow_forward = - fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, + FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_slow_backward = - fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, + FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_BACKWARD,FFTW_ESTIMATE); @@ -639,17 +637,17 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl cufftPlanMany(&(plan->plan_fast), 1, &nfast, &nfast,1,plan->length1, &nfast,1,plan->length1, - CUFFT_Z2Z,plan->total1/plan->length1); + CUFFT_TYPE,plan->total1/plan->length1); cufftPlanMany(&(plan->plan_mid), 1, &nmid, &nmid,1,plan->length2, &nmid,1,plan->length2, - CUFFT_Z2Z,plan->total2/plan->length2); + CUFFT_TYPE,plan->total2/plan->length2); cufftPlanMany(&(plan->plan_slow), 1, &nslow, &nslow,1,plan->length3, &nslow,1,plan->length3, - CUFFT_Z2Z,plan->total3/plan->length3); + CUFFT_TYPE,plan->total3/plan->length3); #else kissfftKK = new KissFFTKokkos(); @@ -771,18 +769,18 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d #if defined(FFT_FFTW3) if (flag == -1) { - fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); } else { - fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); } #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); - cufftExecZ2Z(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); - cufftExecZ2Z(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExec(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExec(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExec(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); #else kiss_fft_functor f; typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index 798f5b51bd..cf5e95b604 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -17,9 +17,7 @@ #include "pointers.h" #include "kokkos_type.h" #include "remap_kokkos.h" - -#define FFT_PRECISION 2 -typedef double FFT_SCALAR; +#include "fftdata_kokkos.h" // if user set FFTW, it means FFTW3 @@ -29,11 +27,24 @@ typedef double FFT_SCALAR; #if defined(FFT_FFTW3) #include "fftw3.h" - typedef fftw_complex FFT_DATA; - #define FFTW_API(function) fftw_ ## function + #if defined(FFT_SINGLE) + typedef fftwf_complex FFT_DATA; + #define FFTW_API(function) fftwf_ ## function + #else + typedef fftw_complex FFT_DATA; + #define FFTW_API(function) fftw_ ## function + #endif #elif defined(FFT_CUFFT) #include "cufft.h" - typedef cufftDoubleComplex FFT_DATA; + #if defined(FFT_SINGLE) + #define cufftExec cufftExecC2C + #define CUFFT_TYPE CUFFT_C2C + typedef cufftComplex FFT_DATA; + #else + #define cufftExec cufftExecZ2Z + #define CUFFT_TYPE CUFFT_Z2Z + typedef cufftDoubleComplex FFT_DATA; + #endif #else #include "kissfft_kokkos.h" #ifndef FFT_KISSFFT diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index 23d99394fb..4d4300daf0 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -120,7 +120,7 @@ void RemapKokkos::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in, // post all recvs into scratch space for (irecv = 0; irecv < plan->nrecv; irecv++) { - double* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv]; + FFT_SCALAR* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv]; MPI_Irecv(scratch,plan->recv_size[irecv], MPI_FFT_SCALAR,plan->recv_proc[irecv],0, plan->comm,&plan->request[irecv]); From dcf9351514f5701200e26550645b667442a9371d Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 2 Jul 2019 08:27:35 -0600 Subject: [PATCH 15/48] Removed commented out code --- src/KOKKOS/kokkos_type.h | 42 ---------------------------------------- 1 file changed, 42 deletions(-) diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 4c1acf47e1..2a19413e0a 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -174,20 +174,6 @@ t_scalar3 operator * return t_scalar3(a.x*b,a.y*b,a.z*b); } -//#if !defined(__CUDACC__) && !defined(__VECTOR_TYPES_H__) -// struct double2 { -// double x, y; -// }; -// struct float2 { -// float x, y; -// }; -// struct float4 { -// float x, y, z, w; -// }; -// struct double4 { -// double x, y, z, w; -// }; -//#endif // set LMPHostype and LMPDeviceType from Kokkos Default Types typedef Kokkos::DefaultExecutionSpace LMPDeviceType; typedef Kokkos::HostSpace::execution_space LMPHostType; @@ -310,14 +296,8 @@ public: #endif #if PRECISION==1 typedef float LMP_FLOAT; -//typedef float2 LMP_FLOAT2; -//typedef lmp_float3 LMP_FLOAT3; -//typedef float4 LMP_FLOAT4; #else typedef double LMP_FLOAT; -//typedef double2 LMP_FLOAT2; -//typedef lmp_double3 LMP_FLOAT3; -//typedef double4 LMP_FLOAT4; #endif #ifndef PREC_FORCE @@ -326,14 +306,8 @@ typedef double LMP_FLOAT; #if PREC_FORCE==1 typedef float F_FLOAT; -//typedef float2 F_FLOAT2; -//typedef lmp_float3 F_FLOAT3; -//typedef float4 F_FLOAT4; #else typedef double F_FLOAT; -//typedef double2 F_FLOAT2; -//typedef lmp_double3 F_FLOAT3; -//typedef double4 F_FLOAT4; #endif #ifndef PREC_ENERGY @@ -342,12 +316,8 @@ typedef double F_FLOAT; #if PREC_ENERGY==1 typedef float E_FLOAT; -//typedef float2 E_FLOAT2; -//typedef float4 E_FLOAT4; #else typedef double E_FLOAT; -//typedef double2 E_FLOAT2; -//typedef double4 E_FLOAT4; #endif struct s_EV_FLOAT { @@ -500,12 +470,8 @@ typedef struct s_FEV_FLOAT FEV_FLOAT; #if PREC_POS==1 typedef float X_FLOAT; -//typedef float2 X_FLOAT2; -//typedef float4 X_FLOAT4; #else typedef double X_FLOAT; -//typedef double2 X_FLOAT2; -//typedef double4 X_FLOAT4; #endif #ifndef PREC_VELOCITIES @@ -514,22 +480,14 @@ typedef double X_FLOAT; #if PREC_VELOCITIES==1 typedef float V_FLOAT; -//typedef float2 V_FLOAT2; -//typedef float4 V_FLOAT4; #else typedef double V_FLOAT; -//typedef double2 V_FLOAT2; -//typedef double4 V_FLOAT4; #endif #if PREC_KSPACE==1 typedef float K_FLOAT; -//typedef float2 K_FLOAT2; -//typedef float4 K_FLOAT4; #else typedef double K_FLOAT; -//typedef double2 K_FLOAT2; -//typedef double4 K_FLOAT4; #endif typedef int T_INT; From b1781e51291950bf26a1b0301dfc3a8beb2b0008 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 2 Jul 2019 08:27:49 -0600 Subject: [PATCH 16/48] Small tweaks --- src/KOKKOS/fft3d_kokkos.cpp | 11 +++++++++-- src/KOKKOS/fft3d_kokkos.h | 4 ++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 6c1ed616da..7d2bc506e6 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -594,8 +594,13 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl // and scaling normalization #if defined(FFT_FFTW3) - if (nthreads > 1) + + #if defined(FFT_FFTW_THREADS) + if (nthreads > 1) { + fftw_init_threads(); fftw_plan_with_nthreads(nthreads); + } + #endif plan->plan_fast_forward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, @@ -704,7 +709,9 @@ void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokk delete plan; delete remapKK; -#ifdef FFT_KISSFFT +#if defined (FFT_FFTW_THREADS) + fftw_cleanup_threads(); +#elif defined (FFT_KISSFFT) delete kissfftKK; #endif } diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index cf5e95b604..b878135855 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -25,6 +25,10 @@ #define FFT_FFTW3 #endif +#ifdef FFT_FFTW_THREADS +#define FFT_FFTW3 +#endif + #if defined(FFT_FFTW3) #include "fftw3.h" #if defined(FFT_SINGLE) From 2ce7da80d18753806cb01b879694e97f5afea940 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 2 Jul 2019 08:28:18 -0600 Subject: [PATCH 17/48] Add Kokkkos FFT build options --- doc/src/Build_settings.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/src/Build_settings.txt b/doc/src/Build_settings.txt index 287cd39ff6..578903774d 100644 --- a/doc/src/Build_settings.txt +++ b/doc/src/Build_settings.txt @@ -33,7 +33,7 @@ LAMMPS can use them if they are available on your system. [CMake variables]: --D FFT=value # FFTW3 or MKL or KISS, default is FFTW3 if found, else KISS +-D FFT=value # FFTW3 or MKL or KISS or CUFFT, default is FFTW3 if found, else KISS -D FFT_SINGLE=value # yes or no (default), no = double precision -D FFT_PACK=value # array (default) or pointer or memcpy :pre @@ -51,7 +51,7 @@ the FFT library, you can set these variables: [Makefile.machine settings]: -FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS +FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, -DFFT_FFTW_THREADS, -DFFT_CUFFT, or -DFFT_KISS # default is KISS if not specified FFT_INC = -DFFT_SINGLE # do not specify for double precision FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY :pre @@ -61,8 +61,10 @@ FFT_INC = -I/usr/local/include FFT_PATH = -L/usr/local/lib FFT_LIB = -lfftw3 # FFTW3 double precision FFT_LIB = -lfftw3 -lfftw3f # FFTW3 single precision +FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision, OpenMP threaded using the KOKKOS package FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler -FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compier :pre +FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compiler +FFT_LIB = -lcufft # cuFFT (for GPUs using the KOKKOS package) :pre As with CMake, you do not need to set paths in FFT_INC or FFT_PATH, if make can find the FFT header and library files. You must specify From 94cb1a8a6a80887dd3bd9bb2ffed0c799ef094de Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 2 Jul 2019 11:40:23 -0600 Subject: [PATCH 18/48] Add CUFFT to KSPACE.cmake --- cmake/Modules/Packages/KSPACE.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Modules/Packages/KSPACE.cmake b/cmake/Modules/Packages/KSPACE.cmake index 6938a93a36..dfa141de58 100644 --- a/cmake/Modules/Packages/KSPACE.cmake +++ b/cmake/Modules/Packages/KSPACE.cmake @@ -11,7 +11,7 @@ if(PKG_KSPACE) else() set(FFT "KISS" CACHE STRING "FFT library for KSPACE package") endif() - set(FFT_VALUES KISS ${FFTW} MKL) + set(FFT_VALUES KISS ${FFTW} MKL CUFFT) set_property(CACHE FFT PROPERTY STRINGS ${FFT_VALUES}) validate_option(FFT FFT_VALUES) string(TOUPPER ${FFT} FFT) From cf11804cebd8ec7a55fda07d1e8866e4c7a59f9e Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 19 Jul 2019 14:44:24 -0400 Subject: [PATCH 19/48] refactor FFT handling in CMake module for KSPACE. This also improves the FFT info output in the PPPM styles # Conflicts: # cmake/Modules/Packages/KSPACE.cmake --- cmake/CMakeLists.txt | 7 ++++++- cmake/Modules/Packages/KSPACE.cmake | 28 ++++++++++++++++------------ src/KOKKOS/pppm_kokkos.cpp | 10 ++-------- src/KSPACE/pppm.cpp | 10 ++-------- src/KSPACE/pppm.h | 11 +++++++++++ src/KSPACE/pppm_disp.cpp | 10 ++-------- src/KSPACE/pppm_disp.h | 10 ++++++++++ 7 files changed, 49 insertions(+), 37 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 8f7ac9a6e7..f589e796db 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -655,5 +655,10 @@ if(PKG_KOKKOS) message(STATUS "Kokkos Arch: ${KOKKOS_ARCH}") endif() if(PKG_KSPACE) - message(STATUS "Using ${FFT} as FFT") + message(STATUS "Using ${FFT} as primary FFT library") + if(FFT_SINGLE) + message(STATUS "Using single precision FFTs") + else() + message(STATUS "Using double precision FFTs") + endif() endif() diff --git a/cmake/Modules/Packages/KSPACE.cmake b/cmake/Modules/Packages/KSPACE.cmake index dfa141de58..63c42baf2d 100644 --- a/cmake/Modules/Packages/KSPACE.cmake +++ b/cmake/Modules/Packages/KSPACE.cmake @@ -1,5 +1,5 @@ if(PKG_KSPACE) - option(FFT_SINGLE "Use single precision FFT instead of double" OFF) + option(FFT_SINGLE "Use single precision FFTs instead of double precision FFTs" OFF) set(FFTW "FFTW3") if(FFT_SINGLE) set(FFTW "FFTW3F") @@ -7,26 +7,30 @@ if(PKG_KSPACE) endif() find_package(${FFTW} QUIET) if(${FFTW}_FOUND) - set(FFT "${FFTW}" CACHE STRING "FFT library for KSPACE package") + set(FFT "FFTW3" CACHE STRING "FFT library for KSPACE package") else() set(FFT "KISS" CACHE STRING "FFT library for KSPACE package") endif() - set(FFT_VALUES KISS ${FFTW} MKL CUFFT) + set(FFT_VALUES KISS FFTW3 MKL) set_property(CACHE FFT PROPERTY STRINGS ${FFT_VALUES}) validate_option(FFT FFT_VALUES) string(TOUPPER ${FFT} FFT) - if(NOT FFT STREQUAL "KISS") - find_package(${FFT} REQUIRED) - if(NOT FFT STREQUAL "FFTW3F") - add_definitions(-DFFT_FFTW) - else() - add_definitions(-DFFT_${FFT}) - endif() - include_directories(${${FFT}_INCLUDE_DIRS}) - list(APPEND LAMMPS_LINK_LIBS ${${FFT}_LIBRARIES}) + + if(FFT STREQUAL "FFTW3") + find_package(${FFTW} REQUIRED) + add_definitions(-DFFT_FFTW3) + include_directories(${${FFTW}_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES}) + elseif(FFT STREQUAL "MKL") + find_package(MKL REQUIRED) + add_definitions(-DFFT_MKL) + include_directories(${MKL_INCLUDE_DIRS}) + list(APPEND LAMMPS_LINK_LIBS ${MKL_LIBRARIES}) else() + # last option is KISSFFT add_definitions(-DFFT_KISS) endif() + set(FFT_PACK "array" CACHE STRING "Optimization for FFT") set(FFT_PACK_VALUES array pointer memcpy) set_property(CACHE FFT_PACK PROPERTY STRINGS ${FFT_PACK_VALUES}) diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index f25ea2313f..0d45f866b4 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -306,12 +306,6 @@ void PPPMKokkos::init() if (me == 0) { -#ifdef FFT_SINGLE - const char fft_prec[] = "single"; -#else - const char fft_prec[] = "double"; -#endif - if (screen) { fprintf(screen," G vector (1/distance) = %g\n",g_ewald); fprintf(screen," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); @@ -320,7 +314,7 @@ void PPPMKokkos::init() estimated_accuracy); fprintf(screen," estimated relative force accuracy = %g\n", estimated_accuracy/two_charge_force); - fprintf(screen," using %s precision FFTs\n",fft_prec); + fprintf(screen," using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n"); fprintf(screen," 3d grid and FFT values/proc = %d %d\n", ngrid_max,nfft_both_max); } @@ -332,7 +326,7 @@ void PPPMKokkos::init() estimated_accuracy); fprintf(logfile," estimated relative force accuracy = %g\n", estimated_accuracy/two_charge_force); - fprintf(logfile," using %s precision FFTs\n",fft_prec); + fprintf(logfile," using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n"); fprintf(logfile," 3d grid and FFT values/proc = %d %d\n", ngrid_max,nfft_both_max); } diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp index 53c18804a5..1abb19e947 100644 --- a/src/KSPACE/pppm.cpp +++ b/src/KSPACE/pppm.cpp @@ -350,12 +350,6 @@ void PPPM::init() if (me == 0) { -#ifdef FFT_SINGLE - const char fft_prec[] = "single"; -#else - const char fft_prec[] = "double"; -#endif - if (screen) { fprintf(screen," G vector (1/distance) = %g\n",g_ewald); fprintf(screen," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); @@ -364,7 +358,7 @@ void PPPM::init() estimated_accuracy); fprintf(screen," estimated relative force accuracy = %g\n", estimated_accuracy/two_charge_force); - fprintf(screen," using %s precision FFTs\n",fft_prec); + fprintf(screen," using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n"); fprintf(screen," 3d grid and FFT values/proc = %d %d\n", ngrid_max,nfft_both_max); } @@ -376,7 +370,7 @@ void PPPM::init() estimated_accuracy); fprintf(logfile," estimated relative force accuracy = %g\n", estimated_accuracy/two_charge_force); - fprintf(logfile," using %s precision FFTs\n",fft_prec); + fprintf(logfile," using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n"); fprintf(logfile," 3d grid and FFT values/proc = %d %d\n", ngrid_max,nfft_both_max); } diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h index 1ce1a0d666..6426b9c22d 100644 --- a/src/KSPACE/pppm.h +++ b/src/KSPACE/pppm.h @@ -23,11 +23,22 @@ KSpaceStyle(pppm,PPPM) #include "lmptype.h" #include +#if defined(FFT_FFTW3) +#define LMP_FFT_LIB "FFTW3" +#elif defined(FFT_MKL) +#define LMP_FFT_LIB "MKL FFT" +#else +#define LMP_FFT_LIB "KISS FFT" +#endif + #ifdef FFT_SINGLE typedef float FFT_SCALAR; +#define LMP_FFT_PREC "single" #define MPI_FFT_SCALAR MPI_FLOAT #else + typedef double FFT_SCALAR; +#define LMP_FFT_PREC "double" #define MPI_FFT_SCALAR MPI_DOUBLE #endif diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp index 45dce0895b..ba0e85bbba 100644 --- a/src/KSPACE/pppm_disp.cpp +++ b/src/KSPACE/pppm_disp.cpp @@ -474,12 +474,6 @@ void PPPMDisp::init() MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world); if (me == 0) { - #ifdef FFT_SINGLE - const char fft_prec[] = "single"; - #else - const char fft_prec[] = "double"; - #endif - if (screen) { fprintf(screen," Coulomb G vector (1/distance)= %g\n",g_ewald); fprintf(screen," Coulomb grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); @@ -488,7 +482,7 @@ void PPPMDisp::init() acc); fprintf(screen," Coulomb estimated relative force accuracy = %g\n", acc/two_charge_force); - fprintf(screen," using %s precision FFTs\n",fft_prec); + fprintf(screen," using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n"); fprintf(screen," 3d grid and FFT values/proc = %d %d\n", ngrid_max, nfft_both_max); } @@ -501,7 +495,7 @@ void PPPMDisp::init() acc); fprintf(logfile," Coulomb estimated relative force accuracy = %g\n", acc/two_charge_force); - fprintf(logfile," using %s precision FFTs\n",fft_prec); + fprintf(logfile," using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n"); fprintf(logfile," 3d grid and FFT values/proc = %d %d\n", ngrid_max, nfft_both_max); } diff --git a/src/KSPACE/pppm_disp.h b/src/KSPACE/pppm_disp.h index ccbeb60f3c..8674669274 100644 --- a/src/KSPACE/pppm_disp.h +++ b/src/KSPACE/pppm_disp.h @@ -23,11 +23,21 @@ KSpaceStyle(pppm/disp,PPPMDisp) #include "lmptype.h" #include +#if defined(FFT_FFTW3) +#define LMP_FFT_LIB "FFTW3" +#elif defined(FFT_MKL) +#define LMP_FFT_LIB "MKL FFT" +#else +#define LMP_FFT_LIB "KISS FFT" +#endif + #ifdef FFT_SINGLE typedef float FFT_SCALAR; +#define LMP_FFT_PREC "single" #define MPI_FFT_SCALAR MPI_FLOAT #else typedef double FFT_SCALAR; +#define LMP_FFT_PREC "double" #define MPI_FFT_SCALAR MPI_DOUBLE #endif From bf06dca11caf0859ac935e6c7979934be3b24d44 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 19 Jul 2019 16:46:37 -0400 Subject: [PATCH 20/48] make support for threaded FFTW3 consistent and incorporate it into CMake This now also includes threaded FFTW3 for regular FFTs, not only KOKKOS. --- cmake/CMakeLists.txt | 5 +++++ cmake/Modules/FindFFTW3.cmake | 15 +++++++++------ cmake/Modules/FindFFTW3F.cmake | 11 ++++++----- cmake/Modules/Packages/KOKKOS.cmake | 4 +++- cmake/Modules/Packages/KSPACE.cmake | 22 ++++++++++++++++++++++ doc/src/Build_settings.txt | 11 ++++++++--- src/KOKKOS/fft3d_kokkos.h | 4 +++- src/KSPACE/fft3d.cpp | 23 ++++++++++++++++++++++- 8 files changed, 78 insertions(+), 17 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index f589e796db..bb5f527ed1 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -661,4 +661,9 @@ if(PKG_KSPACE) else() message(STATUS "Using double precision FFTs") endif() + if(FFT_THREADS) + message(STATUS "Using threaded FFTs") + else() + message(STATUS "Using non-threaded FFTs") + endif() endif() diff --git a/cmake/Modules/FindFFTW3.cmake b/cmake/Modules/FindFFTW3.cmake index 552bcc4257..221e4beaad 100644 --- a/cmake/Modules/FindFFTW3.cmake +++ b/cmake/Modules/FindFFTW3.cmake @@ -1,20 +1,23 @@ # - Find fftw3 # Find the native FFTW3 headers and libraries. # -# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. -# FFTW3_LIBRARIES - List of libraries when using fftw3. -# FFTW3_FOUND - True if fftw3 found. +# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. +# FFTW3_LIBRARIES - List of libraries when using fftw3. +# FFTW3_OMP_LIBRARIES - List of libraries when using fftw3. +# FFTW3_FOUND - True if fftw3 found. +# FFTW3_OMP_FOUND - True if fftw3_omp found. # find_package(PkgConfig) pkg_check_modules(PC_FFTW3 fftw3) find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS}) - find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS}) +find_library(FFTW3_OMP_LIBRARY NAMES fftw3_omp HINTS ${PC_FFTW3_LIBRARY_DIRS}) -set(FFTW3_LIBRARIES ${FFTW3_LIBRARY}) set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR}) +set(FFTW3_LIBRARIES ${FFTW3_LIBRARY}) +set(FFTW3_OMP_LIBRARIES ${FFTW3_OMP_LIBRARY}) include(FindPackageHandleStandardArgs) # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE @@ -22,4 +25,4 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR) -mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY ) +mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY FFTW3_OMP_LIBRARY) diff --git a/cmake/Modules/FindFFTW3F.cmake b/cmake/Modules/FindFFTW3F.cmake index 92d1e85e79..c67aa5faf1 100644 --- a/cmake/Modules/FindFFTW3F.cmake +++ b/cmake/Modules/FindFFTW3F.cmake @@ -1,8 +1,8 @@ -# - Find fftw3f -# Find the native FFTW3F headers and libraries. +# Find the native single precision FFTW3 headers and libraries. # # FFTW3F_INCLUDE_DIRS - where to find fftw3f.h, etc. # FFTW3F_LIBRARIES - List of libraries when using fftw3f. +# FFTW3F_OMP_LIBRARIES - List of libraries when using fftw3. # FFTW3F_FOUND - True if fftw3f found. # @@ -10,11 +10,12 @@ find_package(PkgConfig) pkg_check_modules(PC_FFTW3F fftw3f) find_path(FFTW3F_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3F_INCLUDE_DIRS}) - find_library(FFTW3F_LIBRARY NAMES fftw3f HINTS ${PC_FFTW3F_LIBRARY_DIRS}) +find_library(FFTW3F_OMP_LIBRARY NAMES fftw3f_omp HINTS ${PC_FFTW3F_LIBRARY_DIRS}) -set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY}) set(FFTW3F_INCLUDE_DIRS ${FFTW3F_INCLUDE_DIR}) +set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY}) +set(FFTW3F_OMP_LIBRARIES ${FFTW3F_OMP_LIBRARY}) include(FindPackageHandleStandardArgs) # handle the QUIETLY and REQUIRED arguments and set FFTW3F_FOUND to TRUE @@ -22,4 +23,4 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args(FFTW3F DEFAULT_MSG FFTW3F_LIBRARY FFTW3F_INCLUDE_DIR) -mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY ) +mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 2f7d401802..cd4dd21cd7 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -27,7 +27,9 @@ if(PKG_KOKKOS) ${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp) if(PKG_KSPACE) - list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp) + list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/fft3d_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp) endif() set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}") diff --git a/cmake/Modules/Packages/KSPACE.cmake b/cmake/Modules/Packages/KSPACE.cmake index 63c42baf2d..8451eef91c 100644 --- a/cmake/Modules/Packages/KSPACE.cmake +++ b/cmake/Modules/Packages/KSPACE.cmake @@ -21,14 +21,36 @@ if(PKG_KSPACE) add_definitions(-DFFT_FFTW3) include_directories(${${FFTW}_INCLUDE_DIRS}) list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES}) + if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY) + option(FFT_THREADS "Use threaded FFT library" ON) + else() + option(FFT_THREADS "Use threaded FFT library" OFF) + endif() + + if(FFT_THREADS) + if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY) + add_definitions(-DFFT_FFTW_THREADS) + list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_OMP_LIBRARIES}) + else() + message(FATAL_ERROR "Need OpenMP enabled FFTW3 library for FFT_THREADS") + endif() + endif() elseif(FFT STREQUAL "MKL") find_package(MKL REQUIRED) + option(FFT_THREADS "Use threaded FFT library" OFF) add_definitions(-DFFT_MKL) include_directories(${MKL_INCLUDE_DIRS}) list(APPEND LAMMPS_LINK_LIBS ${MKL_LIBRARIES}) + if (FFT_THREADS) + message(FATAL_ERROR "FFT_THREADS not supported with FFT from MKL") + endif() else() # last option is KISSFFT + option(FFT_THREADS "Use threaded FFT library" OFF) add_definitions(-DFFT_KISS) + if (FFT_THREADS) + message(FATAL_ERROR "FFT_THREADS not supported with KISSFFT") + endif() endif() set(FFT_PACK "array" CACHE STRING "Optimization for FFT") diff --git a/doc/src/Build_settings.txt b/doc/src/Build_settings.txt index 578903774d..48a67a189b 100644 --- a/doc/src/Build_settings.txt +++ b/doc/src/Build_settings.txt @@ -33,8 +33,9 @@ LAMMPS can use them if they are available on your system. [CMake variables]: --D FFT=value # FFTW3 or MKL or KISS or CUFFT, default is FFTW3 if found, else KISS +-D FFT=value # FFTW3 or MKL or KISS, default is FFTW3 if found, else KISS -D FFT_SINGLE=value # yes or no (default), no = double precision +-D FFT_THREADS=value # yes or no, default is yes if FFT=FFTW3 and OpenMP FFT lib found, else no -D FFT_PACK=value # array (default) or pointer or memcpy :pre NOTE: The values for the FFT variable must be in upper-case. This is @@ -51,9 +52,10 @@ the FFT library, you can set these variables: [Makefile.machine settings]: -FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, -DFFT_FFTW_THREADS, -DFFT_CUFFT, or -DFFT_KISS +FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS # default is KISS if not specified FFT_INC = -DFFT_SINGLE # do not specify for double precision +FFT_INC = -DFFT_FFTW_THREADS # use threaded FFTW3. requires -DFFT_FFTW3 (or -DFFT_FFTW) FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY :pre # default is FFT_PACK_ARRAY if not specified @@ -61,7 +63,7 @@ FFT_INC = -I/usr/local/include FFT_PATH = -L/usr/local/lib FFT_LIB = -lfftw3 # FFTW3 double precision FFT_LIB = -lfftw3 -lfftw3f # FFTW3 single precision -FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision, OpenMP threaded using the KOKKOS package +FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision, OpenMP threaded FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compiler FFT_LIB = -lcufft # cuFFT (for GPUs using the KOKKOS package) :pre @@ -119,6 +121,9 @@ library libfftw3f.a make clean ./configure --enable-single; make; make install :pre +When using -DFFT_FFTW_THREADS you need to also have the FFTW3 +OpenMP thread interface libraries available. + Performing 3d FFTs requires communication to transpose the 3d FFT grid. The data packing/unpacking for this can be done in one of 3 modes (ARRAY, POINTER, MEMCPY) as set by the FFT_PACK syntax above. diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index b878135855..951d4adaba 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -26,7 +26,9 @@ #endif #ifdef FFT_FFTW_THREADS -#define FFT_FFTW3 +#if !defined(FFT_FFTW3) +#error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS" +#endif #endif #if defined(FFT_FFTW3) diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp index 7d3c8c83f2..a0abd50c01 100644 --- a/src/KSPACE/fft3d.cpp +++ b/src/KSPACE/fft3d.cpp @@ -25,6 +25,9 @@ #include #include "fft3d.h" #include "remap.h" +#if defined(_OPENMP) +#include +#endif #ifdef FFT_KISS /* include kissfft implementation */ @@ -266,7 +269,7 @@ struct fft_plan_3d *fft_3d_create_plan( int scaled, int permute, int *nbuf, int usecollective) { struct fft_plan_3d *plan; - int me,nprocs; + int me,nprocs,nthreads; int flag,remapflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; @@ -279,6 +282,14 @@ struct fft_plan_3d *fft_3d_create_plan( MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); +#if defined(_OPENMP) + // query OpenMP info. + // should have been initialized systemwide in Comm class constructor + nthreads = omp_get_max_threads(); +#else + nthreads = 1; +#endif + // compute division of procs in 2 dimensions not on-processor bifactor(nprocs,&np1,&np2); @@ -571,6 +582,13 @@ struct fft_plan_3d *fft_3d_create_plan( */ #elif defined(FFT_FFTW3) +#if defined(FFT_FFTW_THREADS) + if (nthreads > 1) { + FFTW_API(init_threads)(); + FFTW_API(plan_with_nthreads)(nthreads); + } +#endif + plan->plan_fast_forward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, @@ -689,6 +707,9 @@ void fft_3d_destroy_plan(struct fft_plan_3d *plan) FFTW_API(destroy_plan)(plan->plan_mid_backward); FFTW_API(destroy_plan)(plan->plan_fast_forward); FFTW_API(destroy_plan)(plan->plan_fast_backward); +#if defined(FFT_FFTW_THREADS) + FFTW_API(cleanup_threads)(); +#endif #else if (plan->cfg_slow_forward != plan->cfg_fast_forward && plan->cfg_slow_forward != plan->cfg_mid_forward) { From 147999d6bd13c79aecf384c8a7a1f63e55c92735 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 19 Jul 2019 17:00:06 -0400 Subject: [PATCH 21/48] fix bug that would break single precision FFTW support with threads using KOKKOS --- src/KOKKOS/fft3d_kokkos.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 7d2bc506e6..27234c3c04 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -597,8 +597,8 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl #if defined(FFT_FFTW_THREADS) if (nthreads > 1) { - fftw_init_threads(); - fftw_plan_with_nthreads(nthreads); + FFTW_API(init_threads)(); + FFTW_API(plan_with_nthreads)(nthreads); } #endif @@ -710,7 +710,7 @@ void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokk delete remapKK; #if defined (FFT_FFTW_THREADS) - fftw_cleanup_threads(); + FFTW_API(cleanup_threads)(); #elif defined (FFT_KISSFFT) delete kissfftKK; #endif From 73f3019ef3ad1dffab2f9e1f91024d6a1e9975fd Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 19 Jul 2019 17:27:10 -0400 Subject: [PATCH 22/48] update comments --- cmake/Modules/FindFFTW3.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/Modules/FindFFTW3.cmake b/cmake/Modules/FindFFTW3.cmake index 221e4beaad..63752f85df 100644 --- a/cmake/Modules/FindFFTW3.cmake +++ b/cmake/Modules/FindFFTW3.cmake @@ -1,11 +1,10 @@ # - Find fftw3 -# Find the native FFTW3 headers and libraries. +# Find the native double precision FFTW3 headers and libraries. # # FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. # FFTW3_LIBRARIES - List of libraries when using fftw3. # FFTW3_OMP_LIBRARIES - List of libraries when using fftw3. # FFTW3_FOUND - True if fftw3 found. -# FFTW3_OMP_FOUND - True if fftw3_omp found. # find_package(PkgConfig) From 2ec743abb7bfb8251acce0ce8ed2567f816bcc7e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 29 Jul 2019 14:02:38 -0600 Subject: [PATCH 23/48] Increase default stack size on GPUs for Kokkos KISS FFT --- src/KOKKOS/fft3d_kokkos.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 7d2bc506e6..2eed975591 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -52,6 +52,17 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int #elif defined(FFT_CUFFT) if (ngpus > 0 && execution_space == Host) lmp->error->all(FLERR,"Cannot use the cuFFT library with Kokkos CUDA on the host CPUs"); +#elif defined(FFT_KISSFFT) + // The compiler can't statically determine the stack size needed for + // recursive function calls in KISS FFT and the default per-thread + // stack size on GPUs needs to be increased to prevent stack overflows + // for reasonably sized FFTs + #if defined (KOKKOS_ENABLE_CUDA) + size_t stack_size; + cudaDeviceGetLimit(&stack_size,cudaLimitStackSize); + if (stack_size < 2048) + cudaDeviceSetLimit(cudaLimitStackSize,2048); + #endif #endif plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, From f96609a04674ba31599adc0d58e49ea38cc59206 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 29 Jul 2019 14:56:21 -0600 Subject: [PATCH 24/48] Add MKL support --- src/KOKKOS/fft3d_kokkos.cpp | 232 +++++++++++++++++++++--------------- src/KOKKOS/fft3d_kokkos.h | 17 ++- 2 files changed, 154 insertions(+), 95 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 2eed975591..55db0dd6be 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -46,7 +46,10 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int int ngpus = lmp->kokkos->ngpus; ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; -#if defined(FFT_FFTW3) +#if defined(FFT_MKL) + if (ngpus > 0 && execution_space == Device) + lmp->error->all(FLERR,"Cannot use the MKL library with Kokkos CUDA on GPUs"); +#elif defined(FFT_FFTW3) if (ngpus > 0 && execution_space == Device) lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA on GPUs"); #elif defined(FFT_CUFFT) @@ -131,27 +134,31 @@ void FFT3dKokkos::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int ns plan plan returned by previous call to fft_3d_create_plan ------------------------------------------------------------------------- */ -#ifdef FFT_CUFFT template -struct cufft_norm_functor { +struct norm_functor { public: typedef DeviceType device_type; typedef ArrayTypes AT; - typename AT::t_FFT_SCALAR_1d_um d_out; + typename AT::t_FFT_DATA_1d_um d_out; int norm; - cufft_norm_functor(typename AT::t_FFT_SCALAR_1d &d_out_, int norm_): - d_out(d_out_) - { - norm = norm_; - } + norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_): + d_out(d_out_),norm(norm_) {} KOKKOS_INLINE_FUNCTION void operator() (const int &i) const { - d_out(i) *= norm; +#if defined(FFT_FFTW3) || defined(FFT_CUFFT) + FFT_SCALAR* out_ptr = (FFT_SCALAR *)d_out(i); + *(out_ptr++) *= norm; + *(out_ptr++) *= norm; +#elif defined(FFT_MKL) + d_out(i) *= norm; +#else + d_out(i,0) *= norm; + d_out(i,1) *= norm; +#endif } }; -#endif #ifdef FFT_KISSFFT template @@ -179,27 +186,6 @@ public: KissFFTKokkos::kiss_fft_kokkos(st,d_data,d_tmp,offset); } }; - -template -struct kiss_norm_functor { -public: - typedef DeviceType device_type; - typedef ArrayTypes AT; - typename AT::t_FFT_DATA_1d_um d_out; - int norm; - - kiss_norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_): - d_out(d_out_) - { - norm = norm_; - } - - KOKKOS_INLINE_FUNCTION - void operator() (const int &i) const { - d_out(i,0) *= norm; - d_out(i,1) *= norm; - } -}; #endif template @@ -231,13 +217,18 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ total = plan->total1; length = plan->length1; - #if defined(FFT_FFTW3) + #if defined(FFT_MKL) if (flag == -1) - FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + DftiComputeForward(plan->handle_fast,(FFT_DATA *)d_data.data()); else - FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + DftiComputeBackward(plan->handle_fast,(FFT_DATA *)d_data.data()); + #elif defined(FFT_FFTW3) + if (flag == -1) + fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + cufftExecZ2Z(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); @@ -272,13 +263,18 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ total = plan->total2; length = plan->length2; - #if defined(FFT_FFTW3) + #if defined(FFT_MKL) if (flag == -1) - FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + DftiComputeForward(plan->handle_mid,(FFT_DATA *)d_data.data()); else - FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + DftiComputeBackward(plan->handle_mid,(FFT_DATA *)d_data.data()); + #elif defined(FFT_FFTW3) + if (flag == -1) + fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExec(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + cufftExecZ2Z(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else if (flag == -1) f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length); @@ -309,13 +305,18 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ total = plan->total3; length = plan->length3; - #if defined(FFT_FFTW3) + #if defined(FFT_MKL) if (flag == -1) - FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + DftiComputeForward(plan->handle_slow,(FFT_DATA *)d_data.data()); else - FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + DftiComputeBackward(plan->handle_slow,(FFT_DATA *)d_data.data()); + #elif defined(FFT_FFTW3) + if (flag == -1) + fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExec(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + cufftExecZ2Z(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else if (flag == -1) f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length); @@ -341,17 +342,11 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ // scaling if required if (flag == 1 && plan->scaled) { - int norm = plan->norm; - FFT_SCALAR num = plan->normnum; - #if defined(FFT_CUFFT) - typename AT::t_FFT_SCALAR_1d d_norm_scalar = - typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); - cufft_norm_functor f(d_norm_scalar,norm); + FFT_SCALAR norm = plan->norm; + int num = plan->normnum; + + norm_functor f(d_out,norm); Kokkos::parallel_for(num,f); - #elif defined(FFT_KISSFFT) - kiss_norm_functor f(d_out,norm); - Kokkos::parallel_for(num,f); - #endif } } @@ -604,48 +599,83 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl // system specific pre-computation of 1d FFT coeffs // and scaling normalization -#if defined(FFT_FFTW3) +#if defined(FFT_MKL) + DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1, + (MKL_LONG)nfast); + DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS, + (MKL_LONG)plan->total1/nfast); + DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE); + DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast); + DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast); + DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads); + DftiCommitDescriptor(plan->handle_fast); - #if defined(FFT_FFTW_THREADS) - if (nthreads > 1) { - fftw_init_threads(); - fftw_plan_with_nthreads(nthreads); + DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1, + (MKL_LONG)nmid); + DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS, + (MKL_LONG)plan->total2/nmid); + DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE); + DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid); + DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid); + DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads); + DftiCommitDescriptor(plan->handle_mid); + + DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1, + (MKL_LONG)nslow); + DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS, + (MKL_LONG)plan->total3/nslow); + DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE); + DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow); + DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow); + DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads); + DftiCommitDescriptor(plan->handle_slow); + + if (scaled == 0) + plan->scaled = 0; + else { + plan->scaled = 1; + plan->norm = 1.0/(nfast*nmid*nslow); + plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * + (out_khi-out_klo+1); } - #endif + +#elif defined(FFT_FFTW3) + if (nthreads > 1) + fftw_plan_with_nthreads(nthreads); plan->plan_fast_forward = - FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, + fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_fast_backward = - FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, + fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_mid_forward = - FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, + fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_mid_backward = - FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, + fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_slow_forward = - FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, + fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_slow_backward = - FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, + fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_BACKWARD,FFTW_ESTIMATE); @@ -653,17 +683,17 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl cufftPlanMany(&(plan->plan_fast), 1, &nfast, &nfast,1,plan->length1, &nfast,1,plan->length1, - CUFFT_TYPE,plan->total1/plan->length1); + CUFFT_Z2Z,plan->total1/plan->length1); cufftPlanMany(&(plan->plan_mid), 1, &nmid, &nmid,1,plan->length2, &nmid,1,plan->length2, - CUFFT_TYPE,plan->total2/plan->length2); + CUFFT_Z2Z,plan->total2/plan->length2); cufftPlanMany(&(plan->plan_slow), 1, &nslow, &nslow,1,plan->length3, &nslow,1,plan->length3, - CUFFT_TYPE,plan->total3/plan->length3); + CUFFT_Z2Z,plan->total3/plan->length3); #else kissfftKK = new KissFFTKokkos(); @@ -717,14 +747,23 @@ void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokk if (plan->mid2_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid2_plan); if (plan->post_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->post_plan); - delete plan; - delete remapKK; - -#if defined (FFT_FFTW_THREADS) - fftw_cleanup_threads(); +#if defined(FFT_MKL) + DftiFreeDescriptor(&(plan->handle_fast)); + DftiFreeDescriptor(&(plan->handle_mid)); + DftiFreeDescriptor(&(plan->handle_slow)); +#elif defined(FFT_FFTW3) + FFTW_API(destroy_plan)(plan->plan_slow_forward); + FFTW_API(destroy_plan)(plan->plan_slow_backward); + FFTW_API(destroy_plan)(plan->plan_mid_forward); + FFTW_API(destroy_plan)(plan->plan_mid_backward); + FFTW_API(destroy_plan)(plan->plan_fast_forward); + FFTW_API(destroy_plan)(plan->plan_fast_backward); #elif defined (FFT_KISSFFT) delete kissfftKK; #endif + + delete plan; + delete remapKK; } /* ---------------------------------------------------------------------- @@ -777,7 +816,10 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d // fftw3 and Dfti in MKL encode the number of transforms // into the plan, so we cannot operate on a smaller data set - +#if defined(FFT_MKL) || defined(FFT_FFTW3) + if ((total1 > nsize) || (total2 > nsize) || (total3 > nsize)) + return; +#endif if (total1 > nsize) total1 = (nsize/length1) * length1; if (total2 > nsize) total2 = (nsize/length2) * length2; if (total3 > nsize) total3 = (nsize/length3) * length3; @@ -785,20 +827,30 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d // perform 1d FFTs in each of 3 dimensions // data is just an array of 0.0 -#if defined(FFT_FFTW3) +#if defined(FFT_MKL) if (flag == -1) { - FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + DftiComputeForward(plan->handle_fast,data); + DftiComputeForward(plan->handle_mid,data); + DftiComputeForward(plan->handle_slow,data); } else { - FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + DftiComputeBackward(plan->handle_fast,data); + DftiComputeBackward(plan->handle_mid,data); + DftiComputeBackward(plan->handle_slow,data); + } +#elif defined(FFT_FFTW3) + if (flag == -1) { + fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + } else { + fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); } #elif defined(FFT_CUFFT) - cufftExec(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); - cufftExec(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); - cufftExec(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExecZ2Z(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExecZ2Z(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExecZ2Z(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); #else kiss_fft_functor f; typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); @@ -829,15 +881,9 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d if (flag == 1 && plan->scaled) { FFT_SCALAR norm = plan->norm; int num = MIN(plan->normnum,nsize); - #if defined(FFT_CUFFT) - typename AT::t_FFT_SCALAR_1d d_norm_scalar = - typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); - cufft_norm_functor f(d_norm_scalar,norm); + + norm_functor f(d_out,norm); Kokkos::parallel_for(num,f); - #elif defined(FFT_KISSFFT) - kiss_norm_functor f(d_data,norm); - Kokkos::parallel_for(num,f); - #endif } } diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index b878135855..c68d78db56 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -29,7 +29,16 @@ #define FFT_FFTW3 #endif -#if defined(FFT_FFTW3) +#if defined(FFT_MKL) + #include "mkl_dfti.h" + #if defined(FFT_SINGLE) + typedef float _Complex FFT_DATA; + #define FFT_MKL_PREC DFTI_SINGLE + #else + typedef double _Complex FFT_DATA; + #define FFT_MKL_PREC DFTI_DOUBLE + #endif +#elif defined(FFT_FFTW3) #include "fftw3.h" #if defined(FFT_SINGLE) typedef fftwf_complex FFT_DATA; @@ -82,7 +91,11 @@ struct fft_plan_3d_kokkos { double norm; // normalization factor for rescaling // system specific 1d FFT info -#if defined(FFT_FFTW3) +#if defined(FFT_MKL) + DFTI_DESCRIPTOR *handle_fast; + DFTI_DESCRIPTOR *handle_mid; + DFTI_DESCRIPTOR *handle_slow; +#elif defined(FFT_FFTW3) FFTW_API(plan) plan_fast_forward; FFTW_API(plan) plan_fast_backward; FFTW_API(plan) plan_mid_forward; From 9a43a6824c194f8b21b4285b30ffa2983c280123 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 30 Jul 2019 09:25:24 -0600 Subject: [PATCH 25/48] Fix issues --- cmake/Modules/Packages/KSPACE.cmake | 2 +- src/KOKKOS/fft3d_kokkos.cpp | 67 ++++++++++++++++------------- 2 files changed, 38 insertions(+), 31 deletions(-) diff --git a/cmake/Modules/Packages/KSPACE.cmake b/cmake/Modules/Packages/KSPACE.cmake index adca8b91a9..737270d1c0 100644 --- a/cmake/Modules/Packages/KSPACE.cmake +++ b/cmake/Modules/Packages/KSPACE.cmake @@ -11,7 +11,7 @@ if(PKG_KSPACE) else() set(FFT "KISS" CACHE STRING "FFT library for KSPACE package") endif() - set(FFT_VALUES KISS FFTW MKL CUFFT) + set(FFT_VALUES KISS FFTW3 MKL CUFFT) set_property(CACHE FFT PROPERTY STRINGS ${FFT_VALUES}) validate_option(FFT FFT_VALUES) string(TOUPPER ${FFT} FFT) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 55db0dd6be..9b001c2356 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -224,11 +224,11 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ DftiComputeBackward(plan->handle_fast,(FFT_DATA *)d_data.data()); #elif defined(FFT_FFTW3) if (flag == -1) - fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); else - fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); @@ -270,11 +270,11 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ DftiComputeBackward(plan->handle_mid,(FFT_DATA *)d_data.data()); #elif defined(FFT_FFTW3) if (flag == -1) - fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); else - fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + cufftExec(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else if (flag == -1) f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length); @@ -312,11 +312,11 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ DftiComputeBackward(plan->handle_slow,(FFT_DATA *)d_data.data()); #elif defined(FFT_FFTW3) if (flag == -1) - fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); else - fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + cufftExec(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else if (flag == -1) f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length); @@ -640,42 +640,44 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl } #elif defined(FFT_FFTW3) - if (nthreads > 1) - fftw_plan_with_nthreads(nthreads); + if (nthreads > 1) { + FFTW_API(init_threads)(); + FFTW_API(plan_with_nthreads)(nthreads); + } plan->plan_fast_forward = - fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, + FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_fast_backward = - fftw_plan_many_dft(1, &nfast,plan->total1/plan->length1, + FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, NULL,&nfast,1,plan->length1, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_mid_forward = - fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, + FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_mid_backward = - fftw_plan_many_dft(1, &nmid,plan->total2/plan->length2, + FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, NULL,&nmid,1,plan->length2, NULL,&nmid,1,plan->length2, FFTW_BACKWARD,FFTW_ESTIMATE); plan->plan_slow_forward = - fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, + FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_FORWARD,FFTW_ESTIMATE); plan->plan_slow_backward = - fftw_plan_many_dft(1, &nslow,plan->total3/plan->length3, + FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, NULL,&nslow,1,plan->length3, NULL,&nslow,1,plan->length3, FFTW_BACKWARD,FFTW_ESTIMATE); @@ -683,17 +685,17 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl cufftPlanMany(&(plan->plan_fast), 1, &nfast, &nfast,1,plan->length1, &nfast,1,plan->length1, - CUFFT_Z2Z,plan->total1/plan->length1); + CUFFT_TYPE,plan->total1/plan->length1); cufftPlanMany(&(plan->plan_mid), 1, &nmid, &nmid,1,plan->length2, &nmid,1,plan->length2, - CUFFT_Z2Z,plan->total2/plan->length2); + CUFFT_TYPE,plan->total2/plan->length2); cufftPlanMany(&(plan->plan_slow), 1, &nslow, &nslow,1,plan->length3, &nslow,1,plan->length3, - CUFFT_Z2Z,plan->total3/plan->length3); + CUFFT_TYPE,plan->total3/plan->length3); #else kissfftKK = new KissFFTKokkos(); @@ -758,6 +760,11 @@ void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokk FFTW_API(destroy_plan)(plan->plan_mid_backward); FFTW_API(destroy_plan)(plan->plan_fast_forward); FFTW_API(destroy_plan)(plan->plan_fast_backward); + + + + + FFTW_API(cleanup_threads)(); #elif defined (FFT_KISSFFT) delete kissfftKK; #endif @@ -839,18 +846,18 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d } #elif defined(FFT_FFTW3) if (flag == -1) { - fftw_execute_dft(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); } else { - fftw_execute_dft(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); - fftw_execute_dft(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); } #elif defined(FFT_CUFFT) - cufftExecZ2Z(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); - cufftExecZ2Z(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); - cufftExecZ2Z(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExec(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExec(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExec(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); #else kiss_fft_functor f; typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); @@ -882,7 +889,7 @@ void FFT3dKokkos::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d FFT_SCALAR norm = plan->norm; int num = MIN(plan->normnum,nsize); - norm_functor f(d_out,norm); + norm_functor f(d_data,norm); Kokkos::parallel_for(num,f); } } From 8b134cb0c9a8c2e0a5327162b989a3244eac6433 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 19 Jul 2019 14:44:24 -0400 Subject: [PATCH 26/48] refactor FFT handling in CMake module for KSPACE. This also improves the FFT info output in the PPPM styles # Conflicts: # cmake/Modules/Packages/KSPACE.cmake --- cmake/Modules/Packages/KSPACE.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Modules/Packages/KSPACE.cmake b/cmake/Modules/Packages/KSPACE.cmake index 737270d1c0..63c42baf2d 100644 --- a/cmake/Modules/Packages/KSPACE.cmake +++ b/cmake/Modules/Packages/KSPACE.cmake @@ -11,7 +11,7 @@ if(PKG_KSPACE) else() set(FFT "KISS" CACHE STRING "FFT library for KSPACE package") endif() - set(FFT_VALUES KISS FFTW3 MKL CUFFT) + set(FFT_VALUES KISS FFTW3 MKL) set_property(CACHE FFT PROPERTY STRINGS ${FFT_VALUES}) validate_option(FFT FFT_VALUES) string(TOUPPER ${FFT} FFT) From 6a65deeb65ba6abe8237c0af0af3aa57a8e75ff4 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 19 Jul 2019 16:46:37 -0400 Subject: [PATCH 27/48] make support for threaded FFTW3 consistent and incorporate it into CMake This now also includes threaded FFTW3 for regular FFTs, not only KOKKOS. --- cmake/CMakeLists.txt | 5 +++++ cmake/Modules/FindFFTW3.cmake | 15 +++++++++------ cmake/Modules/FindFFTW3F.cmake | 11 ++++++----- cmake/Modules/Packages/KOKKOS.cmake | 4 +++- cmake/Modules/Packages/KSPACE.cmake | 22 ++++++++++++++++++++++ doc/src/Build_settings.txt | 11 ++++++++--- src/KOKKOS/fft3d_kokkos.h | 4 +++- src/KSPACE/fft3d.cpp | 23 ++++++++++++++++++++++- 8 files changed, 78 insertions(+), 17 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 0f99cc6274..0687b54fcb 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -679,4 +679,9 @@ if(PKG_KSPACE) else() message(STATUS "Using double precision FFTs") endif() + if(FFT_THREADS) + message(STATUS "Using threaded FFTs") + else() + message(STATUS "Using non-threaded FFTs") + endif() endif() diff --git a/cmake/Modules/FindFFTW3.cmake b/cmake/Modules/FindFFTW3.cmake index 552bcc4257..221e4beaad 100644 --- a/cmake/Modules/FindFFTW3.cmake +++ b/cmake/Modules/FindFFTW3.cmake @@ -1,20 +1,23 @@ # - Find fftw3 # Find the native FFTW3 headers and libraries. # -# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. -# FFTW3_LIBRARIES - List of libraries when using fftw3. -# FFTW3_FOUND - True if fftw3 found. +# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. +# FFTW3_LIBRARIES - List of libraries when using fftw3. +# FFTW3_OMP_LIBRARIES - List of libraries when using fftw3. +# FFTW3_FOUND - True if fftw3 found. +# FFTW3_OMP_FOUND - True if fftw3_omp found. # find_package(PkgConfig) pkg_check_modules(PC_FFTW3 fftw3) find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS}) - find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS}) +find_library(FFTW3_OMP_LIBRARY NAMES fftw3_omp HINTS ${PC_FFTW3_LIBRARY_DIRS}) -set(FFTW3_LIBRARIES ${FFTW3_LIBRARY}) set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR}) +set(FFTW3_LIBRARIES ${FFTW3_LIBRARY}) +set(FFTW3_OMP_LIBRARIES ${FFTW3_OMP_LIBRARY}) include(FindPackageHandleStandardArgs) # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE @@ -22,4 +25,4 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR) -mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY ) +mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY FFTW3_OMP_LIBRARY) diff --git a/cmake/Modules/FindFFTW3F.cmake b/cmake/Modules/FindFFTW3F.cmake index 92d1e85e79..c67aa5faf1 100644 --- a/cmake/Modules/FindFFTW3F.cmake +++ b/cmake/Modules/FindFFTW3F.cmake @@ -1,8 +1,8 @@ -# - Find fftw3f -# Find the native FFTW3F headers and libraries. +# Find the native single precision FFTW3 headers and libraries. # # FFTW3F_INCLUDE_DIRS - where to find fftw3f.h, etc. # FFTW3F_LIBRARIES - List of libraries when using fftw3f. +# FFTW3F_OMP_LIBRARIES - List of libraries when using fftw3. # FFTW3F_FOUND - True if fftw3f found. # @@ -10,11 +10,12 @@ find_package(PkgConfig) pkg_check_modules(PC_FFTW3F fftw3f) find_path(FFTW3F_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3F_INCLUDE_DIRS}) - find_library(FFTW3F_LIBRARY NAMES fftw3f HINTS ${PC_FFTW3F_LIBRARY_DIRS}) +find_library(FFTW3F_OMP_LIBRARY NAMES fftw3f_omp HINTS ${PC_FFTW3F_LIBRARY_DIRS}) -set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY}) set(FFTW3F_INCLUDE_DIRS ${FFTW3F_INCLUDE_DIR}) +set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY}) +set(FFTW3F_OMP_LIBRARIES ${FFTW3F_OMP_LIBRARY}) include(FindPackageHandleStandardArgs) # handle the QUIETLY and REQUIRED arguments and set FFTW3F_FOUND to TRUE @@ -22,4 +23,4 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args(FFTW3F DEFAULT_MSG FFTW3F_LIBRARY FFTW3F_INCLUDE_DIR) -mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY ) +mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 2f7d401802..cd4dd21cd7 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -27,7 +27,9 @@ if(PKG_KOKKOS) ${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp) if(PKG_KSPACE) - list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp) + list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/fft3d_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp) endif() set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}") diff --git a/cmake/Modules/Packages/KSPACE.cmake b/cmake/Modules/Packages/KSPACE.cmake index 63c42baf2d..8451eef91c 100644 --- a/cmake/Modules/Packages/KSPACE.cmake +++ b/cmake/Modules/Packages/KSPACE.cmake @@ -21,14 +21,36 @@ if(PKG_KSPACE) add_definitions(-DFFT_FFTW3) include_directories(${${FFTW}_INCLUDE_DIRS}) list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES}) + if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY) + option(FFT_THREADS "Use threaded FFT library" ON) + else() + option(FFT_THREADS "Use threaded FFT library" OFF) + endif() + + if(FFT_THREADS) + if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY) + add_definitions(-DFFT_FFTW_THREADS) + list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_OMP_LIBRARIES}) + else() + message(FATAL_ERROR "Need OpenMP enabled FFTW3 library for FFT_THREADS") + endif() + endif() elseif(FFT STREQUAL "MKL") find_package(MKL REQUIRED) + option(FFT_THREADS "Use threaded FFT library" OFF) add_definitions(-DFFT_MKL) include_directories(${MKL_INCLUDE_DIRS}) list(APPEND LAMMPS_LINK_LIBS ${MKL_LIBRARIES}) + if (FFT_THREADS) + message(FATAL_ERROR "FFT_THREADS not supported with FFT from MKL") + endif() else() # last option is KISSFFT + option(FFT_THREADS "Use threaded FFT library" OFF) add_definitions(-DFFT_KISS) + if (FFT_THREADS) + message(FATAL_ERROR "FFT_THREADS not supported with KISSFFT") + endif() endif() set(FFT_PACK "array" CACHE STRING "Optimization for FFT") diff --git a/doc/src/Build_settings.txt b/doc/src/Build_settings.txt index 578903774d..48a67a189b 100644 --- a/doc/src/Build_settings.txt +++ b/doc/src/Build_settings.txt @@ -33,8 +33,9 @@ LAMMPS can use them if they are available on your system. [CMake variables]: --D FFT=value # FFTW3 or MKL or KISS or CUFFT, default is FFTW3 if found, else KISS +-D FFT=value # FFTW3 or MKL or KISS, default is FFTW3 if found, else KISS -D FFT_SINGLE=value # yes or no (default), no = double precision +-D FFT_THREADS=value # yes or no, default is yes if FFT=FFTW3 and OpenMP FFT lib found, else no -D FFT_PACK=value # array (default) or pointer or memcpy :pre NOTE: The values for the FFT variable must be in upper-case. This is @@ -51,9 +52,10 @@ the FFT library, you can set these variables: [Makefile.machine settings]: -FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, -DFFT_FFTW_THREADS, -DFFT_CUFFT, or -DFFT_KISS +FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS # default is KISS if not specified FFT_INC = -DFFT_SINGLE # do not specify for double precision +FFT_INC = -DFFT_FFTW_THREADS # use threaded FFTW3. requires -DFFT_FFTW3 (or -DFFT_FFTW) FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY :pre # default is FFT_PACK_ARRAY if not specified @@ -61,7 +63,7 @@ FFT_INC = -I/usr/local/include FFT_PATH = -L/usr/local/lib FFT_LIB = -lfftw3 # FFTW3 double precision FFT_LIB = -lfftw3 -lfftw3f # FFTW3 single precision -FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision, OpenMP threaded using the KOKKOS package +FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision, OpenMP threaded FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compiler FFT_LIB = -lcufft # cuFFT (for GPUs using the KOKKOS package) :pre @@ -119,6 +121,9 @@ library libfftw3f.a make clean ./configure --enable-single; make; make install :pre +When using -DFFT_FFTW_THREADS you need to also have the FFTW3 +OpenMP thread interface libraries available. + Performing 3d FFTs requires communication to transpose the 3d FFT grid. The data packing/unpacking for this can be done in one of 3 modes (ARRAY, POINTER, MEMCPY) as set by the FFT_PACK syntax above. diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index c68d78db56..8d12a3f952 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -26,7 +26,9 @@ #endif #ifdef FFT_FFTW_THREADS -#define FFT_FFTW3 +#if !defined(FFT_FFTW3) +#error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS" +#endif #endif #if defined(FFT_MKL) diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp index 7d3c8c83f2..a0abd50c01 100644 --- a/src/KSPACE/fft3d.cpp +++ b/src/KSPACE/fft3d.cpp @@ -25,6 +25,9 @@ #include #include "fft3d.h" #include "remap.h" +#if defined(_OPENMP) +#include +#endif #ifdef FFT_KISS /* include kissfft implementation */ @@ -266,7 +269,7 @@ struct fft_plan_3d *fft_3d_create_plan( int scaled, int permute, int *nbuf, int usecollective) { struct fft_plan_3d *plan; - int me,nprocs; + int me,nprocs,nthreads; int flag,remapflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; @@ -279,6 +282,14 @@ struct fft_plan_3d *fft_3d_create_plan( MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); +#if defined(_OPENMP) + // query OpenMP info. + // should have been initialized systemwide in Comm class constructor + nthreads = omp_get_max_threads(); +#else + nthreads = 1; +#endif + // compute division of procs in 2 dimensions not on-processor bifactor(nprocs,&np1,&np2); @@ -571,6 +582,13 @@ struct fft_plan_3d *fft_3d_create_plan( */ #elif defined(FFT_FFTW3) +#if defined(FFT_FFTW_THREADS) + if (nthreads > 1) { + FFTW_API(init_threads)(); + FFTW_API(plan_with_nthreads)(nthreads); + } +#endif + plan->plan_fast_forward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, @@ -689,6 +707,9 @@ void fft_3d_destroy_plan(struct fft_plan_3d *plan) FFTW_API(destroy_plan)(plan->plan_mid_backward); FFTW_API(destroy_plan)(plan->plan_fast_forward); FFTW_API(destroy_plan)(plan->plan_fast_backward); +#if defined(FFT_FFTW_THREADS) + FFTW_API(cleanup_threads)(); +#endif #else if (plan->cfg_slow_forward != plan->cfg_fast_forward && plan->cfg_slow_forward != plan->cfg_mid_forward) { From 3d4825e238d2555e19217a493d1d3763e2acfd43 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 19 Jul 2019 17:00:06 -0400 Subject: [PATCH 28/48] fix bug that would break single precision FFTW support with threads using KOKKOS --- src/KOKKOS/fft3d_kokkos.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 9b001c2356..6a0203325b 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -761,10 +761,10 @@ void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokk FFTW_API(destroy_plan)(plan->plan_fast_forward); FFTW_API(destroy_plan)(plan->plan_fast_backward); - - - +#if defined (FFT_FFTW_THREADS) FFTW_API(cleanup_threads)(); +#endif + #elif defined (FFT_KISSFFT) delete kissfftKK; #endif From 5c25355b9b8398f32771108660b0bfc49c452d26 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 19 Jul 2019 17:27:10 -0400 Subject: [PATCH 29/48] update comments --- cmake/Modules/FindFFTW3.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/Modules/FindFFTW3.cmake b/cmake/Modules/FindFFTW3.cmake index 221e4beaad..63752f85df 100644 --- a/cmake/Modules/FindFFTW3.cmake +++ b/cmake/Modules/FindFFTW3.cmake @@ -1,11 +1,10 @@ # - Find fftw3 -# Find the native FFTW3 headers and libraries. +# Find the native double precision FFTW3 headers and libraries. # # FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. # FFTW3_LIBRARIES - List of libraries when using fftw3. # FFTW3_OMP_LIBRARIES - List of libraries when using fftw3. # FFTW3_FOUND - True if fftw3 found. -# FFTW3_OMP_FOUND - True if fftw3_omp found. # find_package(PkgConfig) From 11cb54221105a64ee9519e7d2c07e4d5310e0164 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 12 Sep 2019 07:33:22 -0400 Subject: [PATCH 30/48] add omitted cmake entries for recent KOKKOS package changes --- cmake/Modules/Packages/KOKKOS.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index bc3ef59e73..acf48f16b8 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -17,6 +17,8 @@ if(PKG_KOKKOS) ${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/min_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/min_linesearch_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/neigh_list_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/neigh_bond_kokkos.cpp From 4c9d355fc7d5d278fafcfa8e7bed693c2e1fa9fb Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 16 Dec 2019 13:30:52 -0700 Subject: [PATCH 31/48] Fix compile error in fft3d_kokkos.cpp --- src/KOKKOS/fft3d_kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 6a0203325b..795f246076 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -148,7 +148,7 @@ public: KOKKOS_INLINE_FUNCTION void operator() (const int &i) const { #if defined(FFT_FFTW3) || defined(FFT_CUFFT) - FFT_SCALAR* out_ptr = (FFT_SCALAR *)d_out(i); + FFT_SCALAR* out_ptr = (FFT_SCALAR *)(d_out.data()+i); *(out_ptr++) *= norm; *(out_ptr++) *= norm; #elif defined(FFT_MKL) From 3784df23f080c8ba19c43ca0aa6b7ff9302cafbf Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 12 Jan 2020 21:32:07 -0500 Subject: [PATCH 32/48] use FFT_FFTW_THREADS consistently, add CUFFT to CMake, minor improvements --- cmake/CMakeLists.txt | 5 ++++- cmake/Modules/Packages/KOKKOS.cmake | 4 ++++ cmake/Modules/Packages/KSPACE.cmake | 14 +++--------- doc/src/Build_settings.rst | 34 +++++++++++++++++++++-------- 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index c7b3edaca3..26d1cc6af4 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -738,9 +738,12 @@ if(PKG_KSPACE) else() message(STATUS "Using double precision FFTs") endif() - if(FFT_THREADS) + if(FFT_FFTW_THREADS) message(STATUS "Using threaded FFTs") else() message(STATUS "Using non-threaded FFTs") endif() + if(PKG_KOKKOS AND FFT_CUFFT) + message(STATUS "Kokkos FFT: cuFFT") + endif() endif() diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 23ff6cd1ff..13a751f18d 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -42,6 +42,10 @@ if(PKG_KOKKOS) list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/fft3d_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp) + if(KOKKOS_ENABLE_CUDA) + add_definitions(-DFFT_CUFFT) + list(APPEND LAMMPS_LINK_LIBS -lcufft) + endif() endif() set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}") diff --git a/cmake/Modules/Packages/KSPACE.cmake b/cmake/Modules/Packages/KSPACE.cmake index 8451eef91c..5786d7cb8a 100644 --- a/cmake/Modules/Packages/KSPACE.cmake +++ b/cmake/Modules/Packages/KSPACE.cmake @@ -22,12 +22,12 @@ if(PKG_KSPACE) include_directories(${${FFTW}_INCLUDE_DIRS}) list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES}) if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY) - option(FFT_THREADS "Use threaded FFT library" ON) + option(FFT_FFTW_THREADS "Use threaded FFT library" ON) else() - option(FFT_THREADS "Use threaded FFT library" OFF) + option(FFT_FFTW_THREADS "Use threaded FFT library" OFF) endif() - if(FFT_THREADS) + if(FFT_FFTW_THREADS) if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY) add_definitions(-DFFT_FFTW_THREADS) list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_OMP_LIBRARIES}) @@ -37,20 +37,12 @@ if(PKG_KSPACE) endif() elseif(FFT STREQUAL "MKL") find_package(MKL REQUIRED) - option(FFT_THREADS "Use threaded FFT library" OFF) add_definitions(-DFFT_MKL) include_directories(${MKL_INCLUDE_DIRS}) list(APPEND LAMMPS_LINK_LIBS ${MKL_LIBRARIES}) - if (FFT_THREADS) - message(FATAL_ERROR "FFT_THREADS not supported with FFT from MKL") - endif() else() # last option is KISSFFT - option(FFT_THREADS "Use threaded FFT library" OFF) add_definitions(-DFFT_KISS) - if (FFT_THREADS) - message(FATAL_ERROR "FFT_THREADS not supported with KISSFFT") - endif() endif() set(FFT_PACK "array" CACHE STRING "Optimization for FFT") diff --git a/doc/src/Build_settings.rst b/doc/src/Build_settings.rst index 07d427db49..b76e902033 100644 --- a/doc/src/Build_settings.rst +++ b/doc/src/Build_settings.rst @@ -49,13 +49,19 @@ through the CMAKE\_CXX\_FLAGS variable. Example for CentOS 7: -D CMAKE_CXX_FLAGS="-O3 -g -fopenmp -DNDEBUG -std=c++11" -**Makefile.machine setting**\ : +**Makefile.machine setting**\ to bypass the C++11 test and compile in C++98 mode: .. parsed-literal:: LMP_INC = -DLAMMPS_CXX98 +**Makefile.machine setting**\ to enable the C++11 with older (but not too old) GNU c++ (e.g. on CentOS 7): + + +.. parsed-literal:: + + CCFLAGS = -g -O3 -std=c++11 ---------- @@ -86,14 +92,19 @@ LAMMPS can use them if they are available on your system. an exception to the rule that all CMake variables can be specified with lower-case values. -Usually these settings are all that is needed. If CMake cannot find -the FFT library, you can set these variables: +Usually these settings are all that is needed. If FFTW3 is selected, +then CMake will try to detect, if threaded FFTW libraries are available +and enable them by default. This setting is independent of whether +OpenMP threads are enabled and a packages like KOKKOS or USER-OMP is +used. If CMake cannot detect the FFT library, you can set these variables +to assist: .. parsed-literal:: -D FFTW3_INCLUDE_DIRS=path # path to FFTW3 include files -D FFTW3_LIBRARIES=path # path to FFTW3 libraries + -D FFT_FFTW_THREADS=on # enable using threaded FFTW3 libraries -D MKL_INCLUDE_DIRS=path # ditto for Intel MKL library -D MKL_LIBRARIES=path @@ -105,6 +116,7 @@ the FFT library, you can set these variables: FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS # default is KISS if not specified FFT_INC = -DFFT_SINGLE # do not specify for double precision + FFT_INC = -DFFT_FFTW_THREADS # enable using threaded FFTW3 libraries FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY # default is FFT\_PACK\_ARRAY if not specified @@ -115,6 +127,7 @@ the FFT library, you can set these variables: FFT_INC = -I/usr/local/include FFT_PATH = -L/usr/local/lib FFT_LIB = -lfftw3 # FFTW3 double precision + FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision with threads (needs -DFFT_FFTW_THREADS) FFT_LIB = -lfftw3 -lfftw3f # FFTW3 single precision FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compier @@ -126,16 +139,19 @@ FFT\_LIB with the appropriate FFT libraries to include in the link. **CMake and make info**\ : The `KISS FFT library `_ is included in the LAMMPS -distribution. It is portable across all platforms. Depending on the -size of the FFTs and the number of processors used, the other -libraries listed here can be faster. +distribution. It is portable across all platforms. Depending on the size +of the FFTs and the number of processors used, the other libraries listed +here can be faster. However, note that long-range Coulombics are only a portion of the per-timestep CPU cost, FFTs are only a portion of long-range Coulombics, and 1d FFTs are only a portion of the FFT cost (parallel communication can be costly). A breakdown of these timings is printed -to the screen at the end of a run using the :doc:`kspace\_style pppm ` command. The :doc:`Run output ` -doc page gives more details. +to the screen at the end of a run when using the +:doc:`kspace_style pppm ` command. The :doc:`Run output ` +doc page gives more details. A more detailed (and time consuming) +report of the FFT performance is generated with the +:doc:`kspace_modify fftbench yes ` command. FFTW is a fast, portable FFT library that should also work on any platform and can be faster than the KISS FFT library. You can @@ -166,7 +182,7 @@ When using -DFFT\_SINGLE with FFTW3 you may need to build the FFTW library a second time with support for single-precision. For FFTW3, do the following, which should produce the additional -library libfftw3f.a +library libfftw3f.a or libfftw3f.so. .. parsed-literal:: From 15603e0ff7c808e2be89d1e5ab60f04d484dac15 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 12 Jan 2020 21:46:02 -0500 Subject: [PATCH 33/48] add documentation for including cufft with KOKKOS+CUDA (for conventional build, CMake is automatic) --- cmake/CMakeLists.txt | 2 +- cmake/Modules/Packages/KOKKOS.cmake | 2 +- doc/src/Build_extras.rst | 8 +++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 26d1cc6af4..b004f29068 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -743,7 +743,7 @@ if(PKG_KSPACE) else() message(STATUS "Using non-threaded FFTs") endif() - if(PKG_KOKKOS AND FFT_CUFFT) + if(PKG_KOKKOS AND KOKKOS_ENABLE_CUDA) message(STATUS "Kokkos FFT: cuFFT") endif() endif() diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 13a751f18d..fd352fd47e 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -44,7 +44,7 @@ if(PKG_KOKKOS) ${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp) if(KOKKOS_ENABLE_CUDA) add_definitions(-DFFT_CUFFT) - list(APPEND LAMMPS_LINK_LIBS -lcufft) + list(APPEND LAMMPS_LINK_LIBS cufft) endif() endif() diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index 2cba59bde0..20d846fb7e 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -350,10 +350,12 @@ For NVIDIA GPUs using CUDA: KOKKOS_DEVICES = Cuda KOKKOS_ARCH = archCPU,archGPU # archCPU = CPU from list above that is hosting the GPU # archGPU = GPU from list above + FFT_INC = -DFFT_CUFFT # enable use of cuFFT (optional) + FFT_LIB = -lcufft # link to cuFFT library -For GPUs, you also need these 2 lines in your Makefile.machine before -the CC line is defined, in this case for use with OpenMPI mpicxx. The -2 lines define a nvcc wrapper compiler, which will use nvcc for +For GPUs, you also need the following 2 lines in your Makefile.machine +before the CC line is defined, in this case for use with OpenMPI mpicxx. +The 2 lines define a nvcc wrapper compiler, which will use nvcc for compiling CUDA files and use a C++ compiler for non-Kokkos, non-CUDA files. From 88448ef950ccbe74b5a22dfe2099b68931d88d0e Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 13 Jan 2020 10:57:36 -0500 Subject: [PATCH 34/48] update rst docs since .txt file is gone --- doc/src/kspace_style.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/src/kspace_style.rst b/doc/src/kspace_style.rst index 32179f8be0..cc1ad58ece 100644 --- a/doc/src/kspace_style.rst +++ b/doc/src/kspace_style.rst @@ -404,9 +404,9 @@ calculation can be performed concurrently on the GPU while other calculations for non-bonded and bonded force calculation are performed on the CPU. -The *pppm/kk* style also performs charge assignment and force -interpolation calculations on the GPU while the FFTs themselves are -calculated on the CPU in non-threaded mode. +The *pppm/kk* style performs charge assignment and force interpolation +calculations, along with the FFTs themselves, on the GPU or threaded +on the CPU when using OpenMP. These accelerated styles are part of the GPU, USER-INTEL, KOKKOS, USER-OMP, and OPT packages respectively. They are only enabled if From a7720124db8463f32740424c282fdc972855cc71 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 13 Jan 2020 11:09:37 -0500 Subject: [PATCH 35/48] check and adjust FFT selection for KOKKOS depending on whether CUDA is enabled or not --- src/KOKKOS/fft3d_kokkos.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index 8d12a3f952..b2569942f9 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -19,12 +19,36 @@ #include "remap_kokkos.h" #include "fftdata_kokkos.h" +// with KOKKOS in CUDA mode we can only have +// CUFFT or KISSFFT, thus undefine all other +// FFTs here, since they may be valid in fft3d.cpp + +#if defined(KOKKOS_ENABLE_CUDA) +# if defined(FFT_FFTW) +# undef FFT_FFTW +# endif +# if defined(FFT_FFTW3) +# undef FFT_FFTW3 +# endif +# if defined(FFT_MKL) +# undef FFT_MKL +# endif +# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT) +# define FFT_KISSFFT +# endif +#else +# if defined(FFT_CUFFT) +# error "Must enable CUDA with KOKKOS to use -DFFT_CUFFT" +# endif +#endif + // if user set FFTW, it means FFTW3 #ifdef FFT_FFTW #define FFT_FFTW3 #endif + #ifdef FFT_FFTW_THREADS #if !defined(FFT_FFTW3) #error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS" From 5198058691ae6159c8b8f18e88bf416204c09727 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 13 Jan 2020 11:21:44 -0500 Subject: [PATCH 36/48] fix whitespace issues in fft3d_kokkos files --- src/KOKKOS/fft3d_kokkos.cpp | 14 +++++++------- src/KOKKOS/fft3d_kokkos.h | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 795f246076..ca0f6a9b13 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -39,7 +39,7 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective) : + int scaled, int permute, int *nbuf, int usecollective) : Pointers(lmp) { int nthreads = lmp->kokkos->nthreads; @@ -206,7 +206,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); - remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar, + remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar, d_scratch_scalar, plan->pre_plan); d_data = d_copy; @@ -230,7 +230,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ #elif defined(FFT_CUFFT) cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); #else - typename AT::t_FFT_DATA_1d d_tmp = + typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); kiss_fft_functor f; if (flag == -1) @@ -253,7 +253,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2); d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2); - remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, + remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, d_scratch_scalar, plan->mid1_plan); d_data = d_copy; @@ -295,7 +295,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); - remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, + remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, d_scratch_scalar, plan->mid2_plan); d_data = d_copy; @@ -335,7 +335,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ d_out_scalar = typename AT::t_FFT_SCALAR_1d(d_out.data(),d_out.size()); d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); - remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar, + remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar, d_scratch_scalar, plan->post_plan); } @@ -534,7 +534,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl // configure plan memory pointers and allocate work space // out_size = amount of memory given to FFT by user - // first/second/third_size = amount of memory needed after + // first/second/third_size = amount of memory needed after // pre,mid1,mid2 remaps // copy_size = amount needed internally for extra copy of data // scratch_size = amount needed internally for remap scratch space diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index b2569942f9..81bad20da7 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -189,14 +189,14 @@ The FFT setup for the PPPM solver failed, typically due to lack of memory. This is an unusual error. Check the size of the FFT grid you are requesting. -E: Cannot use the FFTW library with Kokkos CUDA on GPUs +E: Cannot use the FFTW library with Kokkos CUDA on GPUs -Kokkos CUDA doesn't support using the FFTW library to calculate FFTs for -PPPM on GPUs. +Kokkos CUDA doesn't support using the FFTW library to calculate FFTs for +PPPM on GPUs. E: Cannot use the cuFFT library with Kokkos CUDA on the host CPUs -Kokkos CUDA doesn't support using the cuFFT library to calculate FFTs -for PPPM on the host CPUs, use KISS FFT instead. +Kokkos CUDA doesn't support using the cuFFT library to calculate FFTs +for PPPM on the host CPUs, use KISS FFT instead. */ From 206aa77532a836cf4c4774abc128d34d2832a782 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 13 Jan 2020 11:22:42 -0500 Subject: [PATCH 37/48] fix one more whitespace issue --- src/KOKKOS/pack_kokkos.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/pack_kokkos.h b/src/KOKKOS/pack_kokkos.h index 5283e0d3fc..e0a11ff4e0 100644 --- a/src/KOKKOS/pack_kokkos.h +++ b/src/KOKKOS/pack_kokkos.h @@ -6,7 +6,7 @@ Copyright (2014) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under + certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level SPARTA directory. From 2d28e9fa2e16ebb6a0d5d0d06f59c5104851081c Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Mon, 13 Jan 2020 12:08:46 -0500 Subject: [PATCH 38/48] check for FFT_FFTW_THREADS may only run when CUDA is not enabled --- src/KOKKOS/fft3d_kokkos.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index 81bad20da7..2de3b641ea 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -40,19 +40,15 @@ # if defined(FFT_CUFFT) # error "Must enable CUDA with KOKKOS to use -DFFT_CUFFT" # endif -#endif - // if user set FFTW, it means FFTW3 - -#ifdef FFT_FFTW -#define FFT_FFTW3 -#endif - - -#ifdef FFT_FFTW_THREADS -#if !defined(FFT_FFTW3) -#error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS" -#endif +# ifdef FFT_FFTW +# define FFT_FFTW3 +# endif +# ifdef FFT_FFTW_THREADS +# if !defined(FFT_FFTW3) +# error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS" +# endif +# endif #endif #if defined(FFT_MKL) From dfda8b5ce65422dbe7eb75771c65665907c93f5d Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 22 Jan 2020 15:18:59 -0500 Subject: [PATCH 39/48] fix up LMP_FFT_LIB for reporting FFT lib in use for KOKKOS with CUDA logic --- src/KOKKOS/pppm_kokkos.h | 16 ++++++++++++++++ src/KSPACE/pppm.h | 10 ++-------- src/KSPACE/pppm_disp.h | 8 -------- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h index 5dd99af5fb..1f7a5c0e36 100644 --- a/src/KOKKOS/pppm_kokkos.h +++ b/src/KOKKOS/pppm_kokkos.h @@ -22,6 +22,22 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos) #ifndef LMP_PPPM_KOKKOS_H #define LMP_PPPM_KOKKOS_H +// fix up FFT defines for KOKKOS with CUDA + +#if defined(KOKKOS_ENABLE_CUDA) +# if defined(FFT_FFTW) +# undef FFT_FFTW +# endif +# if defined(FFT_FFTW3) +# undef FFT_FFTW3 +# endif +# if defined(FFT_MKL) +# undef FFT_MKL +# endif +# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT) +# define FFT_KISSFFT +# endif + #include "pppm.h" #include "gridcomm_kokkos.h" #include "remap_kokkos.h" diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h index de9c1cc5e4..f73fb49ad3 100644 --- a/src/KSPACE/pppm.h +++ b/src/KSPACE/pppm.h @@ -26,14 +26,8 @@ KSpaceStyle(pppm,PPPM) #define LMP_FFT_LIB "FFTW3" #elif defined(FFT_MKL) #define LMP_FFT_LIB "MKL FFT" -#else -#define LMP_FFT_LIB "KISS FFT" -#endif - -#if defined(FFT_FFTW3) -#define LMP_FFT_LIB "FFTW3" -#elif defined(FFT_MKL) -#define LMP_FFT_LIB "MKL FFT" +#elif defined(FFT_CUFFT) +#define LMP_FFT_LIB "cuFFT" #else #define LMP_FFT_LIB "KISS FFT" #endif diff --git a/src/KSPACE/pppm_disp.h b/src/KSPACE/pppm_disp.h index 9b34200547..130671fa28 100644 --- a/src/KSPACE/pppm_disp.h +++ b/src/KSPACE/pppm_disp.h @@ -30,14 +30,6 @@ KSpaceStyle(pppm/disp,PPPMDisp) #define LMP_FFT_LIB "KISS FFT" #endif -#if defined(FFT_FFTW3) -#define LMP_FFT_LIB "FFTW3" -#elif defined(FFT_MKL) -#define LMP_FFT_LIB "MKL FFT" -#else -#define LMP_FFT_LIB "KISS FFT" -#endif - #ifdef FFT_SINGLE typedef float FFT_SCALAR; #define LMP_FFT_PREC "single" From 70b9a060dc9d719b51e52b6b794e206d32811b7e Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 22 Jan 2020 15:26:34 -0500 Subject: [PATCH 40/48] add missing #endif --- src/KOKKOS/pppm_kokkos.h | 1 + src/kspace.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h index 1f7a5c0e36..40784944c4 100644 --- a/src/KOKKOS/pppm_kokkos.h +++ b/src/KOKKOS/pppm_kokkos.h @@ -37,6 +37,7 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos) # if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT) # define FFT_KISSFFT # endif +#endif #include "pppm.h" #include "gridcomm_kokkos.h" diff --git a/src/kspace.cpp b/src/kspace.cpp index b79e1077bf..2399c10e2b 100644 --- a/src/kspace.cpp +++ b/src/kspace.cpp @@ -39,7 +39,7 @@ KSpace::KSpace(LAMMPS *lmp) : Pointers(lmp) virial[0] = virial[1] = virial[2] = virial[3] = virial[4] = virial[5] = 0.0; triclinic_support = 1; - ewaldflag = pppmflag = msmflag = dispersionflag = tip4pflag = + ewaldflag = pppmflag = msmflag = dispersionflag = tip4pflag = dipoleflag = spinflag = 0; compute_flag = 1; group_group_enable = 0; From 70731468df2a6f4345a9e2ff8dfc290a3a89b6b8 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 22 Jan 2020 16:32:32 -0500 Subject: [PATCH 41/48] include "pppm.h" last so that necessary KOKKOS config define is available --- src/KOKKOS/pppm_kokkos.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h index 40784944c4..c855b1bda8 100644 --- a/src/KOKKOS/pppm_kokkos.h +++ b/src/KOKKOS/pppm_kokkos.h @@ -22,6 +22,12 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos) #ifndef LMP_PPPM_KOKKOS_H #define LMP_PPPM_KOKKOS_H +#include "gridcomm_kokkos.h" +#include "remap_kokkos.h" +#include "fft3d_kokkos.h" +#include "kokkos_base.h" +#include "kokkos_type.h" + // fix up FFT defines for KOKKOS with CUDA #if defined(KOKKOS_ENABLE_CUDA) @@ -40,11 +46,6 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos) #endif #include "pppm.h" -#include "gridcomm_kokkos.h" -#include "remap_kokkos.h" -#include "fft3d_kokkos.h" -#include "kokkos_base.h" -#include "kokkos_type.h" namespace LAMMPS_NS { From 53554fdb8458cd8489b4668b2e30b5185f07093d Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 22 Jan 2020 16:56:12 -0500 Subject: [PATCH 42/48] add -DFFT_CUFFT for KOKKOS only when KISSFFT is not selected --- cmake/Modules/Packages/KOKKOS.cmake | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index fd352fd47e..29beaca957 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -10,7 +10,7 @@ if(PKG_KOKKOS) set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos) set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos) add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR}) - + set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src ${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src ${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src @@ -43,8 +43,10 @@ if(PKG_KOKKOS) ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp ${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp) if(KOKKOS_ENABLE_CUDA) - add_definitions(-DFFT_CUFFT) - list(APPEND LAMMPS_LINK_LIBS cufft) + if(NOT ${FFT} STREQUAL "KISS") + add_definitions(-DFFT_CUFFT) + list(APPEND LAMMPS_LINK_LIBS cufft) + endif() endif() endif() From c126df4f790d677739964b3ff681688f1fab271a Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 22 Jan 2020 17:08:58 -0500 Subject: [PATCH 43/48] update KOKKOS FFT status message in CMake summary --- cmake/CMakeLists.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index b1d84db3a1..665dd291ba 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -745,6 +745,12 @@ if(PKG_KSPACE) message(STATUS "Using non-threaded FFTs") endif() if(PKG_KOKKOS AND KOKKOS_ENABLE_CUDA) - message(STATUS "Kokkos FFT: cuFFT") + if (${FFT} STREQUAL "KISS") + message(STATUS "Kokkos FFT: KISS") + else() + message(STATUS "Kokkos FFT: cuFFT") + endif() + else() + message(STATUS "Kokkos FFT: ${FFT}") endif() endif() From f14b200b6ae429e734e3171d4618d579ad16a7c0 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 23 Jan 2020 00:27:00 -0500 Subject: [PATCH 44/48] correct KOKKOS FFT output for CMake summary one more time --- cmake/CMakeLists.txt | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 665dd291ba..0d8cc61a7b 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -744,13 +744,15 @@ if(PKG_KSPACE) else() message(STATUS "Using non-threaded FFTs") endif() - if(PKG_KOKKOS AND KOKKOS_ENABLE_CUDA) - if (${FFT} STREQUAL "KISS") - message(STATUS "Kokkos FFT: KISS") + if(PKG_KOKKOS) + if(KOKKOS_ENABLE_CUDA) + if (${FFT} STREQUAL "KISS") + message(STATUS "Kokkos FFT: KISS") + else() + message(STATUS "Kokkos FFT: cuFFT") + endif() else() - message(STATUS "Kokkos FFT: cuFFT") + message(STATUS "Kokkos FFT: ${FFT}") endif() - else() - message(STATUS "Kokkos FFT: ${FFT}") endif() endif() From cb5138a3358f01fbb180893240ae5fa1275cc494 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Thu, 23 Jan 2020 12:00:33 -0500 Subject: [PATCH 45/48] update version string for the next patch release --- doc/lammps.1 | 2 +- src/version.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/lammps.1 b/doc/lammps.1 index 4355e64961..5ffd888f05 100644 --- a/doc/lammps.1 +++ b/doc/lammps.1 @@ -1,4 +1,4 @@ -.TH LAMMPS "9 January 2020" "2020-01-09" +.TH LAMMPS "24 January 2020" "2020-01-24" .SH NAME .B LAMMPS \- Molecular Dynamics Simulator. diff --git a/src/version.h b/src/version.h index 1c5c4c6206..a47eeef1c1 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -#define LAMMPS_VERSION "09 Jan 2020" +#define LAMMPS_VERSION "24 Jan 2020" From e9d4823d71c55e363b15e74b87b01f4171590c00 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 24 Jan 2020 13:56:00 -0700 Subject: [PATCH 46/48] Add cuFFT to Makefile.lassen_kokkos and generalize MPI path --- src/MAKE/MACHINES/Makefile.lassen_kokkos | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/MAKE/MACHINES/Makefile.lassen_kokkos b/src/MAKE/MACHINES/Makefile.lassen_kokkos index 23697bfea2..7484f25a4b 100644 --- a/src/MAKE/MACHINES/Makefile.lassen_kokkos +++ b/src/MAKE/MACHINES/Makefile.lassen_kokkos @@ -1,4 +1,4 @@ -# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc 7.3.1 +# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc SHELL = /bin/sh @@ -44,9 +44,12 @@ LMP_INC = -DLAMMPS_GZIP # PATH = path for MPI library # LIB = name of MPI library -MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/include +MY_MPI_EXE = $(shell which mpicxx) +MY_MPI_PATH = $(dir ${MY_MPI_EXE}) + +MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I${MY_MPI_PATH}../include MPI_PATH = -MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/lib -lmpi_ibm +MPI_LIB = -L${MY_MPI_PATH}../lib -lmpi_ibm # FFT library # see discussion in Section 2.2 (step 6) of manaul @@ -55,9 +58,9 @@ MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3. # PATH = path for FFT library # LIB = name of FFT library -FFT_INC = +FFT_INC = -DFFT_CUFFT FFT_PATH = -FFT_LIB = +FFT_LIB = -lcufft # JPEG and/or PNG library # see discussion in Section 2.2 (step 7) of manual From 76aaf16c666534d60020f179c059ea3225b7d715 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 24 Jan 2020 14:30:11 -0700 Subject: [PATCH 47/48] Add check for FFT_FFTW_THREADS and turn off MKL threads for now --- src/KOKKOS/fft3d_kokkos.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index 795f246076..b08419d7ff 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -607,7 +607,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast); DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast); - DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads); + //DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads); DftiCommitDescriptor(plan->handle_fast); DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1, @@ -617,7 +617,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid); DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid); - DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads); + //DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads); DftiCommitDescriptor(plan->handle_mid); DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1, @@ -627,7 +627,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE); DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow); DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow); - DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads); + //DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads); DftiCommitDescriptor(plan->handle_slow); if (scaled == 0) @@ -640,10 +640,13 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl } #elif defined(FFT_FFTW3) + +#if defined (FFT_FFTW_THREADS) if (nthreads > 1) { FFTW_API(init_threads)(); FFTW_API(plan_with_nthreads)(nthreads); } +#endif plan->plan_fast_forward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, From 9d333c351a01c43f898296bef87203e58a68ef18 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Fri, 24 Jan 2020 14:42:40 -0700 Subject: [PATCH 48/48] Small clarification to kspace_style.rst --- doc/src/kspace_style.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/src/kspace_style.rst b/doc/src/kspace_style.rst index cc1ad58ece..4dfcf2fe73 100644 --- a/doc/src/kspace_style.rst +++ b/doc/src/kspace_style.rst @@ -405,8 +405,8 @@ calculations for non-bonded and bonded force calculation are performed on the CPU. The *pppm/kk* style performs charge assignment and force interpolation -calculations, along with the FFTs themselves, on the GPU or threaded -on the CPU when using OpenMP. +calculations, along with the FFTs themselves, on the GPU or (optionally) threaded +on the CPU when using OpenMP and FFTW3. These accelerated styles are part of the GPU, USER-INTEL, KOKKOS, USER-OMP, and OPT packages respectively. They are only enabled if