diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 92293f294b..0d8cc61a7b 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -739,4 +739,20 @@ if(PKG_KSPACE) else() message(STATUS "Using double precision FFTs") endif() + if(FFT_FFTW_THREADS) + message(STATUS "Using threaded FFTs") + else() + message(STATUS "Using non-threaded FFTs") + endif() + if(PKG_KOKKOS) + if(KOKKOS_ENABLE_CUDA) + if (${FFT} STREQUAL "KISS") + message(STATUS "Kokkos FFT: KISS") + else() + message(STATUS "Kokkos FFT: cuFFT") + endif() + else() + message(STATUS "Kokkos FFT: ${FFT}") + endif() + endif() endif() diff --git a/cmake/Modules/FindFFTW3.cmake b/cmake/Modules/FindFFTW3.cmake index 552bcc4257..63752f85df 100644 --- a/cmake/Modules/FindFFTW3.cmake +++ b/cmake/Modules/FindFFTW3.cmake @@ -1,20 +1,22 @@ # - Find fftw3 -# Find the native FFTW3 headers and libraries. +# Find the native double precision FFTW3 headers and libraries. # -# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. -# FFTW3_LIBRARIES - List of libraries when using fftw3. -# FFTW3_FOUND - True if fftw3 found. +# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc. +# FFTW3_LIBRARIES - List of libraries when using fftw3. +# FFTW3_OMP_LIBRARIES - List of libraries when using fftw3. +# FFTW3_FOUND - True if fftw3 found. # find_package(PkgConfig) pkg_check_modules(PC_FFTW3 fftw3) find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS}) - find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS}) +find_library(FFTW3_OMP_LIBRARY NAMES fftw3_omp HINTS ${PC_FFTW3_LIBRARY_DIRS}) -set(FFTW3_LIBRARIES ${FFTW3_LIBRARY}) set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR}) +set(FFTW3_LIBRARIES ${FFTW3_LIBRARY}) +set(FFTW3_OMP_LIBRARIES ${FFTW3_OMP_LIBRARY}) include(FindPackageHandleStandardArgs) # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE @@ -22,4 +24,4 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR) -mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY ) +mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY FFTW3_OMP_LIBRARY) diff --git a/cmake/Modules/FindFFTW3F.cmake b/cmake/Modules/FindFFTW3F.cmake index 92d1e85e79..c67aa5faf1 100644 --- a/cmake/Modules/FindFFTW3F.cmake +++ b/cmake/Modules/FindFFTW3F.cmake @@ -1,8 +1,8 @@ -# - Find fftw3f -# Find the native FFTW3F headers and libraries. +# Find the native single precision FFTW3 headers and libraries. # # FFTW3F_INCLUDE_DIRS - where to find fftw3f.h, etc. # FFTW3F_LIBRARIES - List of libraries when using fftw3f. +# FFTW3F_OMP_LIBRARIES - List of libraries when using fftw3. # FFTW3F_FOUND - True if fftw3f found. # @@ -10,11 +10,12 @@ find_package(PkgConfig) pkg_check_modules(PC_FFTW3F fftw3f) find_path(FFTW3F_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3F_INCLUDE_DIRS}) - find_library(FFTW3F_LIBRARY NAMES fftw3f HINTS ${PC_FFTW3F_LIBRARY_DIRS}) +find_library(FFTW3F_OMP_LIBRARY NAMES fftw3f_omp HINTS ${PC_FFTW3F_LIBRARY_DIRS}) -set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY}) set(FFTW3F_INCLUDE_DIRS ${FFTW3F_INCLUDE_DIR}) +set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY}) +set(FFTW3F_OMP_LIBRARIES ${FFTW3F_OMP_LIBRARY}) include(FindPackageHandleStandardArgs) # handle the QUIETLY and REQUIRED arguments and set FFTW3F_FOUND to TRUE @@ -22,4 +23,4 @@ include(FindPackageHandleStandardArgs) find_package_handle_standard_args(FFTW3F DEFAULT_MSG FFTW3F_LIBRARY FFTW3F_INCLUDE_DIR) -mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY ) +mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY) diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 428588ec9d..29beaca957 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -10,7 +10,7 @@ if(PKG_KOKKOS) set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos) set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos) add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR}) - + set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src ${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src ${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src @@ -39,7 +39,15 @@ if(PKG_KOKKOS) ${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp) if(PKG_KSPACE) - list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp) + list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/fft3d_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp + ${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp) + if(KOKKOS_ENABLE_CUDA) + if(NOT ${FFT} STREQUAL "KISS") + add_definitions(-DFFT_CUFFT) + list(APPEND LAMMPS_LINK_LIBS cufft) + endif() + endif() endif() set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}") diff --git a/cmake/Modules/Packages/KSPACE.cmake b/cmake/Modules/Packages/KSPACE.cmake index 63c42baf2d..5786d7cb8a 100644 --- a/cmake/Modules/Packages/KSPACE.cmake +++ b/cmake/Modules/Packages/KSPACE.cmake @@ -21,6 +21,20 @@ if(PKG_KSPACE) add_definitions(-DFFT_FFTW3) include_directories(${${FFTW}_INCLUDE_DIRS}) list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES}) + if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY) + option(FFT_FFTW_THREADS "Use threaded FFT library" ON) + else() + option(FFT_FFTW_THREADS "Use threaded FFT library" OFF) + endif() + + if(FFT_FFTW_THREADS) + if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY) + add_definitions(-DFFT_FFTW_THREADS) + list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_OMP_LIBRARIES}) + else() + message(FATAL_ERROR "Need OpenMP enabled FFTW3 library for FFT_THREADS") + endif() + endif() elseif(FFT STREQUAL "MKL") find_package(MKL REQUIRED) add_definitions(-DFFT_MKL) diff --git a/doc/lammps.1 b/doc/lammps.1 index 4355e64961..5ffd888f05 100644 --- a/doc/lammps.1 +++ b/doc/lammps.1 @@ -1,4 +1,4 @@ -.TH LAMMPS "9 January 2020" "2020-01-09" +.TH LAMMPS "24 January 2020" "2020-01-24" .SH NAME .B LAMMPS \- Molecular Dynamics Simulator. diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index 59b8c45b66..951e094c12 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -350,10 +350,12 @@ For NVIDIA GPUs using CUDA: KOKKOS_DEVICES = Cuda KOKKOS_ARCH = archCPU,archGPU # archCPU = CPU from list above that is hosting the GPU # archGPU = GPU from list above + FFT_INC = -DFFT_CUFFT # enable use of cuFFT (optional) + FFT_LIB = -lcufft # link to cuFFT library -For GPUs, you also need these 2 lines in your Makefile.machine before -the CC line is defined, in this case for use with OpenMPI mpicxx. The -2 lines define a nvcc wrapper compiler, which will use nvcc for +For GPUs, you also need the following 2 lines in your Makefile.machine +before the CC line is defined, in this case for use with OpenMPI mpicxx. +The 2 lines define a nvcc wrapper compiler, which will use nvcc for compiling CUDA files and use a C++ compiler for non-Kokkos, non-CUDA files. diff --git a/doc/src/Build_settings.rst b/doc/src/Build_settings.rst index d626a3e8fd..cdb8016f57 100644 --- a/doc/src/Build_settings.rst +++ b/doc/src/Build_settings.rst @@ -49,13 +49,19 @@ through the CMAKE\_CXX\_FLAGS variable. Example for CentOS 7: -D CMAKE_CXX_FLAGS="-O3 -g -fopenmp -DNDEBUG -std=c++11" -**Makefile.machine setting**\ : +**Makefile.machine setting**\ to bypass the C++11 test and compile in C++98 mode: .. parsed-literal:: LMP_INC = -DLAMMPS_CXX98 +**Makefile.machine setting**\ to enable the C++11 with older (but not too old) GNU c++ (e.g. on CentOS 7): + + +.. parsed-literal:: + + CCFLAGS = -g -O3 -std=c++11 ---------- @@ -86,14 +92,19 @@ LAMMPS can use them if they are available on your system. an exception to the rule that all CMake variables can be specified with lower-case values. -Usually these settings are all that is needed. If CMake cannot find -the FFT library, you can set these variables: +Usually these settings are all that is needed. If FFTW3 is selected, +then CMake will try to detect, if threaded FFTW libraries are available +and enable them by default. This setting is independent of whether +OpenMP threads are enabled and a packages like KOKKOS or USER-OMP is +used. If CMake cannot detect the FFT library, you can set these variables +to assist: .. parsed-literal:: -D FFTW3_INCLUDE_DIRS=path # path to FFTW3 include files -D FFTW3_LIBRARIES=path # path to FFTW3 libraries + -D FFT_FFTW_THREADS=on # enable using threaded FFTW3 libraries -D MKL_INCLUDE_DIRS=path # ditto for Intel MKL library -D MKL_LIBRARIES=path @@ -105,6 +116,7 @@ the FFT library, you can set these variables: FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS # default is KISS if not specified FFT_INC = -DFFT_SINGLE # do not specify for double precision + FFT_INC = -DFFT_FFTW_THREADS # enable using threaded FFTW3 libraries FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY # default is FFT\_PACK\_ARRAY if not specified @@ -115,6 +127,7 @@ the FFT library, you can set these variables: FFT_INC = -I/usr/local/include FFT_PATH = -L/usr/local/lib FFT_LIB = -lfftw3 # FFTW3 double precision + FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision with threads (needs -DFFT_FFTW_THREADS) FFT_LIB = -lfftw3 -lfftw3f # FFTW3 single precision FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compier @@ -126,16 +139,19 @@ FFT\_LIB with the appropriate FFT libraries to include in the link. **CMake and make info**\ : The `KISS FFT library `_ is included in the LAMMPS -distribution. It is portable across all platforms. Depending on the -size of the FFTs and the number of processors used, the other -libraries listed here can be faster. +distribution. It is portable across all platforms. Depending on the size +of the FFTs and the number of processors used, the other libraries listed +here can be faster. However, note that long-range Coulombics are only a portion of the per-timestep CPU cost, FFTs are only a portion of long-range Coulombics, and 1d FFTs are only a portion of the FFT cost (parallel communication can be costly). A breakdown of these timings is printed -to the screen at the end of a run using the :doc:`kspace_style pppm ` command. The :doc:`Run output ` -doc page gives more details. +to the screen at the end of a run when using the +:doc:`kspace_style pppm ` command. The :doc:`Run output ` +doc page gives more details. A more detailed (and time consuming) +report of the FFT performance is generated with the +:doc:`kspace_modify fftbench yes ` command. FFTW is a fast, portable FFT library that should also work on any platform and can be faster than the KISS FFT library. You can @@ -166,7 +182,7 @@ When using -DFFT\_SINGLE with FFTW3 you may need to build the FFTW library a second time with support for single-precision. For FFTW3, do the following, which should produce the additional -library libfftw3f.a +library libfftw3f.a or libfftw3f.so. .. parsed-literal:: diff --git a/doc/src/kspace_style.rst b/doc/src/kspace_style.rst index 013c39ef61..5142734f19 100644 --- a/doc/src/kspace_style.rst +++ b/doc/src/kspace_style.rst @@ -404,9 +404,9 @@ calculation can be performed concurrently on the GPU while other calculations for non-bonded and bonded force calculation are performed on the CPU. -The *pppm/kk* style also performs charge assignment and force -interpolation calculations on the GPU while the FFTs themselves are -calculated on the CPU in non-threaded mode. +The *pppm/kk* style performs charge assignment and force interpolation +calculations, along with the FFTs themselves, on the GPU or (optionally) threaded +on the CPU when using OpenMP and FFTW3. These accelerated styles are part of the GPU, USER-INTEL, KOKKOS, USER-OMP, and OPT packages respectively. They are only enabled if diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 8f3744bbf4..3ae04384fd 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -97,6 +97,9 @@ action dihedral_opls_kokkos.cpp dihedral_opls.cpp action dihedral_opls_kokkos.h dihedral_opls.h action domain_kokkos.cpp action domain_kokkos.h +action fftdata_kokkos.h fft3d.h +action fft3d_kokkos.cpp fft3d.cpp +action fft3d_kokkos.h fft3d.h action fix_deform_kokkos.cpp action fix_deform_kokkos.h action fix_enforce2d_kokkos.cpp @@ -153,6 +156,7 @@ action improper_class2_kokkos.cpp improper_class2.cpp action improper_class2_kokkos.h improper_class2.h action improper_harmonic_kokkos.cpp improper_harmonic.cpp action improper_harmonic_kokkos.h improper_harmonic.h +action kissfft_kokkos.h kissfft.h action kokkos.cpp action kokkos.h action kokkos_base.h @@ -189,6 +193,7 @@ action min_kokkos.cpp action min_kokkos.h action min_linesearch_kokkos.cpp action min_linesearch_kokkos.h +action pack_kokkos.h pack.h action pair_buck_coul_cut_kokkos.cpp action pair_buck_coul_cut_kokkos.h action pair_buck_coul_long_kokkos.cpp pair_buck_coul_long.cpp @@ -285,6 +290,8 @@ action rand_pool_wrap_kokkos.cpp action rand_pool_wrap_kokkos.h action region_block_kokkos.cpp action region_block_kokkos.h +action remap_kokkos.cpp remap.cpp +action remap_kokkos.h remap.h action sna_kokkos.h sna.h action sna_kokkos_impl.h sna.cpp action verlet_kokkos.cpp diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp new file mode 100644 index 0000000000..9a67ca81f0 --- /dev/null +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -0,0 +1,905 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Stan Moore (SNL), Sam Mish (U.C. Davis) +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "fft3d_kokkos.h" +#include "remap_kokkos.h" +#include "kokkos_type.h" +#include "error.h" +#include "kokkos.h" + + +using namespace LAMMPS_NS; + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +/* ---------------------------------------------------------------------- */ + +template +FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int scaled, int permute, int *nbuf, int usecollective) : + Pointers(lmp) +{ + int nthreads = lmp->kokkos->nthreads; + int ngpus = lmp->kokkos->ngpus; + ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; + +#if defined(FFT_MKL) + if (ngpus > 0 && execution_space == Device) + lmp->error->all(FLERR,"Cannot use the MKL library with Kokkos CUDA on GPUs"); +#elif defined(FFT_FFTW3) + if (ngpus > 0 && execution_space == Device) + lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA on GPUs"); +#elif defined(FFT_CUFFT) + if (ngpus > 0 && execution_space == Host) + lmp->error->all(FLERR,"Cannot use the cuFFT library with Kokkos CUDA on the host CPUs"); +#elif defined(FFT_KISSFFT) + // The compiler can't statically determine the stack size needed for + // recursive function calls in KISS FFT and the default per-thread + // stack size on GPUs needs to be increased to prevent stack overflows + // for reasonably sized FFTs + #if defined (KOKKOS_ENABLE_CUDA) + size_t stack_size; + cudaDeviceGetLimit(&stack_size,cudaLimitStackSize); + if (stack_size < 2048) + cudaDeviceSetLimit(cudaLimitStackSize,2048); + #endif +#endif + + plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, + in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, + scaled,permute,nbuf,usecollective,nthreads); + if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan"); +} + +/* ---------------------------------------------------------------------- */ + +template +FFT3dKokkos::~FFT3dKokkos() +{ + fft_3d_destroy_plan_kokkos(plan); +} + +/* ---------------------------------------------------------------------- */ + +template +void FFT3dKokkos::compute(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, int flag) +{ + typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size()/2); + typename AT::t_FFT_DATA_1d d_out_data(d_out.data(),d_out.size()/2); + + fft_3d_kokkos(d_in_data,d_out_data,flag,plan); +} + +/* ---------------------------------------------------------------------- */ + +template +void FFT3dKokkos::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int nsize, int flag) +{ + typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size()); + + fft_3d_1d_only_kokkos(d_in_data,nsize,flag,plan); +} + +/* ---------------------------------------------------------------------- + Data layout for 3d FFTs: + + data set of Nfast x Nmid x Nslow elements is owned by P procs + on input, each proc owns a subsection of the elements + on output, each proc will own a (possibly different) subsection + my subsection must not overlap with any other proc's subsection, + i.e. the union of all proc's input (or output) subsections must + exactly tile the global Nfast x Nmid x Nslow data set + when called from C, all subsection indices are + C-style from 0 to N-1 where N = Nfast or Nmid or Nslow + when called from F77, all subsection indices are + F77-style from 1 to N where N = Nfast or Nmid or Nslow + a proc can own 0 elements on input or output + by specifying hi index < lo index + on both input and output, data is stored contiguously on a processor + with a fast-varying, mid-varying, and slow-varying index +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Perform 3d FFT + + Arguments: + in starting address of input data on this proc + out starting address of where output data for this proc + will be placed (can be same as in) + flag 1 for forward FFT, -1 for inverse FFT + plan plan returned by previous call to fft_3d_create_plan +------------------------------------------------------------------------- */ + +template +struct norm_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_DATA_1d_um d_out; + int norm; + + norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_): + d_out(d_out_),norm(norm_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (const int &i) const { +#if defined(FFT_FFTW3) || defined(FFT_CUFFT) + FFT_SCALAR* out_ptr = (FFT_SCALAR *)(d_out.data()+i); + *(out_ptr++) *= norm; + *(out_ptr++) *= norm; +#elif defined(FFT_MKL) + d_out(i) *= norm; +#else + d_out(i,0) *= norm; + d_out(i,1) *= norm; +#endif + } +}; + +#ifdef FFT_KISSFFT +template +struct kiss_fft_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_DATA_1d_um d_data,d_tmp; + kiss_fft_state_kokkos st; + int length; + + kiss_fft_functor() {} + + kiss_fft_functor(typename AT::t_FFT_DATA_1d &d_data_,typename AT::t_FFT_DATA_1d &d_tmp_, kiss_fft_state_kokkos &st_, int length_): + d_data(d_data_), + d_tmp(d_tmp_), + st(st_) + { + length = length_; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &i) const { + const int offset = i*length; + KissFFTKokkos::kiss_fft_kokkos(st,d_data,d_tmp,offset); + } +}; +#endif + +template +void FFT3dKokkos::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typename AT::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos *plan) +{ + int total,length; + typename AT::t_FFT_DATA_1d d_data,d_copy; + typename AT::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar; + + // pre-remap to prepare for 1st FFTs if needed + // copy = loc for remap result + + if (plan->pre_plan) { + if (plan->pre_target == 0) d_copy = d_out; + else d_copy = plan->d_copy; + + d_in_scalar = typename AT::t_FFT_SCALAR_1d(d_in.data(),d_in.size()); + d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); + d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + + remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar, + d_scratch_scalar, plan->pre_plan); + + d_data = d_copy; + } else d_data = d_in; + + // 1d FFTs along fast axis + + total = plan->total1; + length = plan->length1; + + #if defined(FFT_MKL) + if (flag == -1) + DftiComputeForward(plan->handle_fast,(FFT_DATA *)d_data.data()); + else + DftiComputeBackward(plan->handle_fast,(FFT_DATA *)d_data.data()); + #elif defined(FFT_FFTW3) + if (flag == -1) + FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + #elif defined(FFT_CUFFT) + cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + #else + typename AT::t_FFT_DATA_1d d_tmp = + typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); + kiss_fft_functor f; + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length); + Kokkos::parallel_for(total/length,f); + d_data = d_tmp; + d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); + #endif + + + // 1st mid-remap to prepare for 2nd FFTs + // copy = loc for remap result + + if (plan->mid1_target == 0) d_copy = d_out; + else d_copy = plan->d_copy; + + d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()*2); + d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2); + d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2); + + remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, + d_scratch_scalar, plan->mid1_plan); + + d_data = d_copy; + + // 1d FFTs along mid axis + + total = plan->total2; + length = plan->length2; + + #if defined(FFT_MKL) + if (flag == -1) + DftiComputeForward(plan->handle_mid,(FFT_DATA *)d_data.data()); + else + DftiComputeBackward(plan->handle_mid,(FFT_DATA *)d_data.data()); + #elif defined(FFT_FFTW3) + if (flag == -1) + FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + #elif defined(FFT_CUFFT) + cufftExec(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + #else + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length); + Kokkos::parallel_for(total/length,f); + d_data = d_tmp; + d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0()); + #endif + + // 2nd mid-remap to prepare for 3rd FFTs + // copy = loc for remap result + + if (plan->mid2_target == 0) d_copy = d_out; + else d_copy = plan->d_copy; + + d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()); + d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + + remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar, + d_scratch_scalar, plan->mid2_plan); + + d_data = d_copy; + + // 1d FFTs along slow axis + + total = plan->total3; + length = plan->length3; + + #if defined(FFT_MKL) + if (flag == -1) + DftiComputeForward(plan->handle_slow,(FFT_DATA *)d_data.data()); + else + DftiComputeBackward(plan->handle_slow,(FFT_DATA *)d_data.data()); + #elif defined(FFT_FFTW3) + if (flag == -1) + FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + else + FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data()); + #elif defined(FFT_CUFFT) + cufftExec(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag); + #else + if (flag == -1) + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length); + else + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_backward,length); + Kokkos::parallel_for(total/length,f); + d_data = d_tmp; + #endif + + + // post-remap to put data in output format if needed + // destination is always out + + if (plan->post_plan) { + d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()); + d_out_scalar = typename AT::t_FFT_SCALAR_1d(d_out.data(),d_out.size()); + d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()); + + remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar, + d_scratch_scalar, plan->post_plan); + } + + // scaling if required + + if (flag == 1 && plan->scaled) { + FFT_SCALAR norm = plan->norm; + int num = plan->normnum; + + norm_functor f(d_out,norm); + Kokkos::parallel_for(num,f); + } +} + +/* ---------------------------------------------------------------------- + Create plan for performing a 3d FFT + + Arguments: + comm MPI communicator for the P procs which own the d_data + nfast,nmid,nslow size of global 3d matrix + in_ilo,in_ihi input bounds of data I own in fast index + in_jlo,in_jhi input bounds of data I own in mid index + in_klo,in_khi input bounds of data I own in slow index + out_ilo,out_ihi output bounds of data I own in fast index + out_jlo,out_jhi output bounds of data I own in mid index + out_klo,out_khi output bounds of data I own in slow index + scaled 0 = no scaling of result, 1 = scaling + permute permutation in storage order of indices on output + 0 = no permutation + 1 = permute once = mid->fast, slow->mid, fast->slow + 2 = permute twice = slow->fast, fast->mid, mid->slow + nbuf returns size of internal storage buffers used by FFT + usecollective use collective MPI operations for remapping data +------------------------------------------------------------------------- */ + +template +struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_plan_kokkos ( + MPI_Comm comm, int nfast, int nmid, int nslow, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int scaled, int permute, int *nbuf, int usecollective, + int nthreads) +{ + struct fft_plan_3d_kokkos *plan; + int me,nprocs; + int flag,remapflag; + int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; + int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; + int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi; + int out_size,first_size,second_size,third_size,copy_size,scratch_size; + int np1,np2,ip1,ip2; + + // query MPI info + + MPI_Comm_rank(comm,&me); + MPI_Comm_size(comm,&nprocs); + + // compute division of procs in 2 dimensions not on-processor + + bifactor(nprocs,&np1,&np2); + ip1 = me % np1; + ip2 = me/np1; + + // allocate memory for plan data struct + + plan = new struct fft_plan_3d_kokkos; + remapKK = new RemapKokkos(lmp); + if (plan == NULL) return NULL; + + // remap from initial distribution to layout needed for 1st set of 1d FFTs + // not needed if all procs own entire fast axis initially + // first indices = distribution after 1st set of FFTs + + + if (in_ilo == 0 && in_ihi == nfast-1) flag = 0; + else flag = 1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) { + first_ilo = in_ilo; + first_ihi = in_ihi; + first_jlo = in_jlo; + first_jhi = in_jhi; + first_klo = in_klo; + first_khi = in_khi; + plan->pre_plan = NULL; + } + else { + first_ilo = 0; + first_ihi = nfast - 1; + first_jlo = ip1*nmid/np1; + first_jhi = (ip1+1)*nmid/np1 - 1; + first_klo = ip2*nslow/np2; + first_khi = (ip2+1)*nslow/np2 - 1; + plan->pre_plan = + remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + first_ilo,first_ihi,first_jlo,first_jhi, + first_klo,first_khi,2,0,0,FFT_PRECISION,0); + if (plan->pre_plan == NULL) return NULL; + } + + // 1d FFTs along fast axis + + plan->length1 = nfast; + plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1); + + // remap from 1st to 2nd FFT + // choose which axis is split over np1 vs np2 to minimize communication + // second indices = distribution after 2nd set of FFTs + + second_ilo = ip1*nfast/np1; + second_ihi = (ip1+1)*nfast/np1 - 1; + second_jlo = 0; + second_jhi = nmid - 1; + second_klo = ip2*nslow/np2; + second_khi = (ip2+1)*nslow/np2 - 1; + plan->mid1_plan = + remapKK->remap_3d_create_plan_kokkos(comm, + first_ilo,first_ihi,first_jlo,first_jhi, + first_klo,first_khi, + second_ilo,second_ihi,second_jlo,second_jhi, + second_klo,second_khi,2,1,0,FFT_PRECISION, + usecollective); + if (plan->mid1_plan == NULL) return NULL; + + // 1d FFTs along mid axis + + plan->length2 = nmid; + plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1); + + // remap from 2nd to 3rd FFT + // if final distribution is permute=2 with all procs owning entire slow axis + // then this remapping goes directly to final distribution + // third indices = distribution after 3rd set of FFTs + + if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0; + else flag = 1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) { + third_ilo = out_ilo; + third_ihi = out_ihi; + third_jlo = out_jlo; + third_jhi = out_jhi; + third_klo = out_klo; + third_khi = out_khi; + } + else { + third_ilo = ip1*nfast/np1; + third_ihi = (ip1+1)*nfast/np1 - 1; + third_jlo = ip2*nmid/np2; + third_jhi = (ip2+1)*nmid/np2 - 1; + third_klo = 0; + third_khi = nslow - 1; + } + + plan->mid2_plan = + remapKK->remap_3d_create_plan_kokkos(comm, + second_jlo,second_jhi,second_klo,second_khi, + second_ilo,second_ihi, + third_jlo,third_jhi,third_klo,third_khi, + third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective); + if (plan->mid2_plan == NULL) return NULL; + + // 1d FFTs along slow axis + + plan->length3 = nslow; + plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow; + + // remap from 3rd FFT to final distribution + // not needed if permute = 2 and third indices = out indices on all procs + + if (permute == 2 && + out_ilo == third_ilo && out_ihi == third_ihi && + out_jlo == third_jlo && out_jhi == third_jhi && + out_klo == third_klo && out_khi == third_khi) flag = 0; + else flag = 1; + + MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm); + + if (remapflag == 0) + plan->post_plan = NULL; + else { + plan->post_plan = + remapKK->remap_3d_create_plan_kokkos(comm, + third_klo,third_khi,third_ilo,third_ihi, + third_jlo,third_jhi, + out_klo,out_khi,out_ilo,out_ihi, + out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0); + if (plan->post_plan == NULL) return NULL; + } + + // configure plan memory pointers and allocate work space + // out_size = amount of memory given to FFT by user + // first/second/third_size = amount of memory needed after + // pre,mid1,mid2 remaps + // copy_size = amount needed internally for extra copy of data + // scratch_size = amount needed internally for remap scratch space + // for each remap: + // out space used for result if big enough, else require copy buffer + // accumulate largest required remap scratch space + + out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1); + first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * + (first_khi-first_klo+1); + second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * + (second_khi-second_klo+1); + third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * + (third_khi-third_klo+1); + + copy_size = 0; + scratch_size = 0; + + if (plan->pre_plan) { + if (first_size <= out_size) + plan->pre_target = 0; + else { + plan->pre_target = 1; + copy_size = MAX(copy_size,first_size); + } + scratch_size = MAX(scratch_size,first_size); + } + + if (plan->mid1_plan) { + if (second_size <= out_size) + plan->mid1_target = 0; + else { + plan->mid1_target = 1; + copy_size = MAX(copy_size,second_size); + } + scratch_size = MAX(scratch_size,second_size); + } + + if (plan->mid2_plan) { + if (third_size <= out_size) + plan->mid2_target = 0; + else { + plan->mid2_target = 1; + copy_size = MAX(copy_size,third_size); + } + scratch_size = MAX(scratch_size,third_size); + } + + if (plan->post_plan) + scratch_size = MAX(scratch_size,out_size); + + *nbuf = copy_size + scratch_size; + + if (copy_size) { + plan->d_copy = typename AT::t_FFT_DATA_1d("fft3d:copy",copy_size); + } + + if (scratch_size) { + plan->d_scratch = typename AT::t_FFT_DATA_1d("fft3d:scratch",scratch_size); + } + + // system specific pre-computation of 1d FFT coeffs + // and scaling normalization + +#if defined(FFT_MKL) + DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1, + (MKL_LONG)nfast); + DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS, + (MKL_LONG)plan->total1/nfast); + DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE); + DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast); + DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast); + //DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads); + DftiCommitDescriptor(plan->handle_fast); + + DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1, + (MKL_LONG)nmid); + DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS, + (MKL_LONG)plan->total2/nmid); + DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE); + DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid); + DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid); + //DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads); + DftiCommitDescriptor(plan->handle_mid); + + DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1, + (MKL_LONG)nslow); + DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS, + (MKL_LONG)plan->total3/nslow); + DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE); + DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow); + DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow); + //DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads); + DftiCommitDescriptor(plan->handle_slow); + + if (scaled == 0) + plan->scaled = 0; + else { + plan->scaled = 1; + plan->norm = 1.0/(nfast*nmid*nslow); + plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * + (out_khi-out_klo+1); + } + +#elif defined(FFT_FFTW3) + +#if defined (FFT_FFTW_THREADS) + if (nthreads > 1) { + FFTW_API(init_threads)(); + FFTW_API(plan_with_nthreads)(nthreads); + } +#endif + + plan->plan_fast_forward = + FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, + NULL,&nfast,1,plan->length1, + NULL,&nfast,1,plan->length1, + FFTW_FORWARD,FFTW_ESTIMATE); + + plan->plan_fast_backward = + FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, + NULL,&nfast,1,plan->length1, + NULL,&nfast,1,plan->length1, + FFTW_BACKWARD,FFTW_ESTIMATE); + + plan->plan_mid_forward = + FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, + NULL,&nmid,1,plan->length2, + NULL,&nmid,1,plan->length2, + FFTW_FORWARD,FFTW_ESTIMATE); + + plan->plan_mid_backward = + FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2, + NULL,&nmid,1,plan->length2, + NULL,&nmid,1,plan->length2, + FFTW_BACKWARD,FFTW_ESTIMATE); + + + plan->plan_slow_forward = + FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, + NULL,&nslow,1,plan->length3, + NULL,&nslow,1,plan->length3, + FFTW_FORWARD,FFTW_ESTIMATE); + + plan->plan_slow_backward = + FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3, + NULL,&nslow,1,plan->length3, + NULL,&nslow,1,plan->length3, + FFTW_BACKWARD,FFTW_ESTIMATE); +#elif defined(FFT_CUFFT) + cufftPlanMany(&(plan->plan_fast), 1, &nfast, + &nfast,1,plan->length1, + &nfast,1,plan->length1, + CUFFT_TYPE,plan->total1/plan->length1); + + cufftPlanMany(&(plan->plan_mid), 1, &nmid, + &nmid,1,plan->length2, + &nmid,1,plan->length2, + CUFFT_TYPE,plan->total2/plan->length2); + + cufftPlanMany(&(plan->plan_slow), 1, &nslow, + &nslow,1,plan->length3, + &nslow,1,plan->length3, + CUFFT_TYPE,plan->total3/plan->length3); +#else + kissfftKK = new KissFFTKokkos(); + + plan->cfg_fast_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL); + plan->cfg_fast_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nfast,1,NULL,NULL); + + if (nmid == nfast) { + plan->cfg_mid_forward = plan->cfg_fast_forward; + plan->cfg_mid_backward = plan->cfg_fast_backward; + } + else { + plan->cfg_mid_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nmid,0,NULL,NULL); + plan->cfg_mid_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nmid,1,NULL,NULL); + } + + if (nslow == nfast) { + plan->cfg_slow_forward = plan->cfg_fast_forward; + plan->cfg_slow_backward = plan->cfg_fast_backward; + } + else if (nslow == nmid) { + plan->cfg_slow_forward = plan->cfg_mid_forward; + plan->cfg_slow_backward = plan->cfg_mid_backward; + } + else { + plan->cfg_slow_forward = KissFFTKokkos::kiss_fft_alloc_kokkos(nslow,0,NULL,NULL); + plan->cfg_slow_backward = KissFFTKokkos::kiss_fft_alloc_kokkos(nslow,1,NULL,NULL); + } +#endif + + if (scaled == 0) + plan->scaled = 0; + else { + plan->scaled = 1; + plan->norm = 1.0/(nfast*nmid*nslow); + plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * + (out_khi-out_klo+1); + } + + return plan; +} + +/* ---------------------------------------------------------------------- + Destroy a 3d fft plan +------------------------------------------------------------------------- */ + +template +void FFT3dKokkos::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos *plan) +{ + if (plan->pre_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->pre_plan); + if (plan->mid1_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid1_plan); + if (plan->mid2_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid2_plan); + if (plan->post_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->post_plan); + +#if defined(FFT_MKL) + DftiFreeDescriptor(&(plan->handle_fast)); + DftiFreeDescriptor(&(plan->handle_mid)); + DftiFreeDescriptor(&(plan->handle_slow)); +#elif defined(FFT_FFTW3) + FFTW_API(destroy_plan)(plan->plan_slow_forward); + FFTW_API(destroy_plan)(plan->plan_slow_backward); + FFTW_API(destroy_plan)(plan->plan_mid_forward); + FFTW_API(destroy_plan)(plan->plan_mid_backward); + FFTW_API(destroy_plan)(plan->plan_fast_forward); + FFTW_API(destroy_plan)(plan->plan_fast_backward); + +#if defined (FFT_FFTW_THREADS) + FFTW_API(cleanup_threads)(); +#endif + +#elif defined (FFT_KISSFFT) + delete kissfftKK; +#endif + + delete plan; + delete remapKK; +} + +/* ---------------------------------------------------------------------- + divide n into 2 factors of as equal size as possible +------------------------------------------------------------------------- */ + +template +void FFT3dKokkos::bifactor(int n, int *factor1, int *factor2) +{ + int n1,n2,facmax; + + facmax = static_cast (sqrt((double) n)); + + for (n1 = facmax; n1 > 0; n1--) { + n2 = n/n1; + if (n1*n2 == n) { + *factor1 = n1; + *factor2 = n2; + return; + } + } +} + +/* ---------------------------------------------------------------------- + perform just the 1d FFTs needed by a 3d FFT, no data movement + used for timing purposes + + Arguments: + in starting address of input data on this proc, all set to 0.0 + nsize size of in + flag 1 for forward FFT, -1 for inverse FFT + plan plan returned by previous call to fft_3d_create_plan +------------------------------------------------------------------------- */ + +template +void FFT3dKokkos::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d_data, int nsize, int flag, + struct fft_plan_3d_kokkos *plan) +{ + // total = size of data needed in each dim + // length = length of 1d FFT in each dim + // total/length = # of 1d FFTs in each dim + // if total > nsize, limit # of 1d FFTs to available size of data + + int total1 = plan->total1; + int length1 = plan->length1; + int total2 = plan->total2; + int length2 = plan->length2; + int total3 = plan->total3; + int length3 = plan->length3; + + // fftw3 and Dfti in MKL encode the number of transforms + // into the plan, so we cannot operate on a smaller data set +#if defined(FFT_MKL) || defined(FFT_FFTW3) + if ((total1 > nsize) || (total2 > nsize) || (total3 > nsize)) + return; +#endif + if (total1 > nsize) total1 = (nsize/length1) * length1; + if (total2 > nsize) total2 = (nsize/length2) * length2; + if (total3 > nsize) total3 = (nsize/length3) * length3; + + // perform 1d FFTs in each of 3 dimensions + // data is just an array of 0.0 + +#if defined(FFT_MKL) + if (flag == -1) { + DftiComputeForward(plan->handle_fast,data); + DftiComputeForward(plan->handle_mid,data); + DftiComputeForward(plan->handle_slow,data); + } else { + DftiComputeBackward(plan->handle_fast,data); + DftiComputeBackward(plan->handle_mid,data); + DftiComputeBackward(plan->handle_slow,data); + } +#elif defined(FFT_FFTW3) + if (flag == -1) { + FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + } else { + FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data()); + } +#elif defined(FFT_CUFFT) + cufftExec(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExec(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); + cufftExec(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag); +#else + kiss_fft_functor f; + typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0()); + if (flag == -1) { + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_forward,length1); + Kokkos::parallel_for(total1/length1,f); + + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_forward,length2); + Kokkos::parallel_for(total2/length2,f); + + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_forward,length3); + Kokkos::parallel_for(total3/length3,f); + } else { + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_fast_backward,length1); + Kokkos::parallel_for(total1/length1,f); + + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_mid_backward,length2); + Kokkos::parallel_for(total2/length2,f); + + f = kiss_fft_functor(d_data,d_tmp,plan->cfg_slow_backward,length3); + Kokkos::parallel_for(total3/length3,f); + } +#endif + + // scaling if required + // limit num to size of data + + if (flag == 1 && plan->scaled) { + FFT_SCALAR norm = plan->norm; + int num = MIN(plan->normnum,nsize); + + norm_functor f(d_data,norm); + Kokkos::parallel_for(num,f); + } +} + +namespace LAMMPS_NS { +template class FFT3dKokkos; +#ifdef KOKKOS_HAVE_CUDA +template class FFT3dKokkos; +#endif +} diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h new file mode 100644 index 0000000000..2de3b641ea --- /dev/null +++ b/src/KOKKOS/fft3d_kokkos.h @@ -0,0 +1,198 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_FFT3D_KOKKOS_H +#define LMP_FFT3D_KOKKOS_H + +#include "pointers.h" +#include "kokkos_type.h" +#include "remap_kokkos.h" +#include "fftdata_kokkos.h" + +// with KOKKOS in CUDA mode we can only have +// CUFFT or KISSFFT, thus undefine all other +// FFTs here, since they may be valid in fft3d.cpp + +#if defined(KOKKOS_ENABLE_CUDA) +# if defined(FFT_FFTW) +# undef FFT_FFTW +# endif +# if defined(FFT_FFTW3) +# undef FFT_FFTW3 +# endif +# if defined(FFT_MKL) +# undef FFT_MKL +# endif +# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT) +# define FFT_KISSFFT +# endif +#else +# if defined(FFT_CUFFT) +# error "Must enable CUDA with KOKKOS to use -DFFT_CUFFT" +# endif +// if user set FFTW, it means FFTW3 +# ifdef FFT_FFTW +# define FFT_FFTW3 +# endif +# ifdef FFT_FFTW_THREADS +# if !defined(FFT_FFTW3) +# error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS" +# endif +# endif +#endif + +#if defined(FFT_MKL) + #include "mkl_dfti.h" + #if defined(FFT_SINGLE) + typedef float _Complex FFT_DATA; + #define FFT_MKL_PREC DFTI_SINGLE + #else + typedef double _Complex FFT_DATA; + #define FFT_MKL_PREC DFTI_DOUBLE + #endif +#elif defined(FFT_FFTW3) + #include "fftw3.h" + #if defined(FFT_SINGLE) + typedef fftwf_complex FFT_DATA; + #define FFTW_API(function) fftwf_ ## function + #else + typedef fftw_complex FFT_DATA; + #define FFTW_API(function) fftw_ ## function + #endif +#elif defined(FFT_CUFFT) + #include "cufft.h" + #if defined(FFT_SINGLE) + #define cufftExec cufftExecC2C + #define CUFFT_TYPE CUFFT_C2C + typedef cufftComplex FFT_DATA; + #else + #define cufftExec cufftExecZ2Z + #define CUFFT_TYPE CUFFT_Z2Z + typedef cufftDoubleComplex FFT_DATA; + #endif +#else + #include "kissfft_kokkos.h" + #ifndef FFT_KISSFFT + #define FFT_KISSFFT + #endif +#endif + +namespace LAMMPS_NS { + +// ------------------------------------------------------------------------- + +// plan for how to perform a 3d FFT + +template +struct fft_plan_3d_kokkos { + typedef DeviceType device_type; + typedef ArrayTypes AT; + + struct remap_plan_3d_kokkos *pre_plan; // remap from input -> 1st FFTs + struct remap_plan_3d_kokkos *mid1_plan; // remap from 1st -> 2nd FFTs + struct remap_plan_3d_kokkos *mid2_plan; // remap from 2nd -> 3rd FFTs + struct remap_plan_3d_kokkos *post_plan; // remap from 3rd FFTs -> output + typename AT::t_FFT_DATA_1d d_copy; // memory for remap results (if needed) + typename AT::t_FFT_DATA_1d d_scratch; // scratch space for remaps + int total1,total2,total3; // # of 1st,2nd,3rd FFTs (times length) + int length1,length2,length3; // length of 1st,2nd,3rd FFTs + int pre_target; // where to put remap results + int mid1_target,mid2_target; + int scaled; // whether to scale FFT results + int normnum; // # of values to rescale + double norm; // normalization factor for rescaling + + // system specific 1d FFT info +#if defined(FFT_MKL) + DFTI_DESCRIPTOR *handle_fast; + DFTI_DESCRIPTOR *handle_mid; + DFTI_DESCRIPTOR *handle_slow; +#elif defined(FFT_FFTW3) + FFTW_API(plan) plan_fast_forward; + FFTW_API(plan) plan_fast_backward; + FFTW_API(plan) plan_mid_forward; + FFTW_API(plan) plan_mid_backward; + FFTW_API(plan) plan_slow_forward; + FFTW_API(plan) plan_slow_backward; +#elif defined(FFT_CUFFT) + cufftHandle plan_fast; + cufftHandle plan_mid; + cufftHandle plan_slow; +#else + kiss_fft_state_kokkos cfg_fast_forward; + kiss_fft_state_kokkos cfg_fast_backward; + kiss_fft_state_kokkos cfg_mid_forward; + kiss_fft_state_kokkos cfg_mid_backward; + kiss_fft_state_kokkos cfg_slow_forward; + kiss_fft_state_kokkos cfg_slow_backward; +#endif +}; + +template +class FFT3dKokkos : protected Pointers { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + FFT3dKokkos(class LAMMPS *, MPI_Comm, + int,int,int,int,int,int,int,int,int,int,int,int,int,int,int, + int,int,int *,int); + ~FFT3dKokkos(); + void compute(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, int); + void timing1d(typename AT::t_FFT_SCALAR_1d, int, int); + + private: + struct fft_plan_3d_kokkos *plan; + RemapKokkos *remapKK; + +#ifdef FFT_KISSFFT + KissFFTKokkos *kissfftKK; +#endif + + void fft_3d_kokkos(typename AT::t_FFT_DATA_1d, typename AT::t_FFT_DATA_1d, int, struct fft_plan_3d_kokkos *); + + struct fft_plan_3d_kokkos *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int, + int, int, int, int, int, + int, int, int, int, int, int, int, + int, int, int *, int, int); + + void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos *); + + void fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d, int, int, struct fft_plan_3d_kokkos *); + + void bifactor(int, int *, int *); +}; + +} + +#endif + +/* ERROR/WARNING messages: + +E: Could not create 3d FFT plan + +The FFT setup for the PPPM solver failed, typically due +to lack of memory. This is an unusual error. Check the +size of the FFT grid you are requesting. + +E: Cannot use the FFTW library with Kokkos CUDA on GPUs + +Kokkos CUDA doesn't support using the FFTW library to calculate FFTs for +PPPM on GPUs. + +E: Cannot use the cuFFT library with Kokkos CUDA on the host CPUs + +Kokkos CUDA doesn't support using the cuFFT library to calculate FFTs +for PPPM on the host CPUs, use KISS FFT instead. + +*/ diff --git a/src/KOKKOS/fftdata_kokkos.h b/src/KOKKOS/fftdata_kokkos.h new file mode 100644 index 0000000000..262b963802 --- /dev/null +++ b/src/KOKKOS/fftdata_kokkos.h @@ -0,0 +1,46 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +// data types for 2d/3d FFTs + +#ifndef FFT_DATA_KOKKOS_H +#define FFT_DATA_KOKKOS_H + +// User-settable FFT precision + +// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag) +// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag) + +#ifdef FFT_SINGLE +#define FFT_PRECISION 1 +#define MPI_FFT_SCALAR MPI_FLOAT +typedef float FFT_SCALAR; +#else +#define FFT_PRECISION 2 +#define MPI_FFT_SCALAR MPI_DOUBLE +typedef double FFT_SCALAR; +#endif + +// ------------------------------------------------------------------------- + +// Data types for single-precision complex + +#if FFT_PRECISION == 1 +#elif FFT_PRECISION == 2 +#else +#error "FFT_PRECISION needs to be either 1 (=single) or 2 (=double)" +#endif + +#endif diff --git a/src/KOKKOS/kissfft_kokkos.cpp b/src/KOKKOS/kissfft_kokkos.cpp new file mode 100644 index 0000000000..a9e68fe723 --- /dev/null +++ b/src/KOKKOS/kissfft_kokkos.cpp @@ -0,0 +1,20 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + + +namespace LAMMPS_NS { +template class KissFFTKokkos; +#ifdef KOKKOS_HAVE_CUDA +template class KissFFTKokkos; +#endif +} diff --git a/src/KOKKOS/kissfft_kokkos.h b/src/KOKKOS/kissfft_kokkos.h new file mode 100644 index 0000000000..ca11c6e6a5 --- /dev/null +++ b/src/KOKKOS/kissfft_kokkos.h @@ -0,0 +1,550 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* + we use a stripped down KISS FFT as default FFT for LAMMPS + this code is adapted from kiss_fft_v1_2_9 + homepage: http://kissfft.sf.net/ + + changes 2008-2011 by Axel Kohlmeyer + + KISS FFT ported to Kokkos by Stan Moore (SNL) +*/ + +/* + Copyright (c) 2003-2010, Mark Borgerding + + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + + * Neither the author nor the names of any contributors may be used + to endorse or promote products derived from this software without + specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#ifndef LMP_KISSFFT_KOKKOS_H +#define LMP_KISSFFT_KOKKOS_H + +#include +#include +#include +#include "kissfft_kokkos.h" +#include "fftdata_kokkos.h" +#include "kokkos_type.h" + +#ifndef M_PI +#define M_PI 3.141592653589793238462643383279502884197169399375105820974944 +#endif + +/* + Explanation of macros dealing with complex math: + + C_MUL(m,a,b) : m = a*b + C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise + C_SUB( res, a,b) : res = a - b + C_SUBFROM( res , a) : res -= a + C_ADDTO( res , a) : res += a + C_EQ( res , a) : res = a +*/ + +#define S_MUL(a,b) ( (a)*(b) ) + +#define C_MUL(m,a,a_index,b,b_index) \ + do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\ + (m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0) + +/* +#define C_FIXDIV(c,div) // NOOP + +#define C_MULBYSCALAR( c, s ) \ + do{ (c)[0] *= (s);\ + (c)[1] *= (s); }while(0) + +#define C_ADD( res, a,b)\ + do { \ + (res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \ + }while(0) + +#define C_SUB( res, a,b)\ + do { \ + (res)[0]=(a)[0]-(b)[0]; (res)[1]=(a)[1]-(b)[1]; \ + }while(0) + +#define C_ADDTO( res , a)\ + do { \ + (res)[0] += (a)[0]; (res)[1] += (a)[1];\ + }while(0) + +#define C_SUBFROM( res , a)\ + do {\ + (res)[0] -= (a)[0]; (res)[1] -= (a)[1]; \ + }while(0) + +#define C_EQ(res, a)\ + do {\ + (res)[0] = (a)[0]; (res)[1] = (a)[1]; \ + }while(0) +*/ + +#define KISS_FFT_COS(phase) (FFT_SCALAR) cos(phase) +#define KISS_FFT_SIN(phase) (FFT_SCALAR) sin(phase) +#define HALF_OF(x) ((x)*.5) + +#define kf_cexp(x,x_index,phase) \ + do{ \ + (x)(x_index,0) = KISS_FFT_COS(phase);\ + (x)(x_index,1) = KISS_FFT_SIN(phase);\ + }while(0) + + +namespace LAMMPS_NS { + +#define MAXFACTORS 32 +/* e.g. an fft of length 128 has 4 factors + as far as kissfft is concerned: 4*4*4*2 */ +template +struct kiss_fft_state_kokkos { + typedef DeviceType device_type; + typedef ArrayTypes AT; + int nfft; + int inverse; + typename AT::t_int_64 d_factors; + typename AT::t_FFT_DATA_1d d_twiddles; + typename AT::t_FFT_DATA_1d d_scratch; +}; + +template +class KissFFTKokkos { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + + KOKKOS_INLINE_FUNCTION + static void kf_bfly2(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, int m, int Fout_count) + { + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR t[2]; + int Fout2_count; + int tw1_count = 0; + + Fout2_count = Fout_count + m; + do { + //C_FIXDIV(d_Fout[Fout_count],2); C_FIXDIV(d_Fout[Fout2_count],2); + + C_MUL(t,d_Fout,Fout2_count,d_twiddles,tw1_count); + tw1_count += fstride; + //C_SUB(*Fout2,*Fout,t); + d_Fout(Fout2_count,0) = d_Fout(Fout_count,0) - t[0]; + d_Fout(Fout2_count,1) = d_Fout(Fout_count,1) - t[1]; + //C_ADDTO(d_Fout[Fout_count],t); + d_Fout(Fout_count,0) += t[0]; + d_Fout(Fout_count,1) += t[1]; + ++Fout2_count; + ++Fout_count; + } while(--m); + } + + KOKKOS_INLINE_FUNCTION + static void kf_bfly4(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, const size_t m, int Fout_count) + { + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR scratch[6][2]; + size_t k=m; + const size_t m2=2*m; + const size_t m3=3*m; + + int tw3_count,tw2_count,tw1_count; + tw3_count = tw2_count = tw1_count = 0; + + do { + //C_FIXDIV(d_Fout[Fout_count],4); C_FIXDIV(d_Fout[m],4); C_FIXDIV(d_Fout[m2],4); C_FIXDIV(d_Fout[m3],4); + + C_MUL(scratch[0],d_Fout,Fout_count + m,d_twiddles,tw1_count); + C_MUL(scratch[1],d_Fout,Fout_count + m2,d_twiddles,tw2_count); + C_MUL(scratch[2],d_Fout,Fout_count + m3,d_twiddles,tw3_count); + + //C_SUB(scratch[5],d_Fout[Fout_count],scratch[1] ); + scratch[5][0] = d_Fout(Fout_count,0) - scratch[1][0]; + scratch[5][1] = d_Fout(Fout_count,1) - scratch[1][1]; + //C_ADDTO(d_Fout[Fout_count], scratch[1]); + d_Fout(Fout_count,0) += scratch[1][0]; + d_Fout(Fout_count,1) += scratch[1][1]; + //C_ADD(scratch[3],scratch[0],scratch[2]); + scratch[3][0] = scratch[0][0] + scratch[2][0]; + scratch[3][1] = scratch[0][1] + scratch[2][1]; + //C_SUB( scratch[4] , scratch[0] , scratch[2] ); + scratch[4][0] = scratch[0][0] - scratch[2][0]; + scratch[4][1] = scratch[0][1] - scratch[2][1]; + //C_SUB(d_Fout[m2],d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count + m2,0) = d_Fout(Fout_count,0) - scratch[3][0]; + d_Fout(Fout_count + m2,1) = d_Fout(Fout_count,1) - scratch[3][1]; + + tw1_count += fstride; + tw2_count += fstride*2; + tw3_count += fstride*3; + //C_ADDTO(d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count,0) += scratch[3][0]; + d_Fout(Fout_count,1) += scratch[3][1]; + + if (st.inverse) { + d_Fout(Fout_count + m,0) = scratch[5][0] - scratch[4][1]; + d_Fout(Fout_count + m,1) = scratch[5][1] + scratch[4][0]; + d_Fout(Fout_count + m3,0) = scratch[5][0] + scratch[4][1]; + d_Fout(Fout_count + m3,1) = scratch[5][1] - scratch[4][0]; + } else{ + d_Fout(Fout_count + m,0) = scratch[5][0] + scratch[4][1]; + d_Fout(Fout_count + m,1) = scratch[5][1] - scratch[4][0]; + d_Fout(Fout_count + m3,0) = scratch[5][0] - scratch[4][1]; + d_Fout(Fout_count + m3,1) = scratch[5][1] + scratch[4][0]; + } + ++Fout_count; + } while(--k); + } + + KOKKOS_INLINE_FUNCTION + static void kf_bfly3(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, size_t m, int Fout_count) + { + size_t k=m; + const size_t m2 = 2*m; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR scratch[5][2]; + FFT_SCALAR epi3[2]; + //C_EQ(epi3,d_twiddles[fstride*m]); + epi3[0] = d_twiddles(fstride*m,0); + epi3[1] = d_twiddles(fstride*m,1); + + int tw1_count,tw2_count; + tw1_count = tw2_count = 0; + + do { + //C_FIXDIV(d_Fout[Fout_count],3); C_FIXDIV(d_Fout[m],3); C_FIXDIV(d_Fout[m2],3); + + C_MUL(scratch[1],d_Fout,Fout_count + m,d_twiddles,tw1_count); + C_MUL(scratch[2],d_Fout,Fout_count + m2,d_twiddles,tw2_count); + + //C_ADD(scratch[3],scratch[1],scratch[2]); + scratch[3][0] = scratch[1][0] + scratch[2][0]; + scratch[3][1] = scratch[1][1] + scratch[2][1]; + //C_SUB(scratch[0],scratch[1],scratch[2]); + scratch[0][0] = scratch[1][0] - scratch[2][0]; + scratch[0][1] = scratch[1][1] - scratch[2][1]; + tw1_count += fstride; + tw2_count += fstride*2; + + d_Fout(Fout_count + m,0) = d_Fout(Fout_count,0) - HALF_OF(scratch[3][0]); + d_Fout(Fout_count + m,1) = d_Fout(Fout_count,1) - HALF_OF(scratch[3][1]); + + //C_MULBYSCALAR(scratch[0],epi3[1]); + scratch[0][0] *= epi3[1]; + scratch[0][1] *= epi3[1]; + + //C_ADDTO(d_Fout[Fout_count],scratch[3]); + d_Fout(Fout_count,0) += scratch[3][0]; + d_Fout(Fout_count,1) += scratch[3][1]; + + d_Fout(Fout_count + m2,0) = d_Fout(Fout_count + m,0) + scratch[0][1]; + d_Fout(Fout_count + m2,1) = d_Fout(Fout_count + m,1) - scratch[0][0]; + + d_Fout(Fout_count + m,0) -= scratch[0][1]; + d_Fout(Fout_count + m,1) += scratch[0][0]; + + ++Fout_count; + } while(--k); + } + + KOKKOS_INLINE_FUNCTION + static void kf_bfly5(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride, + const kiss_fft_state_kokkos &st, int m, int Fout_count) + { + int u; + FFT_SCALAR scratch[13][2]; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR ya[2],yb[2]; + //C_EQ(ya,d_twiddles[fstride*m]); + ya[1] = d_twiddles(fstride*m,1); + ya[0] = d_twiddles(fstride*m,0); + //C_EQ(yb,d_twiddles[fstride*2*m]); + yb[1] = d_twiddles(fstride*2*m,1); + yb[0] = d_twiddles(fstride*2*m,0); + + int Fout0_count=Fout_count; + int Fout1_count=Fout0_count+m; + int Fout2_count=Fout0_count+2*m; + int Fout3_count=Fout0_count+3*m; + int Fout4_count=Fout0_count+4*m; + + for ( u=0; u &st, int m, int p, int Fout_count) + { + int u,k,q1,q; + typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles; + FFT_SCALAR t[2]; + int Norig = st.nfft; + + typename AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch; + for ( u=0; u=Norig) twidx-=Norig; + C_MUL(t,d_scratch,q,d_twiddles,twidx); + //C_ADDTO(d_Fout[k],t); + d_Fout(Fout_count + k,0) += t[0]; + d_Fout(Fout_count + k,1) += t[1]; + } + k += m; + } + } + } + + KOKKOS_INLINE_FUNCTION + static void kf_work(typename AT::t_FFT_DATA_1d_um &d_Fout, const typename AT::t_FFT_DATA_1d_um &d_f, + const size_t fstride, int in_stride, + const typename AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos &st, int Fout_count, int f_count, int factors_count) + { + const int beg = Fout_count; + const int p = d_factors[factors_count++]; /* the radix */ + const int m = d_factors[factors_count++]; /* stage's fft length/p */ + const int end = Fout_count + p*m; + + if (m == 1) { + do { + //C_EQ(d_Fout[Fout_count],d_f[f_count]); + d_Fout(Fout_count,0) = d_f(f_count,0); + d_Fout(Fout_count,1) = d_f(f_count,1); + f_count += fstride*in_stride; + } while (++Fout_count != end); + } else { + do { + /* recursive call: + DFT of size m*p performed by doing + p instances of smaller DFTs of size m, + each one takes a decimated version of the input */ + kf_work(d_Fout, d_f, fstride*p, in_stride, d_factors, st, Fout_count, f_count, factors_count); + f_count += fstride*in_stride; + } while( (Fout_count += m) != end); + } + + Fout_count=beg; + + /* recombine the p smaller DFTs */ + switch (p) { + case 2: kf_bfly2(d_Fout,fstride,st,m,Fout_count); break; + case 3: kf_bfly3(d_Fout,fstride,st,m,Fout_count); break; + case 4: kf_bfly4(d_Fout,fstride,st,m,Fout_count); break; + case 5: kf_bfly5(d_Fout,fstride,st,m,Fout_count); break; + default: kf_bfly_generic(d_Fout,fstride,st,m,p,Fout_count); break; + } + } + + /* facbuf is populated by p1,m1,p2,m2, ... + where + p[i] * m[i] = m[i-1] + m0 = n */ + + static int kf_factor(int n, HAT::t_int_64 h_facbuf) + { + int p=4, nf=0; + double floor_sqrt; + floor_sqrt = floor( sqrt((double)n) ); + int facbuf_count = 0; + int p_max = 0; + + /* factor out the remaining powers of 4, powers of 2, + and then any other remaining primes */ + do { + if (nf == MAXFACTORS) p = n; /* make certain that we don't run out of space */ + while (n % p) { + switch (p) { + case 4: p = 2; break; + case 2: p = 3; break; + default: p += 2; break; + } + if (p > floor_sqrt) + p = n; /* no more factors, skip to end */ + } + n /= p; + h_facbuf[facbuf_count++] = p; + h_facbuf[facbuf_count++] = n; + p_max = MAX(p,p_max); + ++nf; + } while (n > 1); + return p_max; + } + + /* + * User-callable function to allocate all necessary storage space for the fft. + * + * The return value is a contiguous block of memory, allocated with malloc. As such, + * It can be freed with free(), rather than a kiss_fft-specific function. + */ + + static kiss_fft_state_kokkos kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void *mem, size_t *lenmem) + { + kiss_fft_state_kokkos st; + int i; + st.nfft = nfft; + st.inverse = inverse_fft; + + typename AT::tdual_int_64 k_factors = typename AT::tdual_int_64(); + typename AT::tdual_FFT_DATA_1d k_twiddles = typename AT::tdual_FFT_DATA_1d(); + + if (nfft > 0) { + k_factors = typename AT::tdual_int_64("kissfft:factors",MAXFACTORS*2); + k_twiddles = typename AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft); + + for (i=0;i(); + k_factors.template sync(); + st.d_factors = k_factors.template view(); + + k_twiddles.template modify(); + k_twiddles.template sync(); + st.d_twiddles = k_twiddles.template view(); + + return st; + } + + KOKKOS_INLINE_FUNCTION + static void kiss_fft_stride(const kiss_fft_state_kokkos &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset) + { + //if (d_fin.data() == d_fout.data()) { + // // NOTE: this is not really an in-place FFT algorithm. + // // It just performs an out-of-place FFT into a temp buffer + // typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1)); + // kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + // Kokkos::deep_copy(d_fout,d_tmpbuf); + //} else { + kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0); + //} + } + + KOKKOS_INLINE_FUNCTION + static void kiss_fft_kokkos(const kiss_fft_state_kokkos &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset) + { + kiss_fft_stride(cfg,d_fin,d_fout,1,offset); + } + +}; + +} +#endif diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index e900bbd6d1..3ead8417b9 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -174,20 +174,6 @@ t_scalar3 operator * return t_scalar3(a.x*b,a.y*b,a.z*b); } -#if !defined(__CUDACC__) && !defined(__VECTOR_TYPES_H__) - struct double2 { - double x, y; - }; - struct float2 { - float x, y; - }; - struct float4 { - float x, y, z, w; - }; - struct double4 { - double x, y, z, w; - }; -#endif // set LMPHostype and LMPDeviceType from Kokkos Default Types typedef Kokkos::DefaultExecutionSpace LMPDeviceType; typedef Kokkos::HostSpace::execution_space LMPHostType; @@ -310,14 +296,8 @@ public: #endif #if PRECISION==1 typedef float LMP_FLOAT; -typedef float2 LMP_FLOAT2; -typedef lmp_float3 LMP_FLOAT3; -typedef float4 LMP_FLOAT4; #else typedef double LMP_FLOAT; -typedef double2 LMP_FLOAT2; -typedef lmp_double3 LMP_FLOAT3; -typedef double4 LMP_FLOAT4; #endif #ifndef PREC_FORCE @@ -326,14 +306,8 @@ typedef double4 LMP_FLOAT4; #if PREC_FORCE==1 typedef float F_FLOAT; -typedef float2 F_FLOAT2; -typedef lmp_float3 F_FLOAT3; -typedef float4 F_FLOAT4; #else typedef double F_FLOAT; -typedef double2 F_FLOAT2; -typedef lmp_double3 F_FLOAT3; -typedef double4 F_FLOAT4; #endif #ifndef PREC_ENERGY @@ -342,12 +316,8 @@ typedef double4 F_FLOAT4; #if PREC_ENERGY==1 typedef float E_FLOAT; -typedef float2 E_FLOAT2; -typedef float4 E_FLOAT4; #else typedef double E_FLOAT; -typedef double2 E_FLOAT2; -typedef double4 E_FLOAT4; #endif struct s_EV_FLOAT { @@ -500,12 +470,8 @@ typedef struct s_FEV_FLOAT FEV_FLOAT; #if PREC_POS==1 typedef float X_FLOAT; -typedef float2 X_FLOAT2; -typedef float4 X_FLOAT4; #else typedef double X_FLOAT; -typedef double2 X_FLOAT2; -typedef double4 X_FLOAT4; #endif #ifndef PREC_VELOCITIES @@ -514,22 +480,14 @@ typedef double4 X_FLOAT4; #if PREC_VELOCITIES==1 typedef float V_FLOAT; -typedef float2 V_FLOAT2; -typedef float4 V_FLOAT4; #else typedef double V_FLOAT; -typedef double2 V_FLOAT2; -typedef double4 V_FLOAT4; #endif #if PREC_KSPACE==1 typedef float K_FLOAT; -typedef float2 K_FLOAT2; -typedef float4 K_FLOAT4; #else typedef double K_FLOAT; -typedef double2 K_FLOAT2; -typedef double4 K_FLOAT4; #endif typedef int T_INT; diff --git a/src/KOKKOS/pack_kokkos.h b/src/KOKKOS/pack_kokkos.h new file mode 100644 index 0000000000..e0a11ff4e0 --- /dev/null +++ b/src/KOKKOS/pack_kokkos.h @@ -0,0 +1,474 @@ +/* ---------------------------------------------------------------------- + SPARTA - Stochastic PArallel Rarefied-gas Time-accurate Analyzer + http://sparta.sandia.gov + Steve Plimpton, sjplimp@sandia.gov, Michael Gallis, magalli@sandia.gov + Sandia National Laboratories + + Copyright (2014) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level SPARTA directory. +------------------------------------------------------------------------- */ + +// loop counters for doing a pack/unpack + +#include "pack.h" + +/* ---------------------------------------------------------------------- + Pack and unpack functions: + + pack routines copy strided values from data into contiguous locs in buf + unpack routines copy contiguous values from buf into strided locs in data + different versions of unpack depending on permutation + and # of values/element +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + pack from data -> buf +------------------------------------------------------------------------- */ + +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +template +class PackKokkos { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + +struct pack_3d_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nslow; // # of elements in slow index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + pack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nslow = plan->nslow ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int in_orig = slow*nmid*nfast; + const int plane = slow*nstride_plane; + const int out_orig = plane + mid*nstride_line; + const int in = in_orig + mid*nfast + fast; + const int out = out_orig + fast; + d_buf[buf_offset + in] = d_data[data_offset + out]; + } +}; + +static void pack_3d(typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; + pack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow*nmid*nfast,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data +------------------------------------------------------------------------- */ + +struct unpack_3d_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nslow; // # of elements in slow index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nslow = plan->nslow ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast; + const int plane = slow*nstride_plane; + const int in_orig = plane + mid*nstride_line; + const int in = in_orig + fast; + const int out = out_orig + mid*nfast + fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; + } +}; + +static void unpack_3d(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; + unpack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow*nmid*nfast,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data, one axis permutation, 1 value/element +------------------------------------------------------------------------- */ + + +struct unpack_3d_permute1_1_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nslow; // # of elements in slow index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_permute1_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nslow = plan->nslow ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast; + const int plane = slow*nstride_line; + const int in_orig = plane + mid; + const int in = in_orig + fast*nstride_plane; + const int out = out_orig + mid*nfast + fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; + } +}; + +static void unpack_3d_permute1_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; + unpack_3d_permute1_1_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow*nmid*nfast,f); + DeviceType::fence(); +} +/* ---------------------------------------------------------------------- + unpack from buf -> data, one axis permutation, 2 values/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute1_2_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nslow; // # of elements in slow index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_permute1_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nslow = plan->nslow ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast*2; + const int plane = slow*nstride_line; + const int in_orig = plane + 2*mid; + const int in = in_orig + fast*nstride_plane; + const int out = out_orig + 2*mid*nfast + 2*fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; + d_data[data_offset + in+1] = d_buf[buf_offset + out+1]; + } +}; + +static void unpack_3d_permute1_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; + unpack_3d_permute1_2_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow*nmid*nfast,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data, one axis permutation, nqty values/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute1_n_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nslow; // # of elements in slow index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + int nqty; // # of values/element + + unpack_3d_permute1_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nslow = plan->nslow ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + nqty = plan->nqty ; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast*nqty; + const int plane = slow*nstride_line; + const int instart = plane + nqty*mid; + int in = instart + nqty*fast*nstride_plane; + int out = out_orig + nqty*mid*nfast + nqty*fast; + for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++]; + } +}; + +static void unpack_3d_permute1_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; + unpack_3d_permute1_n_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow*nmid*nfast,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data, two axis permutation, 1 value/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute2_1_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nslow; // # of elements in slow index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_permute2_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nslow = plan->nslow ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast; + const int in_orig = slow + mid*nstride_plane; + const int in = in_orig + fast*nstride_line; + const int out = out_orig + mid*nfast + fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; + } +}; + +static void unpack_3d_permute2_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; + unpack_3d_permute2_1_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow*nmid*nfast,f); + DeviceType::fence(); +} + +/* ---------------------------------------------------------------------- + unpack from buf -> data, two axis permutation, 2 values/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute2_2_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nslow; // # of elements in slow index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + + unpack_3d_permute2_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nslow = plan->nslow ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast*2; + const int in_orig = 2*slow + mid*nstride_plane; + const int in = in_orig + fast*nstride_line; + const int out = out_orig + 2*mid*nfast + 2*fast; + d_data[data_offset + in] = d_buf[buf_offset + out]; + d_data[data_offset + in+1] = d_buf[buf_offset + out+1]; + } +}; + +static void unpack_3d_permute2_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; + unpack_3d_permute2_2_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow*nmid*nfast,f); + DeviceType::fence(); +} +/* ---------------------------------------------------------------------- + unpack from buf -> data, two axis permutation, nqty values/element +------------------------------------------------------------------------- */ + +struct unpack_3d_permute2_n_functor { +public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d_um d_buf,d_data; + int buf_offset,data_offset; + int nfast; // # of elements in fast index + int nmid; // # of elements in mid index + int nslow; // # of elements in slow index + int nstride_line; // stride between successive mid indices + int nstride_plane; // stride between successive slow indices + int nqty; // # of values/element + + unpack_3d_permute2_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan): + d_buf(d_buf_), + d_data(d_data_) + { + buf_offset = buf_offset_ ; + data_offset = data_offset_ ; + nfast = plan->nfast ; + nmid = plan->nmid ; + nslow = plan->nslow ; + nstride_line = plan->nstride_line ; + nstride_plane = plan->nstride_plane; + nqty = plan->nqty ; + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int &index) const { + const int fast = index / (nslow * nmid); + const int index_new = index - (fast * nslow * nmid); + const int mid = index_new / nslow; + const int slow = index_new % nslow; + const int out_orig = slow*nmid*nfast*nqty; + const int instart = nqty*slow + mid*nstride_plane; + int in = instart + nqty*fast*nstride_line; + int out = out_orig + nqty*mid*nfast + nqty*fast; + for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++]; + } +}; + +static void unpack_3d_permute2_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan) +{ + const int nslow = plan->nslow; + const int nmid = plan->nmid; + const int nfast = plan->nfast; + unpack_3d_permute2_n_functor f(d_buf,buf_offset,d_data,data_offset,plan); + Kokkos::parallel_for(nslow*nmid*nfast,f); + DeviceType::fence(); +} + +}; + +} diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index 0e7dbdb99b..36e48febe8 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -25,8 +25,8 @@ #include "force.h" #include "pair.h" #include "domain.h" -#include "fft3d_wrap.h" -#include "remap_wrap.h" +#include "fft3d_kokkos.h" +#include "remap_kokkos.h" #include "memory_kokkos.h" #include "error.h" #include "atom_masks.h" @@ -842,19 +842,19 @@ void PPPMKokkos::allocate() int tmp; - fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag); + fft1 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 0,0,&tmp,collective_flag); - fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag); - remap = new Remap(lmp,world, - nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag); + fft2 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + 0,0,&tmp,collective_flag); + remap = new RemapKokkos(lmp,world, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 1,0,0,FFT_PRECISION,collective_flag); // create ghost grid object for rho and electric field communication @@ -1783,12 +1783,7 @@ void PPPMKokkos::brick2fft() Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_inout),*this); copymode = 0; - k_density_fft.template modify(); - k_density_fft.template sync(); - remap->perform(density_fft,density_fft,work1); - k_density_fft.template modify(); - k_density_fft.template sync(); - + remap->perform(d_density_fft,d_density_fft,d_work1); } template @@ -1830,11 +1825,7 @@ void PPPMKokkos::poisson_ik() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work1.template modify(); - k_work1.template sync(); - fft1->compute(work1,work1,1); - k_work1.template modify(); - k_work1.template sync(); + fft1->compute(d_work1,d_work1,1); // global energy and virial contribution @@ -1898,11 +1889,7 @@ void PPPMKokkos::poisson_ik() Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_fft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -1916,11 +1903,7 @@ void PPPMKokkos::poisson_ik() Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_fft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -1933,11 +1916,7 @@ void PPPMKokkos::poisson_ik() Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_fft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2193,11 +2172,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2214,11 +2189,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2230,11 +2201,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2246,11 +2213,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2261,11 +2224,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2277,11 +2236,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -2293,11 +2248,7 @@ void PPPMKokkos::poisson_peratom() Kokkos::parallel_for(Kokkos::RangePolicy(0,nfft),*this); copymode = 0; - k_work2.template modify(); - k_work2.template sync(); - fft2->compute(work2,work2,-1); - k_work2.template modify(); - k_work2.template sync(); + fft2->compute(d_work2,d_work2,-1); copymode = 1; @@ -3037,10 +2988,10 @@ int PPPMKokkos::timing_1d(int n, double &time1d) time1 = MPI_Wtime(); for (int i = 0; i < n; i++) { - fft1->timing1d(work1,nfft_both,1); - fft2->timing1d(work1,nfft_both,-1); - fft2->timing1d(work1,nfft_both,-1); - fft2->timing1d(work1,nfft_both,-1); + fft1->timing1d(d_work1,nfft_both,1); + fft2->timing1d(d_work1,nfft_both,-1); + fft2->timing1d(d_work1,nfft_both,-1); + fft2->timing1d(d_work1,nfft_both,-1); } MPI_Barrier(world); @@ -3074,10 +3025,10 @@ int PPPMKokkos::timing_3d(int n, double &time3d) time1 = MPI_Wtime(); for (int i = 0; i < n; i++) { - fft1->compute(work1,work1,1); - fft2->compute(work1,work1,-1); - fft2->compute(work1,work1,-1); - fft2->compute(work1,work1,-1); + fft1->compute(d_work1,d_work1,1); + fft2->compute(d_work1,d_work1,-1); + fft2->compute(d_work1,d_work1,-1); + fft2->compute(d_work1,d_work1,-1); } MPI_Barrier(world); diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h index 21896eb714..c855b1bda8 100644 --- a/src/KOKKOS/pppm_kokkos.h +++ b/src/KOKKOS/pppm_kokkos.h @@ -22,11 +22,31 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos) #ifndef LMP_PPPM_KOKKOS_H #define LMP_PPPM_KOKKOS_H -#include "pppm.h" #include "gridcomm_kokkos.h" +#include "remap_kokkos.h" +#include "fft3d_kokkos.h" #include "kokkos_base.h" #include "kokkos_type.h" +// fix up FFT defines for KOKKOS with CUDA + +#if defined(KOKKOS_ENABLE_CUDA) +# if defined(FFT_FFTW) +# undef FFT_FFTW +# endif +# if defined(FFT_FFTW3) +# undef FFT_FFTW3 +# endif +# if defined(FFT_MKL) +# undef FFT_MKL +# endif +# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT) +# define FFT_KISSFFT +# endif +#endif + +#include "pppm.h" + namespace LAMMPS_NS { struct TagPPPM_setup1{}; @@ -331,8 +351,8 @@ class PPPMKokkos : public PPPM, public KokkosBase { //double **acons; typename Kokkos::DualView::t_host acons; - class FFT3d *fft1,*fft2; - class Remap *remap; + FFT3dKokkos *fft1,*fft2; + RemapKokkos *remap; GridCommKokkos *cg; GridCommKokkos *cg_peratom; diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp new file mode 100644 index 0000000000..4d4300daf0 --- /dev/null +++ b/src/KOKKOS/remap_kokkos.cpp @@ -0,0 +1,511 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include "remap_kokkos.h" +#include "pack_kokkos.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) > (B) ? (A) : (B)) + +/* ---------------------------------------------------------------------- */ + +template +RemapKokkos::RemapKokkos(LAMMPS *lmp) : Pointers(lmp) +{ + plan = NULL; +} + +template +RemapKokkos::RemapKokkos(LAMMPS *lmp, MPI_Comm comm, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int nqty, int permute, int memory, + int precision, int usecollective) : Pointers(lmp) +{ + plan = remap_3d_create_plan_kokkos(comm, + in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, + out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, + nqty,permute,memory,precision,usecollective); + if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan"); +} + +/* ---------------------------------------------------------------------- */ + +template +RemapKokkos::~RemapKokkos() +{ + remap_3d_destroy_plan_kokkos(plan); +} + +/* ---------------------------------------------------------------------- */ + +template +void RemapKokkos::perform(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf) +{ + remap_3d_kokkos(d_in,d_out,d_buf,plan); +} + + +/* ---------------------------------------------------------------------- + Data layout for 3d remaps: + + data set of Nfast x Nmid x Nslow elements is owned by P procs + each element = nqty contiguous datums + on input, each proc owns a subsection of the elements + on output, each proc will own a (presumably different) subsection + my subsection must not overlap with any other proc's subsection, + i.e. the union of all proc's input (or output) subsections must + exactly tile the global Nfast x Nmid x Nslow data set + when called from C, all subsection indices are + C-style from 0 to N-1 where N = Nfast or Nmid or Nslow + when called from F77, all subsection indices are + F77-style from 1 to N where N = Nfast or Nmid or Nslow + a proc can own 0 elements on input or output + by specifying hi index < lo index + on both input and output, data is stored contiguously on a processor + with a fast-varying, mid-varying, and slow-varying index +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Perform 3d remap + + Arguments: + in starting address of input data on this proc + out starting address of where output data for this proc + will be placed (can be same as in) + buf extra memory required for remap + if memory=0 was used in call to remap_3d_create_plan + then buf must be big enough to hold output result + i.e. nqty * (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * + (out_khi-out_klo+1) + if memory=1 was used in call to remap_3d_create_plan + then buf is not used, can just be a dummy pointer + plan plan returned by previous call to remap_3d_create_plan +------------------------------------------------------------------------- */ + +template +void RemapKokkos::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf, + struct remap_plan_3d_kokkos *plan) +{ + // collective flag not yet supported + + // use point-to-point communication + + int i,isend,irecv; + typename AT::t_FFT_SCALAR_1d d_scratch; + + if (plan->memory == 0) + d_scratch = d_buf; + else + d_scratch = plan->d_scratch; + + // post all recvs into scratch space + + for (irecv = 0; irecv < plan->nrecv; irecv++) { + FFT_SCALAR* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv]; + MPI_Irecv(scratch,plan->recv_size[irecv], + MPI_FFT_SCALAR,plan->recv_proc[irecv],0, + plan->comm,&plan->request[irecv]); + } + + // send all messages to other procs + + for (isend = 0; isend < plan->nsend; isend++) { + int in_offset = plan->send_offset[isend]; + plan->pack(d_in,in_offset, + plan->d_sendbuf,0,&plan->packplan[isend]); + MPI_Send(plan->d_sendbuf.ptr_on_device(),plan->send_size[isend],MPI_FFT_SCALAR, + plan->send_proc[isend],0,plan->comm); + } + + // copy in -> scratch -> out for self data + + if (plan->self) { + isend = plan->nsend; + irecv = plan->nrecv; + + int in_offset = plan->send_offset[isend]; + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; + + plan->pack(d_in,in_offset, + d_scratch,scratch_offset, + &plan->packplan[isend]); + plan->unpack(d_scratch,scratch_offset, + d_out,out_offset,&plan->unpackplan[irecv]); + } + + // unpack all messages from scratch -> out + + for (i = 0; i < plan->nrecv; i++) { + MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE); + + int scratch_offset = plan->recv_bufloc[irecv]; + int out_offset = plan->recv_offset[irecv]; + + plan->unpack(d_scratch,scratch_offset, + d_out,out_offset,&plan->unpackplan[irecv]); + } +} + +/* ---------------------------------------------------------------------- + Create plan for performing a 3d remap + + Arguments: + comm MPI communicator for the P procs which own the data + in_ilo,in_ihi input bounds of data I own in fast index + in_jlo,in_jhi input bounds of data I own in mid index + in_klo,in_khi input bounds of data I own in slow index + out_ilo,out_ihi output bounds of data I own in fast index + out_jlo,out_jhi output bounds of data I own in mid index + out_klo,out_khi output bounds of data I own in slow index + nqty # of datums per element + permute permutation in storage order of indices on output + 0 = no permutation + 1 = permute once = mid->fast, slow->mid, fast->slow + 2 = permute twice = slow->fast, fast->mid, mid->slow + memory user provides buffer memory for remap or system does + 0 = user provides memory + 1 = system provides memory + precision precision of data + 1 = single precision (4 bytes per datum) + 2 = double precision (8 bytes per datum) + usecollective whether to use collective MPI or point-to-point +------------------------------------------------------------------------- */ + +template +struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_create_plan_kokkos( + MPI_Comm comm, + int in_ilo, int in_ihi, int in_jlo, int in_jhi, + int in_klo, int in_khi, + int out_ilo, int out_ihi, int out_jlo, int out_jhi, + int out_klo, int out_khi, + int nqty, int permute, int memory, int precision, int usecollective) +{ + + struct remap_plan_3d_kokkos *plan; + struct extent_3d *inarray, *outarray; + struct extent_3d in,out,overlap; + int i,iproc,nsend,nrecv,ibuf,size,me,nprocs; + + // query MPI info + + MPI_Comm_rank(comm,&me); + MPI_Comm_size(comm,&nprocs); + + // allocate memory for plan data struct + + plan = new struct remap_plan_3d_kokkos; + if (plan == NULL) return NULL; + plan->usecollective = usecollective; + + // store parameters in local data structs + + in.ilo = in_ilo; + in.ihi = in_ihi; + in.isize = in.ihi - in.ilo + 1; + + in.jlo = in_jlo; + in.jhi = in_jhi; + in.jsize = in.jhi - in.jlo + 1; + + in.klo = in_klo; + in.khi = in_khi; + in.ksize = in.khi - in.klo + 1; + + out.ilo = out_ilo; + out.ihi = out_ihi; + out.isize = out.ihi - out.ilo + 1; + + out.jlo = out_jlo; + out.jhi = out_jhi; + out.jsize = out.jhi - out.jlo + 1; + + out.klo = out_klo; + out.khi = out_khi; + out.ksize = out.khi - out.klo + 1; + + // combine output extents across all procs + + inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); + if (inarray == NULL) return NULL; + + outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d)); + if (outarray == NULL) return NULL; + + MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE, + outarray,sizeof(struct extent_3d),MPI_BYTE,comm); + + // count send collides, including self + + nsend = 0; + iproc = me; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + nsend += remap_3d_collide(&in,&outarray[iproc],&overlap); + } + + // malloc space for send info + + if (nsend) { + plan->pack = PackKokkos::pack_3d; + + plan->send_offset = (int *) malloc(nsend*sizeof(int)); + plan->send_size = (int *) malloc(nsend*sizeof(int)); + plan->send_proc = (int *) malloc(nsend*sizeof(int)); + plan->packplan = (struct pack_plan_3d *) + malloc(nsend*sizeof(struct pack_plan_3d)); + + if (plan->send_offset == NULL || plan->send_size == NULL || + plan->send_proc == NULL || plan->packplan == NULL) return NULL; + } + + // store send info, with self as last entry + + nsend = 0; + iproc = me; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&in,&outarray[iproc],&overlap)) { + plan->send_proc[nsend] = iproc; + plan->send_offset[nsend] = nqty * + ((overlap.klo-in.klo)*in.jsize*in.isize + + ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo)); + plan->packplan[nsend].nfast = nqty*overlap.isize; + plan->packplan[nsend].nmid = overlap.jsize; + plan->packplan[nsend].nslow = overlap.ksize; + plan->packplan[nsend].nstride_line = nqty*in.isize; + plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize; + plan->packplan[nsend].nqty = nqty; + plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + nsend++; + } + } + + // plan->nsend = # of sends not including self + + if (nsend && plan->send_proc[nsend-1] == me) { + if (plan->usecollective) // for collectives include self in nsend list + plan->nsend = nsend; + else + plan->nsend = nsend - 1; + } else + plan->nsend = nsend; + + // combine input extents across all procs + + MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE, + inarray,sizeof(struct extent_3d),MPI_BYTE,comm); + + // count recv collides, including self + + nrecv = 0; + iproc = me; + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap); + } + + // malloc space for recv info + + if (nrecv) { + if (permute == 0) + plan->unpack = PackKokkos::unpack_3d; + else if (permute == 1) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute1_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute1_2; + else + plan->unpack = PackKokkos::unpack_3d_permute1_n; + } + else if (permute == 2) { + if (nqty == 1) + plan->unpack = PackKokkos::unpack_3d_permute2_1; + else if (nqty == 2) + plan->unpack = PackKokkos::unpack_3d_permute2_2; + else + plan->unpack = PackKokkos::unpack_3d_permute2_n; + } + + plan->recv_offset = (int *) malloc(nrecv*sizeof(int)); + plan->recv_size = (int *) malloc(nrecv*sizeof(int)); + plan->recv_proc = (int *) malloc(nrecv*sizeof(int)); + plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int)); + plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request)); + plan->unpackplan = (struct pack_plan_3d *) + malloc(nrecv*sizeof(struct pack_plan_3d)); + + if (plan->recv_offset == NULL || plan->recv_size == NULL || + plan->recv_proc == NULL || plan->recv_bufloc == NULL || + plan->request == NULL || plan->unpackplan == NULL) return NULL; + } + + // store recv info, with self as last entry + + ibuf = 0; + nrecv = 0; + iproc = me; + + for (i = 0; i < nprocs; i++) { + iproc++; + if (iproc == nprocs) iproc = 0; + if (remap_3d_collide(&out,&inarray[iproc],&overlap)) { + plan->recv_proc[nrecv] = iproc; + plan->recv_bufloc[nrecv] = ibuf; + + if (permute == 0) { + plan->recv_offset[nrecv] = nqty * + ((overlap.klo-out.klo)*out.jsize*out.isize + + (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo)); + plan->unpackplan[nrecv].nfast = nqty*overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.isize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize; + plan->unpackplan[nrecv].nqty = nqty; + } + else if (permute == 1) { + plan->recv_offset[nrecv] = nqty * + ((overlap.ilo-out.ilo)*out.ksize*out.jsize + + (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.jsize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize; + plan->unpackplan[nrecv].nqty = nqty; + } + else { + plan->recv_offset[nrecv] = nqty * + ((overlap.jlo-out.jlo)*out.isize*out.ksize + + (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo)); + plan->unpackplan[nrecv].nfast = overlap.isize; + plan->unpackplan[nrecv].nmid = overlap.jsize; + plan->unpackplan[nrecv].nslow = overlap.ksize; + plan->unpackplan[nrecv].nstride_line = nqty*out.ksize; + plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize; + plan->unpackplan[nrecv].nqty = nqty; + } + + plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize; + ibuf += plan->recv_size[nrecv]; + nrecv++; + } + } + + // plan->nrecv = # of recvs not including self + // for collectives include self in the nsend list + + if (nrecv && plan->recv_proc[nrecv-1] == me) { + if (plan->usecollective) plan->nrecv = nrecv; + else plan->nrecv = nrecv - 1; + } else plan->nrecv = nrecv; + + // init remaining fields in remap plan + + plan->memory = memory; + + if (nrecv == plan->nrecv) plan->self = 0; + else plan->self = 1; + + // free locally malloced space + + free(inarray); + free(outarray); + + // find biggest send message (not including self) and malloc space for it + + size = 0; + for (nsend = 0; nsend < plan->nsend; nsend++) + size = MAX(size,plan->send_size[nsend]); + + if (size) { + plan->d_sendbuf = typename AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size); + if (!plan->d_sendbuf.data()) return NULL; + } + + // if requested, allocate internal scratch space for recvs, + // only need it if I will receive any data (including self) + + if (memory == 1) { + if (nrecv > 0) { + plan->d_scratch = + typename AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize); + if (!plan->d_scratch.data()) return NULL; + } + } + + // not using collective - dup comm + + MPI_Comm_dup(comm,&plan->comm); + + // return pointer to plan + + return plan; +} + +/* ---------------------------------------------------------------------- + Destroy a 3d remap plan +------------------------------------------------------------------------- */ + +template +void RemapKokkos::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos *plan) +{ + if (plan == NULL) return; + + // free MPI communicator + + if (!((plan->usecollective) && (plan->commringlen == 0))) + MPI_Comm_free(&plan->comm); + + // free internal arrays + + if (plan->nsend || plan->self) { + free(plan->send_offset); + free(plan->send_size); + free(plan->send_proc); + free(plan->packplan); + } + + if (plan->nrecv || plan->self) { + free(plan->recv_offset); + free(plan->recv_size); + free(plan->recv_proc); + free(plan->recv_bufloc); + free(plan->request); + free(plan->unpackplan); + } + + // free plan itself + + delete plan; +} + +namespace LAMMPS_NS { +template class RemapKokkos; +#ifdef KOKKOS_HAVE_CUDA +template class RemapKokkos; +#endif +} diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h new file mode 100644 index 0000000000..f77ce9fb72 --- /dev/null +++ b/src/KOKKOS/remap_kokkos.h @@ -0,0 +1,88 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_REMAP_KOKKOS_H +#define LMP_REMAP_KOKKOS_H + +#include "pointers.h" +#include +#include "fftdata_kokkos.h" +#include "remap.h" +#include "kokkos_type.h" + +namespace LAMMPS_NS { + +// details of how to do a 3d remap + +template +struct remap_plan_3d_kokkos { + typedef DeviceType device_type; + typedef ArrayTypes AT; + typename AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends + typename AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs + void (*pack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *); + // which pack function to use + void (*unpack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *); + // which unpack function to use + int *send_offset; // extraction loc for each send + int *send_size; // size of each send message + int *send_proc; // proc to send each message to + struct pack_plan_3d *packplan; // pack plan for each send message + int *recv_offset; // insertion loc for each recv + int *recv_size; // size of each recv message + int *recv_proc; // proc to recv each message from + int *recv_bufloc; // offset in scratch buf for each recv + MPI_Request *request; // MPI request for each posted recv + struct pack_plan_3d *unpackplan; // unpack plan for each recv message + int nrecv; // # of recvs from other procs + int nsend; // # of sends to other procs + int self; // whether I send/recv with myself + int memory; // user provides scratch space or not + MPI_Comm comm; // group of procs performing remap + int usecollective; // use collective or point-to-point MPI + int commringlen; // length of commringlist + int *commringlist; // ranks on communication ring of this plan +}; + +template +class RemapKokkos : protected Pointers { + public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + RemapKokkos(class LAMMPS *); + RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int, + int,int,int,int,int,int,int,int,int,int,int); + ~RemapKokkos(); + void perform(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d); + + struct remap_plan_3d_kokkos *plan; + + void remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, struct remap_plan_3d_kokkos *); + struct remap_plan_3d_kokkos *remap_3d_create_plan_kokkos(MPI_Comm, + int, int, int, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int); + void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos *); +}; + +} + +#endif + +/* ERROR/WARNING messages: + +E: Could not create 3d remap plan + +The FFT setup in pppm failed. + +*/ diff --git a/src/KSPACE/fft3d.cpp b/src/KSPACE/fft3d.cpp index e4c38a57f5..c9cf8df172 100644 --- a/src/KSPACE/fft3d.cpp +++ b/src/KSPACE/fft3d.cpp @@ -25,6 +25,9 @@ #include #include #include "remap.h" +#if defined(_OPENMP) +#include +#endif #ifdef FFT_KISS /* include kissfft implementation */ @@ -266,7 +269,7 @@ struct fft_plan_3d *fft_3d_create_plan( int scaled, int permute, int *nbuf, int usecollective) { struct fft_plan_3d *plan; - int me,nprocs; + int me,nprocs,nthreads; int flag,remapflag; int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi; int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi; @@ -279,6 +282,14 @@ struct fft_plan_3d *fft_3d_create_plan( MPI_Comm_rank(comm,&me); MPI_Comm_size(comm,&nprocs); +#if defined(_OPENMP) + // query OpenMP info. + // should have been initialized systemwide in Comm class constructor + nthreads = omp_get_max_threads(); +#else + nthreads = 1; +#endif + // compute division of procs in 2 dimensions not on-processor bifactor(nprocs,&np1,&np2); @@ -571,6 +582,13 @@ struct fft_plan_3d *fft_3d_create_plan( */ #elif defined(FFT_FFTW3) +#if defined(FFT_FFTW_THREADS) + if (nthreads > 1) { + FFTW_API(init_threads)(); + FFTW_API(plan_with_nthreads)(nthreads); + } +#endif + plan->plan_fast_forward = FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1, NULL,&nfast,1,plan->length1, @@ -689,6 +707,9 @@ void fft_3d_destroy_plan(struct fft_plan_3d *plan) FFTW_API(destroy_plan)(plan->plan_mid_backward); FFTW_API(destroy_plan)(plan->plan_fast_forward); FFTW_API(destroy_plan)(plan->plan_fast_backward); +#if defined(FFT_FFTW_THREADS) + FFTW_API(cleanup_threads)(); +#endif #else if (plan->cfg_slow_forward != plan->cfg_fast_forward && plan->cfg_slow_forward != plan->cfg_mid_forward) { diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h index 11b98076d9..f73fb49ad3 100644 --- a/src/KSPACE/pppm.h +++ b/src/KSPACE/pppm.h @@ -26,6 +26,8 @@ KSpaceStyle(pppm,PPPM) #define LMP_FFT_LIB "FFTW3" #elif defined(FFT_MKL) #define LMP_FFT_LIB "MKL FFT" +#elif defined(FFT_CUFFT) +#define LMP_FFT_LIB "cuFFT" #else #define LMP_FFT_LIB "KISS FFT" #endif diff --git a/src/MAKE/MACHINES/Makefile.lassen_kokkos b/src/MAKE/MACHINES/Makefile.lassen_kokkos index 23697bfea2..7484f25a4b 100644 --- a/src/MAKE/MACHINES/Makefile.lassen_kokkos +++ b/src/MAKE/MACHINES/Makefile.lassen_kokkos @@ -1,4 +1,4 @@ -# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc 7.3.1 +# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc SHELL = /bin/sh @@ -44,9 +44,12 @@ LMP_INC = -DLAMMPS_GZIP # PATH = path for MPI library # LIB = name of MPI library -MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/include +MY_MPI_EXE = $(shell which mpicxx) +MY_MPI_PATH = $(dir ${MY_MPI_EXE}) + +MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I${MY_MPI_PATH}../include MPI_PATH = -MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/lib -lmpi_ibm +MPI_LIB = -L${MY_MPI_PATH}../lib -lmpi_ibm # FFT library # see discussion in Section 2.2 (step 6) of manaul @@ -55,9 +58,9 @@ MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3. # PATH = path for FFT library # LIB = name of FFT library -FFT_INC = +FFT_INC = -DFFT_CUFFT FFT_PATH = -FFT_LIB = +FFT_LIB = -lcufft # JPEG and/or PNG library # see discussion in Section 2.2 (step 7) of manual diff --git a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi index d6c5c566aa..ba34bc25eb 100644 --- a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi +++ b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi @@ -55,9 +55,9 @@ MPI_LIB = # PATH = path for FFT library # LIB = name of FFT library -FFT_INC = +FFT_INC = -DFFT_CUFFT FFT_PATH = -FFT_LIB = +FFT_LIB = -lcufft # JPEG and/or PNG library # see discussion in Section 2.2 (step 7) of manual diff --git a/src/version.h b/src/version.h index 1c5c4c6206..a47eeef1c1 100644 --- a/src/version.h +++ b/src/version.h @@ -1 +1 @@ -#define LAMMPS_VERSION "09 Jan 2020" +#define LAMMPS_VERSION "24 Jan 2020"