Merge branch 'master' into remove-txt-files
# Conflicts: # doc/src/Build_settings.rst
This commit is contained in:
@ -739,4 +739,20 @@ if(PKG_KSPACE)
|
||||
else()
|
||||
message(STATUS "Using double precision FFTs")
|
||||
endif()
|
||||
if(FFT_FFTW_THREADS)
|
||||
message(STATUS "Using threaded FFTs")
|
||||
else()
|
||||
message(STATUS "Using non-threaded FFTs")
|
||||
endif()
|
||||
if(PKG_KOKKOS)
|
||||
if(KOKKOS_ENABLE_CUDA)
|
||||
if (${FFT} STREQUAL "KISS")
|
||||
message(STATUS "Kokkos FFT: KISS")
|
||||
else()
|
||||
message(STATUS "Kokkos FFT: cuFFT")
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Kokkos FFT: ${FFT}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@ -1,20 +1,22 @@
|
||||
# - Find fftw3
|
||||
# Find the native FFTW3 headers and libraries.
|
||||
# Find the native double precision FFTW3 headers and libraries.
|
||||
#
|
||||
# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
|
||||
# FFTW3_LIBRARIES - List of libraries when using fftw3.
|
||||
# FFTW3_FOUND - True if fftw3 found.
|
||||
# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
|
||||
# FFTW3_LIBRARIES - List of libraries when using fftw3.
|
||||
# FFTW3_OMP_LIBRARIES - List of libraries when using fftw3.
|
||||
# FFTW3_FOUND - True if fftw3 found.
|
||||
#
|
||||
|
||||
find_package(PkgConfig)
|
||||
|
||||
pkg_check_modules(PC_FFTW3 fftw3)
|
||||
find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS})
|
||||
|
||||
find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS})
|
||||
find_library(FFTW3_OMP_LIBRARY NAMES fftw3_omp HINTS ${PC_FFTW3_LIBRARY_DIRS})
|
||||
|
||||
set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
|
||||
set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
|
||||
set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
|
||||
set(FFTW3_OMP_LIBRARIES ${FFTW3_OMP_LIBRARY})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE
|
||||
@ -22,4 +24,4 @@ include(FindPackageHandleStandardArgs)
|
||||
|
||||
find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR)
|
||||
|
||||
mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY )
|
||||
mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY FFTW3_OMP_LIBRARY)
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
# - Find fftw3f
|
||||
# Find the native FFTW3F headers and libraries.
|
||||
# Find the native single precision FFTW3 headers and libraries.
|
||||
#
|
||||
# FFTW3F_INCLUDE_DIRS - where to find fftw3f.h, etc.
|
||||
# FFTW3F_LIBRARIES - List of libraries when using fftw3f.
|
||||
# FFTW3F_OMP_LIBRARIES - List of libraries when using fftw3.
|
||||
# FFTW3F_FOUND - True if fftw3f found.
|
||||
#
|
||||
|
||||
@ -10,11 +10,12 @@ find_package(PkgConfig)
|
||||
|
||||
pkg_check_modules(PC_FFTW3F fftw3f)
|
||||
find_path(FFTW3F_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3F_INCLUDE_DIRS})
|
||||
|
||||
find_library(FFTW3F_LIBRARY NAMES fftw3f HINTS ${PC_FFTW3F_LIBRARY_DIRS})
|
||||
find_library(FFTW3F_OMP_LIBRARY NAMES fftw3f_omp HINTS ${PC_FFTW3F_LIBRARY_DIRS})
|
||||
|
||||
set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY})
|
||||
set(FFTW3F_INCLUDE_DIRS ${FFTW3F_INCLUDE_DIR})
|
||||
set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY})
|
||||
set(FFTW3F_OMP_LIBRARIES ${FFTW3F_OMP_LIBRARY})
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
# handle the QUIETLY and REQUIRED arguments and set FFTW3F_FOUND to TRUE
|
||||
@ -22,4 +23,4 @@ include(FindPackageHandleStandardArgs)
|
||||
|
||||
find_package_handle_standard_args(FFTW3F DEFAULT_MSG FFTW3F_LIBRARY FFTW3F_INCLUDE_DIR)
|
||||
|
||||
mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY )
|
||||
mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY)
|
||||
|
||||
@ -10,7 +10,7 @@ if(PKG_KOKKOS)
|
||||
set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
|
||||
set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
|
||||
add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR})
|
||||
|
||||
|
||||
set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src
|
||||
${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src
|
||||
${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src
|
||||
@ -39,7 +39,15 @@ if(PKG_KOKKOS)
|
||||
${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp)
|
||||
|
||||
if(PKG_KSPACE)
|
||||
list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp)
|
||||
list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/fft3d_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp
|
||||
${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp)
|
||||
if(KOKKOS_ENABLE_CUDA)
|
||||
if(NOT ${FFT} STREQUAL "KISS")
|
||||
add_definitions(-DFFT_CUFFT)
|
||||
list(APPEND LAMMPS_LINK_LIBS cufft)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")
|
||||
|
||||
@ -21,6 +21,20 @@ if(PKG_KSPACE)
|
||||
add_definitions(-DFFT_FFTW3)
|
||||
include_directories(${${FFTW}_INCLUDE_DIRS})
|
||||
list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES})
|
||||
if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY)
|
||||
option(FFT_FFTW_THREADS "Use threaded FFT library" ON)
|
||||
else()
|
||||
option(FFT_FFTW_THREADS "Use threaded FFT library" OFF)
|
||||
endif()
|
||||
|
||||
if(FFT_FFTW_THREADS)
|
||||
if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY)
|
||||
add_definitions(-DFFT_FFTW_THREADS)
|
||||
list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_OMP_LIBRARIES})
|
||||
else()
|
||||
message(FATAL_ERROR "Need OpenMP enabled FFTW3 library for FFT_THREADS")
|
||||
endif()
|
||||
endif()
|
||||
elseif(FFT STREQUAL "MKL")
|
||||
find_package(MKL REQUIRED)
|
||||
add_definitions(-DFFT_MKL)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
.TH LAMMPS "9 January 2020" "2020-01-09"
|
||||
.TH LAMMPS "24 January 2020" "2020-01-24"
|
||||
.SH NAME
|
||||
.B LAMMPS
|
||||
\- Molecular Dynamics Simulator.
|
||||
|
||||
@ -350,10 +350,12 @@ For NVIDIA GPUs using CUDA:
|
||||
KOKKOS_DEVICES = Cuda
|
||||
KOKKOS_ARCH = archCPU,archGPU # archCPU = CPU from list above that is hosting the GPU
|
||||
# archGPU = GPU from list above
|
||||
FFT_INC = -DFFT_CUFFT # enable use of cuFFT (optional)
|
||||
FFT_LIB = -lcufft # link to cuFFT library
|
||||
|
||||
For GPUs, you also need these 2 lines in your Makefile.machine before
|
||||
the CC line is defined, in this case for use with OpenMPI mpicxx. The
|
||||
2 lines define a nvcc wrapper compiler, which will use nvcc for
|
||||
For GPUs, you also need the following 2 lines in your Makefile.machine
|
||||
before the CC line is defined, in this case for use with OpenMPI mpicxx.
|
||||
The 2 lines define a nvcc wrapper compiler, which will use nvcc for
|
||||
compiling CUDA files and use a C++ compiler for non-Kokkos, non-CUDA
|
||||
files.
|
||||
|
||||
|
||||
@ -49,13 +49,19 @@ through the CMAKE\_CXX\_FLAGS variable. Example for CentOS 7:
|
||||
|
||||
-D CMAKE_CXX_FLAGS="-O3 -g -fopenmp -DNDEBUG -std=c++11"
|
||||
|
||||
**Makefile.machine setting**\ :
|
||||
**Makefile.machine setting**\ to bypass the C++11 test and compile in C++98 mode:
|
||||
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
LMP_INC = -DLAMMPS_CXX98
|
||||
|
||||
**Makefile.machine setting**\ to enable the C++11 with older (but not too old) GNU c++ (e.g. on CentOS 7):
|
||||
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
CCFLAGS = -g -O3 -std=c++11
|
||||
|
||||
----------
|
||||
|
||||
@ -86,14 +92,19 @@ LAMMPS can use them if they are available on your system.
|
||||
an exception to the rule that all CMake variables can be specified
|
||||
with lower-case values.
|
||||
|
||||
Usually these settings are all that is needed. If CMake cannot find
|
||||
the FFT library, you can set these variables:
|
||||
Usually these settings are all that is needed. If FFTW3 is selected,
|
||||
then CMake will try to detect, if threaded FFTW libraries are available
|
||||
and enable them by default. This setting is independent of whether
|
||||
OpenMP threads are enabled and a packages like KOKKOS or USER-OMP is
|
||||
used. If CMake cannot detect the FFT library, you can set these variables
|
||||
to assist:
|
||||
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
-D FFTW3_INCLUDE_DIRS=path # path to FFTW3 include files
|
||||
-D FFTW3_LIBRARIES=path # path to FFTW3 libraries
|
||||
-D FFT_FFTW_THREADS=on # enable using threaded FFTW3 libraries
|
||||
-D MKL_INCLUDE_DIRS=path # ditto for Intel MKL library
|
||||
-D MKL_LIBRARIES=path
|
||||
|
||||
@ -105,6 +116,7 @@ the FFT library, you can set these variables:
|
||||
FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS
|
||||
# default is KISS if not specified
|
||||
FFT_INC = -DFFT_SINGLE # do not specify for double precision
|
||||
FFT_INC = -DFFT_FFTW_THREADS # enable using threaded FFTW3 libraries
|
||||
FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY
|
||||
|
||||
# default is FFT\_PACK\_ARRAY if not specified
|
||||
@ -115,6 +127,7 @@ the FFT library, you can set these variables:
|
||||
FFT_INC = -I/usr/local/include
|
||||
FFT_PATH = -L/usr/local/lib
|
||||
FFT_LIB = -lfftw3 # FFTW3 double precision
|
||||
FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision with threads (needs -DFFT_FFTW_THREADS)
|
||||
FFT_LIB = -lfftw3 -lfftw3f # FFTW3 single precision
|
||||
FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler
|
||||
FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compier
|
||||
@ -126,16 +139,19 @@ FFT\_LIB with the appropriate FFT libraries to include in the link.
|
||||
**CMake and make info**\ :
|
||||
|
||||
The `KISS FFT library <http://kissfft.sf.net>`_ is included in the LAMMPS
|
||||
distribution. It is portable across all platforms. Depending on the
|
||||
size of the FFTs and the number of processors used, the other
|
||||
libraries listed here can be faster.
|
||||
distribution. It is portable across all platforms. Depending on the size
|
||||
of the FFTs and the number of processors used, the other libraries listed
|
||||
here can be faster.
|
||||
|
||||
However, note that long-range Coulombics are only a portion of the
|
||||
per-timestep CPU cost, FFTs are only a portion of long-range
|
||||
Coulombics, and 1d FFTs are only a portion of the FFT cost (parallel
|
||||
communication can be costly). A breakdown of these timings is printed
|
||||
to the screen at the end of a run using the :doc:`kspace_style pppm <kspace_style>` command. The :doc:`Run output <Run_output>`
|
||||
doc page gives more details.
|
||||
to the screen at the end of a run when using the
|
||||
:doc:`kspace_style pppm <kspace_style>` command. The :doc:`Run output <Run_output>`
|
||||
doc page gives more details. A more detailed (and time consuming)
|
||||
report of the FFT performance is generated with the
|
||||
:doc:`kspace_modify fftbench yes <kspace_modify>` command.
|
||||
|
||||
FFTW is a fast, portable FFT library that should also work on any
|
||||
platform and can be faster than the KISS FFT library. You can
|
||||
@ -166,7 +182,7 @@ When using -DFFT\_SINGLE with FFTW3 you may need to build the FFTW
|
||||
library a second time with support for single-precision.
|
||||
|
||||
For FFTW3, do the following, which should produce the additional
|
||||
library libfftw3f.a
|
||||
library libfftw3f.a or libfftw3f.so.
|
||||
|
||||
|
||||
.. parsed-literal::
|
||||
|
||||
@ -404,9 +404,9 @@ calculation can be performed concurrently on the GPU while other
|
||||
calculations for non-bonded and bonded force calculation are performed
|
||||
on the CPU.
|
||||
|
||||
The *pppm/kk* style also performs charge assignment and force
|
||||
interpolation calculations on the GPU while the FFTs themselves are
|
||||
calculated on the CPU in non-threaded mode.
|
||||
The *pppm/kk* style performs charge assignment and force interpolation
|
||||
calculations, along with the FFTs themselves, on the GPU or (optionally) threaded
|
||||
on the CPU when using OpenMP and FFTW3.
|
||||
|
||||
These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
|
||||
USER-OMP, and OPT packages respectively. They are only enabled if
|
||||
|
||||
@ -97,6 +97,9 @@ action dihedral_opls_kokkos.cpp dihedral_opls.cpp
|
||||
action dihedral_opls_kokkos.h dihedral_opls.h
|
||||
action domain_kokkos.cpp
|
||||
action domain_kokkos.h
|
||||
action fftdata_kokkos.h fft3d.h
|
||||
action fft3d_kokkos.cpp fft3d.cpp
|
||||
action fft3d_kokkos.h fft3d.h
|
||||
action fix_deform_kokkos.cpp
|
||||
action fix_deform_kokkos.h
|
||||
action fix_enforce2d_kokkos.cpp
|
||||
@ -153,6 +156,7 @@ action improper_class2_kokkos.cpp improper_class2.cpp
|
||||
action improper_class2_kokkos.h improper_class2.h
|
||||
action improper_harmonic_kokkos.cpp improper_harmonic.cpp
|
||||
action improper_harmonic_kokkos.h improper_harmonic.h
|
||||
action kissfft_kokkos.h kissfft.h
|
||||
action kokkos.cpp
|
||||
action kokkos.h
|
||||
action kokkos_base.h
|
||||
@ -189,6 +193,7 @@ action min_kokkos.cpp
|
||||
action min_kokkos.h
|
||||
action min_linesearch_kokkos.cpp
|
||||
action min_linesearch_kokkos.h
|
||||
action pack_kokkos.h pack.h
|
||||
action pair_buck_coul_cut_kokkos.cpp
|
||||
action pair_buck_coul_cut_kokkos.h
|
||||
action pair_buck_coul_long_kokkos.cpp pair_buck_coul_long.cpp
|
||||
@ -285,6 +290,8 @@ action rand_pool_wrap_kokkos.cpp
|
||||
action rand_pool_wrap_kokkos.h
|
||||
action region_block_kokkos.cpp
|
||||
action region_block_kokkos.h
|
||||
action remap_kokkos.cpp remap.cpp
|
||||
action remap_kokkos.h remap.h
|
||||
action sna_kokkos.h sna.h
|
||||
action sna_kokkos_impl.h sna.cpp
|
||||
action verlet_kokkos.cpp
|
||||
|
||||
905
src/KOKKOS/fft3d_kokkos.cpp
Normal file
905
src/KOKKOS/fft3d_kokkos.cpp
Normal file
@ -0,0 +1,905 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Stan Moore (SNL), Sam Mish (U.C. Davis)
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <mpi.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include "fft3d_kokkos.h"
|
||||
#include "remap_kokkos.h"
|
||||
#include "kokkos_type.h"
|
||||
#include "error.h"
|
||||
#include "kokkos.h"
|
||||
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#define MIN(A,B) ((A) < (B) ? (A) : (B))
|
||||
#define MAX(A,B) ((A) > (B) ? (A) : (B))
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow,
|
||||
int in_ilo, int in_ihi, int in_jlo, int in_jhi,
|
||||
int in_klo, int in_khi,
|
||||
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
|
||||
int out_klo, int out_khi,
|
||||
int scaled, int permute, int *nbuf, int usecollective) :
|
||||
Pointers(lmp)
|
||||
{
|
||||
int nthreads = lmp->kokkos->nthreads;
|
||||
int ngpus = lmp->kokkos->ngpus;
|
||||
ExecutionSpace execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
|
||||
|
||||
#if defined(FFT_MKL)
|
||||
if (ngpus > 0 && execution_space == Device)
|
||||
lmp->error->all(FLERR,"Cannot use the MKL library with Kokkos CUDA on GPUs");
|
||||
#elif defined(FFT_FFTW3)
|
||||
if (ngpus > 0 && execution_space == Device)
|
||||
lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA on GPUs");
|
||||
#elif defined(FFT_CUFFT)
|
||||
if (ngpus > 0 && execution_space == Host)
|
||||
lmp->error->all(FLERR,"Cannot use the cuFFT library with Kokkos CUDA on the host CPUs");
|
||||
#elif defined(FFT_KISSFFT)
|
||||
// The compiler can't statically determine the stack size needed for
|
||||
// recursive function calls in KISS FFT and the default per-thread
|
||||
// stack size on GPUs needs to be increased to prevent stack overflows
|
||||
// for reasonably sized FFTs
|
||||
#if defined (KOKKOS_ENABLE_CUDA)
|
||||
size_t stack_size;
|
||||
cudaDeviceGetLimit(&stack_size,cudaLimitStackSize);
|
||||
if (stack_size < 2048)
|
||||
cudaDeviceSetLimit(cudaLimitStackSize,2048);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
|
||||
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
|
||||
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
|
||||
scaled,permute,nbuf,usecollective,nthreads);
|
||||
if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
FFT3dKokkos<DeviceType>::~FFT3dKokkos()
|
||||
{
|
||||
fft_3d_destroy_plan_kokkos(plan);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
void FFT3dKokkos<DeviceType>::compute(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, int flag)
|
||||
{
|
||||
typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size()/2);
|
||||
typename AT::t_FFT_DATA_1d d_out_data(d_out.data(),d_out.size()/2);
|
||||
|
||||
fft_3d_kokkos(d_in_data,d_out_data,flag,plan);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
void FFT3dKokkos<DeviceType>::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int nsize, int flag)
|
||||
{
|
||||
typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size());
|
||||
|
||||
fft_3d_1d_only_kokkos(d_in_data,nsize,flag,plan);
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Data layout for 3d FFTs:
|
||||
|
||||
data set of Nfast x Nmid x Nslow elements is owned by P procs
|
||||
on input, each proc owns a subsection of the elements
|
||||
on output, each proc will own a (possibly different) subsection
|
||||
my subsection must not overlap with any other proc's subsection,
|
||||
i.e. the union of all proc's input (or output) subsections must
|
||||
exactly tile the global Nfast x Nmid x Nslow data set
|
||||
when called from C, all subsection indices are
|
||||
C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
|
||||
when called from F77, all subsection indices are
|
||||
F77-style from 1 to N where N = Nfast or Nmid or Nslow
|
||||
a proc can own 0 elements on input or output
|
||||
by specifying hi index < lo index
|
||||
on both input and output, data is stored contiguously on a processor
|
||||
with a fast-varying, mid-varying, and slow-varying index
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Perform 3d FFT
|
||||
|
||||
Arguments:
|
||||
in starting address of input data on this proc
|
||||
out starting address of where output data for this proc
|
||||
will be placed (can be same as in)
|
||||
flag 1 for forward FFT, -1 for inverse FFT
|
||||
plan plan returned by previous call to fft_3d_create_plan
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct norm_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_DATA_1d_um d_out;
|
||||
int norm;
|
||||
|
||||
norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_):
|
||||
d_out(d_out_),norm(norm_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &i) const {
|
||||
#if defined(FFT_FFTW3) || defined(FFT_CUFFT)
|
||||
FFT_SCALAR* out_ptr = (FFT_SCALAR *)(d_out.data()+i);
|
||||
*(out_ptr++) *= norm;
|
||||
*(out_ptr++) *= norm;
|
||||
#elif defined(FFT_MKL)
|
||||
d_out(i) *= norm;
|
||||
#else
|
||||
d_out(i,0) *= norm;
|
||||
d_out(i,1) *= norm;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef FFT_KISSFFT
|
||||
template<class DeviceType>
|
||||
struct kiss_fft_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_DATA_1d_um d_data,d_tmp;
|
||||
kiss_fft_state_kokkos<DeviceType> st;
|
||||
int length;
|
||||
|
||||
kiss_fft_functor() {}
|
||||
|
||||
kiss_fft_functor(typename AT::t_FFT_DATA_1d &d_data_,typename AT::t_FFT_DATA_1d &d_tmp_, kiss_fft_state_kokkos<DeviceType> &st_, int length_):
|
||||
d_data(d_data_),
|
||||
d_tmp(d_tmp_),
|
||||
st(st_)
|
||||
{
|
||||
length = length_;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &i) const {
|
||||
const int offset = i*length;
|
||||
KissFFTKokkos<DeviceType>::kiss_fft_kokkos(st,d_data,d_tmp,offset);
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template<class DeviceType>
|
||||
void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typename AT::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos<DeviceType> *plan)
|
||||
{
|
||||
int total,length;
|
||||
typename AT::t_FFT_DATA_1d d_data,d_copy;
|
||||
typename AT::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar;
|
||||
|
||||
// pre-remap to prepare for 1st FFTs if needed
|
||||
// copy = loc for remap result
|
||||
|
||||
if (plan->pre_plan) {
|
||||
if (plan->pre_target == 0) d_copy = d_out;
|
||||
else d_copy = plan->d_copy;
|
||||
|
||||
d_in_scalar = typename AT::t_FFT_SCALAR_1d(d_in.data(),d_in.size());
|
||||
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size());
|
||||
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
|
||||
|
||||
remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar,
|
||||
d_scratch_scalar, plan->pre_plan);
|
||||
|
||||
d_data = d_copy;
|
||||
} else d_data = d_in;
|
||||
|
||||
// 1d FFTs along fast axis
|
||||
|
||||
total = plan->total1;
|
||||
length = plan->length1;
|
||||
|
||||
#if defined(FFT_MKL)
|
||||
if (flag == -1)
|
||||
DftiComputeForward(plan->handle_fast,(FFT_DATA *)d_data.data());
|
||||
else
|
||||
DftiComputeBackward(plan->handle_fast,(FFT_DATA *)d_data.data());
|
||||
#elif defined(FFT_FFTW3)
|
||||
if (flag == -1)
|
||||
FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
|
||||
else
|
||||
FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
|
||||
#elif defined(FFT_CUFFT)
|
||||
cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
|
||||
#else
|
||||
typename AT::t_FFT_DATA_1d d_tmp =
|
||||
typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
|
||||
kiss_fft_functor<DeviceType> f;
|
||||
if (flag == -1)
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length);
|
||||
else
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_backward,length);
|
||||
Kokkos::parallel_for(total/length,f);
|
||||
d_data = d_tmp;
|
||||
d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
|
||||
#endif
|
||||
|
||||
|
||||
// 1st mid-remap to prepare for 2nd FFTs
|
||||
// copy = loc for remap result
|
||||
|
||||
if (plan->mid1_target == 0) d_copy = d_out;
|
||||
else d_copy = plan->d_copy;
|
||||
|
||||
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()*2);
|
||||
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2);
|
||||
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2);
|
||||
|
||||
remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar,
|
||||
d_scratch_scalar, plan->mid1_plan);
|
||||
|
||||
d_data = d_copy;
|
||||
|
||||
// 1d FFTs along mid axis
|
||||
|
||||
total = plan->total2;
|
||||
length = plan->length2;
|
||||
|
||||
#if defined(FFT_MKL)
|
||||
if (flag == -1)
|
||||
DftiComputeForward(plan->handle_mid,(FFT_DATA *)d_data.data());
|
||||
else
|
||||
DftiComputeBackward(plan->handle_mid,(FFT_DATA *)d_data.data());
|
||||
#elif defined(FFT_FFTW3)
|
||||
if (flag == -1)
|
||||
FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
|
||||
else
|
||||
FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
|
||||
#elif defined(FFT_CUFFT)
|
||||
cufftExec(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
|
||||
#else
|
||||
if (flag == -1)
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_forward,length);
|
||||
else
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_backward,length);
|
||||
Kokkos::parallel_for(total/length,f);
|
||||
d_data = d_tmp;
|
||||
d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
|
||||
#endif
|
||||
|
||||
// 2nd mid-remap to prepare for 3rd FFTs
|
||||
// copy = loc for remap result
|
||||
|
||||
if (plan->mid2_target == 0) d_copy = d_out;
|
||||
else d_copy = plan->d_copy;
|
||||
|
||||
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size());
|
||||
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size());
|
||||
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
|
||||
|
||||
remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar,
|
||||
d_scratch_scalar, plan->mid2_plan);
|
||||
|
||||
d_data = d_copy;
|
||||
|
||||
// 1d FFTs along slow axis
|
||||
|
||||
total = plan->total3;
|
||||
length = plan->length3;
|
||||
|
||||
#if defined(FFT_MKL)
|
||||
if (flag == -1)
|
||||
DftiComputeForward(plan->handle_slow,(FFT_DATA *)d_data.data());
|
||||
else
|
||||
DftiComputeBackward(plan->handle_slow,(FFT_DATA *)d_data.data());
|
||||
#elif defined(FFT_FFTW3)
|
||||
if (flag == -1)
|
||||
FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
|
||||
else
|
||||
FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
|
||||
#elif defined(FFT_CUFFT)
|
||||
cufftExec(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
|
||||
#else
|
||||
if (flag == -1)
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_forward,length);
|
||||
else
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_backward,length);
|
||||
Kokkos::parallel_for(total/length,f);
|
||||
d_data = d_tmp;
|
||||
#endif
|
||||
|
||||
|
||||
// post-remap to put data in output format if needed
|
||||
// destination is always out
|
||||
|
||||
if (plan->post_plan) {
|
||||
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size());
|
||||
d_out_scalar = typename AT::t_FFT_SCALAR_1d(d_out.data(),d_out.size());
|
||||
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
|
||||
|
||||
remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar,
|
||||
d_scratch_scalar, plan->post_plan);
|
||||
}
|
||||
|
||||
// scaling if required
|
||||
|
||||
if (flag == 1 && plan->scaled) {
|
||||
FFT_SCALAR norm = plan->norm;
|
||||
int num = plan->normnum;
|
||||
|
||||
norm_functor<DeviceType> f(d_out,norm);
|
||||
Kokkos::parallel_for(num,f);
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Create plan for performing a 3d FFT
|
||||
|
||||
Arguments:
|
||||
comm MPI communicator for the P procs which own the d_data
|
||||
nfast,nmid,nslow size of global 3d matrix
|
||||
in_ilo,in_ihi input bounds of data I own in fast index
|
||||
in_jlo,in_jhi input bounds of data I own in mid index
|
||||
in_klo,in_khi input bounds of data I own in slow index
|
||||
out_ilo,out_ihi output bounds of data I own in fast index
|
||||
out_jlo,out_jhi output bounds of data I own in mid index
|
||||
out_klo,out_khi output bounds of data I own in slow index
|
||||
scaled 0 = no scaling of result, 1 = scaling
|
||||
permute permutation in storage order of indices on output
|
||||
0 = no permutation
|
||||
1 = permute once = mid->fast, slow->mid, fast->slow
|
||||
2 = permute twice = slow->fast, fast->mid, mid->slow
|
||||
nbuf returns size of internal storage buffers used by FFT
|
||||
usecollective use collective MPI operations for remapping data
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_plan_kokkos (
|
||||
MPI_Comm comm, int nfast, int nmid, int nslow,
|
||||
int in_ilo, int in_ihi, int in_jlo, int in_jhi,
|
||||
int in_klo, int in_khi,
|
||||
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
|
||||
int out_klo, int out_khi,
|
||||
int scaled, int permute, int *nbuf, int usecollective,
|
||||
int nthreads)
|
||||
{
|
||||
struct fft_plan_3d_kokkos<DeviceType> *plan;
|
||||
int me,nprocs;
|
||||
int flag,remapflag;
|
||||
int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
|
||||
int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
|
||||
int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
|
||||
int out_size,first_size,second_size,third_size,copy_size,scratch_size;
|
||||
int np1,np2,ip1,ip2;
|
||||
|
||||
// query MPI info
|
||||
|
||||
MPI_Comm_rank(comm,&me);
|
||||
MPI_Comm_size(comm,&nprocs);
|
||||
|
||||
// compute division of procs in 2 dimensions not on-processor
|
||||
|
||||
bifactor(nprocs,&np1,&np2);
|
||||
ip1 = me % np1;
|
||||
ip2 = me/np1;
|
||||
|
||||
// allocate memory for plan data struct
|
||||
|
||||
plan = new struct fft_plan_3d_kokkos<DeviceType>;
|
||||
remapKK = new RemapKokkos<DeviceType>(lmp);
|
||||
if (plan == NULL) return NULL;
|
||||
|
||||
// remap from initial distribution to layout needed for 1st set of 1d FFTs
|
||||
// not needed if all procs own entire fast axis initially
|
||||
// first indices = distribution after 1st set of FFTs
|
||||
|
||||
|
||||
if (in_ilo == 0 && in_ihi == nfast-1) flag = 0;
|
||||
else flag = 1;
|
||||
|
||||
MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
|
||||
|
||||
if (remapflag == 0) {
|
||||
first_ilo = in_ilo;
|
||||
first_ihi = in_ihi;
|
||||
first_jlo = in_jlo;
|
||||
first_jhi = in_jhi;
|
||||
first_klo = in_klo;
|
||||
first_khi = in_khi;
|
||||
plan->pre_plan = NULL;
|
||||
}
|
||||
else {
|
||||
first_ilo = 0;
|
||||
first_ihi = nfast - 1;
|
||||
first_jlo = ip1*nmid/np1;
|
||||
first_jhi = (ip1+1)*nmid/np1 - 1;
|
||||
first_klo = ip2*nslow/np2;
|
||||
first_khi = (ip2+1)*nslow/np2 - 1;
|
||||
plan->pre_plan =
|
||||
remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
|
||||
first_ilo,first_ihi,first_jlo,first_jhi,
|
||||
first_klo,first_khi,2,0,0,FFT_PRECISION,0);
|
||||
if (plan->pre_plan == NULL) return NULL;
|
||||
}
|
||||
|
||||
// 1d FFTs along fast axis
|
||||
|
||||
plan->length1 = nfast;
|
||||
plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1);
|
||||
|
||||
// remap from 1st to 2nd FFT
|
||||
// choose which axis is split over np1 vs np2 to minimize communication
|
||||
// second indices = distribution after 2nd set of FFTs
|
||||
|
||||
second_ilo = ip1*nfast/np1;
|
||||
second_ihi = (ip1+1)*nfast/np1 - 1;
|
||||
second_jlo = 0;
|
||||
second_jhi = nmid - 1;
|
||||
second_klo = ip2*nslow/np2;
|
||||
second_khi = (ip2+1)*nslow/np2 - 1;
|
||||
plan->mid1_plan =
|
||||
remapKK->remap_3d_create_plan_kokkos(comm,
|
||||
first_ilo,first_ihi,first_jlo,first_jhi,
|
||||
first_klo,first_khi,
|
||||
second_ilo,second_ihi,second_jlo,second_jhi,
|
||||
second_klo,second_khi,2,1,0,FFT_PRECISION,
|
||||
usecollective);
|
||||
if (plan->mid1_plan == NULL) return NULL;
|
||||
|
||||
// 1d FFTs along mid axis
|
||||
|
||||
plan->length2 = nmid;
|
||||
plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1);
|
||||
|
||||
// remap from 2nd to 3rd FFT
|
||||
// if final distribution is permute=2 with all procs owning entire slow axis
|
||||
// then this remapping goes directly to final distribution
|
||||
// third indices = distribution after 3rd set of FFTs
|
||||
|
||||
if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0;
|
||||
else flag = 1;
|
||||
|
||||
MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
|
||||
|
||||
if (remapflag == 0) {
|
||||
third_ilo = out_ilo;
|
||||
third_ihi = out_ihi;
|
||||
third_jlo = out_jlo;
|
||||
third_jhi = out_jhi;
|
||||
third_klo = out_klo;
|
||||
third_khi = out_khi;
|
||||
}
|
||||
else {
|
||||
third_ilo = ip1*nfast/np1;
|
||||
third_ihi = (ip1+1)*nfast/np1 - 1;
|
||||
third_jlo = ip2*nmid/np2;
|
||||
third_jhi = (ip2+1)*nmid/np2 - 1;
|
||||
third_klo = 0;
|
||||
third_khi = nslow - 1;
|
||||
}
|
||||
|
||||
plan->mid2_plan =
|
||||
remapKK->remap_3d_create_plan_kokkos(comm,
|
||||
second_jlo,second_jhi,second_klo,second_khi,
|
||||
second_ilo,second_ihi,
|
||||
third_jlo,third_jhi,third_klo,third_khi,
|
||||
third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
|
||||
if (plan->mid2_plan == NULL) return NULL;
|
||||
|
||||
// 1d FFTs along slow axis
|
||||
|
||||
plan->length3 = nslow;
|
||||
plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow;
|
||||
|
||||
// remap from 3rd FFT to final distribution
|
||||
// not needed if permute = 2 and third indices = out indices on all procs
|
||||
|
||||
if (permute == 2 &&
|
||||
out_ilo == third_ilo && out_ihi == third_ihi &&
|
||||
out_jlo == third_jlo && out_jhi == third_jhi &&
|
||||
out_klo == third_klo && out_khi == third_khi) flag = 0;
|
||||
else flag = 1;
|
||||
|
||||
MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
|
||||
|
||||
if (remapflag == 0)
|
||||
plan->post_plan = NULL;
|
||||
else {
|
||||
plan->post_plan =
|
||||
remapKK->remap_3d_create_plan_kokkos(comm,
|
||||
third_klo,third_khi,third_ilo,third_ihi,
|
||||
third_jlo,third_jhi,
|
||||
out_klo,out_khi,out_ilo,out_ihi,
|
||||
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
|
||||
if (plan->post_plan == NULL) return NULL;
|
||||
}
|
||||
|
||||
// configure plan memory pointers and allocate work space
|
||||
// out_size = amount of memory given to FFT by user
|
||||
// first/second/third_size = amount of memory needed after
|
||||
// pre,mid1,mid2 remaps
|
||||
// copy_size = amount needed internally for extra copy of data
|
||||
// scratch_size = amount needed internally for remap scratch space
|
||||
// for each remap:
|
||||
// out space used for result if big enough, else require copy buffer
|
||||
// accumulate largest required remap scratch space
|
||||
|
||||
out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
|
||||
first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
|
||||
(first_khi-first_klo+1);
|
||||
second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
|
||||
(second_khi-second_klo+1);
|
||||
third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
|
||||
(third_khi-third_klo+1);
|
||||
|
||||
copy_size = 0;
|
||||
scratch_size = 0;
|
||||
|
||||
if (plan->pre_plan) {
|
||||
if (first_size <= out_size)
|
||||
plan->pre_target = 0;
|
||||
else {
|
||||
plan->pre_target = 1;
|
||||
copy_size = MAX(copy_size,first_size);
|
||||
}
|
||||
scratch_size = MAX(scratch_size,first_size);
|
||||
}
|
||||
|
||||
if (plan->mid1_plan) {
|
||||
if (second_size <= out_size)
|
||||
plan->mid1_target = 0;
|
||||
else {
|
||||
plan->mid1_target = 1;
|
||||
copy_size = MAX(copy_size,second_size);
|
||||
}
|
||||
scratch_size = MAX(scratch_size,second_size);
|
||||
}
|
||||
|
||||
if (plan->mid2_plan) {
|
||||
if (third_size <= out_size)
|
||||
plan->mid2_target = 0;
|
||||
else {
|
||||
plan->mid2_target = 1;
|
||||
copy_size = MAX(copy_size,third_size);
|
||||
}
|
||||
scratch_size = MAX(scratch_size,third_size);
|
||||
}
|
||||
|
||||
if (plan->post_plan)
|
||||
scratch_size = MAX(scratch_size,out_size);
|
||||
|
||||
*nbuf = copy_size + scratch_size;
|
||||
|
||||
if (copy_size) {
|
||||
plan->d_copy = typename AT::t_FFT_DATA_1d("fft3d:copy",copy_size);
|
||||
}
|
||||
|
||||
if (scratch_size) {
|
||||
plan->d_scratch = typename AT::t_FFT_DATA_1d("fft3d:scratch",scratch_size);
|
||||
}
|
||||
|
||||
// system specific pre-computation of 1d FFT coeffs
|
||||
// and scaling normalization
|
||||
|
||||
#if defined(FFT_MKL)
|
||||
DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1,
|
||||
(MKL_LONG)nfast);
|
||||
DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS,
|
||||
(MKL_LONG)plan->total1/nfast);
|
||||
DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE);
|
||||
DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast);
|
||||
DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast);
|
||||
//DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads);
|
||||
DftiCommitDescriptor(plan->handle_fast);
|
||||
|
||||
DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1,
|
||||
(MKL_LONG)nmid);
|
||||
DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS,
|
||||
(MKL_LONG)plan->total2/nmid);
|
||||
DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE);
|
||||
DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid);
|
||||
DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid);
|
||||
//DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads);
|
||||
DftiCommitDescriptor(plan->handle_mid);
|
||||
|
||||
DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1,
|
||||
(MKL_LONG)nslow);
|
||||
DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS,
|
||||
(MKL_LONG)plan->total3/nslow);
|
||||
DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE);
|
||||
DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow);
|
||||
DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow);
|
||||
//DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads);
|
||||
DftiCommitDescriptor(plan->handle_slow);
|
||||
|
||||
if (scaled == 0)
|
||||
plan->scaled = 0;
|
||||
else {
|
||||
plan->scaled = 1;
|
||||
plan->norm = 1.0/(nfast*nmid*nslow);
|
||||
plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
|
||||
(out_khi-out_klo+1);
|
||||
}
|
||||
|
||||
#elif defined(FFT_FFTW3)
|
||||
|
||||
#if defined (FFT_FFTW_THREADS)
|
||||
if (nthreads > 1) {
|
||||
FFTW_API(init_threads)();
|
||||
FFTW_API(plan_with_nthreads)(nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
plan->plan_fast_forward =
|
||||
FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
|
||||
NULL,&nfast,1,plan->length1,
|
||||
NULL,&nfast,1,plan->length1,
|
||||
FFTW_FORWARD,FFTW_ESTIMATE);
|
||||
|
||||
plan->plan_fast_backward =
|
||||
FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
|
||||
NULL,&nfast,1,plan->length1,
|
||||
NULL,&nfast,1,plan->length1,
|
||||
FFTW_BACKWARD,FFTW_ESTIMATE);
|
||||
|
||||
plan->plan_mid_forward =
|
||||
FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
|
||||
NULL,&nmid,1,plan->length2,
|
||||
NULL,&nmid,1,plan->length2,
|
||||
FFTW_FORWARD,FFTW_ESTIMATE);
|
||||
|
||||
plan->plan_mid_backward =
|
||||
FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
|
||||
NULL,&nmid,1,plan->length2,
|
||||
NULL,&nmid,1,plan->length2,
|
||||
FFTW_BACKWARD,FFTW_ESTIMATE);
|
||||
|
||||
|
||||
plan->plan_slow_forward =
|
||||
FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
|
||||
NULL,&nslow,1,plan->length3,
|
||||
NULL,&nslow,1,plan->length3,
|
||||
FFTW_FORWARD,FFTW_ESTIMATE);
|
||||
|
||||
plan->plan_slow_backward =
|
||||
FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
|
||||
NULL,&nslow,1,plan->length3,
|
||||
NULL,&nslow,1,plan->length3,
|
||||
FFTW_BACKWARD,FFTW_ESTIMATE);
|
||||
#elif defined(FFT_CUFFT)
|
||||
cufftPlanMany(&(plan->plan_fast), 1, &nfast,
|
||||
&nfast,1,plan->length1,
|
||||
&nfast,1,plan->length1,
|
||||
CUFFT_TYPE,plan->total1/plan->length1);
|
||||
|
||||
cufftPlanMany(&(plan->plan_mid), 1, &nmid,
|
||||
&nmid,1,plan->length2,
|
||||
&nmid,1,plan->length2,
|
||||
CUFFT_TYPE,plan->total2/plan->length2);
|
||||
|
||||
cufftPlanMany(&(plan->plan_slow), 1, &nslow,
|
||||
&nslow,1,plan->length3,
|
||||
&nslow,1,plan->length3,
|
||||
CUFFT_TYPE,plan->total3/plan->length3);
|
||||
#else
|
||||
kissfftKK = new KissFFTKokkos<DeviceType>();
|
||||
|
||||
plan->cfg_fast_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL);
|
||||
plan->cfg_fast_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nfast,1,NULL,NULL);
|
||||
|
||||
if (nmid == nfast) {
|
||||
plan->cfg_mid_forward = plan->cfg_fast_forward;
|
||||
plan->cfg_mid_backward = plan->cfg_fast_backward;
|
||||
}
|
||||
else {
|
||||
plan->cfg_mid_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nmid,0,NULL,NULL);
|
||||
plan->cfg_mid_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nmid,1,NULL,NULL);
|
||||
}
|
||||
|
||||
if (nslow == nfast) {
|
||||
plan->cfg_slow_forward = plan->cfg_fast_forward;
|
||||
plan->cfg_slow_backward = plan->cfg_fast_backward;
|
||||
}
|
||||
else if (nslow == nmid) {
|
||||
plan->cfg_slow_forward = plan->cfg_mid_forward;
|
||||
plan->cfg_slow_backward = plan->cfg_mid_backward;
|
||||
}
|
||||
else {
|
||||
plan->cfg_slow_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nslow,0,NULL,NULL);
|
||||
plan->cfg_slow_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nslow,1,NULL,NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
if (scaled == 0)
|
||||
plan->scaled = 0;
|
||||
else {
|
||||
plan->scaled = 1;
|
||||
plan->norm = 1.0/(nfast*nmid*nslow);
|
||||
plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
|
||||
(out_khi-out_klo+1);
|
||||
}
|
||||
|
||||
return plan;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Destroy a 3d fft plan
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
void FFT3dKokkos<DeviceType>::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *plan)
|
||||
{
|
||||
if (plan->pre_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->pre_plan);
|
||||
if (plan->mid1_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid1_plan);
|
||||
if (plan->mid2_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid2_plan);
|
||||
if (plan->post_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->post_plan);
|
||||
|
||||
#if defined(FFT_MKL)
|
||||
DftiFreeDescriptor(&(plan->handle_fast));
|
||||
DftiFreeDescriptor(&(plan->handle_mid));
|
||||
DftiFreeDescriptor(&(plan->handle_slow));
|
||||
#elif defined(FFT_FFTW3)
|
||||
FFTW_API(destroy_plan)(plan->plan_slow_forward);
|
||||
FFTW_API(destroy_plan)(plan->plan_slow_backward);
|
||||
FFTW_API(destroy_plan)(plan->plan_mid_forward);
|
||||
FFTW_API(destroy_plan)(plan->plan_mid_backward);
|
||||
FFTW_API(destroy_plan)(plan->plan_fast_forward);
|
||||
FFTW_API(destroy_plan)(plan->plan_fast_backward);
|
||||
|
||||
#if defined (FFT_FFTW_THREADS)
|
||||
FFTW_API(cleanup_threads)();
|
||||
#endif
|
||||
|
||||
#elif defined (FFT_KISSFFT)
|
||||
delete kissfftKK;
|
||||
#endif
|
||||
|
||||
delete plan;
|
||||
delete remapKK;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
divide n into 2 factors of as equal size as possible
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
void FFT3dKokkos<DeviceType>::bifactor(int n, int *factor1, int *factor2)
|
||||
{
|
||||
int n1,n2,facmax;
|
||||
|
||||
facmax = static_cast<int> (sqrt((double) n));
|
||||
|
||||
for (n1 = facmax; n1 > 0; n1--) {
|
||||
n2 = n/n1;
|
||||
if (n1*n2 == n) {
|
||||
*factor1 = n1;
|
||||
*factor2 = n2;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
perform just the 1d FFTs needed by a 3d FFT, no data movement
|
||||
used for timing purposes
|
||||
|
||||
Arguments:
|
||||
in starting address of input data on this proc, all set to 0.0
|
||||
nsize size of in
|
||||
flag 1 for forward FFT, -1 for inverse FFT
|
||||
plan plan returned by previous call to fft_3d_create_plan
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
void FFT3dKokkos<DeviceType>::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d_data, int nsize, int flag,
|
||||
struct fft_plan_3d_kokkos<DeviceType> *plan)
|
||||
{
|
||||
// total = size of data needed in each dim
|
||||
// length = length of 1d FFT in each dim
|
||||
// total/length = # of 1d FFTs in each dim
|
||||
// if total > nsize, limit # of 1d FFTs to available size of data
|
||||
|
||||
int total1 = plan->total1;
|
||||
int length1 = plan->length1;
|
||||
int total2 = plan->total2;
|
||||
int length2 = plan->length2;
|
||||
int total3 = plan->total3;
|
||||
int length3 = plan->length3;
|
||||
|
||||
// fftw3 and Dfti in MKL encode the number of transforms
|
||||
// into the plan, so we cannot operate on a smaller data set
|
||||
#if defined(FFT_MKL) || defined(FFT_FFTW3)
|
||||
if ((total1 > nsize) || (total2 > nsize) || (total3 > nsize))
|
||||
return;
|
||||
#endif
|
||||
if (total1 > nsize) total1 = (nsize/length1) * length1;
|
||||
if (total2 > nsize) total2 = (nsize/length2) * length2;
|
||||
if (total3 > nsize) total3 = (nsize/length3) * length3;
|
||||
|
||||
// perform 1d FFTs in each of 3 dimensions
|
||||
// data is just an array of 0.0
|
||||
|
||||
#if defined(FFT_MKL)
|
||||
if (flag == -1) {
|
||||
DftiComputeForward(plan->handle_fast,data);
|
||||
DftiComputeForward(plan->handle_mid,data);
|
||||
DftiComputeForward(plan->handle_slow,data);
|
||||
} else {
|
||||
DftiComputeBackward(plan->handle_fast,data);
|
||||
DftiComputeBackward(plan->handle_mid,data);
|
||||
DftiComputeBackward(plan->handle_slow,data);
|
||||
}
|
||||
#elif defined(FFT_FFTW3)
|
||||
if (flag == -1) {
|
||||
FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
|
||||
FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
|
||||
FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
|
||||
} else {
|
||||
FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
|
||||
FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
|
||||
FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
|
||||
}
|
||||
#elif defined(FFT_CUFFT)
|
||||
cufftExec(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
|
||||
cufftExec(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
|
||||
cufftExec(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
|
||||
#else
|
||||
kiss_fft_functor<DeviceType> f;
|
||||
typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0());
|
||||
if (flag == -1) {
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length1);
|
||||
Kokkos::parallel_for(total1/length1,f);
|
||||
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_forward,length2);
|
||||
Kokkos::parallel_for(total2/length2,f);
|
||||
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_forward,length3);
|
||||
Kokkos::parallel_for(total3/length3,f);
|
||||
} else {
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_backward,length1);
|
||||
Kokkos::parallel_for(total1/length1,f);
|
||||
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_backward,length2);
|
||||
Kokkos::parallel_for(total2/length2,f);
|
||||
|
||||
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_backward,length3);
|
||||
Kokkos::parallel_for(total3/length3,f);
|
||||
}
|
||||
#endif
|
||||
|
||||
// scaling if required
|
||||
// limit num to size of data
|
||||
|
||||
if (flag == 1 && plan->scaled) {
|
||||
FFT_SCALAR norm = plan->norm;
|
||||
int num = MIN(plan->normnum,nsize);
|
||||
|
||||
norm_functor<DeviceType> f(d_data,norm);
|
||||
Kokkos::parallel_for(num,f);
|
||||
}
|
||||
}
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
template class FFT3dKokkos<LMPDeviceType>;
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
template class FFT3dKokkos<LMPHostType>;
|
||||
#endif
|
||||
}
|
||||
198
src/KOKKOS/fft3d_kokkos.h
Normal file
198
src/KOKKOS/fft3d_kokkos.h
Normal file
@ -0,0 +1,198 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LMP_FFT3D_KOKKOS_H
|
||||
#define LMP_FFT3D_KOKKOS_H
|
||||
|
||||
#include "pointers.h"
|
||||
#include "kokkos_type.h"
|
||||
#include "remap_kokkos.h"
|
||||
#include "fftdata_kokkos.h"
|
||||
|
||||
// with KOKKOS in CUDA mode we can only have
|
||||
// CUFFT or KISSFFT, thus undefine all other
|
||||
// FFTs here, since they may be valid in fft3d.cpp
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
# if defined(FFT_FFTW)
|
||||
# undef FFT_FFTW
|
||||
# endif
|
||||
# if defined(FFT_FFTW3)
|
||||
# undef FFT_FFTW3
|
||||
# endif
|
||||
# if defined(FFT_MKL)
|
||||
# undef FFT_MKL
|
||||
# endif
|
||||
# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT)
|
||||
# define FFT_KISSFFT
|
||||
# endif
|
||||
#else
|
||||
# if defined(FFT_CUFFT)
|
||||
# error "Must enable CUDA with KOKKOS to use -DFFT_CUFFT"
|
||||
# endif
|
||||
// if user set FFTW, it means FFTW3
|
||||
# ifdef FFT_FFTW
|
||||
# define FFT_FFTW3
|
||||
# endif
|
||||
# ifdef FFT_FFTW_THREADS
|
||||
# if !defined(FFT_FFTW3)
|
||||
# error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS"
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(FFT_MKL)
|
||||
#include "mkl_dfti.h"
|
||||
#if defined(FFT_SINGLE)
|
||||
typedef float _Complex FFT_DATA;
|
||||
#define FFT_MKL_PREC DFTI_SINGLE
|
||||
#else
|
||||
typedef double _Complex FFT_DATA;
|
||||
#define FFT_MKL_PREC DFTI_DOUBLE
|
||||
#endif
|
||||
#elif defined(FFT_FFTW3)
|
||||
#include "fftw3.h"
|
||||
#if defined(FFT_SINGLE)
|
||||
typedef fftwf_complex FFT_DATA;
|
||||
#define FFTW_API(function) fftwf_ ## function
|
||||
#else
|
||||
typedef fftw_complex FFT_DATA;
|
||||
#define FFTW_API(function) fftw_ ## function
|
||||
#endif
|
||||
#elif defined(FFT_CUFFT)
|
||||
#include "cufft.h"
|
||||
#if defined(FFT_SINGLE)
|
||||
#define cufftExec cufftExecC2C
|
||||
#define CUFFT_TYPE CUFFT_C2C
|
||||
typedef cufftComplex FFT_DATA;
|
||||
#else
|
||||
#define cufftExec cufftExecZ2Z
|
||||
#define CUFFT_TYPE CUFFT_Z2Z
|
||||
typedef cufftDoubleComplex FFT_DATA;
|
||||
#endif
|
||||
#else
|
||||
#include "kissfft_kokkos.h"
|
||||
#ifndef FFT_KISSFFT
|
||||
#define FFT_KISSFFT
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// plan for how to perform a 3d FFT
|
||||
|
||||
template<class DeviceType>
|
||||
struct fft_plan_3d_kokkos {
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
|
||||
struct remap_plan_3d_kokkos<DeviceType> *pre_plan; // remap from input -> 1st FFTs
|
||||
struct remap_plan_3d_kokkos<DeviceType> *mid1_plan; // remap from 1st -> 2nd FFTs
|
||||
struct remap_plan_3d_kokkos<DeviceType> *mid2_plan; // remap from 2nd -> 3rd FFTs
|
||||
struct remap_plan_3d_kokkos<DeviceType> *post_plan; // remap from 3rd FFTs -> output
|
||||
typename AT::t_FFT_DATA_1d d_copy; // memory for remap results (if needed)
|
||||
typename AT::t_FFT_DATA_1d d_scratch; // scratch space for remaps
|
||||
int total1,total2,total3; // # of 1st,2nd,3rd FFTs (times length)
|
||||
int length1,length2,length3; // length of 1st,2nd,3rd FFTs
|
||||
int pre_target; // where to put remap results
|
||||
int mid1_target,mid2_target;
|
||||
int scaled; // whether to scale FFT results
|
||||
int normnum; // # of values to rescale
|
||||
double norm; // normalization factor for rescaling
|
||||
|
||||
// system specific 1d FFT info
|
||||
#if defined(FFT_MKL)
|
||||
DFTI_DESCRIPTOR *handle_fast;
|
||||
DFTI_DESCRIPTOR *handle_mid;
|
||||
DFTI_DESCRIPTOR *handle_slow;
|
||||
#elif defined(FFT_FFTW3)
|
||||
FFTW_API(plan) plan_fast_forward;
|
||||
FFTW_API(plan) plan_fast_backward;
|
||||
FFTW_API(plan) plan_mid_forward;
|
||||
FFTW_API(plan) plan_mid_backward;
|
||||
FFTW_API(plan) plan_slow_forward;
|
||||
FFTW_API(plan) plan_slow_backward;
|
||||
#elif defined(FFT_CUFFT)
|
||||
cufftHandle plan_fast;
|
||||
cufftHandle plan_mid;
|
||||
cufftHandle plan_slow;
|
||||
#else
|
||||
kiss_fft_state_kokkos<DeviceType> cfg_fast_forward;
|
||||
kiss_fft_state_kokkos<DeviceType> cfg_fast_backward;
|
||||
kiss_fft_state_kokkos<DeviceType> cfg_mid_forward;
|
||||
kiss_fft_state_kokkos<DeviceType> cfg_mid_backward;
|
||||
kiss_fft_state_kokkos<DeviceType> cfg_slow_forward;
|
||||
kiss_fft_state_kokkos<DeviceType> cfg_slow_backward;
|
||||
#endif
|
||||
};
|
||||
|
||||
template<class DeviceType>
|
||||
class FFT3dKokkos : protected Pointers {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
|
||||
FFT3dKokkos(class LAMMPS *, MPI_Comm,
|
||||
int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
|
||||
int,int,int *,int);
|
||||
~FFT3dKokkos();
|
||||
void compute(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, int);
|
||||
void timing1d(typename AT::t_FFT_SCALAR_1d, int, int);
|
||||
|
||||
private:
|
||||
struct fft_plan_3d_kokkos<DeviceType> *plan;
|
||||
RemapKokkos<DeviceType> *remapKK;
|
||||
|
||||
#ifdef FFT_KISSFFT
|
||||
KissFFTKokkos<DeviceType> *kissfftKK;
|
||||
#endif
|
||||
|
||||
void fft_3d_kokkos(typename AT::t_FFT_DATA_1d, typename AT::t_FFT_DATA_1d, int, struct fft_plan_3d_kokkos<DeviceType> *);
|
||||
|
||||
struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
|
||||
int, int, int, int, int,
|
||||
int, int, int, int, int, int, int,
|
||||
int, int, int *, int, int);
|
||||
|
||||
void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);
|
||||
|
||||
void fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d, int, int, struct fft_plan_3d_kokkos<DeviceType> *);
|
||||
|
||||
void bifactor(int, int *, int *);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
E: Could not create 3d FFT plan
|
||||
|
||||
The FFT setup for the PPPM solver failed, typically due
|
||||
to lack of memory. This is an unusual error. Check the
|
||||
size of the FFT grid you are requesting.
|
||||
|
||||
E: Cannot use the FFTW library with Kokkos CUDA on GPUs
|
||||
|
||||
Kokkos CUDA doesn't support using the FFTW library to calculate FFTs for
|
||||
PPPM on GPUs.
|
||||
|
||||
E: Cannot use the cuFFT library with Kokkos CUDA on the host CPUs
|
||||
|
||||
Kokkos CUDA doesn't support using the cuFFT library to calculate FFTs
|
||||
for PPPM on the host CPUs, use KISS FFT instead.
|
||||
|
||||
*/
|
||||
46
src/KOKKOS/fftdata_kokkos.h
Normal file
46
src/KOKKOS/fftdata_kokkos.h
Normal file
@ -0,0 +1,46 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#define MAX(A,B) ((A) > (B) ? (A) : (B))
|
||||
|
||||
// data types for 2d/3d FFTs
|
||||
|
||||
#ifndef FFT_DATA_KOKKOS_H
|
||||
#define FFT_DATA_KOKKOS_H
|
||||
|
||||
// User-settable FFT precision
|
||||
|
||||
// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag)
|
||||
// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag)
|
||||
|
||||
#ifdef FFT_SINGLE
|
||||
#define FFT_PRECISION 1
|
||||
#define MPI_FFT_SCALAR MPI_FLOAT
|
||||
typedef float FFT_SCALAR;
|
||||
#else
|
||||
#define FFT_PRECISION 2
|
||||
#define MPI_FFT_SCALAR MPI_DOUBLE
|
||||
typedef double FFT_SCALAR;
|
||||
#endif
|
||||
|
||||
// -------------------------------------------------------------------------
|
||||
|
||||
// Data types for single-precision complex
|
||||
|
||||
#if FFT_PRECISION == 1
|
||||
#elif FFT_PRECISION == 2
|
||||
#else
|
||||
#error "FFT_PRECISION needs to be either 1 (=single) or 2 (=double)"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
20
src/KOKKOS/kissfft_kokkos.cpp
Normal file
20
src/KOKKOS/kissfft_kokkos.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
template class KissFFTKokkos<LMPDeviceType>;
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
template class KissFFTKokkos<LMPHostType>;
|
||||
#endif
|
||||
}
|
||||
550
src/KOKKOS/kissfft_kokkos.h
Normal file
550
src/KOKKOS/kissfft_kokkos.h
Normal file
@ -0,0 +1,550 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
we use a stripped down KISS FFT as default FFT for LAMMPS
|
||||
this code is adapted from kiss_fft_v1_2_9
|
||||
homepage: http://kissfft.sf.net/
|
||||
|
||||
changes 2008-2011 by Axel Kohlmeyer <akohlmey@gmail.com>
|
||||
|
||||
KISS FFT ported to Kokkos by Stan Moore (SNL)
|
||||
*/
|
||||
|
||||
/*
|
||||
Copyright (c) 2003-2010, Mark Borgerding
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
* Neither the author nor the names of any contributors may be used
|
||||
to endorse or promote products derived from this software without
|
||||
specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef LMP_KISSFFT_KOKKOS_H
|
||||
#define LMP_KISSFFT_KOKKOS_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "kissfft_kokkos.h"
|
||||
#include "fftdata_kokkos.h"
|
||||
#include "kokkos_type.h"
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.141592653589793238462643383279502884197169399375105820974944
|
||||
#endif
|
||||
|
||||
/*
|
||||
Explanation of macros dealing with complex math:
|
||||
|
||||
C_MUL(m,a,b) : m = a*b
|
||||
C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise
|
||||
C_SUB( res, a,b) : res = a - b
|
||||
C_SUBFROM( res , a) : res -= a
|
||||
C_ADDTO( res , a) : res += a
|
||||
C_EQ( res , a) : res = a
|
||||
*/
|
||||
|
||||
#define S_MUL(a,b) ( (a)*(b) )
|
||||
|
||||
#define C_MUL(m,a,a_index,b,b_index) \
|
||||
do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\
|
||||
(m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0)
|
||||
|
||||
/*
|
||||
#define C_FIXDIV(c,div) // NOOP
|
||||
|
||||
#define C_MULBYSCALAR( c, s ) \
|
||||
do{ (c)[0] *= (s);\
|
||||
(c)[1] *= (s); }while(0)
|
||||
|
||||
#define C_ADD( res, a,b)\
|
||||
do { \
|
||||
(res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \
|
||||
}while(0)
|
||||
|
||||
#define C_SUB( res, a,b)\
|
||||
do { \
|
||||
(res)[0]=(a)[0]-(b)[0]; (res)[1]=(a)[1]-(b)[1]; \
|
||||
}while(0)
|
||||
|
||||
#define C_ADDTO( res , a)\
|
||||
do { \
|
||||
(res)[0] += (a)[0]; (res)[1] += (a)[1];\
|
||||
}while(0)
|
||||
|
||||
#define C_SUBFROM( res , a)\
|
||||
do {\
|
||||
(res)[0] -= (a)[0]; (res)[1] -= (a)[1]; \
|
||||
}while(0)
|
||||
|
||||
#define C_EQ(res, a)\
|
||||
do {\
|
||||
(res)[0] = (a)[0]; (res)[1] = (a)[1]; \
|
||||
}while(0)
|
||||
*/
|
||||
|
||||
#define KISS_FFT_COS(phase) (FFT_SCALAR) cos(phase)
|
||||
#define KISS_FFT_SIN(phase) (FFT_SCALAR) sin(phase)
|
||||
#define HALF_OF(x) ((x)*.5)
|
||||
|
||||
#define kf_cexp(x,x_index,phase) \
|
||||
do{ \
|
||||
(x)(x_index,0) = KISS_FFT_COS(phase);\
|
||||
(x)(x_index,1) = KISS_FFT_SIN(phase);\
|
||||
}while(0)
|
||||
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
#define MAXFACTORS 32
|
||||
/* e.g. an fft of length 128 has 4 factors
|
||||
as far as kissfft is concerned: 4*4*4*2 */
|
||||
template<class DeviceType>
|
||||
struct kiss_fft_state_kokkos {
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
int nfft;
|
||||
int inverse;
|
||||
typename AT::t_int_64 d_factors;
|
||||
typename AT::t_FFT_DATA_1d d_twiddles;
|
||||
typename AT::t_FFT_DATA_1d d_scratch;
|
||||
};
|
||||
|
||||
template<class DeviceType>
|
||||
class KissFFTKokkos {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void kf_bfly2(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
|
||||
const kiss_fft_state_kokkos<DeviceType> &st, int m, int Fout_count)
|
||||
{
|
||||
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
|
||||
FFT_SCALAR t[2];
|
||||
int Fout2_count;
|
||||
int tw1_count = 0;
|
||||
|
||||
Fout2_count = Fout_count + m;
|
||||
do {
|
||||
//C_FIXDIV(d_Fout[Fout_count],2); C_FIXDIV(d_Fout[Fout2_count],2);
|
||||
|
||||
C_MUL(t,d_Fout,Fout2_count,d_twiddles,tw1_count);
|
||||
tw1_count += fstride;
|
||||
//C_SUB(*Fout2,*Fout,t);
|
||||
d_Fout(Fout2_count,0) = d_Fout(Fout_count,0) - t[0];
|
||||
d_Fout(Fout2_count,1) = d_Fout(Fout_count,1) - t[1];
|
||||
//C_ADDTO(d_Fout[Fout_count],t);
|
||||
d_Fout(Fout_count,0) += t[0];
|
||||
d_Fout(Fout_count,1) += t[1];
|
||||
++Fout2_count;
|
||||
++Fout_count;
|
||||
} while(--m);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void kf_bfly4(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
|
||||
const kiss_fft_state_kokkos<DeviceType> &st, const size_t m, int Fout_count)
|
||||
{
|
||||
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
|
||||
FFT_SCALAR scratch[6][2];
|
||||
size_t k=m;
|
||||
const size_t m2=2*m;
|
||||
const size_t m3=3*m;
|
||||
|
||||
int tw3_count,tw2_count,tw1_count;
|
||||
tw3_count = tw2_count = tw1_count = 0;
|
||||
|
||||
do {
|
||||
//C_FIXDIV(d_Fout[Fout_count],4); C_FIXDIV(d_Fout[m],4); C_FIXDIV(d_Fout[m2],4); C_FIXDIV(d_Fout[m3],4);
|
||||
|
||||
C_MUL(scratch[0],d_Fout,Fout_count + m,d_twiddles,tw1_count);
|
||||
C_MUL(scratch[1],d_Fout,Fout_count + m2,d_twiddles,tw2_count);
|
||||
C_MUL(scratch[2],d_Fout,Fout_count + m3,d_twiddles,tw3_count);
|
||||
|
||||
//C_SUB(scratch[5],d_Fout[Fout_count],scratch[1] );
|
||||
scratch[5][0] = d_Fout(Fout_count,0) - scratch[1][0];
|
||||
scratch[5][1] = d_Fout(Fout_count,1) - scratch[1][1];
|
||||
//C_ADDTO(d_Fout[Fout_count], scratch[1]);
|
||||
d_Fout(Fout_count,0) += scratch[1][0];
|
||||
d_Fout(Fout_count,1) += scratch[1][1];
|
||||
//C_ADD(scratch[3],scratch[0],scratch[2]);
|
||||
scratch[3][0] = scratch[0][0] + scratch[2][0];
|
||||
scratch[3][1] = scratch[0][1] + scratch[2][1];
|
||||
//C_SUB( scratch[4] , scratch[0] , scratch[2] );
|
||||
scratch[4][0] = scratch[0][0] - scratch[2][0];
|
||||
scratch[4][1] = scratch[0][1] - scratch[2][1];
|
||||
//C_SUB(d_Fout[m2],d_Fout[Fout_count],scratch[3]);
|
||||
d_Fout(Fout_count + m2,0) = d_Fout(Fout_count,0) - scratch[3][0];
|
||||
d_Fout(Fout_count + m2,1) = d_Fout(Fout_count,1) - scratch[3][1];
|
||||
|
||||
tw1_count += fstride;
|
||||
tw2_count += fstride*2;
|
||||
tw3_count += fstride*3;
|
||||
//C_ADDTO(d_Fout[Fout_count],scratch[3]);
|
||||
d_Fout(Fout_count,0) += scratch[3][0];
|
||||
d_Fout(Fout_count,1) += scratch[3][1];
|
||||
|
||||
if (st.inverse) {
|
||||
d_Fout(Fout_count + m,0) = scratch[5][0] - scratch[4][1];
|
||||
d_Fout(Fout_count + m,1) = scratch[5][1] + scratch[4][0];
|
||||
d_Fout(Fout_count + m3,0) = scratch[5][0] + scratch[4][1];
|
||||
d_Fout(Fout_count + m3,1) = scratch[5][1] - scratch[4][0];
|
||||
} else{
|
||||
d_Fout(Fout_count + m,0) = scratch[5][0] + scratch[4][1];
|
||||
d_Fout(Fout_count + m,1) = scratch[5][1] - scratch[4][0];
|
||||
d_Fout(Fout_count + m3,0) = scratch[5][0] - scratch[4][1];
|
||||
d_Fout(Fout_count + m3,1) = scratch[5][1] + scratch[4][0];
|
||||
}
|
||||
++Fout_count;
|
||||
} while(--k);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void kf_bfly3(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
|
||||
const kiss_fft_state_kokkos<DeviceType> &st, size_t m, int Fout_count)
|
||||
{
|
||||
size_t k=m;
|
||||
const size_t m2 = 2*m;
|
||||
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
|
||||
FFT_SCALAR scratch[5][2];
|
||||
FFT_SCALAR epi3[2];
|
||||
//C_EQ(epi3,d_twiddles[fstride*m]);
|
||||
epi3[0] = d_twiddles(fstride*m,0);
|
||||
epi3[1] = d_twiddles(fstride*m,1);
|
||||
|
||||
int tw1_count,tw2_count;
|
||||
tw1_count = tw2_count = 0;
|
||||
|
||||
do {
|
||||
//C_FIXDIV(d_Fout[Fout_count],3); C_FIXDIV(d_Fout[m],3); C_FIXDIV(d_Fout[m2],3);
|
||||
|
||||
C_MUL(scratch[1],d_Fout,Fout_count + m,d_twiddles,tw1_count);
|
||||
C_MUL(scratch[2],d_Fout,Fout_count + m2,d_twiddles,tw2_count);
|
||||
|
||||
//C_ADD(scratch[3],scratch[1],scratch[2]);
|
||||
scratch[3][0] = scratch[1][0] + scratch[2][0];
|
||||
scratch[3][1] = scratch[1][1] + scratch[2][1];
|
||||
//C_SUB(scratch[0],scratch[1],scratch[2]);
|
||||
scratch[0][0] = scratch[1][0] - scratch[2][0];
|
||||
scratch[0][1] = scratch[1][1] - scratch[2][1];
|
||||
tw1_count += fstride;
|
||||
tw2_count += fstride*2;
|
||||
|
||||
d_Fout(Fout_count + m,0) = d_Fout(Fout_count,0) - HALF_OF(scratch[3][0]);
|
||||
d_Fout(Fout_count + m,1) = d_Fout(Fout_count,1) - HALF_OF(scratch[3][1]);
|
||||
|
||||
//C_MULBYSCALAR(scratch[0],epi3[1]);
|
||||
scratch[0][0] *= epi3[1];
|
||||
scratch[0][1] *= epi3[1];
|
||||
|
||||
//C_ADDTO(d_Fout[Fout_count],scratch[3]);
|
||||
d_Fout(Fout_count,0) += scratch[3][0];
|
||||
d_Fout(Fout_count,1) += scratch[3][1];
|
||||
|
||||
d_Fout(Fout_count + m2,0) = d_Fout(Fout_count + m,0) + scratch[0][1];
|
||||
d_Fout(Fout_count + m2,1) = d_Fout(Fout_count + m,1) - scratch[0][0];
|
||||
|
||||
d_Fout(Fout_count + m,0) -= scratch[0][1];
|
||||
d_Fout(Fout_count + m,1) += scratch[0][0];
|
||||
|
||||
++Fout_count;
|
||||
} while(--k);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void kf_bfly5(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
|
||||
const kiss_fft_state_kokkos<DeviceType> &st, int m, int Fout_count)
|
||||
{
|
||||
int u;
|
||||
FFT_SCALAR scratch[13][2];
|
||||
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
|
||||
FFT_SCALAR ya[2],yb[2];
|
||||
//C_EQ(ya,d_twiddles[fstride*m]);
|
||||
ya[1] = d_twiddles(fstride*m,1);
|
||||
ya[0] = d_twiddles(fstride*m,0);
|
||||
//C_EQ(yb,d_twiddles[fstride*2*m]);
|
||||
yb[1] = d_twiddles(fstride*2*m,1);
|
||||
yb[0] = d_twiddles(fstride*2*m,0);
|
||||
|
||||
int Fout0_count=Fout_count;
|
||||
int Fout1_count=Fout0_count+m;
|
||||
int Fout2_count=Fout0_count+2*m;
|
||||
int Fout3_count=Fout0_count+3*m;
|
||||
int Fout4_count=Fout0_count+4*m;
|
||||
|
||||
for ( u=0; u<m; ++u ) {
|
||||
//C_FIXDIV( d_Fout[Fout0_count],5); C_FIXDIV( d_Fout[Fout1_count],5); C_FIXDIV( d_Fout[Fout2_count],5);
|
||||
//C_FIXDIV( d_Fout[Fout3_count],5); C_FIXDIV( d_Fout[Fout4_count],5);
|
||||
//C_EQ(scratch[0],d_Fout[Fout0_count]);
|
||||
scratch[0][0] = d_Fout(Fout0_count,0);
|
||||
scratch[0][1] = d_Fout(Fout0_count,1);
|
||||
|
||||
C_MUL(scratch[1],d_Fout,Fout1_count,d_twiddles,u*fstride );
|
||||
C_MUL(scratch[2],d_Fout,Fout2_count,d_twiddles,2*u*fstride);
|
||||
C_MUL(scratch[3],d_Fout,Fout3_count,d_twiddles,3*u*fstride);
|
||||
C_MUL(scratch[4],d_Fout,Fout4_count,d_twiddles,4*u*fstride);
|
||||
|
||||
//C_ADD(scratch[7],scratch[1],scratch[4]);
|
||||
scratch[7][0] = scratch[1][0] + scratch[4][0];
|
||||
scratch[7][1] = scratch[1][1] + scratch[4][1];
|
||||
//C_SUB(scratch[10],scratch[1],scratch[4]);
|
||||
scratch[10][0] = scratch[1][0] - scratch[4][0];
|
||||
scratch[10][1] = scratch[1][1] - scratch[4][1];
|
||||
//C_ADD(scratch[8],scratch[2],scratch[3]);
|
||||
scratch[8][0] = scratch[2][0] + scratch[3][0];
|
||||
scratch[8][1] = scratch[2][1] + scratch[3][1];
|
||||
//C_SUB(scratch[9],scratch[2],scratch[3]);
|
||||
scratch[9][0] = scratch[2][0] - scratch[3][0];
|
||||
scratch[9][1] = scratch[2][1] - scratch[3][1];
|
||||
|
||||
d_Fout(Fout0_count,0) += scratch[7][0] + scratch[8][0];
|
||||
d_Fout(Fout0_count,1) += scratch[7][1] + scratch[8][1];
|
||||
|
||||
scratch[5][0] = scratch[0][0] + S_MUL(scratch[7][0],ya[0]) + S_MUL(scratch[8][0],yb[0]);
|
||||
scratch[5][1] = scratch[0][1] + S_MUL(scratch[7][1],ya[0]) + S_MUL(scratch[8][1],yb[0]);
|
||||
|
||||
scratch[6][0] = S_MUL(scratch[10][1],ya[1]) + S_MUL(scratch[9][1],yb[1]);
|
||||
scratch[6][1] = -S_MUL(scratch[10][0],ya[1]) - S_MUL(scratch[9][0],yb[1]);
|
||||
|
||||
//C_SUB(d_Fout[Fout1_count],scratch[5],scratch[6]);
|
||||
d_Fout(Fout1_count,0) = scratch[5][0] - scratch[6][0];
|
||||
d_Fout(Fout1_count,1) = scratch[5][1] - scratch[6][1];
|
||||
//C_ADD(d_Fout[Fout4_count],scratch[5],scratch[6]);
|
||||
d_Fout(Fout4_count,0) = scratch[5][0] + scratch[6][0];
|
||||
d_Fout(Fout4_count,1) = scratch[5][1] + scratch[6][1];
|
||||
|
||||
scratch[11][0] = scratch[0][0] + S_MUL(scratch[7][0],yb[0]) + S_MUL(scratch[8][0],ya[0]);
|
||||
scratch[11][1] = scratch[0][1] + S_MUL(scratch[7][1],yb[0]) + S_MUL(scratch[8][1],ya[0]);
|
||||
scratch[12][0] = - S_MUL(scratch[10][1],yb[1]) + S_MUL(scratch[9][1],ya[1]);
|
||||
scratch[12][1] = S_MUL(scratch[10][0],yb[1]) - S_MUL(scratch[9][0],ya[1]);
|
||||
|
||||
//C_ADD(d_Fout[Fout2_count],scratch[11],scratch[12]);
|
||||
d_Fout(Fout2_count,0) = scratch[11][0] + scratch[12][0];
|
||||
d_Fout(Fout2_count,1) = scratch[11][1] + scratch[12][1];
|
||||
//C_SUB(d_Fout3[Fout3_count],scratch[11],scratch[12]);
|
||||
d_Fout(Fout3_count,0) = scratch[11][0] - scratch[12][0];
|
||||
d_Fout(Fout3_count,1) = scratch[11][1] - scratch[12][1];
|
||||
|
||||
++Fout0_count;++Fout1_count;++Fout2_count;++Fout3_count;++Fout4_count;
|
||||
}
|
||||
}
|
||||
|
||||
/* perform the butterfly for one stage of a mixed radix FFT */
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void kf_bfly_generic(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
|
||||
const kiss_fft_state_kokkos<DeviceType> &st, int m, int p, int Fout_count)
|
||||
{
|
||||
int u,k,q1,q;
|
||||
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
|
||||
FFT_SCALAR t[2];
|
||||
int Norig = st.nfft;
|
||||
|
||||
typename AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch;
|
||||
for ( u=0; u<m; ++u ) {
|
||||
k=u;
|
||||
for ( q1=0 ; q1<p ; ++q1 ) {
|
||||
//C_EQ(d_scratch[q1],d_Fout[k]);
|
||||
d_scratch(q1,0) = d_Fout(Fout_count + k,0);
|
||||
d_scratch(q1,1) = d_Fout(Fout_count + k,1);
|
||||
//C_FIXDIV(d_scratch[q1],p);
|
||||
k += m;
|
||||
}
|
||||
|
||||
k=u;
|
||||
for ( q1=0 ; q1<p ; ++q1 ) {
|
||||
int twidx=0;
|
||||
//C_EQ(d_Fout[k],d_scratch[0]);
|
||||
d_Fout(Fout_count + k,0) = d_scratch(0,0);
|
||||
d_Fout(Fout_count + k,1) = d_scratch(0,1);
|
||||
for (q=1;q<p;++q ) {
|
||||
twidx += fstride * k;
|
||||
if (twidx>=Norig) twidx-=Norig;
|
||||
C_MUL(t,d_scratch,q,d_twiddles,twidx);
|
||||
//C_ADDTO(d_Fout[k],t);
|
||||
d_Fout(Fout_count + k,0) += t[0];
|
||||
d_Fout(Fout_count + k,1) += t[1];
|
||||
}
|
||||
k += m;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void kf_work(typename AT::t_FFT_DATA_1d_um &d_Fout, const typename AT::t_FFT_DATA_1d_um &d_f,
|
||||
const size_t fstride, int in_stride,
|
||||
const typename AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos<DeviceType> &st, int Fout_count, int f_count, int factors_count)
|
||||
{
|
||||
const int beg = Fout_count;
|
||||
const int p = d_factors[factors_count++]; /* the radix */
|
||||
const int m = d_factors[factors_count++]; /* stage's fft length/p */
|
||||
const int end = Fout_count + p*m;
|
||||
|
||||
if (m == 1) {
|
||||
do {
|
||||
//C_EQ(d_Fout[Fout_count],d_f[f_count]);
|
||||
d_Fout(Fout_count,0) = d_f(f_count,0);
|
||||
d_Fout(Fout_count,1) = d_f(f_count,1);
|
||||
f_count += fstride*in_stride;
|
||||
} while (++Fout_count != end);
|
||||
} else {
|
||||
do {
|
||||
/* recursive call:
|
||||
DFT of size m*p performed by doing
|
||||
p instances of smaller DFTs of size m,
|
||||
each one takes a decimated version of the input */
|
||||
kf_work(d_Fout, d_f, fstride*p, in_stride, d_factors, st, Fout_count, f_count, factors_count);
|
||||
f_count += fstride*in_stride;
|
||||
} while( (Fout_count += m) != end);
|
||||
}
|
||||
|
||||
Fout_count=beg;
|
||||
|
||||
/* recombine the p smaller DFTs */
|
||||
switch (p) {
|
||||
case 2: kf_bfly2(d_Fout,fstride,st,m,Fout_count); break;
|
||||
case 3: kf_bfly3(d_Fout,fstride,st,m,Fout_count); break;
|
||||
case 4: kf_bfly4(d_Fout,fstride,st,m,Fout_count); break;
|
||||
case 5: kf_bfly5(d_Fout,fstride,st,m,Fout_count); break;
|
||||
default: kf_bfly_generic(d_Fout,fstride,st,m,p,Fout_count); break;
|
||||
}
|
||||
}
|
||||
|
||||
/* facbuf is populated by p1,m1,p2,m2, ...
|
||||
where
|
||||
p[i] * m[i] = m[i-1]
|
||||
m0 = n */
|
||||
|
||||
static int kf_factor(int n, HAT::t_int_64 h_facbuf)
|
||||
{
|
||||
int p=4, nf=0;
|
||||
double floor_sqrt;
|
||||
floor_sqrt = floor( sqrt((double)n) );
|
||||
int facbuf_count = 0;
|
||||
int p_max = 0;
|
||||
|
||||
/* factor out the remaining powers of 4, powers of 2,
|
||||
and then any other remaining primes */
|
||||
do {
|
||||
if (nf == MAXFACTORS) p = n; /* make certain that we don't run out of space */
|
||||
while (n % p) {
|
||||
switch (p) {
|
||||
case 4: p = 2; break;
|
||||
case 2: p = 3; break;
|
||||
default: p += 2; break;
|
||||
}
|
||||
if (p > floor_sqrt)
|
||||
p = n; /* no more factors, skip to end */
|
||||
}
|
||||
n /= p;
|
||||
h_facbuf[facbuf_count++] = p;
|
||||
h_facbuf[facbuf_count++] = n;
|
||||
p_max = MAX(p,p_max);
|
||||
++nf;
|
||||
} while (n > 1);
|
||||
return p_max;
|
||||
}
|
||||
|
||||
/*
|
||||
* User-callable function to allocate all necessary storage space for the fft.
|
||||
*
|
||||
* The return value is a contiguous block of memory, allocated with malloc. As such,
|
||||
* It can be freed with free(), rather than a kiss_fft-specific function.
|
||||
*/
|
||||
|
||||
static kiss_fft_state_kokkos<DeviceType> kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void *mem, size_t *lenmem)
|
||||
{
|
||||
kiss_fft_state_kokkos<DeviceType> st;
|
||||
int i;
|
||||
st.nfft = nfft;
|
||||
st.inverse = inverse_fft;
|
||||
|
||||
typename AT::tdual_int_64 k_factors = typename AT::tdual_int_64();
|
||||
typename AT::tdual_FFT_DATA_1d k_twiddles = typename AT::tdual_FFT_DATA_1d();
|
||||
|
||||
if (nfft > 0) {
|
||||
k_factors = typename AT::tdual_int_64("kissfft:factors",MAXFACTORS*2);
|
||||
k_twiddles = typename AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft);
|
||||
|
||||
for (i=0;i<nfft;++i) {
|
||||
const double phase = (st.inverse ? 2.0*M_PI:-2.0*M_PI)*i / nfft;
|
||||
kf_cexp(k_twiddles.h_view,i,phase );
|
||||
}
|
||||
|
||||
int p_max = kf_factor(nfft,k_factors.h_view);
|
||||
st.d_scratch = typename AT::t_FFT_DATA_1d("kissfft:scratch",p_max);
|
||||
}
|
||||
|
||||
k_factors.template modify<LMPHostType>();
|
||||
k_factors.template sync<LMPDeviceType>();
|
||||
st.d_factors = k_factors.template view<DeviceType>();
|
||||
|
||||
k_twiddles.template modify<LMPHostType>();
|
||||
k_twiddles.template sync<LMPDeviceType>();
|
||||
st.d_twiddles = k_twiddles.template view<DeviceType>();
|
||||
|
||||
return st;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void kiss_fft_stride(const kiss_fft_state_kokkos<DeviceType> &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset)
|
||||
{
|
||||
//if (d_fin.data() == d_fout.data()) {
|
||||
// // NOTE: this is not really an in-place FFT algorithm.
|
||||
// // It just performs an out-of-place FFT into a temp buffer
|
||||
// typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1));
|
||||
// kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0);
|
||||
// Kokkos::deep_copy(d_fout,d_tmpbuf);
|
||||
//} else {
|
||||
kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0);
|
||||
//}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void kiss_fft_kokkos(const kiss_fft_state_kokkos<DeviceType> &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset)
|
||||
{
|
||||
kiss_fft_stride(cfg,d_fin,d_fout,1,offset);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
@ -174,20 +174,6 @@ t_scalar3<Scalar> operator *
|
||||
return t_scalar3<Scalar>(a.x*b,a.y*b,a.z*b);
|
||||
}
|
||||
|
||||
#if !defined(__CUDACC__) && !defined(__VECTOR_TYPES_H__)
|
||||
struct double2 {
|
||||
double x, y;
|
||||
};
|
||||
struct float2 {
|
||||
float x, y;
|
||||
};
|
||||
struct float4 {
|
||||
float x, y, z, w;
|
||||
};
|
||||
struct double4 {
|
||||
double x, y, z, w;
|
||||
};
|
||||
#endif
|
||||
// set LMPHostype and LMPDeviceType from Kokkos Default Types
|
||||
typedef Kokkos::DefaultExecutionSpace LMPDeviceType;
|
||||
typedef Kokkos::HostSpace::execution_space LMPHostType;
|
||||
@ -310,14 +296,8 @@ public:
|
||||
#endif
|
||||
#if PRECISION==1
|
||||
typedef float LMP_FLOAT;
|
||||
typedef float2 LMP_FLOAT2;
|
||||
typedef lmp_float3 LMP_FLOAT3;
|
||||
typedef float4 LMP_FLOAT4;
|
||||
#else
|
||||
typedef double LMP_FLOAT;
|
||||
typedef double2 LMP_FLOAT2;
|
||||
typedef lmp_double3 LMP_FLOAT3;
|
||||
typedef double4 LMP_FLOAT4;
|
||||
#endif
|
||||
|
||||
#ifndef PREC_FORCE
|
||||
@ -326,14 +306,8 @@ typedef double4 LMP_FLOAT4;
|
||||
|
||||
#if PREC_FORCE==1
|
||||
typedef float F_FLOAT;
|
||||
typedef float2 F_FLOAT2;
|
||||
typedef lmp_float3 F_FLOAT3;
|
||||
typedef float4 F_FLOAT4;
|
||||
#else
|
||||
typedef double F_FLOAT;
|
||||
typedef double2 F_FLOAT2;
|
||||
typedef lmp_double3 F_FLOAT3;
|
||||
typedef double4 F_FLOAT4;
|
||||
#endif
|
||||
|
||||
#ifndef PREC_ENERGY
|
||||
@ -342,12 +316,8 @@ typedef double4 F_FLOAT4;
|
||||
|
||||
#if PREC_ENERGY==1
|
||||
typedef float E_FLOAT;
|
||||
typedef float2 E_FLOAT2;
|
||||
typedef float4 E_FLOAT4;
|
||||
#else
|
||||
typedef double E_FLOAT;
|
||||
typedef double2 E_FLOAT2;
|
||||
typedef double4 E_FLOAT4;
|
||||
#endif
|
||||
|
||||
struct s_EV_FLOAT {
|
||||
@ -500,12 +470,8 @@ typedef struct s_FEV_FLOAT FEV_FLOAT;
|
||||
|
||||
#if PREC_POS==1
|
||||
typedef float X_FLOAT;
|
||||
typedef float2 X_FLOAT2;
|
||||
typedef float4 X_FLOAT4;
|
||||
#else
|
||||
typedef double X_FLOAT;
|
||||
typedef double2 X_FLOAT2;
|
||||
typedef double4 X_FLOAT4;
|
||||
#endif
|
||||
|
||||
#ifndef PREC_VELOCITIES
|
||||
@ -514,22 +480,14 @@ typedef double4 X_FLOAT4;
|
||||
|
||||
#if PREC_VELOCITIES==1
|
||||
typedef float V_FLOAT;
|
||||
typedef float2 V_FLOAT2;
|
||||
typedef float4 V_FLOAT4;
|
||||
#else
|
||||
typedef double V_FLOAT;
|
||||
typedef double2 V_FLOAT2;
|
||||
typedef double4 V_FLOAT4;
|
||||
#endif
|
||||
|
||||
#if PREC_KSPACE==1
|
||||
typedef float K_FLOAT;
|
||||
typedef float2 K_FLOAT2;
|
||||
typedef float4 K_FLOAT4;
|
||||
#else
|
||||
typedef double K_FLOAT;
|
||||
typedef double2 K_FLOAT2;
|
||||
typedef double4 K_FLOAT4;
|
||||
#endif
|
||||
|
||||
typedef int T_INT;
|
||||
|
||||
474
src/KOKKOS/pack_kokkos.h
Normal file
474
src/KOKKOS/pack_kokkos.h
Normal file
@ -0,0 +1,474 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
SPARTA - Stochastic PArallel Rarefied-gas Time-accurate Analyzer
|
||||
http://sparta.sandia.gov
|
||||
Steve Plimpton, sjplimp@sandia.gov, Michael Gallis, magalli@sandia.gov
|
||||
Sandia National Laboratories
|
||||
|
||||
Copyright (2014) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level SPARTA directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
// loop counters for doing a pack/unpack
|
||||
|
||||
#include "pack.h"
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Pack and unpack functions:
|
||||
|
||||
pack routines copy strided values from data into contiguous locs in buf
|
||||
unpack routines copy contiguous values from buf into strided locs in data
|
||||
different versions of unpack depending on permutation
|
||||
and # of values/element
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
pack from data -> buf
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "kokkos_type.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
template<class DeviceType>
|
||||
class PackKokkos {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
|
||||
struct pack_3d_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
|
||||
int buf_offset,data_offset;
|
||||
int nfast; // # of elements in fast index
|
||||
int nmid; // # of elements in mid index
|
||||
int nslow; // # of elements in slow index
|
||||
int nstride_line; // stride between successive mid indices
|
||||
int nstride_plane; // stride between successive slow indices
|
||||
|
||||
pack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
|
||||
d_buf(d_buf_),
|
||||
d_data(d_data_)
|
||||
{
|
||||
buf_offset = buf_offset_ ;
|
||||
data_offset = data_offset_ ;
|
||||
nfast = plan->nfast ;
|
||||
nmid = plan->nmid ;
|
||||
nslow = plan->nslow ;
|
||||
nstride_line = plan->nstride_line ;
|
||||
nstride_plane = plan->nstride_plane;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &index) const {
|
||||
const int fast = index / (nslow * nmid);
|
||||
const int index_new = index - (fast * nslow * nmid);
|
||||
const int mid = index_new / nslow;
|
||||
const int slow = index_new % nslow;
|
||||
const int in_orig = slow*nmid*nfast;
|
||||
const int plane = slow*nstride_plane;
|
||||
const int out_orig = plane + mid*nstride_line;
|
||||
const int in = in_orig + mid*nfast + fast;
|
||||
const int out = out_orig + fast;
|
||||
d_buf[buf_offset + in] = d_data[data_offset + out];
|
||||
}
|
||||
};
|
||||
|
||||
static void pack_3d(typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, struct pack_plan_3d *plan)
|
||||
{
|
||||
const int nslow = plan->nslow;
|
||||
const int nmid = plan->nmid;
|
||||
const int nfast = plan->nfast;
|
||||
pack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan);
|
||||
Kokkos::parallel_for(nslow*nmid*nfast,f);
|
||||
DeviceType::fence();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
unpack from buf -> data
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
struct unpack_3d_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
|
||||
int buf_offset,data_offset;
|
||||
int nfast; // # of elements in fast index
|
||||
int nmid; // # of elements in mid index
|
||||
int nslow; // # of elements in slow index
|
||||
int nstride_line; // stride between successive mid indices
|
||||
int nstride_plane; // stride between successive slow indices
|
||||
|
||||
unpack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
|
||||
d_buf(d_buf_),
|
||||
d_data(d_data_)
|
||||
{
|
||||
buf_offset = buf_offset_ ;
|
||||
data_offset = data_offset_ ;
|
||||
nfast = plan->nfast ;
|
||||
nmid = plan->nmid ;
|
||||
nslow = plan->nslow ;
|
||||
nstride_line = plan->nstride_line ;
|
||||
nstride_plane = plan->nstride_plane;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &index) const {
|
||||
const int fast = index / (nslow * nmid);
|
||||
const int index_new = index - (fast * nslow * nmid);
|
||||
const int mid = index_new / nslow;
|
||||
const int slow = index_new % nslow;
|
||||
const int out_orig = slow*nmid*nfast;
|
||||
const int plane = slow*nstride_plane;
|
||||
const int in_orig = plane + mid*nstride_line;
|
||||
const int in = in_orig + fast;
|
||||
const int out = out_orig + mid*nfast + fast;
|
||||
d_data[data_offset + in] = d_buf[buf_offset + out];
|
||||
}
|
||||
};
|
||||
|
||||
static void unpack_3d(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
|
||||
{
|
||||
const int nslow = plan->nslow;
|
||||
const int nmid = plan->nmid;
|
||||
const int nfast = plan->nfast;
|
||||
unpack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan);
|
||||
Kokkos::parallel_for(nslow*nmid*nfast,f);
|
||||
DeviceType::fence();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
unpack from buf -> data, one axis permutation, 1 value/element
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
|
||||
struct unpack_3d_permute1_1_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
|
||||
int buf_offset,data_offset;
|
||||
int nfast; // # of elements in fast index
|
||||
int nmid; // # of elements in mid index
|
||||
int nslow; // # of elements in slow index
|
||||
int nstride_line; // stride between successive mid indices
|
||||
int nstride_plane; // stride between successive slow indices
|
||||
|
||||
unpack_3d_permute1_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
|
||||
d_buf(d_buf_),
|
||||
d_data(d_data_)
|
||||
{
|
||||
buf_offset = buf_offset_ ;
|
||||
data_offset = data_offset_ ;
|
||||
nfast = plan->nfast ;
|
||||
nmid = plan->nmid ;
|
||||
nslow = plan->nslow ;
|
||||
nstride_line = plan->nstride_line ;
|
||||
nstride_plane = plan->nstride_plane;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &index) const {
|
||||
const int fast = index / (nslow * nmid);
|
||||
const int index_new = index - (fast * nslow * nmid);
|
||||
const int mid = index_new / nslow;
|
||||
const int slow = index_new % nslow;
|
||||
const int out_orig = slow*nmid*nfast;
|
||||
const int plane = slow*nstride_line;
|
||||
const int in_orig = plane + mid;
|
||||
const int in = in_orig + fast*nstride_plane;
|
||||
const int out = out_orig + mid*nfast + fast;
|
||||
d_data[data_offset + in] = d_buf[buf_offset + out];
|
||||
}
|
||||
};
|
||||
|
||||
static void unpack_3d_permute1_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
|
||||
{
|
||||
const int nslow = plan->nslow;
|
||||
const int nmid = plan->nmid;
|
||||
const int nfast = plan->nfast;
|
||||
unpack_3d_permute1_1_functor f(d_buf,buf_offset,d_data,data_offset,plan);
|
||||
Kokkos::parallel_for(nslow*nmid*nfast,f);
|
||||
DeviceType::fence();
|
||||
}
|
||||
/* ----------------------------------------------------------------------
|
||||
unpack from buf -> data, one axis permutation, 2 values/element
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
struct unpack_3d_permute1_2_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
|
||||
int buf_offset,data_offset;
|
||||
int nfast; // # of elements in fast index
|
||||
int nmid; // # of elements in mid index
|
||||
int nslow; // # of elements in slow index
|
||||
int nstride_line; // stride between successive mid indices
|
||||
int nstride_plane; // stride between successive slow indices
|
||||
|
||||
unpack_3d_permute1_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
|
||||
d_buf(d_buf_),
|
||||
d_data(d_data_)
|
||||
{
|
||||
buf_offset = buf_offset_ ;
|
||||
data_offset = data_offset_ ;
|
||||
nfast = plan->nfast ;
|
||||
nmid = plan->nmid ;
|
||||
nslow = plan->nslow ;
|
||||
nstride_line = plan->nstride_line ;
|
||||
nstride_plane = plan->nstride_plane;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &index) const {
|
||||
const int fast = index / (nslow * nmid);
|
||||
const int index_new = index - (fast * nslow * nmid);
|
||||
const int mid = index_new / nslow;
|
||||
const int slow = index_new % nslow;
|
||||
const int out_orig = slow*nmid*nfast*2;
|
||||
const int plane = slow*nstride_line;
|
||||
const int in_orig = plane + 2*mid;
|
||||
const int in = in_orig + fast*nstride_plane;
|
||||
const int out = out_orig + 2*mid*nfast + 2*fast;
|
||||
d_data[data_offset + in] = d_buf[buf_offset + out];
|
||||
d_data[data_offset + in+1] = d_buf[buf_offset + out+1];
|
||||
}
|
||||
};
|
||||
|
||||
static void unpack_3d_permute1_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
|
||||
{
|
||||
const int nslow = plan->nslow;
|
||||
const int nmid = plan->nmid;
|
||||
const int nfast = plan->nfast;
|
||||
unpack_3d_permute1_2_functor f(d_buf,buf_offset,d_data,data_offset,plan);
|
||||
Kokkos::parallel_for(nslow*nmid*nfast,f);
|
||||
DeviceType::fence();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
unpack from buf -> data, one axis permutation, nqty values/element
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
struct unpack_3d_permute1_n_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
|
||||
int buf_offset,data_offset;
|
||||
int nfast; // # of elements in fast index
|
||||
int nmid; // # of elements in mid index
|
||||
int nslow; // # of elements in slow index
|
||||
int nstride_line; // stride between successive mid indices
|
||||
int nstride_plane; // stride between successive slow indices
|
||||
int nqty; // # of values/element
|
||||
|
||||
unpack_3d_permute1_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
|
||||
d_buf(d_buf_),
|
||||
d_data(d_data_)
|
||||
{
|
||||
buf_offset = buf_offset_ ;
|
||||
data_offset = data_offset_ ;
|
||||
nfast = plan->nfast ;
|
||||
nmid = plan->nmid ;
|
||||
nslow = plan->nslow ;
|
||||
nstride_line = plan->nstride_line ;
|
||||
nstride_plane = plan->nstride_plane;
|
||||
nqty = plan->nqty ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &index) const {
|
||||
const int fast = index / (nslow * nmid);
|
||||
const int index_new = index - (fast * nslow * nmid);
|
||||
const int mid = index_new / nslow;
|
||||
const int slow = index_new % nslow;
|
||||
const int out_orig = slow*nmid*nfast*nqty;
|
||||
const int plane = slow*nstride_line;
|
||||
const int instart = plane + nqty*mid;
|
||||
int in = instart + nqty*fast*nstride_plane;
|
||||
int out = out_orig + nqty*mid*nfast + nqty*fast;
|
||||
for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++];
|
||||
}
|
||||
};
|
||||
|
||||
static void unpack_3d_permute1_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
|
||||
{
|
||||
const int nslow = plan->nslow;
|
||||
const int nmid = plan->nmid;
|
||||
const int nfast = plan->nfast;
|
||||
unpack_3d_permute1_n_functor f(d_buf,buf_offset,d_data,data_offset,plan);
|
||||
Kokkos::parallel_for(nslow*nmid*nfast,f);
|
||||
DeviceType::fence();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
unpack from buf -> data, two axis permutation, 1 value/element
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
struct unpack_3d_permute2_1_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
|
||||
int buf_offset,data_offset;
|
||||
int nfast; // # of elements in fast index
|
||||
int nmid; // # of elements in mid index
|
||||
int nslow; // # of elements in slow index
|
||||
int nstride_line; // stride between successive mid indices
|
||||
int nstride_plane; // stride between successive slow indices
|
||||
|
||||
unpack_3d_permute2_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
|
||||
d_buf(d_buf_),
|
||||
d_data(d_data_)
|
||||
{
|
||||
buf_offset = buf_offset_ ;
|
||||
data_offset = data_offset_ ;
|
||||
nfast = plan->nfast ;
|
||||
nmid = plan->nmid ;
|
||||
nslow = plan->nslow ;
|
||||
nstride_line = plan->nstride_line ;
|
||||
nstride_plane = plan->nstride_plane;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &index) const {
|
||||
const int fast = index / (nslow * nmid);
|
||||
const int index_new = index - (fast * nslow * nmid);
|
||||
const int mid = index_new / nslow;
|
||||
const int slow = index_new % nslow;
|
||||
const int out_orig = slow*nmid*nfast;
|
||||
const int in_orig = slow + mid*nstride_plane;
|
||||
const int in = in_orig + fast*nstride_line;
|
||||
const int out = out_orig + mid*nfast + fast;
|
||||
d_data[data_offset + in] = d_buf[buf_offset + out];
|
||||
}
|
||||
};
|
||||
|
||||
static void unpack_3d_permute2_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
|
||||
{
|
||||
const int nslow = plan->nslow;
|
||||
const int nmid = plan->nmid;
|
||||
const int nfast = plan->nfast;
|
||||
unpack_3d_permute2_1_functor f(d_buf,buf_offset,d_data,data_offset,plan);
|
||||
Kokkos::parallel_for(nslow*nmid*nfast,f);
|
||||
DeviceType::fence();
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
unpack from buf -> data, two axis permutation, 2 values/element
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
struct unpack_3d_permute2_2_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
|
||||
int buf_offset,data_offset;
|
||||
int nfast; // # of elements in fast index
|
||||
int nmid; // # of elements in mid index
|
||||
int nslow; // # of elements in slow index
|
||||
int nstride_line; // stride between successive mid indices
|
||||
int nstride_plane; // stride between successive slow indices
|
||||
|
||||
unpack_3d_permute2_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
|
||||
d_buf(d_buf_),
|
||||
d_data(d_data_)
|
||||
{
|
||||
buf_offset = buf_offset_ ;
|
||||
data_offset = data_offset_ ;
|
||||
nfast = plan->nfast ;
|
||||
nmid = plan->nmid ;
|
||||
nslow = plan->nslow ;
|
||||
nstride_line = plan->nstride_line ;
|
||||
nstride_plane = plan->nstride_plane;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &index) const {
|
||||
const int fast = index / (nslow * nmid);
|
||||
const int index_new = index - (fast * nslow * nmid);
|
||||
const int mid = index_new / nslow;
|
||||
const int slow = index_new % nslow;
|
||||
const int out_orig = slow*nmid*nfast*2;
|
||||
const int in_orig = 2*slow + mid*nstride_plane;
|
||||
const int in = in_orig + fast*nstride_line;
|
||||
const int out = out_orig + 2*mid*nfast + 2*fast;
|
||||
d_data[data_offset + in] = d_buf[buf_offset + out];
|
||||
d_data[data_offset + in+1] = d_buf[buf_offset + out+1];
|
||||
}
|
||||
};
|
||||
|
||||
static void unpack_3d_permute2_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
|
||||
{
|
||||
const int nslow = plan->nslow;
|
||||
const int nmid = plan->nmid;
|
||||
const int nfast = plan->nfast;
|
||||
unpack_3d_permute2_2_functor f(d_buf,buf_offset,d_data,data_offset,plan);
|
||||
Kokkos::parallel_for(nslow*nmid*nfast,f);
|
||||
DeviceType::fence();
|
||||
}
|
||||
/* ----------------------------------------------------------------------
|
||||
unpack from buf -> data, two axis permutation, nqty values/element
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
struct unpack_3d_permute2_n_functor {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
|
||||
int buf_offset,data_offset;
|
||||
int nfast; // # of elements in fast index
|
||||
int nmid; // # of elements in mid index
|
||||
int nslow; // # of elements in slow index
|
||||
int nstride_line; // stride between successive mid indices
|
||||
int nstride_plane; // stride between successive slow indices
|
||||
int nqty; // # of values/element
|
||||
|
||||
unpack_3d_permute2_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
|
||||
d_buf(d_buf_),
|
||||
d_data(d_data_)
|
||||
{
|
||||
buf_offset = buf_offset_ ;
|
||||
data_offset = data_offset_ ;
|
||||
nfast = plan->nfast ;
|
||||
nmid = plan->nmid ;
|
||||
nslow = plan->nslow ;
|
||||
nstride_line = plan->nstride_line ;
|
||||
nstride_plane = plan->nstride_plane;
|
||||
nqty = plan->nqty ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int &index) const {
|
||||
const int fast = index / (nslow * nmid);
|
||||
const int index_new = index - (fast * nslow * nmid);
|
||||
const int mid = index_new / nslow;
|
||||
const int slow = index_new % nslow;
|
||||
const int out_orig = slow*nmid*nfast*nqty;
|
||||
const int instart = nqty*slow + mid*nstride_plane;
|
||||
int in = instart + nqty*fast*nstride_line;
|
||||
int out = out_orig + nqty*mid*nfast + nqty*fast;
|
||||
for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++];
|
||||
}
|
||||
};
|
||||
|
||||
static void unpack_3d_permute2_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
|
||||
{
|
||||
const int nslow = plan->nslow;
|
||||
const int nmid = plan->nmid;
|
||||
const int nfast = plan->nfast;
|
||||
unpack_3d_permute2_n_functor f(d_buf,buf_offset,d_data,data_offset,plan);
|
||||
Kokkos::parallel_for(nslow*nmid*nfast,f);
|
||||
DeviceType::fence();
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
@ -25,8 +25,8 @@
|
||||
#include "force.h"
|
||||
#include "pair.h"
|
||||
#include "domain.h"
|
||||
#include "fft3d_wrap.h"
|
||||
#include "remap_wrap.h"
|
||||
#include "fft3d_kokkos.h"
|
||||
#include "remap_kokkos.h"
|
||||
#include "memory_kokkos.h"
|
||||
#include "error.h"
|
||||
#include "atom_masks.h"
|
||||
@ -842,19 +842,19 @@ void PPPMKokkos<DeviceType>::allocate()
|
||||
|
||||
int tmp;
|
||||
|
||||
fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
0,0,&tmp,collective_flag);
|
||||
fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
0,0,&tmp,collective_flag);
|
||||
|
||||
fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
||||
0,0,&tmp,collective_flag);
|
||||
remap = new Remap(lmp,world,
|
||||
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
1,0,0,FFT_PRECISION,collective_flag);
|
||||
fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
||||
0,0,&tmp,collective_flag);
|
||||
remap = new RemapKokkos<DeviceType>(lmp,world,
|
||||
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
1,0,0,FFT_PRECISION,collective_flag);
|
||||
|
||||
// create ghost grid object for rho and electric field communication
|
||||
|
||||
@ -1783,12 +1783,7 @@ void PPPMKokkos<DeviceType>::brick2fft()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_brick2fft>(0,inum_inout),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_density_fft.template modify<DeviceType>();
|
||||
k_density_fft.template sync<LMPHostType>();
|
||||
remap->perform(density_fft,density_fft,work1);
|
||||
k_density_fft.template modify<LMPHostType>();
|
||||
k_density_fft.template sync<DeviceType>();
|
||||
|
||||
remap->perform(d_density_fft,d_density_fft,d_work1);
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
@ -1830,11 +1825,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik1>(0,nfft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work1.template modify<DeviceType>();
|
||||
k_work1.template sync<LMPHostType>();
|
||||
fft1->compute(work1,work1,1);
|
||||
k_work1.template modify<LMPHostType>();
|
||||
k_work1.template sync<DeviceType>();
|
||||
fft1->compute(d_work1,d_work1,1);
|
||||
|
||||
|
||||
// global energy and virial contribution
|
||||
@ -1898,11 +1889,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik5>(0,inum_fft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -1916,11 +1903,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik7>(0,inum_fft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -1933,11 +1916,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik9>(0,inum_fft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -2193,11 +2172,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom1>(0,nfft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -2214,11 +2189,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom3>(0,nfft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -2230,11 +2201,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom5>(0,nfft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -2246,11 +2213,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom7>(0,nfft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -2261,11 +2224,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom9>(0,nfft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -2277,11 +2236,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom11>(0,nfft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -2293,11 +2248,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
|
||||
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom13>(0,nfft),*this);
|
||||
copymode = 0;
|
||||
|
||||
k_work2.template modify<DeviceType>();
|
||||
k_work2.template sync<LMPHostType>();
|
||||
fft2->compute(work2,work2,-1);
|
||||
k_work2.template modify<LMPHostType>();
|
||||
k_work2.template sync<DeviceType>();
|
||||
fft2->compute(d_work2,d_work2,-1);
|
||||
|
||||
|
||||
copymode = 1;
|
||||
@ -3037,10 +2988,10 @@ int PPPMKokkos<DeviceType>::timing_1d(int n, double &time1d)
|
||||
time1 = MPI_Wtime();
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
fft1->timing1d(work1,nfft_both,1);
|
||||
fft2->timing1d(work1,nfft_both,-1);
|
||||
fft2->timing1d(work1,nfft_both,-1);
|
||||
fft2->timing1d(work1,nfft_both,-1);
|
||||
fft1->timing1d(d_work1,nfft_both,1);
|
||||
fft2->timing1d(d_work1,nfft_both,-1);
|
||||
fft2->timing1d(d_work1,nfft_both,-1);
|
||||
fft2->timing1d(d_work1,nfft_both,-1);
|
||||
}
|
||||
|
||||
MPI_Barrier(world);
|
||||
@ -3074,10 +3025,10 @@ int PPPMKokkos<DeviceType>::timing_3d(int n, double &time3d)
|
||||
time1 = MPI_Wtime();
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
fft1->compute(work1,work1,1);
|
||||
fft2->compute(work1,work1,-1);
|
||||
fft2->compute(work1,work1,-1);
|
||||
fft2->compute(work1,work1,-1);
|
||||
fft1->compute(d_work1,d_work1,1);
|
||||
fft2->compute(d_work1,d_work1,-1);
|
||||
fft2->compute(d_work1,d_work1,-1);
|
||||
fft2->compute(d_work1,d_work1,-1);
|
||||
}
|
||||
|
||||
MPI_Barrier(world);
|
||||
|
||||
@ -22,11 +22,31 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
|
||||
#ifndef LMP_PPPM_KOKKOS_H
|
||||
#define LMP_PPPM_KOKKOS_H
|
||||
|
||||
#include "pppm.h"
|
||||
#include "gridcomm_kokkos.h"
|
||||
#include "remap_kokkos.h"
|
||||
#include "fft3d_kokkos.h"
|
||||
#include "kokkos_base.h"
|
||||
#include "kokkos_type.h"
|
||||
|
||||
// fix up FFT defines for KOKKOS with CUDA
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
# if defined(FFT_FFTW)
|
||||
# undef FFT_FFTW
|
||||
# endif
|
||||
# if defined(FFT_FFTW3)
|
||||
# undef FFT_FFTW3
|
||||
# endif
|
||||
# if defined(FFT_MKL)
|
||||
# undef FFT_MKL
|
||||
# endif
|
||||
# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT)
|
||||
# define FFT_KISSFFT
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include "pppm.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
struct TagPPPM_setup1{};
|
||||
@ -331,8 +351,8 @@ class PPPMKokkos : public PPPM, public KokkosBase {
|
||||
//double **acons;
|
||||
typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,DeviceType>::t_host acons;
|
||||
|
||||
class FFT3d *fft1,*fft2;
|
||||
class Remap *remap;
|
||||
FFT3dKokkos<DeviceType> *fft1,*fft2;
|
||||
RemapKokkos<DeviceType> *remap;
|
||||
GridCommKokkos<DeviceType> *cg;
|
||||
GridCommKokkos<DeviceType> *cg_peratom;
|
||||
|
||||
|
||||
511
src/KOKKOS/remap_kokkos.cpp
Normal file
511
src/KOKKOS/remap_kokkos.cpp
Normal file
@ -0,0 +1,511 @@
|
||||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <mpi.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "remap_kokkos.h"
|
||||
#include "pack_kokkos.h"
|
||||
#include "error.h"
|
||||
|
||||
using namespace LAMMPS_NS;
|
||||
|
||||
#define MIN(A,B) ((A) < (B) ? (A) : (B))
|
||||
#define MAX(A,B) ((A) > (B) ? (A) : (B))
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp) : Pointers(lmp)
|
||||
{
|
||||
plan = NULL;
|
||||
}
|
||||
|
||||
template<class DeviceType>
|
||||
RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
|
||||
int in_ilo, int in_ihi, int in_jlo, int in_jhi,
|
||||
int in_klo, int in_khi,
|
||||
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
|
||||
int out_klo, int out_khi,
|
||||
int nqty, int permute, int memory,
|
||||
int precision, int usecollective) : Pointers(lmp)
|
||||
{
|
||||
plan = remap_3d_create_plan_kokkos(comm,
|
||||
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
|
||||
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
|
||||
nqty,permute,memory,precision,usecollective);
|
||||
if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
RemapKokkos<DeviceType>::~RemapKokkos()
|
||||
{
|
||||
remap_3d_destroy_plan_kokkos(plan);
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
void RemapKokkos<DeviceType>::perform(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf)
|
||||
{
|
||||
remap_3d_kokkos(d_in,d_out,d_buf,plan);
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Data layout for 3d remaps:
|
||||
|
||||
data set of Nfast x Nmid x Nslow elements is owned by P procs
|
||||
each element = nqty contiguous datums
|
||||
on input, each proc owns a subsection of the elements
|
||||
on output, each proc will own a (presumably different) subsection
|
||||
my subsection must not overlap with any other proc's subsection,
|
||||
i.e. the union of all proc's input (or output) subsections must
|
||||
exactly tile the global Nfast x Nmid x Nslow data set
|
||||
when called from C, all subsection indices are
|
||||
C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
|
||||
when called from F77, all subsection indices are
|
||||
F77-style from 1 to N where N = Nfast or Nmid or Nslow
|
||||
a proc can own 0 elements on input or output
|
||||
by specifying hi index < lo index
|
||||
on both input and output, data is stored contiguously on a processor
|
||||
with a fast-varying, mid-varying, and slow-varying index
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Perform 3d remap
|
||||
|
||||
Arguments:
|
||||
in starting address of input data on this proc
|
||||
out starting address of where output data for this proc
|
||||
will be placed (can be same as in)
|
||||
buf extra memory required for remap
|
||||
if memory=0 was used in call to remap_3d_create_plan
|
||||
then buf must be big enough to hold output result
|
||||
i.e. nqty * (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
|
||||
(out_khi-out_klo+1)
|
||||
if memory=1 was used in call to remap_3d_create_plan
|
||||
then buf is not used, can just be a dummy pointer
|
||||
plan plan returned by previous call to remap_3d_create_plan
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
void RemapKokkos<DeviceType>::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf,
|
||||
struct remap_plan_3d_kokkos<DeviceType> *plan)
|
||||
{
|
||||
// collective flag not yet supported
|
||||
|
||||
// use point-to-point communication
|
||||
|
||||
int i,isend,irecv;
|
||||
typename AT::t_FFT_SCALAR_1d d_scratch;
|
||||
|
||||
if (plan->memory == 0)
|
||||
d_scratch = d_buf;
|
||||
else
|
||||
d_scratch = plan->d_scratch;
|
||||
|
||||
// post all recvs into scratch space
|
||||
|
||||
for (irecv = 0; irecv < plan->nrecv; irecv++) {
|
||||
FFT_SCALAR* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv];
|
||||
MPI_Irecv(scratch,plan->recv_size[irecv],
|
||||
MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
|
||||
plan->comm,&plan->request[irecv]);
|
||||
}
|
||||
|
||||
// send all messages to other procs
|
||||
|
||||
for (isend = 0; isend < plan->nsend; isend++) {
|
||||
int in_offset = plan->send_offset[isend];
|
||||
plan->pack(d_in,in_offset,
|
||||
plan->d_sendbuf,0,&plan->packplan[isend]);
|
||||
MPI_Send(plan->d_sendbuf.ptr_on_device(),plan->send_size[isend],MPI_FFT_SCALAR,
|
||||
plan->send_proc[isend],0,plan->comm);
|
||||
}
|
||||
|
||||
// copy in -> scratch -> out for self data
|
||||
|
||||
if (plan->self) {
|
||||
isend = plan->nsend;
|
||||
irecv = plan->nrecv;
|
||||
|
||||
int in_offset = plan->send_offset[isend];
|
||||
int scratch_offset = plan->recv_bufloc[irecv];
|
||||
int out_offset = plan->recv_offset[irecv];
|
||||
|
||||
plan->pack(d_in,in_offset,
|
||||
d_scratch,scratch_offset,
|
||||
&plan->packplan[isend]);
|
||||
plan->unpack(d_scratch,scratch_offset,
|
||||
d_out,out_offset,&plan->unpackplan[irecv]);
|
||||
}
|
||||
|
||||
// unpack all messages from scratch -> out
|
||||
|
||||
for (i = 0; i < plan->nrecv; i++) {
|
||||
MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE);
|
||||
|
||||
int scratch_offset = plan->recv_bufloc[irecv];
|
||||
int out_offset = plan->recv_offset[irecv];
|
||||
|
||||
plan->unpack(d_scratch,scratch_offset,
|
||||
d_out,out_offset,&plan->unpackplan[irecv]);
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Create plan for performing a 3d remap
|
||||
|
||||
Arguments:
|
||||
comm MPI communicator for the P procs which own the data
|
||||
in_ilo,in_ihi input bounds of data I own in fast index
|
||||
in_jlo,in_jhi input bounds of data I own in mid index
|
||||
in_klo,in_khi input bounds of data I own in slow index
|
||||
out_ilo,out_ihi output bounds of data I own in fast index
|
||||
out_jlo,out_jhi output bounds of data I own in mid index
|
||||
out_klo,out_khi output bounds of data I own in slow index
|
||||
nqty # of datums per element
|
||||
permute permutation in storage order of indices on output
|
||||
0 = no permutation
|
||||
1 = permute once = mid->fast, slow->mid, fast->slow
|
||||
2 = permute twice = slow->fast, fast->mid, mid->slow
|
||||
memory user provides buffer memory for remap or system does
|
||||
0 = user provides memory
|
||||
1 = system provides memory
|
||||
precision precision of data
|
||||
1 = single precision (4 bytes per datum)
|
||||
2 = double precision (8 bytes per datum)
|
||||
usecollective whether to use collective MPI or point-to-point
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_create_plan_kokkos(
|
||||
MPI_Comm comm,
|
||||
int in_ilo, int in_ihi, int in_jlo, int in_jhi,
|
||||
int in_klo, int in_khi,
|
||||
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
|
||||
int out_klo, int out_khi,
|
||||
int nqty, int permute, int memory, int precision, int usecollective)
|
||||
{
|
||||
|
||||
struct remap_plan_3d_kokkos<DeviceType> *plan;
|
||||
struct extent_3d *inarray, *outarray;
|
||||
struct extent_3d in,out,overlap;
|
||||
int i,iproc,nsend,nrecv,ibuf,size,me,nprocs;
|
||||
|
||||
// query MPI info
|
||||
|
||||
MPI_Comm_rank(comm,&me);
|
||||
MPI_Comm_size(comm,&nprocs);
|
||||
|
||||
// allocate memory for plan data struct
|
||||
|
||||
plan = new struct remap_plan_3d_kokkos<DeviceType>;
|
||||
if (plan == NULL) return NULL;
|
||||
plan->usecollective = usecollective;
|
||||
|
||||
// store parameters in local data structs
|
||||
|
||||
in.ilo = in_ilo;
|
||||
in.ihi = in_ihi;
|
||||
in.isize = in.ihi - in.ilo + 1;
|
||||
|
||||
in.jlo = in_jlo;
|
||||
in.jhi = in_jhi;
|
||||
in.jsize = in.jhi - in.jlo + 1;
|
||||
|
||||
in.klo = in_klo;
|
||||
in.khi = in_khi;
|
||||
in.ksize = in.khi - in.klo + 1;
|
||||
|
||||
out.ilo = out_ilo;
|
||||
out.ihi = out_ihi;
|
||||
out.isize = out.ihi - out.ilo + 1;
|
||||
|
||||
out.jlo = out_jlo;
|
||||
out.jhi = out_jhi;
|
||||
out.jsize = out.jhi - out.jlo + 1;
|
||||
|
||||
out.klo = out_klo;
|
||||
out.khi = out_khi;
|
||||
out.ksize = out.khi - out.klo + 1;
|
||||
|
||||
// combine output extents across all procs
|
||||
|
||||
inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
|
||||
if (inarray == NULL) return NULL;
|
||||
|
||||
outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
|
||||
if (outarray == NULL) return NULL;
|
||||
|
||||
MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE,
|
||||
outarray,sizeof(struct extent_3d),MPI_BYTE,comm);
|
||||
|
||||
// count send collides, including self
|
||||
|
||||
nsend = 0;
|
||||
iproc = me;
|
||||
for (i = 0; i < nprocs; i++) {
|
||||
iproc++;
|
||||
if (iproc == nprocs) iproc = 0;
|
||||
nsend += remap_3d_collide(&in,&outarray[iproc],&overlap);
|
||||
}
|
||||
|
||||
// malloc space for send info
|
||||
|
||||
if (nsend) {
|
||||
plan->pack = PackKokkos<DeviceType>::pack_3d;
|
||||
|
||||
plan->send_offset = (int *) malloc(nsend*sizeof(int));
|
||||
plan->send_size = (int *) malloc(nsend*sizeof(int));
|
||||
plan->send_proc = (int *) malloc(nsend*sizeof(int));
|
||||
plan->packplan = (struct pack_plan_3d *)
|
||||
malloc(nsend*sizeof(struct pack_plan_3d));
|
||||
|
||||
if (plan->send_offset == NULL || plan->send_size == NULL ||
|
||||
plan->send_proc == NULL || plan->packplan == NULL) return NULL;
|
||||
}
|
||||
|
||||
// store send info, with self as last entry
|
||||
|
||||
nsend = 0;
|
||||
iproc = me;
|
||||
for (i = 0; i < nprocs; i++) {
|
||||
iproc++;
|
||||
if (iproc == nprocs) iproc = 0;
|
||||
if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
|
||||
plan->send_proc[nsend] = iproc;
|
||||
plan->send_offset[nsend] = nqty *
|
||||
((overlap.klo-in.klo)*in.jsize*in.isize +
|
||||
((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
|
||||
plan->packplan[nsend].nfast = nqty*overlap.isize;
|
||||
plan->packplan[nsend].nmid = overlap.jsize;
|
||||
plan->packplan[nsend].nslow = overlap.ksize;
|
||||
plan->packplan[nsend].nstride_line = nqty*in.isize;
|
||||
plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
|
||||
plan->packplan[nsend].nqty = nqty;
|
||||
plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
|
||||
nsend++;
|
||||
}
|
||||
}
|
||||
|
||||
// plan->nsend = # of sends not including self
|
||||
|
||||
if (nsend && plan->send_proc[nsend-1] == me) {
|
||||
if (plan->usecollective) // for collectives include self in nsend list
|
||||
plan->nsend = nsend;
|
||||
else
|
||||
plan->nsend = nsend - 1;
|
||||
} else
|
||||
plan->nsend = nsend;
|
||||
|
||||
// combine input extents across all procs
|
||||
|
||||
MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE,
|
||||
inarray,sizeof(struct extent_3d),MPI_BYTE,comm);
|
||||
|
||||
// count recv collides, including self
|
||||
|
||||
nrecv = 0;
|
||||
iproc = me;
|
||||
for (i = 0; i < nprocs; i++) {
|
||||
iproc++;
|
||||
if (iproc == nprocs) iproc = 0;
|
||||
nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap);
|
||||
}
|
||||
|
||||
// malloc space for recv info
|
||||
|
||||
if (nrecv) {
|
||||
if (permute == 0)
|
||||
plan->unpack = PackKokkos<DeviceType>::unpack_3d;
|
||||
else if (permute == 1) {
|
||||
if (nqty == 1)
|
||||
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
|
||||
else if (nqty == 2)
|
||||
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
|
||||
else
|
||||
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
|
||||
}
|
||||
else if (permute == 2) {
|
||||
if (nqty == 1)
|
||||
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
|
||||
else if (nqty == 2)
|
||||
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
|
||||
else
|
||||
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
|
||||
}
|
||||
|
||||
plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
|
||||
plan->recv_size = (int *) malloc(nrecv*sizeof(int));
|
||||
plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
|
||||
plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
|
||||
plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
|
||||
plan->unpackplan = (struct pack_plan_3d *)
|
||||
malloc(nrecv*sizeof(struct pack_plan_3d));
|
||||
|
||||
if (plan->recv_offset == NULL || plan->recv_size == NULL ||
|
||||
plan->recv_proc == NULL || plan->recv_bufloc == NULL ||
|
||||
plan->request == NULL || plan->unpackplan == NULL) return NULL;
|
||||
}
|
||||
|
||||
// store recv info, with self as last entry
|
||||
|
||||
ibuf = 0;
|
||||
nrecv = 0;
|
||||
iproc = me;
|
||||
|
||||
for (i = 0; i < nprocs; i++) {
|
||||
iproc++;
|
||||
if (iproc == nprocs) iproc = 0;
|
||||
if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
|
||||
plan->recv_proc[nrecv] = iproc;
|
||||
plan->recv_bufloc[nrecv] = ibuf;
|
||||
|
||||
if (permute == 0) {
|
||||
plan->recv_offset[nrecv] = nqty *
|
||||
((overlap.klo-out.klo)*out.jsize*out.isize +
|
||||
(overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
|
||||
plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
|
||||
plan->unpackplan[nrecv].nmid = overlap.jsize;
|
||||
plan->unpackplan[nrecv].nslow = overlap.ksize;
|
||||
plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
|
||||
plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
|
||||
plan->unpackplan[nrecv].nqty = nqty;
|
||||
}
|
||||
else if (permute == 1) {
|
||||
plan->recv_offset[nrecv] = nqty *
|
||||
((overlap.ilo-out.ilo)*out.ksize*out.jsize +
|
||||
(overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
|
||||
plan->unpackplan[nrecv].nfast = overlap.isize;
|
||||
plan->unpackplan[nrecv].nmid = overlap.jsize;
|
||||
plan->unpackplan[nrecv].nslow = overlap.ksize;
|
||||
plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
|
||||
plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
|
||||
plan->unpackplan[nrecv].nqty = nqty;
|
||||
}
|
||||
else {
|
||||
plan->recv_offset[nrecv] = nqty *
|
||||
((overlap.jlo-out.jlo)*out.isize*out.ksize +
|
||||
(overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
|
||||
plan->unpackplan[nrecv].nfast = overlap.isize;
|
||||
plan->unpackplan[nrecv].nmid = overlap.jsize;
|
||||
plan->unpackplan[nrecv].nslow = overlap.ksize;
|
||||
plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
|
||||
plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
|
||||
plan->unpackplan[nrecv].nqty = nqty;
|
||||
}
|
||||
|
||||
plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
|
||||
ibuf += plan->recv_size[nrecv];
|
||||
nrecv++;
|
||||
}
|
||||
}
|
||||
|
||||
// plan->nrecv = # of recvs not including self
|
||||
// for collectives include self in the nsend list
|
||||
|
||||
if (nrecv && plan->recv_proc[nrecv-1] == me) {
|
||||
if (plan->usecollective) plan->nrecv = nrecv;
|
||||
else plan->nrecv = nrecv - 1;
|
||||
} else plan->nrecv = nrecv;
|
||||
|
||||
// init remaining fields in remap plan
|
||||
|
||||
plan->memory = memory;
|
||||
|
||||
if (nrecv == plan->nrecv) plan->self = 0;
|
||||
else plan->self = 1;
|
||||
|
||||
// free locally malloced space
|
||||
|
||||
free(inarray);
|
||||
free(outarray);
|
||||
|
||||
// find biggest send message (not including self) and malloc space for it
|
||||
|
||||
size = 0;
|
||||
for (nsend = 0; nsend < plan->nsend; nsend++)
|
||||
size = MAX(size,plan->send_size[nsend]);
|
||||
|
||||
if (size) {
|
||||
plan->d_sendbuf = typename AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
|
||||
if (!plan->d_sendbuf.data()) return NULL;
|
||||
}
|
||||
|
||||
// if requested, allocate internal scratch space for recvs,
|
||||
// only need it if I will receive any data (including self)
|
||||
|
||||
if (memory == 1) {
|
||||
if (nrecv > 0) {
|
||||
plan->d_scratch =
|
||||
typename AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
|
||||
if (!plan->d_scratch.data()) return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// not using collective - dup comm
|
||||
|
||||
MPI_Comm_dup(comm,&plan->comm);
|
||||
|
||||
// return pointer to plan
|
||||
|
||||
return plan;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Destroy a 3d remap plan
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
void RemapKokkos<DeviceType>::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *plan)
|
||||
{
|
||||
if (plan == NULL) return;
|
||||
|
||||
// free MPI communicator
|
||||
|
||||
if (!((plan->usecollective) && (plan->commringlen == 0)))
|
||||
MPI_Comm_free(&plan->comm);
|
||||
|
||||
// free internal arrays
|
||||
|
||||
if (plan->nsend || plan->self) {
|
||||
free(plan->send_offset);
|
||||
free(plan->send_size);
|
||||
free(plan->send_proc);
|
||||
free(plan->packplan);
|
||||
}
|
||||
|
||||
if (plan->nrecv || plan->self) {
|
||||
free(plan->recv_offset);
|
||||
free(plan->recv_size);
|
||||
free(plan->recv_proc);
|
||||
free(plan->recv_bufloc);
|
||||
free(plan->request);
|
||||
free(plan->unpackplan);
|
||||
}
|
||||
|
||||
// free plan itself
|
||||
|
||||
delete plan;
|
||||
}
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
template class RemapKokkos<LMPDeviceType>;
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
template class RemapKokkos<LMPHostType>;
|
||||
#endif
|
||||
}
|
||||
88
src/KOKKOS/remap_kokkos.h
Normal file
88
src/KOKKOS/remap_kokkos.h
Normal file
@ -0,0 +1,88 @@
|
||||
/* -*- c++ -*- ----------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LMP_REMAP_KOKKOS_H
|
||||
#define LMP_REMAP_KOKKOS_H
|
||||
|
||||
#include "pointers.h"
|
||||
#include <mpi.h>
|
||||
#include "fftdata_kokkos.h"
|
||||
#include "remap.h"
|
||||
#include "kokkos_type.h"
|
||||
|
||||
namespace LAMMPS_NS {
|
||||
|
||||
// details of how to do a 3d remap
|
||||
|
||||
template<class DeviceType>
|
||||
struct remap_plan_3d_kokkos {
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
typename AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends
|
||||
typename AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs
|
||||
void (*pack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
|
||||
// which pack function to use
|
||||
void (*unpack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
|
||||
// which unpack function to use
|
||||
int *send_offset; // extraction loc for each send
|
||||
int *send_size; // size of each send message
|
||||
int *send_proc; // proc to send each message to
|
||||
struct pack_plan_3d *packplan; // pack plan for each send message
|
||||
int *recv_offset; // insertion loc for each recv
|
||||
int *recv_size; // size of each recv message
|
||||
int *recv_proc; // proc to recv each message from
|
||||
int *recv_bufloc; // offset in scratch buf for each recv
|
||||
MPI_Request *request; // MPI request for each posted recv
|
||||
struct pack_plan_3d *unpackplan; // unpack plan for each recv message
|
||||
int nrecv; // # of recvs from other procs
|
||||
int nsend; // # of sends to other procs
|
||||
int self; // whether I send/recv with myself
|
||||
int memory; // user provides scratch space or not
|
||||
MPI_Comm comm; // group of procs performing remap
|
||||
int usecollective; // use collective or point-to-point MPI
|
||||
int commringlen; // length of commringlist
|
||||
int *commringlist; // ranks on communication ring of this plan
|
||||
};
|
||||
|
||||
template<class DeviceType>
|
||||
class RemapKokkos : protected Pointers {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
typedef ArrayTypes<DeviceType> AT;
|
||||
RemapKokkos(class LAMMPS *);
|
||||
RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
|
||||
int,int,int,int,int,int,int,int,int,int,int);
|
||||
~RemapKokkos();
|
||||
void perform(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d);
|
||||
|
||||
struct remap_plan_3d_kokkos<DeviceType> *plan;
|
||||
|
||||
void remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, struct remap_plan_3d_kokkos<DeviceType> *);
|
||||
struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
|
||||
int, int, int, int, int, int,
|
||||
int, int, int, int, int, int,
|
||||
int, int, int, int, int);
|
||||
void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* ERROR/WARNING messages:
|
||||
|
||||
E: Could not create 3d remap plan
|
||||
|
||||
The FFT setup in pppm failed.
|
||||
|
||||
*/
|
||||
@ -25,6 +25,9 @@
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include "remap.h"
|
||||
#if defined(_OPENMP)
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#ifdef FFT_KISS
|
||||
/* include kissfft implementation */
|
||||
@ -266,7 +269,7 @@ struct fft_plan_3d *fft_3d_create_plan(
|
||||
int scaled, int permute, int *nbuf, int usecollective)
|
||||
{
|
||||
struct fft_plan_3d *plan;
|
||||
int me,nprocs;
|
||||
int me,nprocs,nthreads;
|
||||
int flag,remapflag;
|
||||
int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
|
||||
int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
|
||||
@ -279,6 +282,14 @@ struct fft_plan_3d *fft_3d_create_plan(
|
||||
MPI_Comm_rank(comm,&me);
|
||||
MPI_Comm_size(comm,&nprocs);
|
||||
|
||||
#if defined(_OPENMP)
|
||||
// query OpenMP info.
|
||||
// should have been initialized systemwide in Comm class constructor
|
||||
nthreads = omp_get_max_threads();
|
||||
#else
|
||||
nthreads = 1;
|
||||
#endif
|
||||
|
||||
// compute division of procs in 2 dimensions not on-processor
|
||||
|
||||
bifactor(nprocs,&np1,&np2);
|
||||
@ -571,6 +582,13 @@ struct fft_plan_3d *fft_3d_create_plan(
|
||||
*/
|
||||
|
||||
#elif defined(FFT_FFTW3)
|
||||
#if defined(FFT_FFTW_THREADS)
|
||||
if (nthreads > 1) {
|
||||
FFTW_API(init_threads)();
|
||||
FFTW_API(plan_with_nthreads)(nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
plan->plan_fast_forward =
|
||||
FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
|
||||
NULL,&nfast,1,plan->length1,
|
||||
@ -689,6 +707,9 @@ void fft_3d_destroy_plan(struct fft_plan_3d *plan)
|
||||
FFTW_API(destroy_plan)(plan->plan_mid_backward);
|
||||
FFTW_API(destroy_plan)(plan->plan_fast_forward);
|
||||
FFTW_API(destroy_plan)(plan->plan_fast_backward);
|
||||
#if defined(FFT_FFTW_THREADS)
|
||||
FFTW_API(cleanup_threads)();
|
||||
#endif
|
||||
#else
|
||||
if (plan->cfg_slow_forward != plan->cfg_fast_forward &&
|
||||
plan->cfg_slow_forward != plan->cfg_mid_forward) {
|
||||
|
||||
@ -26,6 +26,8 @@ KSpaceStyle(pppm,PPPM)
|
||||
#define LMP_FFT_LIB "FFTW3"
|
||||
#elif defined(FFT_MKL)
|
||||
#define LMP_FFT_LIB "MKL FFT"
|
||||
#elif defined(FFT_CUFFT)
|
||||
#define LMP_FFT_LIB "cuFFT"
|
||||
#else
|
||||
#define LMP_FFT_LIB "KISS FFT"
|
||||
#endif
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc 7.3.1
|
||||
# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc
|
||||
|
||||
SHELL = /bin/sh
|
||||
|
||||
@ -44,9 +44,12 @@ LMP_INC = -DLAMMPS_GZIP
|
||||
# PATH = path for MPI library
|
||||
# LIB = name of MPI library
|
||||
|
||||
MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/include
|
||||
MY_MPI_EXE = $(shell which mpicxx)
|
||||
MY_MPI_PATH = $(dir ${MY_MPI_EXE})
|
||||
|
||||
MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I${MY_MPI_PATH}../include
|
||||
MPI_PATH =
|
||||
MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/lib -lmpi_ibm
|
||||
MPI_LIB = -L${MY_MPI_PATH}../lib -lmpi_ibm
|
||||
|
||||
# FFT library
|
||||
# see discussion in Section 2.2 (step 6) of manaul
|
||||
@ -55,9 +58,9 @@ MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.
|
||||
# PATH = path for FFT library
|
||||
# LIB = name of FFT library
|
||||
|
||||
FFT_INC =
|
||||
FFT_INC = -DFFT_CUFFT
|
||||
FFT_PATH =
|
||||
FFT_LIB =
|
||||
FFT_LIB = -lcufft
|
||||
|
||||
# JPEG and/or PNG library
|
||||
# see discussion in Section 2.2 (step 7) of manual
|
||||
|
||||
@ -55,9 +55,9 @@ MPI_LIB =
|
||||
# PATH = path for FFT library
|
||||
# LIB = name of FFT library
|
||||
|
||||
FFT_INC =
|
||||
FFT_INC = -DFFT_CUFFT
|
||||
FFT_PATH =
|
||||
FFT_LIB =
|
||||
FFT_LIB = -lcufft
|
||||
|
||||
# JPEG and/or PNG library
|
||||
# see discussion in Section 2.2 (step 7) of manual
|
||||
|
||||
@ -1 +1 @@
|
||||
#define LAMMPS_VERSION "09 Jan 2020"
|
||||
#define LAMMPS_VERSION "24 Jan 2020"
|
||||
|
||||
Reference in New Issue
Block a user