Merge branch 'master' into remove-txt-files

# Conflicts:
#	doc/src/Build_settings.rst
This commit is contained in:
Axel Kohlmeyer
2020-01-24 21:32:24 -05:00
26 changed files with 2980 additions and 167 deletions

View File

@ -739,4 +739,20 @@ if(PKG_KSPACE)
else()
message(STATUS "Using double precision FFTs")
endif()
if(FFT_FFTW_THREADS)
message(STATUS "Using threaded FFTs")
else()
message(STATUS "Using non-threaded FFTs")
endif()
if(PKG_KOKKOS)
if(KOKKOS_ENABLE_CUDA)
if (${FFT} STREQUAL "KISS")
message(STATUS "Kokkos FFT: KISS")
else()
message(STATUS "Kokkos FFT: cuFFT")
endif()
else()
message(STATUS "Kokkos FFT: ${FFT}")
endif()
endif()
endif()

View File

@ -1,20 +1,22 @@
# - Find fftw3
# Find the native FFTW3 headers and libraries.
# Find the native double precision FFTW3 headers and libraries.
#
# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
# FFTW3_LIBRARIES - List of libraries when using fftw3.
# FFTW3_FOUND - True if fftw3 found.
# FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
# FFTW3_LIBRARIES - List of libraries when using fftw3.
# FFTW3_OMP_LIBRARIES - List of libraries when using fftw3.
# FFTW3_FOUND - True if fftw3 found.
#
find_package(PkgConfig)
pkg_check_modules(PC_FFTW3 fftw3)
find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS})
find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS})
find_library(FFTW3_OMP_LIBRARY NAMES fftw3_omp HINTS ${PC_FFTW3_LIBRARY_DIRS})
set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
set(FFTW3_OMP_LIBRARIES ${FFTW3_OMP_LIBRARY})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE
@ -22,4 +24,4 @@ include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR)
mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY )
mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY FFTW3_OMP_LIBRARY)

View File

@ -1,8 +1,8 @@
# - Find fftw3f
# Find the native FFTW3F headers and libraries.
# Find the native single precision FFTW3 headers and libraries.
#
# FFTW3F_INCLUDE_DIRS - where to find fftw3f.h, etc.
# FFTW3F_LIBRARIES - List of libraries when using fftw3f.
# FFTW3F_OMP_LIBRARIES - List of libraries when using fftw3.
# FFTW3F_FOUND - True if fftw3f found.
#
@ -10,11 +10,12 @@ find_package(PkgConfig)
pkg_check_modules(PC_FFTW3F fftw3f)
find_path(FFTW3F_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3F_INCLUDE_DIRS})
find_library(FFTW3F_LIBRARY NAMES fftw3f HINTS ${PC_FFTW3F_LIBRARY_DIRS})
find_library(FFTW3F_OMP_LIBRARY NAMES fftw3f_omp HINTS ${PC_FFTW3F_LIBRARY_DIRS})
set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY})
set(FFTW3F_INCLUDE_DIRS ${FFTW3F_INCLUDE_DIR})
set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY})
set(FFTW3F_OMP_LIBRARIES ${FFTW3F_OMP_LIBRARY})
include(FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set FFTW3F_FOUND to TRUE
@ -22,4 +23,4 @@ include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(FFTW3F DEFAULT_MSG FFTW3F_LIBRARY FFTW3F_INCLUDE_DIR)
mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY )
mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY)

View File

@ -10,7 +10,7 @@ if(PKG_KOKKOS)
set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR})
set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src
${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src
${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src
@ -39,7 +39,15 @@ if(PKG_KOKKOS)
${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp)
if(PKG_KSPACE)
list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp)
list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/fft3d_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp
${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp)
if(KOKKOS_ENABLE_CUDA)
if(NOT ${FFT} STREQUAL "KISS")
add_definitions(-DFFT_CUFFT)
list(APPEND LAMMPS_LINK_LIBS cufft)
endif()
endif()
endif()
set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")

View File

@ -21,6 +21,20 @@ if(PKG_KSPACE)
add_definitions(-DFFT_FFTW3)
include_directories(${${FFTW}_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES})
if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY)
option(FFT_FFTW_THREADS "Use threaded FFT library" ON)
else()
option(FFT_FFTW_THREADS "Use threaded FFT library" OFF)
endif()
if(FFT_FFTW_THREADS)
if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY)
add_definitions(-DFFT_FFTW_THREADS)
list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_OMP_LIBRARIES})
else()
message(FATAL_ERROR "Need OpenMP enabled FFTW3 library for FFT_THREADS")
endif()
endif()
elseif(FFT STREQUAL "MKL")
find_package(MKL REQUIRED)
add_definitions(-DFFT_MKL)

View File

@ -1,4 +1,4 @@
.TH LAMMPS "9 January 2020" "2020-01-09"
.TH LAMMPS "24 January 2020" "2020-01-24"
.SH NAME
.B LAMMPS
\- Molecular Dynamics Simulator.

View File

@ -350,10 +350,12 @@ For NVIDIA GPUs using CUDA:
KOKKOS_DEVICES = Cuda
KOKKOS_ARCH = archCPU,archGPU # archCPU = CPU from list above that is hosting the GPU
# archGPU = GPU from list above
FFT_INC = -DFFT_CUFFT # enable use of cuFFT (optional)
FFT_LIB = -lcufft # link to cuFFT library
For GPUs, you also need these 2 lines in your Makefile.machine before
the CC line is defined, in this case for use with OpenMPI mpicxx. The
2 lines define a nvcc wrapper compiler, which will use nvcc for
For GPUs, you also need the following 2 lines in your Makefile.machine
before the CC line is defined, in this case for use with OpenMPI mpicxx.
The 2 lines define a nvcc wrapper compiler, which will use nvcc for
compiling CUDA files and use a C++ compiler for non-Kokkos, non-CUDA
files.

View File

@ -49,13 +49,19 @@ through the CMAKE\_CXX\_FLAGS variable. Example for CentOS 7:
-D CMAKE_CXX_FLAGS="-O3 -g -fopenmp -DNDEBUG -std=c++11"
**Makefile.machine setting**\ :
**Makefile.machine setting**\ to bypass the C++11 test and compile in C++98 mode:
.. parsed-literal::
LMP_INC = -DLAMMPS_CXX98
**Makefile.machine setting**\ to enable the C++11 with older (but not too old) GNU c++ (e.g. on CentOS 7):
.. parsed-literal::
CCFLAGS = -g -O3 -std=c++11
----------
@ -86,14 +92,19 @@ LAMMPS can use them if they are available on your system.
an exception to the rule that all CMake variables can be specified
with lower-case values.
Usually these settings are all that is needed. If CMake cannot find
the FFT library, you can set these variables:
Usually these settings are all that is needed. If FFTW3 is selected,
then CMake will try to detect, if threaded FFTW libraries are available
and enable them by default. This setting is independent of whether
OpenMP threads are enabled and a packages like KOKKOS or USER-OMP is
used. If CMake cannot detect the FFT library, you can set these variables
to assist:
.. parsed-literal::
-D FFTW3_INCLUDE_DIRS=path # path to FFTW3 include files
-D FFTW3_LIBRARIES=path # path to FFTW3 libraries
-D FFT_FFTW_THREADS=on # enable using threaded FFTW3 libraries
-D MKL_INCLUDE_DIRS=path # ditto for Intel MKL library
-D MKL_LIBRARIES=path
@ -105,6 +116,7 @@ the FFT library, you can set these variables:
FFT_INC = -DFFT_FFTW3 # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS
# default is KISS if not specified
FFT_INC = -DFFT_SINGLE # do not specify for double precision
FFT_INC = -DFFT_FFTW_THREADS # enable using threaded FFTW3 libraries
FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY
# default is FFT\_PACK\_ARRAY if not specified
@ -115,6 +127,7 @@ the FFT library, you can set these variables:
FFT_INC = -I/usr/local/include
FFT_PATH = -L/usr/local/lib
FFT_LIB = -lfftw3 # FFTW3 double precision
FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision with threads (needs -DFFT_FFTW_THREADS)
FFT_LIB = -lfftw3 -lfftw3f # FFTW3 single precision
FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler
FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compier
@ -126,16 +139,19 @@ FFT\_LIB with the appropriate FFT libraries to include in the link.
**CMake and make info**\ :
The `KISS FFT library <http://kissfft.sf.net>`_ is included in the LAMMPS
distribution. It is portable across all platforms. Depending on the
size of the FFTs and the number of processors used, the other
libraries listed here can be faster.
distribution. It is portable across all platforms. Depending on the size
of the FFTs and the number of processors used, the other libraries listed
here can be faster.
However, note that long-range Coulombics are only a portion of the
per-timestep CPU cost, FFTs are only a portion of long-range
Coulombics, and 1d FFTs are only a portion of the FFT cost (parallel
communication can be costly). A breakdown of these timings is printed
to the screen at the end of a run using the :doc:`kspace_style pppm <kspace_style>` command. The :doc:`Run output <Run_output>`
doc page gives more details.
to the screen at the end of a run when using the
:doc:`kspace_style pppm <kspace_style>` command. The :doc:`Run output <Run_output>`
doc page gives more details. A more detailed (and time consuming)
report of the FFT performance is generated with the
:doc:`kspace_modify fftbench yes <kspace_modify>` command.
FFTW is a fast, portable FFT library that should also work on any
platform and can be faster than the KISS FFT library. You can
@ -166,7 +182,7 @@ When using -DFFT\_SINGLE with FFTW3 you may need to build the FFTW
library a second time with support for single-precision.
For FFTW3, do the following, which should produce the additional
library libfftw3f.a
library libfftw3f.a or libfftw3f.so.
.. parsed-literal::

View File

@ -404,9 +404,9 @@ calculation can be performed concurrently on the GPU while other
calculations for non-bonded and bonded force calculation are performed
on the CPU.
The *pppm/kk* style also performs charge assignment and force
interpolation calculations on the GPU while the FFTs themselves are
calculated on the CPU in non-threaded mode.
The *pppm/kk* style performs charge assignment and force interpolation
calculations, along with the FFTs themselves, on the GPU or (optionally) threaded
on the CPU when using OpenMP and FFTW3.
These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
USER-OMP, and OPT packages respectively. They are only enabled if

View File

@ -97,6 +97,9 @@ action dihedral_opls_kokkos.cpp dihedral_opls.cpp
action dihedral_opls_kokkos.h dihedral_opls.h
action domain_kokkos.cpp
action domain_kokkos.h
action fftdata_kokkos.h fft3d.h
action fft3d_kokkos.cpp fft3d.cpp
action fft3d_kokkos.h fft3d.h
action fix_deform_kokkos.cpp
action fix_deform_kokkos.h
action fix_enforce2d_kokkos.cpp
@ -153,6 +156,7 @@ action improper_class2_kokkos.cpp improper_class2.cpp
action improper_class2_kokkos.h improper_class2.h
action improper_harmonic_kokkos.cpp improper_harmonic.cpp
action improper_harmonic_kokkos.h improper_harmonic.h
action kissfft_kokkos.h kissfft.h
action kokkos.cpp
action kokkos.h
action kokkos_base.h
@ -189,6 +193,7 @@ action min_kokkos.cpp
action min_kokkos.h
action min_linesearch_kokkos.cpp
action min_linesearch_kokkos.h
action pack_kokkos.h pack.h
action pair_buck_coul_cut_kokkos.cpp
action pair_buck_coul_cut_kokkos.h
action pair_buck_coul_long_kokkos.cpp pair_buck_coul_long.cpp
@ -285,6 +290,8 @@ action rand_pool_wrap_kokkos.cpp
action rand_pool_wrap_kokkos.h
action region_block_kokkos.cpp
action region_block_kokkos.h
action remap_kokkos.cpp remap.cpp
action remap_kokkos.h remap.h
action sna_kokkos.h sna.h
action sna_kokkos_impl.h sna.cpp
action verlet_kokkos.cpp

905
src/KOKKOS/fft3d_kokkos.cpp Normal file
View File

@ -0,0 +1,905 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Contributing authors: Stan Moore (SNL), Sam Mish (U.C. Davis)
------------------------------------------------------------------------- */
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "fft3d_kokkos.h"
#include "remap_kokkos.h"
#include "kokkos_type.h"
#include "error.h"
#include "kokkos.h"
using namespace LAMMPS_NS;
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define MAX(A,B) ((A) > (B) ? (A) : (B))
/* ---------------------------------------------------------------------- */
template<class DeviceType>
FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow,
int in_ilo, int in_ihi, int in_jlo, int in_jhi,
int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int scaled, int permute, int *nbuf, int usecollective) :
Pointers(lmp)
{
int nthreads = lmp->kokkos->nthreads;
int ngpus = lmp->kokkos->ngpus;
ExecutionSpace execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
#if defined(FFT_MKL)
if (ngpus > 0 && execution_space == Device)
lmp->error->all(FLERR,"Cannot use the MKL library with Kokkos CUDA on GPUs");
#elif defined(FFT_FFTW3)
if (ngpus > 0 && execution_space == Device)
lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA on GPUs");
#elif defined(FFT_CUFFT)
if (ngpus > 0 && execution_space == Host)
lmp->error->all(FLERR,"Cannot use the cuFFT library with Kokkos CUDA on the host CPUs");
#elif defined(FFT_KISSFFT)
// The compiler can't statically determine the stack size needed for
// recursive function calls in KISS FFT and the default per-thread
// stack size on GPUs needs to be increased to prevent stack overflows
// for reasonably sized FFTs
#if defined (KOKKOS_ENABLE_CUDA)
size_t stack_size;
cudaDeviceGetLimit(&stack_size,cudaLimitStackSize);
if (stack_size < 2048)
cudaDeviceSetLimit(cudaLimitStackSize,2048);
#endif
#endif
plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
scaled,permute,nbuf,usecollective,nthreads);
if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
FFT3dKokkos<DeviceType>::~FFT3dKokkos()
{
fft_3d_destroy_plan_kokkos(plan);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void FFT3dKokkos<DeviceType>::compute(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, int flag)
{
typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size()/2);
typename AT::t_FFT_DATA_1d d_out_data(d_out.data(),d_out.size()/2);
fft_3d_kokkos(d_in_data,d_out_data,flag,plan);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void FFT3dKokkos<DeviceType>::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int nsize, int flag)
{
typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size());
fft_3d_1d_only_kokkos(d_in_data,nsize,flag,plan);
}
/* ----------------------------------------------------------------------
Data layout for 3d FFTs:
data set of Nfast x Nmid x Nslow elements is owned by P procs
on input, each proc owns a subsection of the elements
on output, each proc will own a (possibly different) subsection
my subsection must not overlap with any other proc's subsection,
i.e. the union of all proc's input (or output) subsections must
exactly tile the global Nfast x Nmid x Nslow data set
when called from C, all subsection indices are
C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
when called from F77, all subsection indices are
F77-style from 1 to N where N = Nfast or Nmid or Nslow
a proc can own 0 elements on input or output
by specifying hi index < lo index
on both input and output, data is stored contiguously on a processor
with a fast-varying, mid-varying, and slow-varying index
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Perform 3d FFT
Arguments:
in starting address of input data on this proc
out starting address of where output data for this proc
will be placed (can be same as in)
flag 1 for forward FFT, -1 for inverse FFT
plan plan returned by previous call to fft_3d_create_plan
------------------------------------------------------------------------- */
template<class DeviceType>
struct norm_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_DATA_1d_um d_out;
int norm;
norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_):
d_out(d_out_),norm(norm_) {}
KOKKOS_INLINE_FUNCTION
void operator() (const int &i) const {
#if defined(FFT_FFTW3) || defined(FFT_CUFFT)
FFT_SCALAR* out_ptr = (FFT_SCALAR *)(d_out.data()+i);
*(out_ptr++) *= norm;
*(out_ptr++) *= norm;
#elif defined(FFT_MKL)
d_out(i) *= norm;
#else
d_out(i,0) *= norm;
d_out(i,1) *= norm;
#endif
}
};
#ifdef FFT_KISSFFT
template<class DeviceType>
struct kiss_fft_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_DATA_1d_um d_data,d_tmp;
kiss_fft_state_kokkos<DeviceType> st;
int length;
kiss_fft_functor() {}
kiss_fft_functor(typename AT::t_FFT_DATA_1d &d_data_,typename AT::t_FFT_DATA_1d &d_tmp_, kiss_fft_state_kokkos<DeviceType> &st_, int length_):
d_data(d_data_),
d_tmp(d_tmp_),
st(st_)
{
length = length_;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &i) const {
const int offset = i*length;
KissFFTKokkos<DeviceType>::kiss_fft_kokkos(st,d_data,d_tmp,offset);
}
};
#endif
template<class DeviceType>
void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typename AT::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos<DeviceType> *plan)
{
int total,length;
typename AT::t_FFT_DATA_1d d_data,d_copy;
typename AT::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar;
// pre-remap to prepare for 1st FFTs if needed
// copy = loc for remap result
if (plan->pre_plan) {
if (plan->pre_target == 0) d_copy = d_out;
else d_copy = plan->d_copy;
d_in_scalar = typename AT::t_FFT_SCALAR_1d(d_in.data(),d_in.size());
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size());
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar,
d_scratch_scalar, plan->pre_plan);
d_data = d_copy;
} else d_data = d_in;
// 1d FFTs along fast axis
total = plan->total1;
length = plan->length1;
#if defined(FFT_MKL)
if (flag == -1)
DftiComputeForward(plan->handle_fast,(FFT_DATA *)d_data.data());
else
DftiComputeBackward(plan->handle_fast,(FFT_DATA *)d_data.data());
#elif defined(FFT_FFTW3)
if (flag == -1)
FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
else
FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
#elif defined(FFT_CUFFT)
cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
#else
typename AT::t_FFT_DATA_1d d_tmp =
typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
kiss_fft_functor<DeviceType> f;
if (flag == -1)
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length);
else
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_backward,length);
Kokkos::parallel_for(total/length,f);
d_data = d_tmp;
d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
#endif
// 1st mid-remap to prepare for 2nd FFTs
// copy = loc for remap result
if (plan->mid1_target == 0) d_copy = d_out;
else d_copy = plan->d_copy;
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()*2);
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2);
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2);
remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar,
d_scratch_scalar, plan->mid1_plan);
d_data = d_copy;
// 1d FFTs along mid axis
total = plan->total2;
length = plan->length2;
#if defined(FFT_MKL)
if (flag == -1)
DftiComputeForward(plan->handle_mid,(FFT_DATA *)d_data.data());
else
DftiComputeBackward(plan->handle_mid,(FFT_DATA *)d_data.data());
#elif defined(FFT_FFTW3)
if (flag == -1)
FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
else
FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
#elif defined(FFT_CUFFT)
cufftExec(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
#else
if (flag == -1)
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_forward,length);
else
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_backward,length);
Kokkos::parallel_for(total/length,f);
d_data = d_tmp;
d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
#endif
// 2nd mid-remap to prepare for 3rd FFTs
// copy = loc for remap result
if (plan->mid2_target == 0) d_copy = d_out;
else d_copy = plan->d_copy;
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size());
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size());
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar,
d_scratch_scalar, plan->mid2_plan);
d_data = d_copy;
// 1d FFTs along slow axis
total = plan->total3;
length = plan->length3;
#if defined(FFT_MKL)
if (flag == -1)
DftiComputeForward(plan->handle_slow,(FFT_DATA *)d_data.data());
else
DftiComputeBackward(plan->handle_slow,(FFT_DATA *)d_data.data());
#elif defined(FFT_FFTW3)
if (flag == -1)
FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
else
FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
#elif defined(FFT_CUFFT)
cufftExec(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
#else
if (flag == -1)
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_forward,length);
else
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_backward,length);
Kokkos::parallel_for(total/length,f);
d_data = d_tmp;
#endif
// post-remap to put data in output format if needed
// destination is always out
if (plan->post_plan) {
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size());
d_out_scalar = typename AT::t_FFT_SCALAR_1d(d_out.data(),d_out.size());
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar,
d_scratch_scalar, plan->post_plan);
}
// scaling if required
if (flag == 1 && plan->scaled) {
FFT_SCALAR norm = plan->norm;
int num = plan->normnum;
norm_functor<DeviceType> f(d_out,norm);
Kokkos::parallel_for(num,f);
}
}
/* ----------------------------------------------------------------------
Create plan for performing a 3d FFT
Arguments:
comm MPI communicator for the P procs which own the d_data
nfast,nmid,nslow size of global 3d matrix
in_ilo,in_ihi input bounds of data I own in fast index
in_jlo,in_jhi input bounds of data I own in mid index
in_klo,in_khi input bounds of data I own in slow index
out_ilo,out_ihi output bounds of data I own in fast index
out_jlo,out_jhi output bounds of data I own in mid index
out_klo,out_khi output bounds of data I own in slow index
scaled 0 = no scaling of result, 1 = scaling
permute permutation in storage order of indices on output
0 = no permutation
1 = permute once = mid->fast, slow->mid, fast->slow
2 = permute twice = slow->fast, fast->mid, mid->slow
nbuf returns size of internal storage buffers used by FFT
usecollective use collective MPI operations for remapping data
------------------------------------------------------------------------- */
template<class DeviceType>
struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_plan_kokkos (
MPI_Comm comm, int nfast, int nmid, int nslow,
int in_ilo, int in_ihi, int in_jlo, int in_jhi,
int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int scaled, int permute, int *nbuf, int usecollective,
int nthreads)
{
struct fft_plan_3d_kokkos<DeviceType> *plan;
int me,nprocs;
int flag,remapflag;
int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
int out_size,first_size,second_size,third_size,copy_size,scratch_size;
int np1,np2,ip1,ip2;
// query MPI info
MPI_Comm_rank(comm,&me);
MPI_Comm_size(comm,&nprocs);
// compute division of procs in 2 dimensions not on-processor
bifactor(nprocs,&np1,&np2);
ip1 = me % np1;
ip2 = me/np1;
// allocate memory for plan data struct
plan = new struct fft_plan_3d_kokkos<DeviceType>;
remapKK = new RemapKokkos<DeviceType>(lmp);
if (plan == NULL) return NULL;
// remap from initial distribution to layout needed for 1st set of 1d FFTs
// not needed if all procs own entire fast axis initially
// first indices = distribution after 1st set of FFTs
if (in_ilo == 0 && in_ihi == nfast-1) flag = 0;
else flag = 1;
MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
if (remapflag == 0) {
first_ilo = in_ilo;
first_ihi = in_ihi;
first_jlo = in_jlo;
first_jhi = in_jhi;
first_klo = in_klo;
first_khi = in_khi;
plan->pre_plan = NULL;
}
else {
first_ilo = 0;
first_ihi = nfast - 1;
first_jlo = ip1*nmid/np1;
first_jhi = (ip1+1)*nmid/np1 - 1;
first_klo = ip2*nslow/np2;
first_khi = (ip2+1)*nslow/np2 - 1;
plan->pre_plan =
remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
first_ilo,first_ihi,first_jlo,first_jhi,
first_klo,first_khi,2,0,0,FFT_PRECISION,0);
if (plan->pre_plan == NULL) return NULL;
}
// 1d FFTs along fast axis
plan->length1 = nfast;
plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1);
// remap from 1st to 2nd FFT
// choose which axis is split over np1 vs np2 to minimize communication
// second indices = distribution after 2nd set of FFTs
second_ilo = ip1*nfast/np1;
second_ihi = (ip1+1)*nfast/np1 - 1;
second_jlo = 0;
second_jhi = nmid - 1;
second_klo = ip2*nslow/np2;
second_khi = (ip2+1)*nslow/np2 - 1;
plan->mid1_plan =
remapKK->remap_3d_create_plan_kokkos(comm,
first_ilo,first_ihi,first_jlo,first_jhi,
first_klo,first_khi,
second_ilo,second_ihi,second_jlo,second_jhi,
second_klo,second_khi,2,1,0,FFT_PRECISION,
usecollective);
if (plan->mid1_plan == NULL) return NULL;
// 1d FFTs along mid axis
plan->length2 = nmid;
plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1);
// remap from 2nd to 3rd FFT
// if final distribution is permute=2 with all procs owning entire slow axis
// then this remapping goes directly to final distribution
// third indices = distribution after 3rd set of FFTs
if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0;
else flag = 1;
MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
if (remapflag == 0) {
third_ilo = out_ilo;
third_ihi = out_ihi;
third_jlo = out_jlo;
third_jhi = out_jhi;
third_klo = out_klo;
third_khi = out_khi;
}
else {
third_ilo = ip1*nfast/np1;
third_ihi = (ip1+1)*nfast/np1 - 1;
third_jlo = ip2*nmid/np2;
third_jhi = (ip2+1)*nmid/np2 - 1;
third_klo = 0;
third_khi = nslow - 1;
}
plan->mid2_plan =
remapKK->remap_3d_create_plan_kokkos(comm,
second_jlo,second_jhi,second_klo,second_khi,
second_ilo,second_ihi,
third_jlo,third_jhi,third_klo,third_khi,
third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
if (plan->mid2_plan == NULL) return NULL;
// 1d FFTs along slow axis
plan->length3 = nslow;
plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow;
// remap from 3rd FFT to final distribution
// not needed if permute = 2 and third indices = out indices on all procs
if (permute == 2 &&
out_ilo == third_ilo && out_ihi == third_ihi &&
out_jlo == third_jlo && out_jhi == third_jhi &&
out_klo == third_klo && out_khi == third_khi) flag = 0;
else flag = 1;
MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
if (remapflag == 0)
plan->post_plan = NULL;
else {
plan->post_plan =
remapKK->remap_3d_create_plan_kokkos(comm,
third_klo,third_khi,third_ilo,third_ihi,
third_jlo,third_jhi,
out_klo,out_khi,out_ilo,out_ihi,
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
if (plan->post_plan == NULL) return NULL;
}
// configure plan memory pointers and allocate work space
// out_size = amount of memory given to FFT by user
// first/second/third_size = amount of memory needed after
// pre,mid1,mid2 remaps
// copy_size = amount needed internally for extra copy of data
// scratch_size = amount needed internally for remap scratch space
// for each remap:
// out space used for result if big enough, else require copy buffer
// accumulate largest required remap scratch space
out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
(first_khi-first_klo+1);
second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
(second_khi-second_klo+1);
third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
(third_khi-third_klo+1);
copy_size = 0;
scratch_size = 0;
if (plan->pre_plan) {
if (first_size <= out_size)
plan->pre_target = 0;
else {
plan->pre_target = 1;
copy_size = MAX(copy_size,first_size);
}
scratch_size = MAX(scratch_size,first_size);
}
if (plan->mid1_plan) {
if (second_size <= out_size)
plan->mid1_target = 0;
else {
plan->mid1_target = 1;
copy_size = MAX(copy_size,second_size);
}
scratch_size = MAX(scratch_size,second_size);
}
if (plan->mid2_plan) {
if (third_size <= out_size)
plan->mid2_target = 0;
else {
plan->mid2_target = 1;
copy_size = MAX(copy_size,third_size);
}
scratch_size = MAX(scratch_size,third_size);
}
if (plan->post_plan)
scratch_size = MAX(scratch_size,out_size);
*nbuf = copy_size + scratch_size;
if (copy_size) {
plan->d_copy = typename AT::t_FFT_DATA_1d("fft3d:copy",copy_size);
}
if (scratch_size) {
plan->d_scratch = typename AT::t_FFT_DATA_1d("fft3d:scratch",scratch_size);
}
// system specific pre-computation of 1d FFT coeffs
// and scaling normalization
#if defined(FFT_MKL)
DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1,
(MKL_LONG)nfast);
DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS,
(MKL_LONG)plan->total1/nfast);
DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast);
DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast);
//DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads);
DftiCommitDescriptor(plan->handle_fast);
DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1,
(MKL_LONG)nmid);
DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS,
(MKL_LONG)plan->total2/nmid);
DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid);
DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid);
//DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads);
DftiCommitDescriptor(plan->handle_mid);
DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1,
(MKL_LONG)nslow);
DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS,
(MKL_LONG)plan->total3/nslow);
DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow);
DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow);
//DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads);
DftiCommitDescriptor(plan->handle_slow);
if (scaled == 0)
plan->scaled = 0;
else {
plan->scaled = 1;
plan->norm = 1.0/(nfast*nmid*nslow);
plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
(out_khi-out_klo+1);
}
#elif defined(FFT_FFTW3)
#if defined (FFT_FFTW_THREADS)
if (nthreads > 1) {
FFTW_API(init_threads)();
FFTW_API(plan_with_nthreads)(nthreads);
}
#endif
plan->plan_fast_forward =
FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
NULL,&nfast,1,plan->length1,
NULL,&nfast,1,plan->length1,
FFTW_FORWARD,FFTW_ESTIMATE);
plan->plan_fast_backward =
FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
NULL,&nfast,1,plan->length1,
NULL,&nfast,1,plan->length1,
FFTW_BACKWARD,FFTW_ESTIMATE);
plan->plan_mid_forward =
FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
NULL,&nmid,1,plan->length2,
NULL,&nmid,1,plan->length2,
FFTW_FORWARD,FFTW_ESTIMATE);
plan->plan_mid_backward =
FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
NULL,&nmid,1,plan->length2,
NULL,&nmid,1,plan->length2,
FFTW_BACKWARD,FFTW_ESTIMATE);
plan->plan_slow_forward =
FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
NULL,&nslow,1,plan->length3,
NULL,&nslow,1,plan->length3,
FFTW_FORWARD,FFTW_ESTIMATE);
plan->plan_slow_backward =
FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
NULL,&nslow,1,plan->length3,
NULL,&nslow,1,plan->length3,
FFTW_BACKWARD,FFTW_ESTIMATE);
#elif defined(FFT_CUFFT)
cufftPlanMany(&(plan->plan_fast), 1, &nfast,
&nfast,1,plan->length1,
&nfast,1,plan->length1,
CUFFT_TYPE,plan->total1/plan->length1);
cufftPlanMany(&(plan->plan_mid), 1, &nmid,
&nmid,1,plan->length2,
&nmid,1,plan->length2,
CUFFT_TYPE,plan->total2/plan->length2);
cufftPlanMany(&(plan->plan_slow), 1, &nslow,
&nslow,1,plan->length3,
&nslow,1,plan->length3,
CUFFT_TYPE,plan->total3/plan->length3);
#else
kissfftKK = new KissFFTKokkos<DeviceType>();
plan->cfg_fast_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL);
plan->cfg_fast_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nfast,1,NULL,NULL);
if (nmid == nfast) {
plan->cfg_mid_forward = plan->cfg_fast_forward;
plan->cfg_mid_backward = plan->cfg_fast_backward;
}
else {
plan->cfg_mid_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nmid,0,NULL,NULL);
plan->cfg_mid_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nmid,1,NULL,NULL);
}
if (nslow == nfast) {
plan->cfg_slow_forward = plan->cfg_fast_forward;
plan->cfg_slow_backward = plan->cfg_fast_backward;
}
else if (nslow == nmid) {
plan->cfg_slow_forward = plan->cfg_mid_forward;
plan->cfg_slow_backward = plan->cfg_mid_backward;
}
else {
plan->cfg_slow_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nslow,0,NULL,NULL);
plan->cfg_slow_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nslow,1,NULL,NULL);
}
#endif
if (scaled == 0)
plan->scaled = 0;
else {
plan->scaled = 1;
plan->norm = 1.0/(nfast*nmid*nslow);
plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
(out_khi-out_klo+1);
}
return plan;
}
/* ----------------------------------------------------------------------
Destroy a 3d fft plan
------------------------------------------------------------------------- */
template<class DeviceType>
void FFT3dKokkos<DeviceType>::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *plan)
{
if (plan->pre_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->pre_plan);
if (plan->mid1_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid1_plan);
if (plan->mid2_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid2_plan);
if (plan->post_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->post_plan);
#if defined(FFT_MKL)
DftiFreeDescriptor(&(plan->handle_fast));
DftiFreeDescriptor(&(plan->handle_mid));
DftiFreeDescriptor(&(plan->handle_slow));
#elif defined(FFT_FFTW3)
FFTW_API(destroy_plan)(plan->plan_slow_forward);
FFTW_API(destroy_plan)(plan->plan_slow_backward);
FFTW_API(destroy_plan)(plan->plan_mid_forward);
FFTW_API(destroy_plan)(plan->plan_mid_backward);
FFTW_API(destroy_plan)(plan->plan_fast_forward);
FFTW_API(destroy_plan)(plan->plan_fast_backward);
#if defined (FFT_FFTW_THREADS)
FFTW_API(cleanup_threads)();
#endif
#elif defined (FFT_KISSFFT)
delete kissfftKK;
#endif
delete plan;
delete remapKK;
}
/* ----------------------------------------------------------------------
divide n into 2 factors of as equal size as possible
------------------------------------------------------------------------- */
template<class DeviceType>
void FFT3dKokkos<DeviceType>::bifactor(int n, int *factor1, int *factor2)
{
int n1,n2,facmax;
facmax = static_cast<int> (sqrt((double) n));
for (n1 = facmax; n1 > 0; n1--) {
n2 = n/n1;
if (n1*n2 == n) {
*factor1 = n1;
*factor2 = n2;
return;
}
}
}
/* ----------------------------------------------------------------------
perform just the 1d FFTs needed by a 3d FFT, no data movement
used for timing purposes
Arguments:
in starting address of input data on this proc, all set to 0.0
nsize size of in
flag 1 for forward FFT, -1 for inverse FFT
plan plan returned by previous call to fft_3d_create_plan
------------------------------------------------------------------------- */
template<class DeviceType>
void FFT3dKokkos<DeviceType>::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d_data, int nsize, int flag,
struct fft_plan_3d_kokkos<DeviceType> *plan)
{
// total = size of data needed in each dim
// length = length of 1d FFT in each dim
// total/length = # of 1d FFTs in each dim
// if total > nsize, limit # of 1d FFTs to available size of data
int total1 = plan->total1;
int length1 = plan->length1;
int total2 = plan->total2;
int length2 = plan->length2;
int total3 = plan->total3;
int length3 = plan->length3;
// fftw3 and Dfti in MKL encode the number of transforms
// into the plan, so we cannot operate on a smaller data set
#if defined(FFT_MKL) || defined(FFT_FFTW3)
if ((total1 > nsize) || (total2 > nsize) || (total3 > nsize))
return;
#endif
if (total1 > nsize) total1 = (nsize/length1) * length1;
if (total2 > nsize) total2 = (nsize/length2) * length2;
if (total3 > nsize) total3 = (nsize/length3) * length3;
// perform 1d FFTs in each of 3 dimensions
// data is just an array of 0.0
#if defined(FFT_MKL)
if (flag == -1) {
DftiComputeForward(plan->handle_fast,data);
DftiComputeForward(plan->handle_mid,data);
DftiComputeForward(plan->handle_slow,data);
} else {
DftiComputeBackward(plan->handle_fast,data);
DftiComputeBackward(plan->handle_mid,data);
DftiComputeBackward(plan->handle_slow,data);
}
#elif defined(FFT_FFTW3)
if (flag == -1) {
FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
} else {
FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
}
#elif defined(FFT_CUFFT)
cufftExec(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
cufftExec(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
cufftExec(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
#else
kiss_fft_functor<DeviceType> f;
typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0());
if (flag == -1) {
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length1);
Kokkos::parallel_for(total1/length1,f);
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_forward,length2);
Kokkos::parallel_for(total2/length2,f);
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_forward,length3);
Kokkos::parallel_for(total3/length3,f);
} else {
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_backward,length1);
Kokkos::parallel_for(total1/length1,f);
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_backward,length2);
Kokkos::parallel_for(total2/length2,f);
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_backward,length3);
Kokkos::parallel_for(total3/length3,f);
}
#endif
// scaling if required
// limit num to size of data
if (flag == 1 && plan->scaled) {
FFT_SCALAR norm = plan->norm;
int num = MIN(plan->normnum,nsize);
norm_functor<DeviceType> f(d_data,norm);
Kokkos::parallel_for(num,f);
}
}
namespace LAMMPS_NS {
template class FFT3dKokkos<LMPDeviceType>;
#ifdef KOKKOS_HAVE_CUDA
template class FFT3dKokkos<LMPHostType>;
#endif
}

198
src/KOKKOS/fft3d_kokkos.h Normal file
View File

@ -0,0 +1,198 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_FFT3D_KOKKOS_H
#define LMP_FFT3D_KOKKOS_H
#include "pointers.h"
#include "kokkos_type.h"
#include "remap_kokkos.h"
#include "fftdata_kokkos.h"
// with KOKKOS in CUDA mode we can only have
// CUFFT or KISSFFT, thus undefine all other
// FFTs here, since they may be valid in fft3d.cpp
#if defined(KOKKOS_ENABLE_CUDA)
# if defined(FFT_FFTW)
# undef FFT_FFTW
# endif
# if defined(FFT_FFTW3)
# undef FFT_FFTW3
# endif
# if defined(FFT_MKL)
# undef FFT_MKL
# endif
# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT)
# define FFT_KISSFFT
# endif
#else
# if defined(FFT_CUFFT)
# error "Must enable CUDA with KOKKOS to use -DFFT_CUFFT"
# endif
// if user set FFTW, it means FFTW3
# ifdef FFT_FFTW
# define FFT_FFTW3
# endif
# ifdef FFT_FFTW_THREADS
# if !defined(FFT_FFTW3)
# error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS"
# endif
# endif
#endif
#if defined(FFT_MKL)
#include "mkl_dfti.h"
#if defined(FFT_SINGLE)
typedef float _Complex FFT_DATA;
#define FFT_MKL_PREC DFTI_SINGLE
#else
typedef double _Complex FFT_DATA;
#define FFT_MKL_PREC DFTI_DOUBLE
#endif
#elif defined(FFT_FFTW3)
#include "fftw3.h"
#if defined(FFT_SINGLE)
typedef fftwf_complex FFT_DATA;
#define FFTW_API(function) fftwf_ ## function
#else
typedef fftw_complex FFT_DATA;
#define FFTW_API(function) fftw_ ## function
#endif
#elif defined(FFT_CUFFT)
#include "cufft.h"
#if defined(FFT_SINGLE)
#define cufftExec cufftExecC2C
#define CUFFT_TYPE CUFFT_C2C
typedef cufftComplex FFT_DATA;
#else
#define cufftExec cufftExecZ2Z
#define CUFFT_TYPE CUFFT_Z2Z
typedef cufftDoubleComplex FFT_DATA;
#endif
#else
#include "kissfft_kokkos.h"
#ifndef FFT_KISSFFT
#define FFT_KISSFFT
#endif
#endif
namespace LAMMPS_NS {
// -------------------------------------------------------------------------
// plan for how to perform a 3d FFT
template<class DeviceType>
struct fft_plan_3d_kokkos {
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
struct remap_plan_3d_kokkos<DeviceType> *pre_plan; // remap from input -> 1st FFTs
struct remap_plan_3d_kokkos<DeviceType> *mid1_plan; // remap from 1st -> 2nd FFTs
struct remap_plan_3d_kokkos<DeviceType> *mid2_plan; // remap from 2nd -> 3rd FFTs
struct remap_plan_3d_kokkos<DeviceType> *post_plan; // remap from 3rd FFTs -> output
typename AT::t_FFT_DATA_1d d_copy; // memory for remap results (if needed)
typename AT::t_FFT_DATA_1d d_scratch; // scratch space for remaps
int total1,total2,total3; // # of 1st,2nd,3rd FFTs (times length)
int length1,length2,length3; // length of 1st,2nd,3rd FFTs
int pre_target; // where to put remap results
int mid1_target,mid2_target;
int scaled; // whether to scale FFT results
int normnum; // # of values to rescale
double norm; // normalization factor for rescaling
// system specific 1d FFT info
#if defined(FFT_MKL)
DFTI_DESCRIPTOR *handle_fast;
DFTI_DESCRIPTOR *handle_mid;
DFTI_DESCRIPTOR *handle_slow;
#elif defined(FFT_FFTW3)
FFTW_API(plan) plan_fast_forward;
FFTW_API(plan) plan_fast_backward;
FFTW_API(plan) plan_mid_forward;
FFTW_API(plan) plan_mid_backward;
FFTW_API(plan) plan_slow_forward;
FFTW_API(plan) plan_slow_backward;
#elif defined(FFT_CUFFT)
cufftHandle plan_fast;
cufftHandle plan_mid;
cufftHandle plan_slow;
#else
kiss_fft_state_kokkos<DeviceType> cfg_fast_forward;
kiss_fft_state_kokkos<DeviceType> cfg_fast_backward;
kiss_fft_state_kokkos<DeviceType> cfg_mid_forward;
kiss_fft_state_kokkos<DeviceType> cfg_mid_backward;
kiss_fft_state_kokkos<DeviceType> cfg_slow_forward;
kiss_fft_state_kokkos<DeviceType> cfg_slow_backward;
#endif
};
template<class DeviceType>
class FFT3dKokkos : protected Pointers {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
FFT3dKokkos(class LAMMPS *, MPI_Comm,
int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
int,int,int *,int);
~FFT3dKokkos();
void compute(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, int);
void timing1d(typename AT::t_FFT_SCALAR_1d, int, int);
private:
struct fft_plan_3d_kokkos<DeviceType> *plan;
RemapKokkos<DeviceType> *remapKK;
#ifdef FFT_KISSFFT
KissFFTKokkos<DeviceType> *kissfftKK;
#endif
void fft_3d_kokkos(typename AT::t_FFT_DATA_1d, typename AT::t_FFT_DATA_1d, int, struct fft_plan_3d_kokkos<DeviceType> *);
struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
int, int, int, int, int,
int, int, int, int, int, int, int,
int, int, int *, int, int);
void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);
void fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d, int, int, struct fft_plan_3d_kokkos<DeviceType> *);
void bifactor(int, int *, int *);
};
}
#endif
/* ERROR/WARNING messages:
E: Could not create 3d FFT plan
The FFT setup for the PPPM solver failed, typically due
to lack of memory. This is an unusual error. Check the
size of the FFT grid you are requesting.
E: Cannot use the FFTW library with Kokkos CUDA on GPUs
Kokkos CUDA doesn't support using the FFTW library to calculate FFTs for
PPPM on GPUs.
E: Cannot use the cuFFT library with Kokkos CUDA on the host CPUs
Kokkos CUDA doesn't support using the cuFFT library to calculate FFTs
for PPPM on the host CPUs, use KISS FFT instead.
*/

View File

@ -0,0 +1,46 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#define MAX(A,B) ((A) > (B) ? (A) : (B))
// data types for 2d/3d FFTs
#ifndef FFT_DATA_KOKKOS_H
#define FFT_DATA_KOKKOS_H
// User-settable FFT precision
// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag)
// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag)
#ifdef FFT_SINGLE
#define FFT_PRECISION 1
#define MPI_FFT_SCALAR MPI_FLOAT
typedef float FFT_SCALAR;
#else
#define FFT_PRECISION 2
#define MPI_FFT_SCALAR MPI_DOUBLE
typedef double FFT_SCALAR;
#endif
// -------------------------------------------------------------------------
// Data types for single-precision complex
#if FFT_PRECISION == 1
#elif FFT_PRECISION == 2
#else
#error "FFT_PRECISION needs to be either 1 (=single) or 2 (=double)"
#endif
#endif

View File

@ -0,0 +1,20 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
namespace LAMMPS_NS {
template class KissFFTKokkos<LMPDeviceType>;
#ifdef KOKKOS_HAVE_CUDA
template class KissFFTKokkos<LMPHostType>;
#endif
}

550
src/KOKKOS/kissfft_kokkos.h Normal file
View File

@ -0,0 +1,550 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
/*
we use a stripped down KISS FFT as default FFT for LAMMPS
this code is adapted from kiss_fft_v1_2_9
homepage: http://kissfft.sf.net/
changes 2008-2011 by Axel Kohlmeyer <akohlmey@gmail.com>
KISS FFT ported to Kokkos by Stan Moore (SNL)
*/
/*
Copyright (c) 2003-2010, Mark Borgerding
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
* Neither the author nor the names of any contributors may be used
to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef LMP_KISSFFT_KOKKOS_H
#define LMP_KISSFFT_KOKKOS_H
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "kissfft_kokkos.h"
#include "fftdata_kokkos.h"
#include "kokkos_type.h"
#ifndef M_PI
#define M_PI 3.141592653589793238462643383279502884197169399375105820974944
#endif
/*
Explanation of macros dealing with complex math:
C_MUL(m,a,b) : m = a*b
C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise
C_SUB( res, a,b) : res = a - b
C_SUBFROM( res , a) : res -= a
C_ADDTO( res , a) : res += a
C_EQ( res , a) : res = a
*/
#define S_MUL(a,b) ( (a)*(b) )
#define C_MUL(m,a,a_index,b,b_index) \
do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\
(m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0)
/*
#define C_FIXDIV(c,div) // NOOP
#define C_MULBYSCALAR( c, s ) \
do{ (c)[0] *= (s);\
(c)[1] *= (s); }while(0)
#define C_ADD( res, a,b)\
do { \
(res)[0]=(a)[0]+(b)[0]; (res)[1]=(a)[1]+(b)[1]; \
}while(0)
#define C_SUB( res, a,b)\
do { \
(res)[0]=(a)[0]-(b)[0]; (res)[1]=(a)[1]-(b)[1]; \
}while(0)
#define C_ADDTO( res , a)\
do { \
(res)[0] += (a)[0]; (res)[1] += (a)[1];\
}while(0)
#define C_SUBFROM( res , a)\
do {\
(res)[0] -= (a)[0]; (res)[1] -= (a)[1]; \
}while(0)
#define C_EQ(res, a)\
do {\
(res)[0] = (a)[0]; (res)[1] = (a)[1]; \
}while(0)
*/
#define KISS_FFT_COS(phase) (FFT_SCALAR) cos(phase)
#define KISS_FFT_SIN(phase) (FFT_SCALAR) sin(phase)
#define HALF_OF(x) ((x)*.5)
#define kf_cexp(x,x_index,phase) \
do{ \
(x)(x_index,0) = KISS_FFT_COS(phase);\
(x)(x_index,1) = KISS_FFT_SIN(phase);\
}while(0)
namespace LAMMPS_NS {
#define MAXFACTORS 32
/* e.g. an fft of length 128 has 4 factors
as far as kissfft is concerned: 4*4*4*2 */
template<class DeviceType>
struct kiss_fft_state_kokkos {
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
int nfft;
int inverse;
typename AT::t_int_64 d_factors;
typename AT::t_FFT_DATA_1d d_twiddles;
typename AT::t_FFT_DATA_1d d_scratch;
};
template<class DeviceType>
class KissFFTKokkos {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
KOKKOS_INLINE_FUNCTION
static void kf_bfly2(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, int m, int Fout_count)
{
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR t[2];
int Fout2_count;
int tw1_count = 0;
Fout2_count = Fout_count + m;
do {
//C_FIXDIV(d_Fout[Fout_count],2); C_FIXDIV(d_Fout[Fout2_count],2);
C_MUL(t,d_Fout,Fout2_count,d_twiddles,tw1_count);
tw1_count += fstride;
//C_SUB(*Fout2,*Fout,t);
d_Fout(Fout2_count,0) = d_Fout(Fout_count,0) - t[0];
d_Fout(Fout2_count,1) = d_Fout(Fout_count,1) - t[1];
//C_ADDTO(d_Fout[Fout_count],t);
d_Fout(Fout_count,0) += t[0];
d_Fout(Fout_count,1) += t[1];
++Fout2_count;
++Fout_count;
} while(--m);
}
KOKKOS_INLINE_FUNCTION
static void kf_bfly4(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, const size_t m, int Fout_count)
{
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR scratch[6][2];
size_t k=m;
const size_t m2=2*m;
const size_t m3=3*m;
int tw3_count,tw2_count,tw1_count;
tw3_count = tw2_count = tw1_count = 0;
do {
//C_FIXDIV(d_Fout[Fout_count],4); C_FIXDIV(d_Fout[m],4); C_FIXDIV(d_Fout[m2],4); C_FIXDIV(d_Fout[m3],4);
C_MUL(scratch[0],d_Fout,Fout_count + m,d_twiddles,tw1_count);
C_MUL(scratch[1],d_Fout,Fout_count + m2,d_twiddles,tw2_count);
C_MUL(scratch[2],d_Fout,Fout_count + m3,d_twiddles,tw3_count);
//C_SUB(scratch[5],d_Fout[Fout_count],scratch[1] );
scratch[5][0] = d_Fout(Fout_count,0) - scratch[1][0];
scratch[5][1] = d_Fout(Fout_count,1) - scratch[1][1];
//C_ADDTO(d_Fout[Fout_count], scratch[1]);
d_Fout(Fout_count,0) += scratch[1][0];
d_Fout(Fout_count,1) += scratch[1][1];
//C_ADD(scratch[3],scratch[0],scratch[2]);
scratch[3][0] = scratch[0][0] + scratch[2][0];
scratch[3][1] = scratch[0][1] + scratch[2][1];
//C_SUB( scratch[4] , scratch[0] , scratch[2] );
scratch[4][0] = scratch[0][0] - scratch[2][0];
scratch[4][1] = scratch[0][1] - scratch[2][1];
//C_SUB(d_Fout[m2],d_Fout[Fout_count],scratch[3]);
d_Fout(Fout_count + m2,0) = d_Fout(Fout_count,0) - scratch[3][0];
d_Fout(Fout_count + m2,1) = d_Fout(Fout_count,1) - scratch[3][1];
tw1_count += fstride;
tw2_count += fstride*2;
tw3_count += fstride*3;
//C_ADDTO(d_Fout[Fout_count],scratch[3]);
d_Fout(Fout_count,0) += scratch[3][0];
d_Fout(Fout_count,1) += scratch[3][1];
if (st.inverse) {
d_Fout(Fout_count + m,0) = scratch[5][0] - scratch[4][1];
d_Fout(Fout_count + m,1) = scratch[5][1] + scratch[4][0];
d_Fout(Fout_count + m3,0) = scratch[5][0] + scratch[4][1];
d_Fout(Fout_count + m3,1) = scratch[5][1] - scratch[4][0];
} else{
d_Fout(Fout_count + m,0) = scratch[5][0] + scratch[4][1];
d_Fout(Fout_count + m,1) = scratch[5][1] - scratch[4][0];
d_Fout(Fout_count + m3,0) = scratch[5][0] - scratch[4][1];
d_Fout(Fout_count + m3,1) = scratch[5][1] + scratch[4][0];
}
++Fout_count;
} while(--k);
}
KOKKOS_INLINE_FUNCTION
static void kf_bfly3(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, size_t m, int Fout_count)
{
size_t k=m;
const size_t m2 = 2*m;
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR scratch[5][2];
FFT_SCALAR epi3[2];
//C_EQ(epi3,d_twiddles[fstride*m]);
epi3[0] = d_twiddles(fstride*m,0);
epi3[1] = d_twiddles(fstride*m,1);
int tw1_count,tw2_count;
tw1_count = tw2_count = 0;
do {
//C_FIXDIV(d_Fout[Fout_count],3); C_FIXDIV(d_Fout[m],3); C_FIXDIV(d_Fout[m2],3);
C_MUL(scratch[1],d_Fout,Fout_count + m,d_twiddles,tw1_count);
C_MUL(scratch[2],d_Fout,Fout_count + m2,d_twiddles,tw2_count);
//C_ADD(scratch[3],scratch[1],scratch[2]);
scratch[3][0] = scratch[1][0] + scratch[2][0];
scratch[3][1] = scratch[1][1] + scratch[2][1];
//C_SUB(scratch[0],scratch[1],scratch[2]);
scratch[0][0] = scratch[1][0] - scratch[2][0];
scratch[0][1] = scratch[1][1] - scratch[2][1];
tw1_count += fstride;
tw2_count += fstride*2;
d_Fout(Fout_count + m,0) = d_Fout(Fout_count,0) - HALF_OF(scratch[3][0]);
d_Fout(Fout_count + m,1) = d_Fout(Fout_count,1) - HALF_OF(scratch[3][1]);
//C_MULBYSCALAR(scratch[0],epi3[1]);
scratch[0][0] *= epi3[1];
scratch[0][1] *= epi3[1];
//C_ADDTO(d_Fout[Fout_count],scratch[3]);
d_Fout(Fout_count,0) += scratch[3][0];
d_Fout(Fout_count,1) += scratch[3][1];
d_Fout(Fout_count + m2,0) = d_Fout(Fout_count + m,0) + scratch[0][1];
d_Fout(Fout_count + m2,1) = d_Fout(Fout_count + m,1) - scratch[0][0];
d_Fout(Fout_count + m,0) -= scratch[0][1];
d_Fout(Fout_count + m,1) += scratch[0][0];
++Fout_count;
} while(--k);
}
KOKKOS_INLINE_FUNCTION
static void kf_bfly5(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, int m, int Fout_count)
{
int u;
FFT_SCALAR scratch[13][2];
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR ya[2],yb[2];
//C_EQ(ya,d_twiddles[fstride*m]);
ya[1] = d_twiddles(fstride*m,1);
ya[0] = d_twiddles(fstride*m,0);
//C_EQ(yb,d_twiddles[fstride*2*m]);
yb[1] = d_twiddles(fstride*2*m,1);
yb[0] = d_twiddles(fstride*2*m,0);
int Fout0_count=Fout_count;
int Fout1_count=Fout0_count+m;
int Fout2_count=Fout0_count+2*m;
int Fout3_count=Fout0_count+3*m;
int Fout4_count=Fout0_count+4*m;
for ( u=0; u<m; ++u ) {
//C_FIXDIV( d_Fout[Fout0_count],5); C_FIXDIV( d_Fout[Fout1_count],5); C_FIXDIV( d_Fout[Fout2_count],5);
//C_FIXDIV( d_Fout[Fout3_count],5); C_FIXDIV( d_Fout[Fout4_count],5);
//C_EQ(scratch[0],d_Fout[Fout0_count]);
scratch[0][0] = d_Fout(Fout0_count,0);
scratch[0][1] = d_Fout(Fout0_count,1);
C_MUL(scratch[1],d_Fout,Fout1_count,d_twiddles,u*fstride );
C_MUL(scratch[2],d_Fout,Fout2_count,d_twiddles,2*u*fstride);
C_MUL(scratch[3],d_Fout,Fout3_count,d_twiddles,3*u*fstride);
C_MUL(scratch[4],d_Fout,Fout4_count,d_twiddles,4*u*fstride);
//C_ADD(scratch[7],scratch[1],scratch[4]);
scratch[7][0] = scratch[1][0] + scratch[4][0];
scratch[7][1] = scratch[1][1] + scratch[4][1];
//C_SUB(scratch[10],scratch[1],scratch[4]);
scratch[10][0] = scratch[1][0] - scratch[4][0];
scratch[10][1] = scratch[1][1] - scratch[4][1];
//C_ADD(scratch[8],scratch[2],scratch[3]);
scratch[8][0] = scratch[2][0] + scratch[3][0];
scratch[8][1] = scratch[2][1] + scratch[3][1];
//C_SUB(scratch[9],scratch[2],scratch[3]);
scratch[9][0] = scratch[2][0] - scratch[3][0];
scratch[9][1] = scratch[2][1] - scratch[3][1];
d_Fout(Fout0_count,0) += scratch[7][0] + scratch[8][0];
d_Fout(Fout0_count,1) += scratch[7][1] + scratch[8][1];
scratch[5][0] = scratch[0][0] + S_MUL(scratch[7][0],ya[0]) + S_MUL(scratch[8][0],yb[0]);
scratch[5][1] = scratch[0][1] + S_MUL(scratch[7][1],ya[0]) + S_MUL(scratch[8][1],yb[0]);
scratch[6][0] = S_MUL(scratch[10][1],ya[1]) + S_MUL(scratch[9][1],yb[1]);
scratch[6][1] = -S_MUL(scratch[10][0],ya[1]) - S_MUL(scratch[9][0],yb[1]);
//C_SUB(d_Fout[Fout1_count],scratch[5],scratch[6]);
d_Fout(Fout1_count,0) = scratch[5][0] - scratch[6][0];
d_Fout(Fout1_count,1) = scratch[5][1] - scratch[6][1];
//C_ADD(d_Fout[Fout4_count],scratch[5],scratch[6]);
d_Fout(Fout4_count,0) = scratch[5][0] + scratch[6][0];
d_Fout(Fout4_count,1) = scratch[5][1] + scratch[6][1];
scratch[11][0] = scratch[0][0] + S_MUL(scratch[7][0],yb[0]) + S_MUL(scratch[8][0],ya[0]);
scratch[11][1] = scratch[0][1] + S_MUL(scratch[7][1],yb[0]) + S_MUL(scratch[8][1],ya[0]);
scratch[12][0] = - S_MUL(scratch[10][1],yb[1]) + S_MUL(scratch[9][1],ya[1]);
scratch[12][1] = S_MUL(scratch[10][0],yb[1]) - S_MUL(scratch[9][0],ya[1]);
//C_ADD(d_Fout[Fout2_count],scratch[11],scratch[12]);
d_Fout(Fout2_count,0) = scratch[11][0] + scratch[12][0];
d_Fout(Fout2_count,1) = scratch[11][1] + scratch[12][1];
//C_SUB(d_Fout3[Fout3_count],scratch[11],scratch[12]);
d_Fout(Fout3_count,0) = scratch[11][0] - scratch[12][0];
d_Fout(Fout3_count,1) = scratch[11][1] - scratch[12][1];
++Fout0_count;++Fout1_count;++Fout2_count;++Fout3_count;++Fout4_count;
}
}
/* perform the butterfly for one stage of a mixed radix FFT */
KOKKOS_INLINE_FUNCTION
static void kf_bfly_generic(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, int m, int p, int Fout_count)
{
int u,k,q1,q;
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR t[2];
int Norig = st.nfft;
typename AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch;
for ( u=0; u<m; ++u ) {
k=u;
for ( q1=0 ; q1<p ; ++q1 ) {
//C_EQ(d_scratch[q1],d_Fout[k]);
d_scratch(q1,0) = d_Fout(Fout_count + k,0);
d_scratch(q1,1) = d_Fout(Fout_count + k,1);
//C_FIXDIV(d_scratch[q1],p);
k += m;
}
k=u;
for ( q1=0 ; q1<p ; ++q1 ) {
int twidx=0;
//C_EQ(d_Fout[k],d_scratch[0]);
d_Fout(Fout_count + k,0) = d_scratch(0,0);
d_Fout(Fout_count + k,1) = d_scratch(0,1);
for (q=1;q<p;++q ) {
twidx += fstride * k;
if (twidx>=Norig) twidx-=Norig;
C_MUL(t,d_scratch,q,d_twiddles,twidx);
//C_ADDTO(d_Fout[k],t);
d_Fout(Fout_count + k,0) += t[0];
d_Fout(Fout_count + k,1) += t[1];
}
k += m;
}
}
}
KOKKOS_INLINE_FUNCTION
static void kf_work(typename AT::t_FFT_DATA_1d_um &d_Fout, const typename AT::t_FFT_DATA_1d_um &d_f,
const size_t fstride, int in_stride,
const typename AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos<DeviceType> &st, int Fout_count, int f_count, int factors_count)
{
const int beg = Fout_count;
const int p = d_factors[factors_count++]; /* the radix */
const int m = d_factors[factors_count++]; /* stage's fft length/p */
const int end = Fout_count + p*m;
if (m == 1) {
do {
//C_EQ(d_Fout[Fout_count],d_f[f_count]);
d_Fout(Fout_count,0) = d_f(f_count,0);
d_Fout(Fout_count,1) = d_f(f_count,1);
f_count += fstride*in_stride;
} while (++Fout_count != end);
} else {
do {
/* recursive call:
DFT of size m*p performed by doing
p instances of smaller DFTs of size m,
each one takes a decimated version of the input */
kf_work(d_Fout, d_f, fstride*p, in_stride, d_factors, st, Fout_count, f_count, factors_count);
f_count += fstride*in_stride;
} while( (Fout_count += m) != end);
}
Fout_count=beg;
/* recombine the p smaller DFTs */
switch (p) {
case 2: kf_bfly2(d_Fout,fstride,st,m,Fout_count); break;
case 3: kf_bfly3(d_Fout,fstride,st,m,Fout_count); break;
case 4: kf_bfly4(d_Fout,fstride,st,m,Fout_count); break;
case 5: kf_bfly5(d_Fout,fstride,st,m,Fout_count); break;
default: kf_bfly_generic(d_Fout,fstride,st,m,p,Fout_count); break;
}
}
/* facbuf is populated by p1,m1,p2,m2, ...
where
p[i] * m[i] = m[i-1]
m0 = n */
static int kf_factor(int n, HAT::t_int_64 h_facbuf)
{
int p=4, nf=0;
double floor_sqrt;
floor_sqrt = floor( sqrt((double)n) );
int facbuf_count = 0;
int p_max = 0;
/* factor out the remaining powers of 4, powers of 2,
and then any other remaining primes */
do {
if (nf == MAXFACTORS) p = n; /* make certain that we don't run out of space */
while (n % p) {
switch (p) {
case 4: p = 2; break;
case 2: p = 3; break;
default: p += 2; break;
}
if (p > floor_sqrt)
p = n; /* no more factors, skip to end */
}
n /= p;
h_facbuf[facbuf_count++] = p;
h_facbuf[facbuf_count++] = n;
p_max = MAX(p,p_max);
++nf;
} while (n > 1);
return p_max;
}
/*
* User-callable function to allocate all necessary storage space for the fft.
*
* The return value is a contiguous block of memory, allocated with malloc. As such,
* It can be freed with free(), rather than a kiss_fft-specific function.
*/
static kiss_fft_state_kokkos<DeviceType> kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void *mem, size_t *lenmem)
{
kiss_fft_state_kokkos<DeviceType> st;
int i;
st.nfft = nfft;
st.inverse = inverse_fft;
typename AT::tdual_int_64 k_factors = typename AT::tdual_int_64();
typename AT::tdual_FFT_DATA_1d k_twiddles = typename AT::tdual_FFT_DATA_1d();
if (nfft > 0) {
k_factors = typename AT::tdual_int_64("kissfft:factors",MAXFACTORS*2);
k_twiddles = typename AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft);
for (i=0;i<nfft;++i) {
const double phase = (st.inverse ? 2.0*M_PI:-2.0*M_PI)*i / nfft;
kf_cexp(k_twiddles.h_view,i,phase );
}
int p_max = kf_factor(nfft,k_factors.h_view);
st.d_scratch = typename AT::t_FFT_DATA_1d("kissfft:scratch",p_max);
}
k_factors.template modify<LMPHostType>();
k_factors.template sync<LMPDeviceType>();
st.d_factors = k_factors.template view<DeviceType>();
k_twiddles.template modify<LMPHostType>();
k_twiddles.template sync<LMPDeviceType>();
st.d_twiddles = k_twiddles.template view<DeviceType>();
return st;
}
KOKKOS_INLINE_FUNCTION
static void kiss_fft_stride(const kiss_fft_state_kokkos<DeviceType> &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset)
{
//if (d_fin.data() == d_fout.data()) {
// // NOTE: this is not really an in-place FFT algorithm.
// // It just performs an out-of-place FFT into a temp buffer
// typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1));
// kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0);
// Kokkos::deep_copy(d_fout,d_tmpbuf);
//} else {
kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0);
//}
}
KOKKOS_INLINE_FUNCTION
static void kiss_fft_kokkos(const kiss_fft_state_kokkos<DeviceType> &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset)
{
kiss_fft_stride(cfg,d_fin,d_fout,1,offset);
}
};
}
#endif

View File

@ -174,20 +174,6 @@ t_scalar3<Scalar> operator *
return t_scalar3<Scalar>(a.x*b,a.y*b,a.z*b);
}
#if !defined(__CUDACC__) && !defined(__VECTOR_TYPES_H__)
struct double2 {
double x, y;
};
struct float2 {
float x, y;
};
struct float4 {
float x, y, z, w;
};
struct double4 {
double x, y, z, w;
};
#endif
// set LMPHostype and LMPDeviceType from Kokkos Default Types
typedef Kokkos::DefaultExecutionSpace LMPDeviceType;
typedef Kokkos::HostSpace::execution_space LMPHostType;
@ -310,14 +296,8 @@ public:
#endif
#if PRECISION==1
typedef float LMP_FLOAT;
typedef float2 LMP_FLOAT2;
typedef lmp_float3 LMP_FLOAT3;
typedef float4 LMP_FLOAT4;
#else
typedef double LMP_FLOAT;
typedef double2 LMP_FLOAT2;
typedef lmp_double3 LMP_FLOAT3;
typedef double4 LMP_FLOAT4;
#endif
#ifndef PREC_FORCE
@ -326,14 +306,8 @@ typedef double4 LMP_FLOAT4;
#if PREC_FORCE==1
typedef float F_FLOAT;
typedef float2 F_FLOAT2;
typedef lmp_float3 F_FLOAT3;
typedef float4 F_FLOAT4;
#else
typedef double F_FLOAT;
typedef double2 F_FLOAT2;
typedef lmp_double3 F_FLOAT3;
typedef double4 F_FLOAT4;
#endif
#ifndef PREC_ENERGY
@ -342,12 +316,8 @@ typedef double4 F_FLOAT4;
#if PREC_ENERGY==1
typedef float E_FLOAT;
typedef float2 E_FLOAT2;
typedef float4 E_FLOAT4;
#else
typedef double E_FLOAT;
typedef double2 E_FLOAT2;
typedef double4 E_FLOAT4;
#endif
struct s_EV_FLOAT {
@ -500,12 +470,8 @@ typedef struct s_FEV_FLOAT FEV_FLOAT;
#if PREC_POS==1
typedef float X_FLOAT;
typedef float2 X_FLOAT2;
typedef float4 X_FLOAT4;
#else
typedef double X_FLOAT;
typedef double2 X_FLOAT2;
typedef double4 X_FLOAT4;
#endif
#ifndef PREC_VELOCITIES
@ -514,22 +480,14 @@ typedef double4 X_FLOAT4;
#if PREC_VELOCITIES==1
typedef float V_FLOAT;
typedef float2 V_FLOAT2;
typedef float4 V_FLOAT4;
#else
typedef double V_FLOAT;
typedef double2 V_FLOAT2;
typedef double4 V_FLOAT4;
#endif
#if PREC_KSPACE==1
typedef float K_FLOAT;
typedef float2 K_FLOAT2;
typedef float4 K_FLOAT4;
#else
typedef double K_FLOAT;
typedef double2 K_FLOAT2;
typedef double4 K_FLOAT4;
#endif
typedef int T_INT;

474
src/KOKKOS/pack_kokkos.h Normal file
View File

@ -0,0 +1,474 @@
/* ----------------------------------------------------------------------
SPARTA - Stochastic PArallel Rarefied-gas Time-accurate Analyzer
http://sparta.sandia.gov
Steve Plimpton, sjplimp@sandia.gov, Michael Gallis, magalli@sandia.gov
Sandia National Laboratories
Copyright (2014) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level SPARTA directory.
------------------------------------------------------------------------- */
// loop counters for doing a pack/unpack
#include "pack.h"
/* ----------------------------------------------------------------------
Pack and unpack functions:
pack routines copy strided values from data into contiguous locs in buf
unpack routines copy contiguous values from buf into strided locs in data
different versions of unpack depending on permutation
and # of values/element
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
pack from data -> buf
------------------------------------------------------------------------- */
#include "kokkos_type.h"
namespace LAMMPS_NS {
template<class DeviceType>
class PackKokkos {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
struct pack_3d_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
int nslow; // # of elements in slow index
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
pack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
buf_offset = buf_offset_ ;
data_offset = data_offset_ ;
nfast = plan->nfast ;
nmid = plan->nmid ;
nslow = plan->nslow ;
nstride_line = plan->nstride_line ;
nstride_plane = plan->nstride_plane;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &index) const {
const int fast = index / (nslow * nmid);
const int index_new = index - (fast * nslow * nmid);
const int mid = index_new / nslow;
const int slow = index_new % nslow;
const int in_orig = slow*nmid*nfast;
const int plane = slow*nstride_plane;
const int out_orig = plane + mid*nstride_line;
const int in = in_orig + mid*nfast + fast;
const int out = out_orig + fast;
d_buf[buf_offset + in] = d_data[data_offset + out];
}
};
static void pack_3d(typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
const int nfast = plan->nfast;
pack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan);
Kokkos::parallel_for(nslow*nmid*nfast,f);
DeviceType::fence();
}
/* ----------------------------------------------------------------------
unpack from buf -> data
------------------------------------------------------------------------- */
struct unpack_3d_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
int nslow; // # of elements in slow index
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
buf_offset = buf_offset_ ;
data_offset = data_offset_ ;
nfast = plan->nfast ;
nmid = plan->nmid ;
nslow = plan->nslow ;
nstride_line = plan->nstride_line ;
nstride_plane = plan->nstride_plane;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &index) const {
const int fast = index / (nslow * nmid);
const int index_new = index - (fast * nslow * nmid);
const int mid = index_new / nslow;
const int slow = index_new % nslow;
const int out_orig = slow*nmid*nfast;
const int plane = slow*nstride_plane;
const int in_orig = plane + mid*nstride_line;
const int in = in_orig + fast;
const int out = out_orig + mid*nfast + fast;
d_data[data_offset + in] = d_buf[buf_offset + out];
}
};
static void unpack_3d(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
const int nfast = plan->nfast;
unpack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan);
Kokkos::parallel_for(nslow*nmid*nfast,f);
DeviceType::fence();
}
/* ----------------------------------------------------------------------
unpack from buf -> data, one axis permutation, 1 value/element
------------------------------------------------------------------------- */
struct unpack_3d_permute1_1_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
int nslow; // # of elements in slow index
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_permute1_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
buf_offset = buf_offset_ ;
data_offset = data_offset_ ;
nfast = plan->nfast ;
nmid = plan->nmid ;
nslow = plan->nslow ;
nstride_line = plan->nstride_line ;
nstride_plane = plan->nstride_plane;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &index) const {
const int fast = index / (nslow * nmid);
const int index_new = index - (fast * nslow * nmid);
const int mid = index_new / nslow;
const int slow = index_new % nslow;
const int out_orig = slow*nmid*nfast;
const int plane = slow*nstride_line;
const int in_orig = plane + mid;
const int in = in_orig + fast*nstride_plane;
const int out = out_orig + mid*nfast + fast;
d_data[data_offset + in] = d_buf[buf_offset + out];
}
};
static void unpack_3d_permute1_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
const int nfast = plan->nfast;
unpack_3d_permute1_1_functor f(d_buf,buf_offset,d_data,data_offset,plan);
Kokkos::parallel_for(nslow*nmid*nfast,f);
DeviceType::fence();
}
/* ----------------------------------------------------------------------
unpack from buf -> data, one axis permutation, 2 values/element
------------------------------------------------------------------------- */
struct unpack_3d_permute1_2_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
int nslow; // # of elements in slow index
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_permute1_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
buf_offset = buf_offset_ ;
data_offset = data_offset_ ;
nfast = plan->nfast ;
nmid = plan->nmid ;
nslow = plan->nslow ;
nstride_line = plan->nstride_line ;
nstride_plane = plan->nstride_plane;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &index) const {
const int fast = index / (nslow * nmid);
const int index_new = index - (fast * nslow * nmid);
const int mid = index_new / nslow;
const int slow = index_new % nslow;
const int out_orig = slow*nmid*nfast*2;
const int plane = slow*nstride_line;
const int in_orig = plane + 2*mid;
const int in = in_orig + fast*nstride_plane;
const int out = out_orig + 2*mid*nfast + 2*fast;
d_data[data_offset + in] = d_buf[buf_offset + out];
d_data[data_offset + in+1] = d_buf[buf_offset + out+1];
}
};
static void unpack_3d_permute1_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
const int nfast = plan->nfast;
unpack_3d_permute1_2_functor f(d_buf,buf_offset,d_data,data_offset,plan);
Kokkos::parallel_for(nslow*nmid*nfast,f);
DeviceType::fence();
}
/* ----------------------------------------------------------------------
unpack from buf -> data, one axis permutation, nqty values/element
------------------------------------------------------------------------- */
struct unpack_3d_permute1_n_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
int nslow; // # of elements in slow index
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
int nqty; // # of values/element
unpack_3d_permute1_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
buf_offset = buf_offset_ ;
data_offset = data_offset_ ;
nfast = plan->nfast ;
nmid = plan->nmid ;
nslow = plan->nslow ;
nstride_line = plan->nstride_line ;
nstride_plane = plan->nstride_plane;
nqty = plan->nqty ;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &index) const {
const int fast = index / (nslow * nmid);
const int index_new = index - (fast * nslow * nmid);
const int mid = index_new / nslow;
const int slow = index_new % nslow;
const int out_orig = slow*nmid*nfast*nqty;
const int plane = slow*nstride_line;
const int instart = plane + nqty*mid;
int in = instart + nqty*fast*nstride_plane;
int out = out_orig + nqty*mid*nfast + nqty*fast;
for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++];
}
};
static void unpack_3d_permute1_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
const int nfast = plan->nfast;
unpack_3d_permute1_n_functor f(d_buf,buf_offset,d_data,data_offset,plan);
Kokkos::parallel_for(nslow*nmid*nfast,f);
DeviceType::fence();
}
/* ----------------------------------------------------------------------
unpack from buf -> data, two axis permutation, 1 value/element
------------------------------------------------------------------------- */
struct unpack_3d_permute2_1_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
int nslow; // # of elements in slow index
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_permute2_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
buf_offset = buf_offset_ ;
data_offset = data_offset_ ;
nfast = plan->nfast ;
nmid = plan->nmid ;
nslow = plan->nslow ;
nstride_line = plan->nstride_line ;
nstride_plane = plan->nstride_plane;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &index) const {
const int fast = index / (nslow * nmid);
const int index_new = index - (fast * nslow * nmid);
const int mid = index_new / nslow;
const int slow = index_new % nslow;
const int out_orig = slow*nmid*nfast;
const int in_orig = slow + mid*nstride_plane;
const int in = in_orig + fast*nstride_line;
const int out = out_orig + mid*nfast + fast;
d_data[data_offset + in] = d_buf[buf_offset + out];
}
};
static void unpack_3d_permute2_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
const int nfast = plan->nfast;
unpack_3d_permute2_1_functor f(d_buf,buf_offset,d_data,data_offset,plan);
Kokkos::parallel_for(nslow*nmid*nfast,f);
DeviceType::fence();
}
/* ----------------------------------------------------------------------
unpack from buf -> data, two axis permutation, 2 values/element
------------------------------------------------------------------------- */
struct unpack_3d_permute2_2_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
int nslow; // # of elements in slow index
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_permute2_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
buf_offset = buf_offset_ ;
data_offset = data_offset_ ;
nfast = plan->nfast ;
nmid = plan->nmid ;
nslow = plan->nslow ;
nstride_line = plan->nstride_line ;
nstride_plane = plan->nstride_plane;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &index) const {
const int fast = index / (nslow * nmid);
const int index_new = index - (fast * nslow * nmid);
const int mid = index_new / nslow;
const int slow = index_new % nslow;
const int out_orig = slow*nmid*nfast*2;
const int in_orig = 2*slow + mid*nstride_plane;
const int in = in_orig + fast*nstride_line;
const int out = out_orig + 2*mid*nfast + 2*fast;
d_data[data_offset + in] = d_buf[buf_offset + out];
d_data[data_offset + in+1] = d_buf[buf_offset + out+1];
}
};
static void unpack_3d_permute2_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
const int nfast = plan->nfast;
unpack_3d_permute2_2_functor f(d_buf,buf_offset,d_data,data_offset,plan);
Kokkos::parallel_for(nslow*nmid*nfast,f);
DeviceType::fence();
}
/* ----------------------------------------------------------------------
unpack from buf -> data, two axis permutation, nqty values/element
------------------------------------------------------------------------- */
struct unpack_3d_permute2_n_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
int nslow; // # of elements in slow index
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
int nqty; // # of values/element
unpack_3d_permute2_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
buf_offset = buf_offset_ ;
data_offset = data_offset_ ;
nfast = plan->nfast ;
nmid = plan->nmid ;
nslow = plan->nslow ;
nstride_line = plan->nstride_line ;
nstride_plane = plan->nstride_plane;
nqty = plan->nqty ;
}
KOKKOS_INLINE_FUNCTION
void operator() (const int &index) const {
const int fast = index / (nslow * nmid);
const int index_new = index - (fast * nslow * nmid);
const int mid = index_new / nslow;
const int slow = index_new % nslow;
const int out_orig = slow*nmid*nfast*nqty;
const int instart = nqty*slow + mid*nstride_plane;
int in = instart + nqty*fast*nstride_line;
int out = out_orig + nqty*mid*nfast + nqty*fast;
for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++];
}
};
static void unpack_3d_permute2_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
const int nfast = plan->nfast;
unpack_3d_permute2_n_functor f(d_buf,buf_offset,d_data,data_offset,plan);
Kokkos::parallel_for(nslow*nmid*nfast,f);
DeviceType::fence();
}
};
}

View File

@ -25,8 +25,8 @@
#include "force.h"
#include "pair.h"
#include "domain.h"
#include "fft3d_wrap.h"
#include "remap_wrap.h"
#include "fft3d_kokkos.h"
#include "remap_kokkos.h"
#include "memory_kokkos.h"
#include "error.h"
#include "atom_masks.h"
@ -842,19 +842,19 @@ void PPPMKokkos<DeviceType>::allocate()
int tmp;
fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
0,0,&tmp,collective_flag);
fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
0,0,&tmp,collective_flag);
fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
0,0,&tmp,collective_flag);
remap = new Remap(lmp,world,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
1,0,0,FFT_PRECISION,collective_flag);
fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
0,0,&tmp,collective_flag);
remap = new RemapKokkos<DeviceType>(lmp,world,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
1,0,0,FFT_PRECISION,collective_flag);
// create ghost grid object for rho and electric field communication
@ -1783,12 +1783,7 @@ void PPPMKokkos<DeviceType>::brick2fft()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_brick2fft>(0,inum_inout),*this);
copymode = 0;
k_density_fft.template modify<DeviceType>();
k_density_fft.template sync<LMPHostType>();
remap->perform(density_fft,density_fft,work1);
k_density_fft.template modify<LMPHostType>();
k_density_fft.template sync<DeviceType>();
remap->perform(d_density_fft,d_density_fft,d_work1);
}
template<class DeviceType>
@ -1830,11 +1825,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik1>(0,nfft),*this);
copymode = 0;
k_work1.template modify<DeviceType>();
k_work1.template sync<LMPHostType>();
fft1->compute(work1,work1,1);
k_work1.template modify<LMPHostType>();
k_work1.template sync<DeviceType>();
fft1->compute(d_work1,d_work1,1);
// global energy and virial contribution
@ -1898,11 +1889,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik5>(0,inum_fft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -1916,11 +1903,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik7>(0,inum_fft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -1933,11 +1916,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik9>(0,inum_fft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -2193,11 +2172,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom1>(0,nfft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -2214,11 +2189,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom3>(0,nfft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -2230,11 +2201,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom5>(0,nfft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -2246,11 +2213,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom7>(0,nfft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -2261,11 +2224,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom9>(0,nfft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -2277,11 +2236,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom11>(0,nfft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -2293,11 +2248,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom13>(0,nfft),*this);
copymode = 0;
k_work2.template modify<DeviceType>();
k_work2.template sync<LMPHostType>();
fft2->compute(work2,work2,-1);
k_work2.template modify<LMPHostType>();
k_work2.template sync<DeviceType>();
fft2->compute(d_work2,d_work2,-1);
copymode = 1;
@ -3037,10 +2988,10 @@ int PPPMKokkos<DeviceType>::timing_1d(int n, double &time1d)
time1 = MPI_Wtime();
for (int i = 0; i < n; i++) {
fft1->timing1d(work1,nfft_both,1);
fft2->timing1d(work1,nfft_both,-1);
fft2->timing1d(work1,nfft_both,-1);
fft2->timing1d(work1,nfft_both,-1);
fft1->timing1d(d_work1,nfft_both,1);
fft2->timing1d(d_work1,nfft_both,-1);
fft2->timing1d(d_work1,nfft_both,-1);
fft2->timing1d(d_work1,nfft_both,-1);
}
MPI_Barrier(world);
@ -3074,10 +3025,10 @@ int PPPMKokkos<DeviceType>::timing_3d(int n, double &time3d)
time1 = MPI_Wtime();
for (int i = 0; i < n; i++) {
fft1->compute(work1,work1,1);
fft2->compute(work1,work1,-1);
fft2->compute(work1,work1,-1);
fft2->compute(work1,work1,-1);
fft1->compute(d_work1,d_work1,1);
fft2->compute(d_work1,d_work1,-1);
fft2->compute(d_work1,d_work1,-1);
fft2->compute(d_work1,d_work1,-1);
}
MPI_Barrier(world);

View File

@ -22,11 +22,31 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
#ifndef LMP_PPPM_KOKKOS_H
#define LMP_PPPM_KOKKOS_H
#include "pppm.h"
#include "gridcomm_kokkos.h"
#include "remap_kokkos.h"
#include "fft3d_kokkos.h"
#include "kokkos_base.h"
#include "kokkos_type.h"
// fix up FFT defines for KOKKOS with CUDA
#if defined(KOKKOS_ENABLE_CUDA)
# if defined(FFT_FFTW)
# undef FFT_FFTW
# endif
# if defined(FFT_FFTW3)
# undef FFT_FFTW3
# endif
# if defined(FFT_MKL)
# undef FFT_MKL
# endif
# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT)
# define FFT_KISSFFT
# endif
#endif
#include "pppm.h"
namespace LAMMPS_NS {
struct TagPPPM_setup1{};
@ -331,8 +351,8 @@ class PPPMKokkos : public PPPM, public KokkosBase {
//double **acons;
typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,DeviceType>::t_host acons;
class FFT3d *fft1,*fft2;
class Remap *remap;
FFT3dKokkos<DeviceType> *fft1,*fft2;
RemapKokkos<DeviceType> *remap;
GridCommKokkos<DeviceType> *cg;
GridCommKokkos<DeviceType> *cg_peratom;

511
src/KOKKOS/remap_kokkos.cpp Normal file
View File

@ -0,0 +1,511 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include "remap_kokkos.h"
#include "pack_kokkos.h"
#include "error.h"
using namespace LAMMPS_NS;
#define MIN(A,B) ((A) < (B) ? (A) : (B))
#define MAX(A,B) ((A) > (B) ? (A) : (B))
/* ---------------------------------------------------------------------- */
template<class DeviceType>
RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp) : Pointers(lmp)
{
plan = NULL;
}
template<class DeviceType>
RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
int in_ilo, int in_ihi, int in_jlo, int in_jhi,
int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int nqty, int permute, int memory,
int precision, int usecollective) : Pointers(lmp)
{
plan = remap_3d_create_plan_kokkos(comm,
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
nqty,permute,memory,precision,usecollective);
if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
RemapKokkos<DeviceType>::~RemapKokkos()
{
remap_3d_destroy_plan_kokkos(plan);
}
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void RemapKokkos<DeviceType>::perform(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf)
{
remap_3d_kokkos(d_in,d_out,d_buf,plan);
}
/* ----------------------------------------------------------------------
Data layout for 3d remaps:
data set of Nfast x Nmid x Nslow elements is owned by P procs
each element = nqty contiguous datums
on input, each proc owns a subsection of the elements
on output, each proc will own a (presumably different) subsection
my subsection must not overlap with any other proc's subsection,
i.e. the union of all proc's input (or output) subsections must
exactly tile the global Nfast x Nmid x Nslow data set
when called from C, all subsection indices are
C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
when called from F77, all subsection indices are
F77-style from 1 to N where N = Nfast or Nmid or Nslow
a proc can own 0 elements on input or output
by specifying hi index < lo index
on both input and output, data is stored contiguously on a processor
with a fast-varying, mid-varying, and slow-varying index
------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------
Perform 3d remap
Arguments:
in starting address of input data on this proc
out starting address of where output data for this proc
will be placed (can be same as in)
buf extra memory required for remap
if memory=0 was used in call to remap_3d_create_plan
then buf must be big enough to hold output result
i.e. nqty * (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
(out_khi-out_klo+1)
if memory=1 was used in call to remap_3d_create_plan
then buf is not used, can just be a dummy pointer
plan plan returned by previous call to remap_3d_create_plan
------------------------------------------------------------------------- */
template<class DeviceType>
void RemapKokkos<DeviceType>::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf,
struct remap_plan_3d_kokkos<DeviceType> *plan)
{
// collective flag not yet supported
// use point-to-point communication
int i,isend,irecv;
typename AT::t_FFT_SCALAR_1d d_scratch;
if (plan->memory == 0)
d_scratch = d_buf;
else
d_scratch = plan->d_scratch;
// post all recvs into scratch space
for (irecv = 0; irecv < plan->nrecv; irecv++) {
FFT_SCALAR* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv];
MPI_Irecv(scratch,plan->recv_size[irecv],
MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
plan->comm,&plan->request[irecv]);
}
// send all messages to other procs
for (isend = 0; isend < plan->nsend; isend++) {
int in_offset = plan->send_offset[isend];
plan->pack(d_in,in_offset,
plan->d_sendbuf,0,&plan->packplan[isend]);
MPI_Send(plan->d_sendbuf.ptr_on_device(),plan->send_size[isend],MPI_FFT_SCALAR,
plan->send_proc[isend],0,plan->comm);
}
// copy in -> scratch -> out for self data
if (plan->self) {
isend = plan->nsend;
irecv = plan->nrecv;
int in_offset = plan->send_offset[isend];
int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv];
plan->pack(d_in,in_offset,
d_scratch,scratch_offset,
&plan->packplan[isend]);
plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]);
}
// unpack all messages from scratch -> out
for (i = 0; i < plan->nrecv; i++) {
MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE);
int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv];
plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]);
}
}
/* ----------------------------------------------------------------------
Create plan for performing a 3d remap
Arguments:
comm MPI communicator for the P procs which own the data
in_ilo,in_ihi input bounds of data I own in fast index
in_jlo,in_jhi input bounds of data I own in mid index
in_klo,in_khi input bounds of data I own in slow index
out_ilo,out_ihi output bounds of data I own in fast index
out_jlo,out_jhi output bounds of data I own in mid index
out_klo,out_khi output bounds of data I own in slow index
nqty # of datums per element
permute permutation in storage order of indices on output
0 = no permutation
1 = permute once = mid->fast, slow->mid, fast->slow
2 = permute twice = slow->fast, fast->mid, mid->slow
memory user provides buffer memory for remap or system does
0 = user provides memory
1 = system provides memory
precision precision of data
1 = single precision (4 bytes per datum)
2 = double precision (8 bytes per datum)
usecollective whether to use collective MPI or point-to-point
------------------------------------------------------------------------- */
template<class DeviceType>
struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_create_plan_kokkos(
MPI_Comm comm,
int in_ilo, int in_ihi, int in_jlo, int in_jhi,
int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int nqty, int permute, int memory, int precision, int usecollective)
{
struct remap_plan_3d_kokkos<DeviceType> *plan;
struct extent_3d *inarray, *outarray;
struct extent_3d in,out,overlap;
int i,iproc,nsend,nrecv,ibuf,size,me,nprocs;
// query MPI info
MPI_Comm_rank(comm,&me);
MPI_Comm_size(comm,&nprocs);
// allocate memory for plan data struct
plan = new struct remap_plan_3d_kokkos<DeviceType>;
if (plan == NULL) return NULL;
plan->usecollective = usecollective;
// store parameters in local data structs
in.ilo = in_ilo;
in.ihi = in_ihi;
in.isize = in.ihi - in.ilo + 1;
in.jlo = in_jlo;
in.jhi = in_jhi;
in.jsize = in.jhi - in.jlo + 1;
in.klo = in_klo;
in.khi = in_khi;
in.ksize = in.khi - in.klo + 1;
out.ilo = out_ilo;
out.ihi = out_ihi;
out.isize = out.ihi - out.ilo + 1;
out.jlo = out_jlo;
out.jhi = out_jhi;
out.jsize = out.jhi - out.jlo + 1;
out.klo = out_klo;
out.khi = out_khi;
out.ksize = out.khi - out.klo + 1;
// combine output extents across all procs
inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
if (inarray == NULL) return NULL;
outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
if (outarray == NULL) return NULL;
MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE,
outarray,sizeof(struct extent_3d),MPI_BYTE,comm);
// count send collides, including self
nsend = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
nsend += remap_3d_collide(&in,&outarray[iproc],&overlap);
}
// malloc space for send info
if (nsend) {
plan->pack = PackKokkos<DeviceType>::pack_3d;
plan->send_offset = (int *) malloc(nsend*sizeof(int));
plan->send_size = (int *) malloc(nsend*sizeof(int));
plan->send_proc = (int *) malloc(nsend*sizeof(int));
plan->packplan = (struct pack_plan_3d *)
malloc(nsend*sizeof(struct pack_plan_3d));
if (plan->send_offset == NULL || plan->send_size == NULL ||
plan->send_proc == NULL || plan->packplan == NULL) return NULL;
}
// store send info, with self as last entry
nsend = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
plan->send_proc[nsend] = iproc;
plan->send_offset[nsend] = nqty *
((overlap.klo-in.klo)*in.jsize*in.isize +
((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
plan->packplan[nsend].nfast = nqty*overlap.isize;
plan->packplan[nsend].nmid = overlap.jsize;
plan->packplan[nsend].nslow = overlap.ksize;
plan->packplan[nsend].nstride_line = nqty*in.isize;
plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
plan->packplan[nsend].nqty = nqty;
plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
nsend++;
}
}
// plan->nsend = # of sends not including self
if (nsend && plan->send_proc[nsend-1] == me) {
if (plan->usecollective) // for collectives include self in nsend list
plan->nsend = nsend;
else
plan->nsend = nsend - 1;
} else
plan->nsend = nsend;
// combine input extents across all procs
MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE,
inarray,sizeof(struct extent_3d),MPI_BYTE,comm);
// count recv collides, including self
nrecv = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap);
}
// malloc space for recv info
if (nrecv) {
if (permute == 0)
plan->unpack = PackKokkos<DeviceType>::unpack_3d;
else if (permute == 1) {
if (nqty == 1)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
else if (nqty == 2)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
else
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
}
else if (permute == 2) {
if (nqty == 1)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
else if (nqty == 2)
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
else
plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
}
plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
plan->recv_size = (int *) malloc(nrecv*sizeof(int));
plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
plan->unpackplan = (struct pack_plan_3d *)
malloc(nrecv*sizeof(struct pack_plan_3d));
if (plan->recv_offset == NULL || plan->recv_size == NULL ||
plan->recv_proc == NULL || plan->recv_bufloc == NULL ||
plan->request == NULL || plan->unpackplan == NULL) return NULL;
}
// store recv info, with self as last entry
ibuf = 0;
nrecv = 0;
iproc = me;
for (i = 0; i < nprocs; i++) {
iproc++;
if (iproc == nprocs) iproc = 0;
if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
plan->recv_proc[nrecv] = iproc;
plan->recv_bufloc[nrecv] = ibuf;
if (permute == 0) {
plan->recv_offset[nrecv] = nqty *
((overlap.klo-out.klo)*out.jsize*out.isize +
(overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
plan->unpackplan[nrecv].nqty = nqty;
}
else if (permute == 1) {
plan->recv_offset[nrecv] = nqty *
((overlap.ilo-out.ilo)*out.ksize*out.jsize +
(overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
plan->unpackplan[nrecv].nfast = overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
plan->unpackplan[nrecv].nqty = nqty;
}
else {
plan->recv_offset[nrecv] = nqty *
((overlap.jlo-out.jlo)*out.isize*out.ksize +
(overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
plan->unpackplan[nrecv].nfast = overlap.isize;
plan->unpackplan[nrecv].nmid = overlap.jsize;
plan->unpackplan[nrecv].nslow = overlap.ksize;
plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
plan->unpackplan[nrecv].nqty = nqty;
}
plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
ibuf += plan->recv_size[nrecv];
nrecv++;
}
}
// plan->nrecv = # of recvs not including self
// for collectives include self in the nsend list
if (nrecv && plan->recv_proc[nrecv-1] == me) {
if (plan->usecollective) plan->nrecv = nrecv;
else plan->nrecv = nrecv - 1;
} else plan->nrecv = nrecv;
// init remaining fields in remap plan
plan->memory = memory;
if (nrecv == plan->nrecv) plan->self = 0;
else plan->self = 1;
// free locally malloced space
free(inarray);
free(outarray);
// find biggest send message (not including self) and malloc space for it
size = 0;
for (nsend = 0; nsend < plan->nsend; nsend++)
size = MAX(size,plan->send_size[nsend]);
if (size) {
plan->d_sendbuf = typename AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
if (!plan->d_sendbuf.data()) return NULL;
}
// if requested, allocate internal scratch space for recvs,
// only need it if I will receive any data (including self)
if (memory == 1) {
if (nrecv > 0) {
plan->d_scratch =
typename AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
if (!plan->d_scratch.data()) return NULL;
}
}
// not using collective - dup comm
MPI_Comm_dup(comm,&plan->comm);
// return pointer to plan
return plan;
}
/* ----------------------------------------------------------------------
Destroy a 3d remap plan
------------------------------------------------------------------------- */
template<class DeviceType>
void RemapKokkos<DeviceType>::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *plan)
{
if (plan == NULL) return;
// free MPI communicator
if (!((plan->usecollective) && (plan->commringlen == 0)))
MPI_Comm_free(&plan->comm);
// free internal arrays
if (plan->nsend || plan->self) {
free(plan->send_offset);
free(plan->send_size);
free(plan->send_proc);
free(plan->packplan);
}
if (plan->nrecv || plan->self) {
free(plan->recv_offset);
free(plan->recv_size);
free(plan->recv_proc);
free(plan->recv_bufloc);
free(plan->request);
free(plan->unpackplan);
}
// free plan itself
delete plan;
}
namespace LAMMPS_NS {
template class RemapKokkos<LMPDeviceType>;
#ifdef KOKKOS_HAVE_CUDA
template class RemapKokkos<LMPHostType>;
#endif
}

88
src/KOKKOS/remap_kokkos.h Normal file
View File

@ -0,0 +1,88 @@
/* -*- c++ -*- ----------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_REMAP_KOKKOS_H
#define LMP_REMAP_KOKKOS_H
#include "pointers.h"
#include <mpi.h>
#include "fftdata_kokkos.h"
#include "remap.h"
#include "kokkos_type.h"
namespace LAMMPS_NS {
// details of how to do a 3d remap
template<class DeviceType>
struct remap_plan_3d_kokkos {
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends
typename AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs
void (*pack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
// which pack function to use
void (*unpack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
// which unpack function to use
int *send_offset; // extraction loc for each send
int *send_size; // size of each send message
int *send_proc; // proc to send each message to
struct pack_plan_3d *packplan; // pack plan for each send message
int *recv_offset; // insertion loc for each recv
int *recv_size; // size of each recv message
int *recv_proc; // proc to recv each message from
int *recv_bufloc; // offset in scratch buf for each recv
MPI_Request *request; // MPI request for each posted recv
struct pack_plan_3d *unpackplan; // unpack plan for each recv message
int nrecv; // # of recvs from other procs
int nsend; // # of sends to other procs
int self; // whether I send/recv with myself
int memory; // user provides scratch space or not
MPI_Comm comm; // group of procs performing remap
int usecollective; // use collective or point-to-point MPI
int commringlen; // length of commringlist
int *commringlist; // ranks on communication ring of this plan
};
template<class DeviceType>
class RemapKokkos : protected Pointers {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
RemapKokkos(class LAMMPS *);
RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,int);
~RemapKokkos();
void perform(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d);
struct remap_plan_3d_kokkos<DeviceType> *plan;
void remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, struct remap_plan_3d_kokkos<DeviceType> *);
struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
int, int, int, int, int, int,
int, int, int, int, int, int,
int, int, int, int, int);
void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
};
}
#endif
/* ERROR/WARNING messages:
E: Could not create 3d remap plan
The FFT setup in pppm failed.
*/

View File

@ -25,6 +25,9 @@
#include <cstdlib>
#include <cmath>
#include "remap.h"
#if defined(_OPENMP)
#include <omp.h>
#endif
#ifdef FFT_KISS
/* include kissfft implementation */
@ -266,7 +269,7 @@ struct fft_plan_3d *fft_3d_create_plan(
int scaled, int permute, int *nbuf, int usecollective)
{
struct fft_plan_3d *plan;
int me,nprocs;
int me,nprocs,nthreads;
int flag,remapflag;
int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
@ -279,6 +282,14 @@ struct fft_plan_3d *fft_3d_create_plan(
MPI_Comm_rank(comm,&me);
MPI_Comm_size(comm,&nprocs);
#if defined(_OPENMP)
// query OpenMP info.
// should have been initialized systemwide in Comm class constructor
nthreads = omp_get_max_threads();
#else
nthreads = 1;
#endif
// compute division of procs in 2 dimensions not on-processor
bifactor(nprocs,&np1,&np2);
@ -571,6 +582,13 @@ struct fft_plan_3d *fft_3d_create_plan(
*/
#elif defined(FFT_FFTW3)
#if defined(FFT_FFTW_THREADS)
if (nthreads > 1) {
FFTW_API(init_threads)();
FFTW_API(plan_with_nthreads)(nthreads);
}
#endif
plan->plan_fast_forward =
FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
NULL,&nfast,1,plan->length1,
@ -689,6 +707,9 @@ void fft_3d_destroy_plan(struct fft_plan_3d *plan)
FFTW_API(destroy_plan)(plan->plan_mid_backward);
FFTW_API(destroy_plan)(plan->plan_fast_forward);
FFTW_API(destroy_plan)(plan->plan_fast_backward);
#if defined(FFT_FFTW_THREADS)
FFTW_API(cleanup_threads)();
#endif
#else
if (plan->cfg_slow_forward != plan->cfg_fast_forward &&
plan->cfg_slow_forward != plan->cfg_mid_forward) {

View File

@ -26,6 +26,8 @@ KSpaceStyle(pppm,PPPM)
#define LMP_FFT_LIB "FFTW3"
#elif defined(FFT_MKL)
#define LMP_FFT_LIB "MKL FFT"
#elif defined(FFT_CUFFT)
#define LMP_FFT_LIB "cuFFT"
#else
#define LMP_FFT_LIB "KISS FFT"
#endif

View File

@ -1,4 +1,4 @@
# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc 7.3.1
# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc
SHELL = /bin/sh
@ -44,9 +44,12 @@ LMP_INC = -DLAMMPS_GZIP
# PATH = path for MPI library
# LIB = name of MPI library
MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/include
MY_MPI_EXE = $(shell which mpicxx)
MY_MPI_PATH = $(dir ${MY_MPI_EXE})
MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I${MY_MPI_PATH}../include
MPI_PATH =
MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/lib -lmpi_ibm
MPI_LIB = -L${MY_MPI_PATH}../lib -lmpi_ibm
# FFT library
# see discussion in Section 2.2 (step 6) of manaul
@ -55,9 +58,9 @@ MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.
# PATH = path for FFT library
# LIB = name of FFT library
FFT_INC =
FFT_INC = -DFFT_CUFFT
FFT_PATH =
FFT_LIB =
FFT_LIB = -lcufft
# JPEG and/or PNG library
# see discussion in Section 2.2 (step 7) of manual

View File

@ -55,9 +55,9 @@ MPI_LIB =
# PATH = path for FFT library
# LIB = name of FFT library
FFT_INC =
FFT_INC = -DFFT_CUFFT
FFT_PATH =
FFT_LIB =
FFT_LIB = -lcufft
# JPEG and/or PNG library
# see discussion in Section 2.2 (step 7) of manual

View File

@ -1 +1 @@
#define LAMMPS_VERSION "09 Jan 2020"
#define LAMMPS_VERSION "24 Jan 2020"