Merge pull request #1858 from akohlmey/fft-consistency

Improved consistency and thread support in FFTs
This commit is contained in:
Axel Kohlmeyer
2020-02-03 11:14:16 -05:00
committed by GitHub
20 changed files with 486 additions and 537 deletions

View File

@ -739,7 +739,7 @@ if(PKG_KSPACE)
else()
message(STATUS "Using double precision FFTs")
endif()
if(FFT_FFTW_THREADS)
if(FFT_FFTW_THREADS OR FFT_MKL_THREADS)
message(STATUS "Using threaded FFTs")
else()
message(STATUS "Using non-threaded FFTs")

View File

@ -22,7 +22,7 @@ if(PKG_KSPACE)
include_directories(${${FFTW}_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES})
if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY)
option(FFT_FFTW_THREADS "Use threaded FFT library" ON)
option(FFT_FFTW_THREADS "Use threaded FFTW library" ON)
else()
option(FFT_FFTW_THREADS "Use threaded FFT library" OFF)
endif()
@ -38,6 +38,10 @@ if(PKG_KSPACE)
elseif(FFT STREQUAL "MKL")
find_package(MKL REQUIRED)
add_definitions(-DFFT_MKL)
option(FFT_MKL_THREADS "Use threaded MKL FFT" ON)
if(FFT_MKL_THREADS)
add_definitions(-DFFT_MKL_THREADS)
endif()
include_directories(${MKL_INCLUDE_DIRS})
list(APPEND LAMMPS_LINK_LIBS ${MKL_LIBRARIES})
else()

View File

@ -106,6 +106,7 @@ to assist:
-D FFTW3_LIBRARIES=path # path to FFTW3 libraries
-D FFT_FFTW_THREADS=on # enable using threaded FFTW3 libraries
-D MKL_INCLUDE_DIRS=path # ditto for Intel MKL library
-D FFT_MKL_THREADS=on # enable using threaded FFTs with MKL libraries
-D MKL_LIBRARIES=path
**Makefile.machine settings**\ :
@ -117,6 +118,7 @@ to assist:
# default is KISS if not specified
FFT_INC = -DFFT_SINGLE # do not specify for double precision
FFT_INC = -DFFT_FFTW_THREADS # enable using threaded FFTW3 libraries
FFT_INC = -DFFT_MKL_THREADS # enable using threaded FFTs with MKL libraries
FFT_INC = -DFFT_PACK_ARRAY # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY
# default is FFT\_PACK\_ARRAY if not specified
@ -129,12 +131,15 @@ to assist:
FFT_LIB = -lfftw3 # FFTW3 double precision
FFT_LIB = -lfftw3 -lfftw3_omp # FFTW3 double precision with threads (needs -DFFT_FFTW_THREADS)
FFT_LIB = -lfftw3 -lfftw3f # FFTW3 single precision
FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler
FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compier
FFT_LIB = -lmkl_intel_lp64 -lmkl_sequential -lmkl_core # MKL with Intel compiler, serial interface
FFT_LIB = -lmkl_gf_lp64 -lmkl_sequential -lmkl_core # MKL with GNU compier, serial interface
FFT_LIB = -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core # MKL with Intel compiler, threaded interface
FFT_LIB = -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core # MKL with GNU compiler, threaded interface
FFT_LIB = -lmkl_rt # MKL with automatic runtime selection of interface libs
As with CMake, you do not need to set paths in FFT\_INC or FFT\_PATH, if
make can find the FFT header and library files. You must specify
FFT\_LIB with the appropriate FFT libraries to include in the link.
the compiler can find the FFT header and library files in its default search path.
You must specify FFT\_LIB with the appropriate FFT libraries to include in the link.
**CMake and make info**\ :

View File

@ -160,6 +160,7 @@ action kissfft_kokkos.h kissfft.h
action kokkos.cpp
action kokkos.h
action kokkos_base.h
action kokkos_base_fft.h fft3d.h
action kokkos_few.h
action kokkos_type.h
action memory_kokkos.h

View File

@ -21,7 +21,6 @@
#include <math.h>
#include "fft3d_kokkos.h"
#include "remap_kokkos.h"
#include "kokkos_type.h"
#include "error.h"
#include "kokkos.h"
@ -86,10 +85,10 @@ FFT3dKokkos<DeviceType>::~FFT3dKokkos()
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void FFT3dKokkos<DeviceType>::compute(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, int flag)
void FFT3dKokkos<DeviceType>::compute(typename FFT_AT::t_FFT_SCALAR_1d d_in, typename FFT_AT::t_FFT_SCALAR_1d d_out, int flag)
{
typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size()/2);
typename AT::t_FFT_DATA_1d d_out_data(d_out.data(),d_out.size()/2);
typename FFT_AT::t_FFT_DATA_1d d_in_data((FFT_DATA_POINTER)d_in.data(),d_in.size()/2);
typename FFT_AT::t_FFT_DATA_1d d_out_data((FFT_DATA_POINTER)d_out.data(),d_out.size()/2);
fft_3d_kokkos(d_in_data,d_out_data,flag,plan);
}
@ -97,9 +96,9 @@ void FFT3dKokkos<DeviceType>::compute(typename AT::t_FFT_SCALAR_1d d_in, typenam
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void FFT3dKokkos<DeviceType>::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int nsize, int flag)
void FFT3dKokkos<DeviceType>::timing1d(typename FFT_AT::t_FFT_SCALAR_1d d_in, int nsize, int flag)
{
typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size());
typename FFT_AT::t_FFT_DATA_1d d_in_data((FFT_DATA_POINTER)d_in.data(),d_in.size()/2);
fft_3d_1d_only_kokkos(d_in_data,nsize,flag,plan);
}
@ -138,11 +137,11 @@ template<class DeviceType>
struct norm_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_DATA_1d_um d_out;
typedef FFTArrayTypes<DeviceType> FFT_AT;
typename FFT_AT::t_FFT_DATA_1d_um d_out;
int norm;
norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_):
norm_functor(typename FFT_AT::t_FFT_DATA_1d &d_out_, int norm_):
d_out(d_out_),norm(norm_) {}
KOKKOS_INLINE_FUNCTION
@ -153,9 +152,9 @@ public:
*(out_ptr++) *= norm;
#elif defined(FFT_MKL)
d_out(i) *= norm;
#else
d_out(i,0) *= norm;
d_out(i,1) *= norm;
#else // FFT_KISS
d_out(i).re *= norm;
d_out(i).im *= norm;
#endif
}
};
@ -165,14 +164,14 @@ template<class DeviceType>
struct kiss_fft_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_DATA_1d_um d_data,d_tmp;
typedef FFTArrayTypes<DeviceType> FFT_AT;
typename FFT_AT::t_FFT_DATA_1d_um d_data,d_tmp;
kiss_fft_state_kokkos<DeviceType> st;
int length;
kiss_fft_functor() {}
kiss_fft_functor(typename AT::t_FFT_DATA_1d &d_data_,typename AT::t_FFT_DATA_1d &d_tmp_, kiss_fft_state_kokkos<DeviceType> &st_, int length_):
kiss_fft_functor(typename FFT_AT::t_FFT_DATA_1d &d_data_,typename FFT_AT::t_FFT_DATA_1d &d_tmp_, kiss_fft_state_kokkos<DeviceType> &st_, int length_):
d_data(d_data_),
d_tmp(d_tmp_),
st(st_)
@ -189,11 +188,11 @@ public:
#endif
template<class DeviceType>
void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typename AT::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos<DeviceType> *plan)
void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in, typename FFT_AT::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos<DeviceType> *plan)
{
int total,length;
typename AT::t_FFT_DATA_1d d_data,d_copy;
typename AT::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar;
typename FFT_AT::t_FFT_DATA_1d d_data,d_copy;
typename FFT_AT::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar;
// pre-remap to prepare for 1st FFTs if needed
// copy = loc for remap result
@ -202,9 +201,9 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
if (plan->pre_target == 0) d_copy = d_out;
else d_copy = plan->d_copy;
d_in_scalar = typename AT::t_FFT_SCALAR_1d(d_in.data(),d_in.size());
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size());
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
d_in_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)d_in.data(),d_in.size()*2);
d_copy_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)d_copy.data(),d_copy.size()*2);
d_scratch_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)plan->d_scratch.data(),plan->d_scratch.size()*2);
remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar,
d_scratch_scalar, plan->pre_plan);
@ -219,19 +218,19 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
#if defined(FFT_MKL)
if (flag == -1)
DftiComputeForward(plan->handle_fast,(FFT_DATA *)d_data.data());
DftiComputeForward(plan->handle_fast,d_data.data());
else
DftiComputeBackward(plan->handle_fast,(FFT_DATA *)d_data.data());
DftiComputeBackward(plan->handle_fast,d_data.data());
#elif defined(FFT_FFTW3)
if (flag == -1)
FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
else
FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
#elif defined(FFT_CUFFT)
cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
cufftExec(plan->plan_fast,d_data.data(),d_data.data(),flag);
#else
typename AT::t_FFT_DATA_1d d_tmp =
typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
typename FFT_AT::t_FFT_DATA_1d d_tmp =
typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
kiss_fft_functor<DeviceType> f;
if (flag == -1)
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length);
@ -239,7 +238,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_backward,length);
Kokkos::parallel_for(total/length,f);
d_data = d_tmp;
d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
d_tmp = typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
#endif
@ -249,9 +248,9 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
if (plan->mid1_target == 0) d_copy = d_out;
else d_copy = plan->d_copy;
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()*2);
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2);
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2);
d_data_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)d_data.data(),d_data.size()*2);
d_copy_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)d_copy.data(),d_copy.size()*2);
d_scratch_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)plan->d_scratch.data(),plan->d_scratch.size()*2);
remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar,
d_scratch_scalar, plan->mid1_plan);
@ -265,16 +264,16 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
#if defined(FFT_MKL)
if (flag == -1)
DftiComputeForward(plan->handle_mid,(FFT_DATA *)d_data.data());
DftiComputeForward(plan->handle_mid,d_data.data());
else
DftiComputeBackward(plan->handle_mid,(FFT_DATA *)d_data.data());
DftiComputeBackward(plan->handle_mid,d_data.data());
#elif defined(FFT_FFTW3)
if (flag == -1)
FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
else
FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
#elif defined(FFT_CUFFT)
cufftExec(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
cufftExec(plan->plan_mid,d_data.data(),d_data.data(),flag);
#else
if (flag == -1)
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_forward,length);
@ -282,7 +281,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_backward,length);
Kokkos::parallel_for(total/length,f);
d_data = d_tmp;
d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
d_tmp = typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
#endif
// 2nd mid-remap to prepare for 3rd FFTs
@ -291,9 +290,9 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
if (plan->mid2_target == 0) d_copy = d_out;
else d_copy = plan->d_copy;
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size());
d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size());
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
d_data_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)d_data.data(),d_data.size()*2);
d_copy_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)d_copy.data(),d_copy.size()*2);
d_scratch_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)plan->d_scratch.data(),plan->d_scratch.size()*2);
remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar,
d_scratch_scalar, plan->mid2_plan);
@ -307,16 +306,16 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
#if defined(FFT_MKL)
if (flag == -1)
DftiComputeForward(plan->handle_slow,(FFT_DATA *)d_data.data());
DftiComputeForward(plan->handle_slow,d_data.data());
else
DftiComputeBackward(plan->handle_slow,(FFT_DATA *)d_data.data());
DftiComputeBackward(plan->handle_slow,d_data.data());
#elif defined(FFT_FFTW3)
if (flag == -1)
FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
else
FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
#elif defined(FFT_CUFFT)
cufftExec(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
cufftExec(plan->plan_slow,d_data.data(),d_data.data(),flag);
#else
if (flag == -1)
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_forward,length);
@ -331,9 +330,9 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typ
// destination is always out
if (plan->post_plan) {
d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size());
d_out_scalar = typename AT::t_FFT_SCALAR_1d(d_out.data(),d_out.size());
d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
d_data_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)d_data.data(),d_data.size()*2);
d_out_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)d_out.data(),d_out.size()*2);
d_scratch_scalar = typename FFT_AT::t_FFT_SCALAR_1d((FFT_SCALAR*)plan->d_scratch.data(),plan->d_scratch.size()*2);
remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar,
d_scratch_scalar, plan->post_plan);
@ -589,11 +588,11 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
*nbuf = copy_size + scratch_size;
if (copy_size) {
plan->d_copy = typename AT::t_FFT_DATA_1d("fft3d:copy",copy_size);
plan->d_copy = typename FFT_AT::t_FFT_DATA_1d("fft3d:copy",copy_size);
}
if (scratch_size) {
plan->d_scratch = typename AT::t_FFT_DATA_1d("fft3d:scratch",scratch_size);
plan->d_scratch = typename FFT_AT::t_FFT_DATA_1d("fft3d:scratch",scratch_size);
}
// system specific pre-computation of 1d FFT coeffs
@ -607,7 +606,9 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast);
DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast);
//DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#if defined(FFT_MKL_THREADS)
DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#endif
DftiCommitDescriptor(plan->handle_fast);
DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1,
@ -617,7 +618,9 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid);
DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid);
//DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#if defined(FFT_MKL_THREADS)
DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#endif
DftiCommitDescriptor(plan->handle_mid);
DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1,
@ -627,18 +630,11 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow);
DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow);
//DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#if defined(FFT_MKL_THREADS)
DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#endif
DftiCommitDescriptor(plan->handle_slow);
if (scaled == 0)
plan->scaled = 0;
else {
plan->scaled = 1;
plan->norm = 1.0/(nfast*nmid*nslow);
plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
(out_khi-out_klo+1);
}
#elif defined(FFT_FFTW3)
#if defined (FFT_FFTW_THREADS)
@ -684,7 +680,9 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
NULL,&nslow,1,plan->length3,
NULL,&nslow,1,plan->length3,
FFTW_BACKWARD,FFTW_ESTIMATE);
#elif defined(FFT_CUFFT)
cufftPlanMany(&(plan->plan_fast), 1, &nfast,
&nfast,1,plan->length1,
&nfast,1,plan->length1,
@ -699,7 +697,9 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
&nslow,1,plan->length3,
&nslow,1,plan->length3,
CUFFT_TYPE,plan->total3/plan->length3);
#else
#else /* FFT_KISS */
kissfftKK = new KissFFTKokkos<DeviceType>();
plan->cfg_fast_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL);
@ -726,6 +726,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
plan->cfg_slow_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nslow,0,NULL,NULL);
plan->cfg_slow_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nslow,1,NULL,NULL);
}
#endif
if (scaled == 0)
@ -809,7 +810,7 @@ void FFT3dKokkos<DeviceType>::bifactor(int n, int *factor1, int *factor2)
------------------------------------------------------------------------- */
template<class DeviceType>
void FFT3dKokkos<DeviceType>::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d_data, int nsize, int flag,
void FFT3dKokkos<DeviceType>::fft_3d_1d_only_kokkos(typename FFT_AT::t_FFT_DATA_1d d_data, int nsize, int flag,
struct fft_plan_3d_kokkos<DeviceType> *plan)
{
// total = size of data needed in each dim
@ -839,13 +840,13 @@ void FFT3dKokkos<DeviceType>::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d
#if defined(FFT_MKL)
if (flag == -1) {
DftiComputeForward(plan->handle_fast,data);
DftiComputeForward(plan->handle_mid,data);
DftiComputeForward(plan->handle_slow,data);
DftiComputeForward(plan->handle_fast,d_data.data());
DftiComputeForward(plan->handle_mid,d_data.data());
DftiComputeForward(plan->handle_slow,d_data.data());
} else {
DftiComputeBackward(plan->handle_fast,data);
DftiComputeBackward(plan->handle_mid,data);
DftiComputeBackward(plan->handle_slow,data);
DftiComputeBackward(plan->handle_fast,d_data.data());
DftiComputeBackward(plan->handle_mid,d_data.data());
DftiComputeBackward(plan->handle_slow,d_data.data());
}
#elif defined(FFT_FFTW3)
if (flag == -1) {
@ -858,12 +859,12 @@ void FFT3dKokkos<DeviceType>::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d
FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
}
#elif defined(FFT_CUFFT)
cufftExec(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
cufftExec(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
cufftExec(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
cufftExec(plan->plan_fast,d_data.data(),d_data.data(),flag);
cufftExec(plan->plan_mid,d_data.data(),d_data.data(),flag);
cufftExec(plan->plan_slow,d_data.data(),d_data.data(),flag);
#else
kiss_fft_functor<DeviceType> f;
typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0());
typename FFT_AT::t_FFT_DATA_1d d_tmp = typename FFT_AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0());
if (flag == -1) {
f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length1);
Kokkos::parallel_for(total1/length1,f);

View File

@ -15,78 +15,9 @@
#define LMP_FFT3D_KOKKOS_H
#include "pointers.h"
#include "kokkos_type.h"
#include "remap_kokkos.h"
#include "fftdata_kokkos.h"
// with KOKKOS in CUDA mode we can only have
// CUFFT or KISSFFT, thus undefine all other
// FFTs here, since they may be valid in fft3d.cpp
#if defined(KOKKOS_ENABLE_CUDA)
# if defined(FFT_FFTW)
# undef FFT_FFTW
# endif
# if defined(FFT_FFTW3)
# undef FFT_FFTW3
# endif
# if defined(FFT_MKL)
# undef FFT_MKL
# endif
# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT)
# define FFT_KISSFFT
# endif
#else
# if defined(FFT_CUFFT)
# error "Must enable CUDA with KOKKOS to use -DFFT_CUFFT"
# endif
// if user set FFTW, it means FFTW3
# ifdef FFT_FFTW
# define FFT_FFTW3
# endif
# ifdef FFT_FFTW_THREADS
# if !defined(FFT_FFTW3)
# error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS"
# endif
# endif
#endif
#if defined(FFT_MKL)
#include "mkl_dfti.h"
#if defined(FFT_SINGLE)
typedef float _Complex FFT_DATA;
#define FFT_MKL_PREC DFTI_SINGLE
#else
typedef double _Complex FFT_DATA;
#define FFT_MKL_PREC DFTI_DOUBLE
#endif
#elif defined(FFT_FFTW3)
#include "fftw3.h"
#if defined(FFT_SINGLE)
typedef fftwf_complex FFT_DATA;
#define FFTW_API(function) fftwf_ ## function
#else
typedef fftw_complex FFT_DATA;
#define FFTW_API(function) fftw_ ## function
#endif
#elif defined(FFT_CUFFT)
#include "cufft.h"
#if defined(FFT_SINGLE)
#define cufftExec cufftExecC2C
#define CUFFT_TYPE CUFFT_C2C
typedef cufftComplex FFT_DATA;
#else
#define cufftExec cufftExecZ2Z
#define CUFFT_TYPE CUFFT_Z2Z
typedef cufftDoubleComplex FFT_DATA;
#endif
#else
#include "kissfft_kokkos.h"
#ifndef FFT_KISSFFT
#define FFT_KISSFFT
#endif
#endif
namespace LAMMPS_NS {
// -------------------------------------------------------------------------
@ -96,14 +27,14 @@ namespace LAMMPS_NS {
template<class DeviceType>
struct fft_plan_3d_kokkos {
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef FFTArrayTypes<DeviceType> FFT_AT;
struct remap_plan_3d_kokkos<DeviceType> *pre_plan; // remap from input -> 1st FFTs
struct remap_plan_3d_kokkos<DeviceType> *mid1_plan; // remap from 1st -> 2nd FFTs
struct remap_plan_3d_kokkos<DeviceType> *mid2_plan; // remap from 2nd -> 3rd FFTs
struct remap_plan_3d_kokkos<DeviceType> *post_plan; // remap from 3rd FFTs -> output
typename AT::t_FFT_DATA_1d d_copy; // memory for remap results (if needed)
typename AT::t_FFT_DATA_1d d_scratch; // scratch space for remaps
typename FFT_AT::t_FFT_DATA_1d d_copy; // memory for remap results (if needed)
typename FFT_AT::t_FFT_DATA_1d d_scratch; // scratch space for remaps
int total1,total2,total3; // # of 1st,2nd,3rd FFTs (times length)
int length1,length2,length3; // length of 1st,2nd,3rd FFTs
int pre_target; // where to put remap results
@ -142,14 +73,14 @@ template<class DeviceType>
class FFT3dKokkos : protected Pointers {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef FFTArrayTypes<DeviceType> FFT_AT;
FFT3dKokkos(class LAMMPS *, MPI_Comm,
int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
int,int,int *,int);
~FFT3dKokkos();
void compute(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, int);
void timing1d(typename AT::t_FFT_SCALAR_1d, int, int);
void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
private:
struct fft_plan_3d_kokkos<DeviceType> *plan;
@ -159,7 +90,7 @@ class FFT3dKokkos : protected Pointers {
KissFFTKokkos<DeviceType> *kissfftKK;
#endif
void fft_3d_kokkos(typename AT::t_FFT_DATA_1d, typename AT::t_FFT_DATA_1d, int, struct fft_plan_3d_kokkos<DeviceType> *);
void fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d, typename FFT_AT::t_FFT_DATA_1d, int, struct fft_plan_3d_kokkos<DeviceType> *);
struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
int, int, int, int, int,
@ -168,7 +99,7 @@ class FFT3dKokkos : protected Pointers {
void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);
void fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d, int, int, struct fft_plan_3d_kokkos<DeviceType> *);
void fft_3d_1d_only_kokkos(typename FFT_AT::t_FFT_DATA_1d, int, int, struct fft_plan_3d_kokkos<DeviceType> *);
void bifactor(int, int *, int *);
};

View File

@ -11,12 +11,16 @@
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#include "kokkos_type.h"
#ifndef MAX
#define MAX(A,B) ((A) > (B) ? (A) : (B))
#endif
// data types for 2d/3d FFTs
#ifndef FFT_DATA_KOKKOS_H
#define FFT_DATA_KOKKOS_H
#ifndef LMP_FFT_DATA_KOKKOS_H
#define LMP_FFT_DATA_KOKKOS_H
// User-settable FFT precision
@ -43,4 +47,163 @@ typedef double FFT_SCALAR;
#error "FFT_PRECISION needs to be either 1 (=single) or 2 (=double)"
#endif
// with KOKKOS in CUDA mode we can only have
// CUFFT or KISSFFT, thus undefine all other
// FFTs here, since they may be valid in fft3d.cpp
#if defined(KOKKOS_ENABLE_CUDA)
# if defined(FFT_FFTW)
# undef FFT_FFTW
# endif
# if defined(FFT_FFTW3)
# undef FFT_FFTW3
# endif
# if defined(FFT_MKL)
# undef FFT_MKL
# endif
# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT)
# define FFT_KISSFFT
# endif
#else
# if defined(FFT_CUFFT)
# error "Must enable CUDA with KOKKOS to use -DFFT_CUFFT"
# endif
// if user set FFTW, it means FFTW3
# ifdef FFT_FFTW
# define FFT_FFTW3
# endif
# ifdef FFT_FFTW_THREADS
# if !defined(FFT_FFTW3)
# error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS"
# endif
# endif
#endif
#if defined(FFT_MKL)
#include "mkl_dfti.h"
#if defined(FFT_SINGLE)
typedef float _Complex FFT_DATA;
#define FFT_MKL_PREC DFTI_SINGLE
#else
typedef double _Complex FFT_DATA;
#define FFT_MKL_PREC DFTI_DOUBLE
#endif
#elif defined(FFT_FFTW3)
#include "fftw3.h"
#if defined(FFT_SINGLE)
typedef fftwf_complex FFT_DATA;
#define FFTW_API(function) fftwf_ ## function
#else
typedef fftw_complex FFT_DATA;
#define FFTW_API(function) fftw_ ## function
#endif
#elif defined(FFT_CUFFT)
#include "cufft.h"
#if defined(FFT_SINGLE)
#define cufftExec cufftExecC2C
#define CUFFT_TYPE CUFFT_C2C
typedef cufftComplex FFT_DATA;
#else
#define cufftExec cufftExecZ2Z
#define CUFFT_TYPE CUFFT_Z2Z
typedef cufftDoubleComplex FFT_DATA;
#endif
#else
#if defined(FFT_SINGLE)
#define kiss_fft_scalar float
#else
#define kiss_fft_scalar double
typedef struct {
kiss_fft_scalar re;
kiss_fft_scalar im;
} FFT_DATA;
#endif
#ifndef FFT_KISSFFT
#define FFT_KISSFFT
#endif
#endif
// (double[2]*) is not a 1D pointer
#if defined(FFT_FFTW3)
typedef FFT_SCALAR* FFT_DATA_POINTER;
#else
typedef FFT_DATA* FFT_DATA_POINTER;
#endif
template <class DeviceType>
struct FFTArrayTypes;
template <>
struct FFTArrayTypes<LMPDeviceType> {
typedef Kokkos::
DualView<FFT_SCALAR*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_SCALAR_1d;
typedef tdual_FFT_SCALAR_1d::t_dev t_FFT_SCALAR_1d;
typedef tdual_FFT_SCALAR_1d::t_dev_um t_FFT_SCALAR_1d_um;
typedef Kokkos::DualView<FFT_SCALAR**,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d;
typedef tdual_FFT_SCALAR_2d::t_dev t_FFT_SCALAR_2d;
typedef Kokkos::DualView<FFT_SCALAR**[3],Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d_3;
typedef tdual_FFT_SCALAR_2d_3::t_dev t_FFT_SCALAR_2d_3;
typedef Kokkos::DualView<FFT_SCALAR***,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_3d;
typedef tdual_FFT_SCALAR_3d::t_dev t_FFT_SCALAR_3d;
typedef Kokkos::
DualView<FFT_DATA*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_DATA_1d;
typedef tdual_FFT_DATA_1d::t_dev t_FFT_DATA_1d;
typedef tdual_FFT_DATA_1d::t_dev_um t_FFT_DATA_1d_um;
typedef Kokkos::
DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_64;
typedef tdual_int_64::t_dev t_int_64;
typedef tdual_int_64::t_dev_um t_int_64_um;
};
#ifdef KOKKOS_ENABLE_CUDA
template <>
struct FFTArrayTypes<LMPHostType> {
//Kspace
typedef Kokkos::
DualView<FFT_SCALAR*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_SCALAR_1d;
typedef tdual_FFT_SCALAR_1d::t_host t_FFT_SCALAR_1d;
typedef tdual_FFT_SCALAR_1d::t_host_um t_FFT_SCALAR_1d_um;
typedef Kokkos::DualView<FFT_SCALAR**,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d;
typedef tdual_FFT_SCALAR_2d::t_host t_FFT_SCALAR_2d;
typedef Kokkos::DualView<FFT_SCALAR**[3],Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d_3;
typedef tdual_FFT_SCALAR_2d_3::t_host t_FFT_SCALAR_2d_3;
typedef Kokkos::DualView<FFT_SCALAR***,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_3d;
typedef tdual_FFT_SCALAR_3d::t_host t_FFT_SCALAR_3d;
typedef Kokkos::
DualView<FFT_DATA*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_DATA_1d;
typedef tdual_FFT_DATA_1d::t_host t_FFT_DATA_1d;
typedef tdual_FFT_DATA_1d::t_host_um t_FFT_DATA_1d_um;
typedef Kokkos::
DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_64;
typedef tdual_int_64::t_host t_int_64;
typedef tdual_int_64::t_host_um t_int_64_um;
};
#endif
typedef struct FFTArrayTypes<LMPDeviceType> FFT_DAT;
typedef struct FFTArrayTypes<LMPHostType> FFT_HAT;
#if defined(FFT_KISSFFT)
#include "kissfft_kokkos.h" // uses t_FFT_DATA_1d, needs to come last
#endif
#endif

View File

@ -17,7 +17,7 @@
#include "kspace.h"
#include "memory_kokkos.h"
#include "error.h"
#include "kokkos_base.h"
#include "kokkos_base_fft.h"
#include "kokkos.h"
using namespace LAMMPS_NS;
@ -502,9 +502,9 @@ void GridCommKokkos<DeviceType>::setup()
}
nbuf *= MAX(nforward,nreverse);
//memory->create(buf1,nbuf,"Commgrid:buf1");
k_buf1 = DAT::tdual_FFT_SCALAR_1d("Commgrid:buf1",nbuf);
k_buf1 = FFT_DAT::tdual_FFT_SCALAR_1d("Commgrid:buf1",nbuf);
//memory->create(buf2,nbuf,"Commgrid:buf2");
k_buf2 = DAT::tdual_FFT_SCALAR_1d("Commgrid:buf2",nbuf);
k_buf2 = FFT_DAT::tdual_FFT_SCALAR_1d("Commgrid:buf2",nbuf);
}
/* ----------------------------------------------------------------------
@ -517,7 +517,7 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
k_packlist.sync<DeviceType>();
k_unpacklist.sync<DeviceType>();
KokkosBase* kspaceKKBase = dynamic_cast<KokkosBase*>(kspace);
KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
for (int m = 0; m < nswap; m++) {
if (swap[m].sendproc == me)
@ -567,7 +567,7 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
k_packlist.sync<DeviceType>();
k_unpacklist.sync<DeviceType>();
KokkosBase* kspaceKKBase = dynamic_cast<KokkosBase*>(kspace);
KokkosBaseFFT* kspaceKKBase = dynamic_cast<KokkosBaseFFT*>(kspace);
for (int m = nswap-1; m >= 0; m--) {
if (swap[m].recvproc == me)

View File

@ -16,6 +16,7 @@
#include "pointers.h"
#include "kokkos_type.h"
#include "fftdata_kokkos.h"
#ifdef FFT_SINGLE
typedef float FFT_SCALAR;
@ -32,6 +33,7 @@ class GridCommKokkos : protected Pointers {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef FFTArrayTypes<DeviceType> FFT_AT;
GridCommKokkos(class LAMMPS *, MPI_Comm, int, int,
int, int, int, int, int, int,
@ -70,8 +72,8 @@ class GridCommKokkos : protected Pointers {
int nbuf;
//FFT_SCALAR *buf1,*buf2;
DAT::tdual_FFT_SCALAR_1d k_buf1;
DAT::tdual_FFT_SCALAR_1d k_buf2;
FFT_DAT::tdual_FFT_SCALAR_1d k_buf1;
FFT_DAT::tdual_FFT_SCALAR_1d k_buf2;
struct Swap {
int sendproc; // proc to send to for forward comm

View File

@ -62,9 +62,7 @@
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "kissfft_kokkos.h"
#include "fftdata_kokkos.h"
#include "kokkos_type.h"
#ifndef M_PI
#define M_PI 3.141592653589793238462643383279502884197169399375105820974944
@ -84,8 +82,8 @@
#define S_MUL(a,b) ( (a)*(b) )
#define C_MUL(m,a,a_index,b,b_index) \
do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\
(m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0)
do{ (m)[0] = (a)(a_index).re*(b)(b_index).re - (a)(a_index).im*(b)(b_index).im;\
(m)[1] = (a)(a_index).re*(b)(b_index).im + (a)(a_index).im*(b)(b_index).re; }while(0)
/*
#define C_FIXDIV(c,div) // NOOP
@ -126,8 +124,8 @@
#define kf_cexp(x,x_index,phase) \
do{ \
(x)(x_index,0) = KISS_FFT_COS(phase);\
(x)(x_index,1) = KISS_FFT_SIN(phase);\
(x)(x_index).re = KISS_FFT_COS(phase);\
(x)(x_index).im = KISS_FFT_SIN(phase);\
}while(0)
@ -139,25 +137,25 @@ namespace LAMMPS_NS {
template<class DeviceType>
struct kiss_fft_state_kokkos {
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef FFTArrayTypes<DeviceType> FFT_AT;
int nfft;
int inverse;
typename AT::t_int_64 d_factors;
typename AT::t_FFT_DATA_1d d_twiddles;
typename AT::t_FFT_DATA_1d d_scratch;
typename FFT_AT::t_int_64 d_factors;
typename FFT_AT::t_FFT_DATA_1d d_twiddles;
typename FFT_AT::t_FFT_DATA_1d d_scratch;
};
template<class DeviceType>
class KissFFTKokkos {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef FFTArrayTypes<DeviceType> FFT_AT;
KOKKOS_INLINE_FUNCTION
static void kf_bfly2(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
static void kf_bfly2(typename FFT_AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, int m, int Fout_count)
{
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
typename FFT_AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR t[2];
int Fout2_count;
int tw1_count = 0;
@ -169,21 +167,21 @@ class KissFFTKokkos {
C_MUL(t,d_Fout,Fout2_count,d_twiddles,tw1_count);
tw1_count += fstride;
//C_SUB(*Fout2,*Fout,t);
d_Fout(Fout2_count,0) = d_Fout(Fout_count,0) - t[0];
d_Fout(Fout2_count,1) = d_Fout(Fout_count,1) - t[1];
d_Fout(Fout2_count).re = d_Fout(Fout_count).re - t[0];
d_Fout(Fout2_count).im = d_Fout(Fout_count).im - t[1];
//C_ADDTO(d_Fout[Fout_count],t);
d_Fout(Fout_count,0) += t[0];
d_Fout(Fout_count,1) += t[1];
d_Fout(Fout_count).re += t[0];
d_Fout(Fout_count).im += t[1];
++Fout2_count;
++Fout_count;
} while(--m);
}
KOKKOS_INLINE_FUNCTION
static void kf_bfly4(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
static void kf_bfly4(typename FFT_AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, const size_t m, int Fout_count)
{
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
typename FFT_AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR scratch[6][2];
size_t k=m;
const size_t m2=2*m;
@ -200,11 +198,11 @@ class KissFFTKokkos {
C_MUL(scratch[2],d_Fout,Fout_count + m3,d_twiddles,tw3_count);
//C_SUB(scratch[5],d_Fout[Fout_count],scratch[1] );
scratch[5][0] = d_Fout(Fout_count,0) - scratch[1][0];
scratch[5][1] = d_Fout(Fout_count,1) - scratch[1][1];
scratch[5][0] = d_Fout(Fout_count).re - scratch[1][0];
scratch[5][1] = d_Fout(Fout_count).im - scratch[1][1];
//C_ADDTO(d_Fout[Fout_count], scratch[1]);
d_Fout(Fout_count,0) += scratch[1][0];
d_Fout(Fout_count,1) += scratch[1][1];
d_Fout(Fout_count).re += scratch[1][0];
d_Fout(Fout_count).im += scratch[1][1];
//C_ADD(scratch[3],scratch[0],scratch[2]);
scratch[3][0] = scratch[0][0] + scratch[2][0];
scratch[3][1] = scratch[0][1] + scratch[2][1];
@ -212,43 +210,43 @@ class KissFFTKokkos {
scratch[4][0] = scratch[0][0] - scratch[2][0];
scratch[4][1] = scratch[0][1] - scratch[2][1];
//C_SUB(d_Fout[m2],d_Fout[Fout_count],scratch[3]);
d_Fout(Fout_count + m2,0) = d_Fout(Fout_count,0) - scratch[3][0];
d_Fout(Fout_count + m2,1) = d_Fout(Fout_count,1) - scratch[3][1];
d_Fout(Fout_count + m2).re = d_Fout(Fout_count).re - scratch[3][0];
d_Fout(Fout_count + m2).im = d_Fout(Fout_count).im - scratch[3][1];
tw1_count += fstride;
tw2_count += fstride*2;
tw3_count += fstride*3;
//C_ADDTO(d_Fout[Fout_count],scratch[3]);
d_Fout(Fout_count,0) += scratch[3][0];
d_Fout(Fout_count,1) += scratch[3][1];
d_Fout(Fout_count).re += scratch[3][0];
d_Fout(Fout_count).im += scratch[3][1];
if (st.inverse) {
d_Fout(Fout_count + m,0) = scratch[5][0] - scratch[4][1];
d_Fout(Fout_count + m,1) = scratch[5][1] + scratch[4][0];
d_Fout(Fout_count + m3,0) = scratch[5][0] + scratch[4][1];
d_Fout(Fout_count + m3,1) = scratch[5][1] - scratch[4][0];
d_Fout(Fout_count + m).re = scratch[5][0] - scratch[4][1];
d_Fout(Fout_count + m).im = scratch[5][1] + scratch[4][0];
d_Fout(Fout_count + m3).re = scratch[5][0] + scratch[4][1];
d_Fout(Fout_count + m3).im = scratch[5][1] - scratch[4][0];
} else{
d_Fout(Fout_count + m,0) = scratch[5][0] + scratch[4][1];
d_Fout(Fout_count + m,1) = scratch[5][1] - scratch[4][0];
d_Fout(Fout_count + m3,0) = scratch[5][0] - scratch[4][1];
d_Fout(Fout_count + m3,1) = scratch[5][1] + scratch[4][0];
d_Fout(Fout_count + m).re = scratch[5][0] + scratch[4][1];
d_Fout(Fout_count + m).im = scratch[5][1] - scratch[4][0];
d_Fout(Fout_count + m3).re = scratch[5][0] - scratch[4][1];
d_Fout(Fout_count + m3).im = scratch[5][1] + scratch[4][0];
}
++Fout_count;
} while(--k);
}
KOKKOS_INLINE_FUNCTION
static void kf_bfly3(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
static void kf_bfly3(typename FFT_AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, size_t m, int Fout_count)
{
size_t k=m;
const size_t m2 = 2*m;
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
typename FFT_AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR scratch[5][2];
FFT_SCALAR epi3[2];
//C_EQ(epi3,d_twiddles[fstride*m]);
epi3[0] = d_twiddles(fstride*m,0);
epi3[1] = d_twiddles(fstride*m,1);
epi3[0] = d_twiddles(fstride*m).re;
epi3[1] = d_twiddles(fstride*m).im;
int tw1_count,tw2_count;
tw1_count = tw2_count = 0;
@ -268,41 +266,41 @@ class KissFFTKokkos {
tw1_count += fstride;
tw2_count += fstride*2;
d_Fout(Fout_count + m,0) = d_Fout(Fout_count,0) - HALF_OF(scratch[3][0]);
d_Fout(Fout_count + m,1) = d_Fout(Fout_count,1) - HALF_OF(scratch[3][1]);
d_Fout(Fout_count + m).re = d_Fout(Fout_count).re - HALF_OF(scratch[3][0]);
d_Fout(Fout_count + m).im = d_Fout(Fout_count).im - HALF_OF(scratch[3][1]);
//C_MULBYSCALAR(scratch[0],epi3[1]);
scratch[0][0] *= epi3[1];
scratch[0][1] *= epi3[1];
//C_ADDTO(d_Fout[Fout_count],scratch[3]);
d_Fout(Fout_count,0) += scratch[3][0];
d_Fout(Fout_count,1) += scratch[3][1];
d_Fout(Fout_count).re += scratch[3][0];
d_Fout(Fout_count).im += scratch[3][1];
d_Fout(Fout_count + m2,0) = d_Fout(Fout_count + m,0) + scratch[0][1];
d_Fout(Fout_count + m2,1) = d_Fout(Fout_count + m,1) - scratch[0][0];
d_Fout(Fout_count + m2).re = d_Fout(Fout_count + m).re + scratch[0][1];
d_Fout(Fout_count + m2).im = d_Fout(Fout_count + m).im - scratch[0][0];
d_Fout(Fout_count + m,0) -= scratch[0][1];
d_Fout(Fout_count + m,1) += scratch[0][0];
d_Fout(Fout_count + m).re -= scratch[0][1];
d_Fout(Fout_count + m).im += scratch[0][0];
++Fout_count;
} while(--k);
}
KOKKOS_INLINE_FUNCTION
static void kf_bfly5(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
static void kf_bfly5(typename FFT_AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, int m, int Fout_count)
{
int u;
FFT_SCALAR scratch[13][2];
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
typename FFT_AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR ya[2],yb[2];
//C_EQ(ya,d_twiddles[fstride*m]);
ya[1] = d_twiddles(fstride*m,1);
ya[0] = d_twiddles(fstride*m,0);
ya[1] = d_twiddles(fstride*m).im;
ya[0] = d_twiddles(fstride*m).re;
//C_EQ(yb,d_twiddles[fstride*2*m]);
yb[1] = d_twiddles(fstride*2*m,1);
yb[0] = d_twiddles(fstride*2*m,0);
yb[1] = d_twiddles(fstride*2*m).im;
yb[0] = d_twiddles(fstride*2*m).re;
int Fout0_count=Fout_count;
int Fout1_count=Fout0_count+m;
@ -314,8 +312,8 @@ class KissFFTKokkos {
//C_FIXDIV( d_Fout[Fout0_count],5); C_FIXDIV( d_Fout[Fout1_count],5); C_FIXDIV( d_Fout[Fout2_count],5);
//C_FIXDIV( d_Fout[Fout3_count],5); C_FIXDIV( d_Fout[Fout4_count],5);
//C_EQ(scratch[0],d_Fout[Fout0_count]);
scratch[0][0] = d_Fout(Fout0_count,0);
scratch[0][1] = d_Fout(Fout0_count,1);
scratch[0][0] = d_Fout(Fout0_count).re;
scratch[0][1] = d_Fout(Fout0_count).im;
C_MUL(scratch[1],d_Fout,Fout1_count,d_twiddles,u*fstride );
C_MUL(scratch[2],d_Fout,Fout2_count,d_twiddles,2*u*fstride);
@ -335,8 +333,8 @@ class KissFFTKokkos {
scratch[9][0] = scratch[2][0] - scratch[3][0];
scratch[9][1] = scratch[2][1] - scratch[3][1];
d_Fout(Fout0_count,0) += scratch[7][0] + scratch[8][0];
d_Fout(Fout0_count,1) += scratch[7][1] + scratch[8][1];
d_Fout(Fout0_count).re += scratch[7][0] + scratch[8][0];
d_Fout(Fout0_count).im += scratch[7][1] + scratch[8][1];
scratch[5][0] = scratch[0][0] + S_MUL(scratch[7][0],ya[0]) + S_MUL(scratch[8][0],yb[0]);
scratch[5][1] = scratch[0][1] + S_MUL(scratch[7][1],ya[0]) + S_MUL(scratch[8][1],yb[0]);
@ -345,11 +343,11 @@ class KissFFTKokkos {
scratch[6][1] = -S_MUL(scratch[10][0],ya[1]) - S_MUL(scratch[9][0],yb[1]);
//C_SUB(d_Fout[Fout1_count],scratch[5],scratch[6]);
d_Fout(Fout1_count,0) = scratch[5][0] - scratch[6][0];
d_Fout(Fout1_count,1) = scratch[5][1] - scratch[6][1];
d_Fout(Fout1_count).re = scratch[5][0] - scratch[6][0];
d_Fout(Fout1_count).im = scratch[5][1] - scratch[6][1];
//C_ADD(d_Fout[Fout4_count],scratch[5],scratch[6]);
d_Fout(Fout4_count,0) = scratch[5][0] + scratch[6][0];
d_Fout(Fout4_count,1) = scratch[5][1] + scratch[6][1];
d_Fout(Fout4_count).re = scratch[5][0] + scratch[6][0];
d_Fout(Fout4_count).im = scratch[5][1] + scratch[6][1];
scratch[11][0] = scratch[0][0] + S_MUL(scratch[7][0],yb[0]) + S_MUL(scratch[8][0],ya[0]);
scratch[11][1] = scratch[0][1] + S_MUL(scratch[7][1],yb[0]) + S_MUL(scratch[8][1],ya[0]);
@ -357,11 +355,11 @@ class KissFFTKokkos {
scratch[12][1] = S_MUL(scratch[10][0],yb[1]) - S_MUL(scratch[9][0],ya[1]);
//C_ADD(d_Fout[Fout2_count],scratch[11],scratch[12]);
d_Fout(Fout2_count,0) = scratch[11][0] + scratch[12][0];
d_Fout(Fout2_count,1) = scratch[11][1] + scratch[12][1];
d_Fout(Fout2_count).re = scratch[11][0] + scratch[12][0];
d_Fout(Fout2_count).im = scratch[11][1] + scratch[12][1];
//C_SUB(d_Fout3[Fout3_count],scratch[11],scratch[12]);
d_Fout(Fout3_count,0) = scratch[11][0] - scratch[12][0];
d_Fout(Fout3_count,1) = scratch[11][1] - scratch[12][1];
d_Fout(Fout3_count).re = scratch[11][0] - scratch[12][0];
d_Fout(Fout3_count).im = scratch[11][1] - scratch[12][1];
++Fout0_count;++Fout1_count;++Fout2_count;++Fout3_count;++Fout4_count;
}
@ -370,21 +368,21 @@ class KissFFTKokkos {
/* perform the butterfly for one stage of a mixed radix FFT */
KOKKOS_INLINE_FUNCTION
static void kf_bfly_generic(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
static void kf_bfly_generic(typename FFT_AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
const kiss_fft_state_kokkos<DeviceType> &st, int m, int p, int Fout_count)
{
int u,k,q1,q;
typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
typename FFT_AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
FFT_SCALAR t[2];
int Norig = st.nfft;
typename AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch;
typename FFT_AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch;
for ( u=0; u<m; ++u ) {
k=u;
for ( q1=0 ; q1<p ; ++q1 ) {
//C_EQ(d_scratch[q1],d_Fout[k]);
d_scratch(q1,0) = d_Fout(Fout_count + k,0);
d_scratch(q1,1) = d_Fout(Fout_count + k,1);
d_scratch(q1).re = d_Fout(Fout_count + k).re;
d_scratch(q1).im = d_Fout(Fout_count + k).im;
//C_FIXDIV(d_scratch[q1],p);
k += m;
}
@ -393,15 +391,15 @@ class KissFFTKokkos {
for ( q1=0 ; q1<p ; ++q1 ) {
int twidx=0;
//C_EQ(d_Fout[k],d_scratch[0]);
d_Fout(Fout_count + k,0) = d_scratch(0,0);
d_Fout(Fout_count + k,1) = d_scratch(0,1);
d_Fout(Fout_count + k).re = d_scratch(0).re;
d_Fout(Fout_count + k).im = d_scratch(0).im;
for (q=1;q<p;++q ) {
twidx += fstride * k;
if (twidx>=Norig) twidx-=Norig;
C_MUL(t,d_scratch,q,d_twiddles,twidx);
//C_ADDTO(d_Fout[k],t);
d_Fout(Fout_count + k,0) += t[0];
d_Fout(Fout_count + k,1) += t[1];
d_Fout(Fout_count + k).re += t[0];
d_Fout(Fout_count + k).im += t[1];
}
k += m;
}
@ -409,9 +407,9 @@ class KissFFTKokkos {
}
KOKKOS_INLINE_FUNCTION
static void kf_work(typename AT::t_FFT_DATA_1d_um &d_Fout, const typename AT::t_FFT_DATA_1d_um &d_f,
static void kf_work(typename FFT_AT::t_FFT_DATA_1d_um &d_Fout, const typename FFT_AT::t_FFT_DATA_1d_um &d_f,
const size_t fstride, int in_stride,
const typename AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos<DeviceType> &st, int Fout_count, int f_count, int factors_count)
const typename FFT_AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos<DeviceType> &st, int Fout_count, int f_count, int factors_count)
{
const int beg = Fout_count;
const int p = d_factors[factors_count++]; /* the radix */
@ -421,8 +419,8 @@ class KissFFTKokkos {
if (m == 1) {
do {
//C_EQ(d_Fout[Fout_count],d_f[f_count]);
d_Fout(Fout_count,0) = d_f(f_count,0);
d_Fout(Fout_count,1) = d_f(f_count,1);
d_Fout(Fout_count).re = d_f(f_count).re;
d_Fout(Fout_count).im = d_f(f_count).im;
f_count += fstride*in_stride;
} while (++Fout_count != end);
} else {
@ -453,7 +451,7 @@ class KissFFTKokkos {
p[i] * m[i] = m[i-1]
m0 = n */
static int kf_factor(int n, HAT::t_int_64 h_facbuf)
static int kf_factor(int n, FFT_HAT::t_int_64 h_facbuf)
{
int p=4, nf=0;
double floor_sqrt;
@ -497,12 +495,12 @@ class KissFFTKokkos {
st.nfft = nfft;
st.inverse = inverse_fft;
typename AT::tdual_int_64 k_factors = typename AT::tdual_int_64();
typename AT::tdual_FFT_DATA_1d k_twiddles = typename AT::tdual_FFT_DATA_1d();
typename FFT_AT::tdual_int_64 k_factors = typename FFT_AT::tdual_int_64();
typename FFT_AT::tdual_FFT_DATA_1d k_twiddles = typename FFT_AT::tdual_FFT_DATA_1d();
if (nfft > 0) {
k_factors = typename AT::tdual_int_64("kissfft:factors",MAXFACTORS*2);
k_twiddles = typename AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft);
k_factors = typename FFT_AT::tdual_int_64("kissfft:factors",MAXFACTORS*2);
k_twiddles = typename FFT_AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft);
for (i=0;i<nfft;++i) {
const double phase = (st.inverse ? 2.0*M_PI:-2.0*M_PI)*i / nfft;
@ -510,7 +508,7 @@ class KissFFTKokkos {
}
int p_max = kf_factor(nfft,k_factors.h_view);
st.d_scratch = typename AT::t_FFT_DATA_1d("kissfft:scratch",p_max);
st.d_scratch = typename FFT_AT::t_FFT_DATA_1d("kissfft:scratch",p_max);
}
k_factors.template modify<LMPHostType>();
@ -525,13 +523,13 @@ class KissFFTKokkos {
}
KOKKOS_INLINE_FUNCTION
static void kiss_fft_stride(const kiss_fft_state_kokkos<DeviceType> &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset)
static void kiss_fft_stride(const kiss_fft_state_kokkos<DeviceType> &st, const typename FFT_AT::t_FFT_DATA_1d_um &d_fin, typename FFT_AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset)
{
//if (d_fin.data() == d_fout.data()) {
// // NOTE: this is not really an in-place FFT algorithm.
// // It just performs an out-of-place FFT into a temp buffer
// typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1));
// kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0);
// typename FFT_AT::t_FFT_DATA_1d_um d_tmpbuf = typename FFT_AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1));
// kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset).re;
// Kokkos::deep_copy(d_fout,d_tmpbuf);
//} else {
kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0);
@ -539,7 +537,7 @@ class KissFFTKokkos {
}
KOKKOS_INLINE_FUNCTION
static void kiss_fft_kokkos(const kiss_fft_state_kokkos<DeviceType> &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset)
static void kiss_fft_kokkos(const kiss_fft_state_kokkos<DeviceType> &cfg, const typename FFT_AT::t_FFT_DATA_1d_um d_fin, typename FFT_AT::t_FFT_DATA_1d_um d_fout, int offset)
{
kiss_fft_stride(cfg,d_fin,d_fout,1,offset);
}

View File

@ -22,12 +22,6 @@ class KokkosBase {
public:
KokkosBase() {}
//Kspace
virtual void pack_forward_kspace_kokkos(int, DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
virtual void unpack_forward_kspace_kokkos(int, DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
virtual void pack_reverse_kspace_kokkos(int, DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
virtual void unpack_reverse_kspace_kokkos(int, DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
// Pair
virtual int pack_forward_comm_kokkos(int, DAT::tdual_int_2d,
int, DAT::tdual_xfloat_1d &,

View File

@ -0,0 +1,38 @@
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
http://lammps.sandia.gov, Sandia National Laboratories
Steve Plimpton, sjplimp@sandia.gov
Copyright (2003) Sandia Corporation. Under the terms of Contract
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
certain rights in this software. This software is distributed under
the GNU General Public License.
See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
#ifndef LMP_KOKKOS_BASE_FFT_H
#define LMP_KOKKOS_BASE_FFT_H
#include "fftdata_kokkos.h"
namespace LAMMPS_NS {
class KokkosBaseFFT {
public:
KokkosBaseFFT() {}
//Kspace
virtual void pack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
virtual void unpack_forward_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
virtual void pack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
virtual void unpack_reverse_kspace_kokkos(int, FFT_DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
};
}
#endif
/* ERROR/WARNING messages:
*/

View File

@ -29,21 +29,6 @@ enum{FULL=1u,HALFTHREAD=2u,HALF=4u,N2=8u};
#define ISFINITE(x) std::isfinite(x)
#endif
// User-settable FFT precision
// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag)
// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag)
#ifdef FFT_SINGLE
#define FFT_PRECISION 1
#define MPI_FFT_SCALAR MPI_FLOAT
typedef float FFT_SCALAR;
#else
#define FFT_PRECISION 2
#define MPI_FFT_SCALAR MPI_DOUBLE
typedef double FFT_SCALAR;
#endif
#define MAX_TYPES_STACKPARAMS 12
#define NeighClusterSize 8
@ -752,32 +737,6 @@ typedef tdual_neighbors_2d::t_dev_um t_neighbors_2d_um;
typedef tdual_neighbors_2d::t_dev_const_um t_neighbors_2d_const_um;
typedef tdual_neighbors_2d::t_dev_const_randomread t_neighbors_2d_randomread;
//Kspace
typedef Kokkos::
DualView<FFT_SCALAR*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_SCALAR_1d;
typedef tdual_FFT_SCALAR_1d::t_dev t_FFT_SCALAR_1d;
typedef tdual_FFT_SCALAR_1d::t_dev_um t_FFT_SCALAR_1d_um;
typedef Kokkos::DualView<FFT_SCALAR**,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d;
typedef tdual_FFT_SCALAR_2d::t_dev t_FFT_SCALAR_2d;
typedef Kokkos::DualView<FFT_SCALAR**[3],Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d_3;
typedef tdual_FFT_SCALAR_2d_3::t_dev t_FFT_SCALAR_2d_3;
typedef Kokkos::DualView<FFT_SCALAR***,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_3d;
typedef tdual_FFT_SCALAR_3d::t_dev t_FFT_SCALAR_3d;
typedef Kokkos::
DualView<FFT_SCALAR*[2], Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_DATA_1d;
typedef tdual_FFT_DATA_1d::t_dev t_FFT_DATA_1d;
typedef tdual_FFT_DATA_1d::t_dev_um t_FFT_DATA_1d_um;
typedef Kokkos::
DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_64;
typedef tdual_int_64::t_dev t_int_64;
typedef tdual_int_64::t_dev_um t_int_64_um;
};
#ifdef KOKKOS_ENABLE_CUDA
@ -1012,33 +971,6 @@ typedef tdual_neighbors_2d::t_host_um t_neighbors_2d_um;
typedef tdual_neighbors_2d::t_host_const_um t_neighbors_2d_const_um;
typedef tdual_neighbors_2d::t_host_const_randomread t_neighbors_2d_randomread;
//Kspace
typedef Kokkos::
DualView<FFT_SCALAR*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_SCALAR_1d;
typedef tdual_FFT_SCALAR_1d::t_host t_FFT_SCALAR_1d;
typedef tdual_FFT_SCALAR_1d::t_host_um t_FFT_SCALAR_1d_um;
typedef Kokkos::DualView<FFT_SCALAR**,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d;
typedef tdual_FFT_SCALAR_2d::t_host t_FFT_SCALAR_2d;
typedef Kokkos::DualView<FFT_SCALAR**[3],Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d_3;
typedef tdual_FFT_SCALAR_2d_3::t_host t_FFT_SCALAR_2d_3;
typedef Kokkos::DualView<FFT_SCALAR***,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_3d;
typedef tdual_FFT_SCALAR_3d::t_host t_FFT_SCALAR_3d;
typedef Kokkos::
DualView<FFT_SCALAR*[2], Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_DATA_1d;
typedef tdual_FFT_DATA_1d::t_host t_FFT_DATA_1d;
typedef tdual_FFT_DATA_1d::t_host_um t_FFT_DATA_1d_um;
typedef Kokkos::
DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_64;
typedef tdual_int_64::t_host t_int_64;
typedef tdual_int_64::t_host_um t_int_64_um;
};
#endif
//default LAMMPS Types

View File

@ -29,7 +29,7 @@
pack from data -> buf
------------------------------------------------------------------------- */
#include "kokkos_type.h"
#include "fftdata_kokkos.h"
namespace LAMMPS_NS {
@ -37,13 +37,13 @@ template<class DeviceType>
class PackKokkos {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef FFTArrayTypes<DeviceType> FFT_AT;
struct pack_3d_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
typedef FFTArrayTypes<DeviceType> FFT_AT;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
@ -51,7 +51,7 @@ public:
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
pack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
pack_3d_functor(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename FFT_AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
@ -79,7 +79,7 @@ public:
}
};
static void pack_3d(typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, struct pack_plan_3d *plan)
static void pack_3d(typename FFT_AT::t_FFT_SCALAR_1d_um d_data, int data_offset, typename FFT_AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
@ -97,7 +97,7 @@ struct unpack_3d_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
@ -105,7 +105,7 @@ public:
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
unpack_3d_functor(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename FFT_AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
@ -133,7 +133,7 @@ public:
}
};
static void unpack_3d(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
static void unpack_3d(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename FFT_AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
@ -152,7 +152,7 @@ struct unpack_3d_permute1_1_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
@ -160,7 +160,7 @@ public:
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_permute1_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
unpack_3d_permute1_1_functor(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename FFT_AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
@ -188,7 +188,7 @@ public:
}
};
static void unpack_3d_permute1_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
static void unpack_3d_permute1_1(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename FFT_AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
@ -205,7 +205,7 @@ struct unpack_3d_permute1_2_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
@ -213,7 +213,7 @@ public:
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_permute1_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
unpack_3d_permute1_2_functor(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename FFT_AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
@ -242,7 +242,7 @@ public:
}
};
static void unpack_3d_permute1_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
static void unpack_3d_permute1_2(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename FFT_AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
@ -260,7 +260,7 @@ struct unpack_3d_permute1_n_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
@ -269,7 +269,7 @@ public:
int nstride_plane; // stride between successive slow indices
int nqty; // # of values/element
unpack_3d_permute1_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
unpack_3d_permute1_n_functor(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename FFT_AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
@ -298,7 +298,7 @@ public:
}
};
static void unpack_3d_permute1_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
static void unpack_3d_permute1_n(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename FFT_AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
@ -316,7 +316,7 @@ struct unpack_3d_permute2_1_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
@ -324,7 +324,7 @@ public:
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_permute2_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
unpack_3d_permute2_1_functor(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename FFT_AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
@ -351,7 +351,7 @@ public:
}
};
static void unpack_3d_permute2_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
static void unpack_3d_permute2_1(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename FFT_AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
@ -369,7 +369,7 @@ struct unpack_3d_permute2_2_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
@ -377,7 +377,7 @@ public:
int nstride_line; // stride between successive mid indices
int nstride_plane; // stride between successive slow indices
unpack_3d_permute2_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
unpack_3d_permute2_2_functor(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename FFT_AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
@ -405,7 +405,7 @@ public:
}
};
static void unpack_3d_permute2_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
static void unpack_3d_permute2_2(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename FFT_AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;
@ -422,7 +422,7 @@ struct unpack_3d_permute2_n_functor {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf,d_data;
int buf_offset,data_offset;
int nfast; // # of elements in fast index
int nmid; // # of elements in mid index
@ -431,7 +431,7 @@ public:
int nstride_plane; // stride between successive slow indices
int nqty; // # of values/element
unpack_3d_permute2_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
unpack_3d_permute2_n_functor(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename FFT_AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
d_buf(d_buf_),
d_data(d_data_)
{
@ -459,7 +459,7 @@ public:
}
};
static void unpack_3d_permute2_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
static void unpack_3d_permute2_n(typename FFT_AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename FFT_AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
{
const int nslow = plan->nslow;
const int nmid = plan->nmid;

View File

@ -669,7 +669,7 @@ void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
nmax = atomKK->nmax;
//memory->create(part2grid,nmax,3,"pppm:part2grid");
d_part2grid = typename AT::t_int_1d_3("pppm:part2grid",nmax);
d_rho1d = typename AT::t_FFT_SCALAR_2d_3("pppm:rho1d",nmax,order/2+order/2+1);
d_rho1d = typename FFT_AT::t_FFT_SCALAR_2d_3("pppm:rho1d",nmax,order/2+order/2+1);
}
// find grid points for all my particles
@ -799,7 +799,7 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_self2, const int &i) const
template<class DeviceType>
void PPPMKokkos<DeviceType>::allocate()
{
d_density_brick = typename AT::t_FFT_SCALAR_3d("pppm:density_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_density_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:density_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
memoryKK->create_kokkos(k_density_fft,density_fft,nfft_both,"pppm:d_density_fft");
d_density_fft = k_density_fft.view<DeviceType>();
@ -821,17 +821,17 @@ void PPPMKokkos<DeviceType>::allocate()
d_fkz = typename AT::t_float_1d("pppm:d_fkz",nfft_both);
}
d_vdx_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdx_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_vdy_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdy_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_vdz_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdz_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_vdx_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_vdx_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_vdy_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_vdy_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_vdz_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_vdz_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
// summation coeffs
order_allocated = order;
k_gf_b = typename DAT::tdual_float_1d("pppm:gf_b",order);
d_gf_b = k_gf_b.view<DeviceType>();
d_rho1d = typename AT::t_FFT_SCALAR_2d_3("pppm:rho1d",nmax,order/2+order/2+1);
k_rho_coeff = DAT::tdual_FFT_SCALAR_2d("pppm:rho_coeff",order,order/2-(1-order)/2+1);
d_rho1d = typename FFT_AT::t_FFT_SCALAR_2d_3("pppm:rho1d",nmax,order/2+order/2+1);
k_rho_coeff = FFT_DAT::tdual_FFT_SCALAR_2d("pppm:rho_coeff",order,order/2-(1-order)/2+1);
d_rho_coeff = k_rho_coeff.view<DeviceType>();
h_rho_coeff = k_rho_coeff.h_view;
@ -902,14 +902,14 @@ void PPPMKokkos<DeviceType>::allocate_peratom()
{
peratom_allocate_flag = 1;
d_u_brick = typename AT::t_FFT_SCALAR_3d("pppm:u_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_u_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:u_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v0_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v0_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v1_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v1_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v2_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v2_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v3_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v3_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v4_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v4_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v5_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v5_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v0_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v0_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v1_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v1_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v2_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v2_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v3_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v3_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v4_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v4_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
d_v5_brick = typename FFT_AT::t_FFT_SCALAR_3d("pppm:d_v5_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
// create ghost grid object for rho and electric field communication

View File

@ -25,7 +25,8 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
#include "gridcomm_kokkos.h"
#include "remap_kokkos.h"
#include "fft3d_kokkos.h"
#include "kokkos_base.h"
#include "kokkos_base_fft.h"
#include "fftdata_kokkos.h"
#include "kokkos_type.h"
// fix up FFT defines for KOKKOS with CUDA
@ -107,10 +108,11 @@ struct TagPPPM_slabcorr4{};
struct TagPPPM_timing_zero{};
template<class DeviceType>
class PPPMKokkos : public PPPM, public KokkosBase {
class PPPMKokkos : public PPPM, public KokkosBaseFFT {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef FFTArrayTypes<DeviceType> FFT_AT;
PPPMKokkos(class LAMMPS *);
virtual ~PPPMKokkos();
@ -308,7 +310,7 @@ class PPPMKokkos : public PPPM, public KokkosBase {
int nx,ny,nz;
typename AT::t_int_1d_um d_list_index;
typename AT::t_FFT_SCALAR_1d_um d_buf;
typename FFT_AT::t_FFT_SCALAR_1d_um d_buf;
DAT::tdual_int_scalar k_flag;
@ -323,31 +325,31 @@ class PPPMKokkos : public PPPM, public KokkosBase {
int factors[3];
typename AT::t_FFT_SCALAR_3d d_density_brick;
typename AT::t_FFT_SCALAR_3d d_vdx_brick,d_vdy_brick,d_vdz_brick;
typename AT::t_FFT_SCALAR_3d d_u_brick;
typename AT::t_FFT_SCALAR_3d d_v0_brick,d_v1_brick,d_v2_brick;
typename AT::t_FFT_SCALAR_3d d_v3_brick,d_v4_brick,d_v5_brick;
typename FFT_AT::t_FFT_SCALAR_3d d_density_brick;
typename FFT_AT::t_FFT_SCALAR_3d d_vdx_brick,d_vdy_brick,d_vdz_brick;
typename FFT_AT::t_FFT_SCALAR_3d d_u_brick;
typename FFT_AT::t_FFT_SCALAR_3d d_v0_brick,d_v1_brick,d_v2_brick;
typename FFT_AT::t_FFT_SCALAR_3d d_v3_brick,d_v4_brick,d_v5_brick;
typename AT::t_float_1d d_greensfn;
typename AT::t_virial_array d_vg;
typename AT::t_float_1d d_fkx;
typename AT::t_float_1d d_fky;
typename AT::t_float_1d d_fkz;
DAT::tdual_FFT_SCALAR_1d k_density_fft;
DAT::tdual_FFT_SCALAR_1d k_work1;
DAT::tdual_FFT_SCALAR_1d k_work2;
typename AT::t_FFT_SCALAR_1d d_density_fft;
typename AT::t_FFT_SCALAR_1d d_work1;
typename AT::t_FFT_SCALAR_1d d_work2;
FFT_DAT::tdual_FFT_SCALAR_1d k_density_fft;
FFT_DAT::tdual_FFT_SCALAR_1d k_work1;
FFT_DAT::tdual_FFT_SCALAR_1d k_work2;
typename FFT_AT::t_FFT_SCALAR_1d d_density_fft;
typename FFT_AT::t_FFT_SCALAR_1d d_work1;
typename FFT_AT::t_FFT_SCALAR_1d d_work2;
DAT::tdual_float_1d k_gf_b;
typename AT::t_float_1d d_gf_b;
//FFT_SCALAR **rho1d,**rho_coeff,**drho1d,**drho_coeff;
typename AT::t_FFT_SCALAR_2d_3 d_rho1d;
DAT::tdual_FFT_SCALAR_2d k_rho_coeff;
typename AT::t_FFT_SCALAR_2d d_rho_coeff;
HAT::t_FFT_SCALAR_2d h_rho_coeff;
typename FFT_AT::t_FFT_SCALAR_2d_3 d_rho1d;
FFT_DAT::tdual_FFT_SCALAR_2d k_rho_coeff;
typename FFT_AT::t_FFT_SCALAR_2d d_rho_coeff;
FFT_HAT::t_FFT_SCALAR_2d h_rho_coeff;
//double **acons;
typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,DeviceType>::t_host acons;

View File

@ -58,7 +58,7 @@ RemapKokkos<DeviceType>::~RemapKokkos()
/* ---------------------------------------------------------------------- */
template<class DeviceType>
void RemapKokkos<DeviceType>::perform(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf)
void RemapKokkos<DeviceType>::perform(typename FFT_AT::t_FFT_SCALAR_1d d_in, typename FFT_AT::t_FFT_SCALAR_1d d_out, typename FFT_AT::t_FFT_SCALAR_1d d_buf)
{
remap_3d_kokkos(d_in,d_out,d_buf,plan);
}
@ -102,7 +102,7 @@ void RemapKokkos<DeviceType>::perform(typename AT::t_FFT_SCALAR_1d d_in, typenam
------------------------------------------------------------------------- */
template<class DeviceType>
void RemapKokkos<DeviceType>::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf,
void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d_in, typename FFT_AT::t_FFT_SCALAR_1d d_out, typename FFT_AT::t_FFT_SCALAR_1d d_buf,
struct remap_plan_3d_kokkos<DeviceType> *plan)
{
// collective flag not yet supported
@ -110,7 +110,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in,
// use point-to-point communication
int i,isend,irecv;
typename AT::t_FFT_SCALAR_1d d_scratch;
typename FFT_AT::t_FFT_SCALAR_1d d_scratch;
if (plan->memory == 0)
d_scratch = d_buf;
@ -442,7 +442,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
size = MAX(size,plan->send_size[nsend]);
if (size) {
plan->d_sendbuf = typename AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
plan->d_sendbuf = typename FFT_AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
if (!plan->d_sendbuf.data()) return NULL;
}
@ -452,7 +452,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
if (memory == 1) {
if (nrecv > 0) {
plan->d_scratch =
typename AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
typename FFT_AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
if (!plan->d_scratch.data()) return NULL;
}
}

View File

@ -18,7 +18,6 @@
#include <mpi.h>
#include "fftdata_kokkos.h"
#include "remap.h"
#include "kokkos_type.h"
namespace LAMMPS_NS {
@ -27,12 +26,12 @@ namespace LAMMPS_NS {
template<class DeviceType>
struct remap_plan_3d_kokkos {
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typename AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends
typename AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs
void (*pack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
typedef FFTArrayTypes<DeviceType> FFT_AT;
typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends
typename FFT_AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs
void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
// which pack function to use
void (*unpack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
// which unpack function to use
int *send_offset; // extraction loc for each send
int *send_size; // size of each send message
@ -58,16 +57,16 @@ template<class DeviceType>
class RemapKokkos : protected Pointers {
public:
typedef DeviceType device_type;
typedef ArrayTypes<DeviceType> AT;
typedef FFTArrayTypes<DeviceType> FFT_AT;
RemapKokkos(class LAMMPS *);
RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,int);
~RemapKokkos();
void perform(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d);
void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);
struct remap_plan_3d_kokkos<DeviceType> *plan;
void remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, struct remap_plan_3d_kokkos<DeviceType> *);
void remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, struct remap_plan_3d_kokkos<DeviceType> *);
struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
int, int, int, int, int, int,
int, int, int, int, int, int,

View File

@ -107,13 +107,6 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
DftiComputeForward(plan->handle_fast,data);
else
DftiComputeBackward(plan->handle_fast,data);
/*
#elif defined(FFT_FFTW2)
if (flag == -1)
fftw(plan->plan_fast_forward,total/length,data,1,length,NULL,0,0);
else
fftw(plan->plan_fast_backward,total/length,data,1,length,NULL,0,0);
*/
#elif defined(FFT_FFTW3)
if (flag == -1)
theplan=plan->plan_fast_forward;
@ -148,13 +141,6 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
DftiComputeForward(plan->handle_mid,data);
else
DftiComputeBackward(plan->handle_mid,data);
/*
#elif defined(FFT_FFTW2)
if (flag == -1)
fftw(plan->plan_mid_forward,total/length,data,1,length,NULL,0,0);
else
fftw(plan->plan_mid_backward,total/length,data,1,length,NULL,0,0);
*/
#elif defined(FFT_FFTW3)
if (flag == -1)
theplan=plan->plan_mid_forward;
@ -189,13 +175,6 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
DftiComputeForward(plan->handle_slow,data);
else
DftiComputeBackward(plan->handle_slow,data);
/*
#elif defined(FFT_FFTW2)
if (flag == -1)
fftw(plan->plan_slow_forward,total/length,data,1,length,NULL,0,0);
else
fftw(plan->plan_slow_backward,total/length,data,1,length,NULL,0,0);
*/
#elif defined(FFT_FFTW3)
if (flag == -1)
theplan=plan->plan_slow_forward;
@ -231,7 +210,7 @@ void fft_3d(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
*(out_ptr++) *= norm;
#elif defined(FFT_MKL)
out[i] *= norm;
#else
#else /* FFT_KISS */
out[i].re *= norm;
out[i].im *= norm;
#endif
@ -508,6 +487,9 @@ struct fft_plan_3d *fft_3d_create_plan(
DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast);
DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast);
#if defined(FFT_MKL_THREADS)
DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#endif
DftiCommitDescriptor(plan->handle_fast);
DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1,
@ -517,6 +499,9 @@ struct fft_plan_3d *fft_3d_create_plan(
DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid);
DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid);
#if defined(FFT_MKL_THREADS)
DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#endif
DftiCommitDescriptor(plan->handle_mid);
DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1,
@ -526,61 +511,11 @@ struct fft_plan_3d *fft_3d_create_plan(
DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE);
DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow);
DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow);
#if defined(FFT_MKL_THREADS)
DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads);
#endif
DftiCommitDescriptor(plan->handle_slow);
if (scaled == 0)
plan->scaled = 0;
else {
plan->scaled = 1;
plan->norm = 1.0/(nfast*nmid*nslow);
plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
(out_khi-out_klo+1);
}
/*
#elif defined(FFT_FFTW2)
plan->plan_fast_forward =
fftw_create_plan(nfast,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
plan->plan_fast_backward =
fftw_create_plan(nfast,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
if (nmid == nfast) {
plan->plan_mid_forward = plan->plan_fast_forward;
plan->plan_mid_backward = plan->plan_fast_backward;
}
else {
plan->plan_mid_forward =
fftw_create_plan(nmid,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
plan->plan_mid_backward =
fftw_create_plan(nmid,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
}
if (nslow == nfast) {
plan->plan_slow_forward = plan->plan_fast_forward;
plan->plan_slow_backward = plan->plan_fast_backward;
}
else if (nslow == nmid) {
plan->plan_slow_forward = plan->plan_mid_forward;
plan->plan_slow_backward = plan->plan_mid_backward;
}
else {
plan->plan_slow_forward =
fftw_create_plan(nslow,FFTW_FORWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
plan->plan_slow_backward =
fftw_create_plan(nslow,FFTW_BACKWARD,FFTW_ESTIMATE | FFTW_IN_PLACE);
}
if (scaled == 0)
plan->scaled = 0;
else {
plan->scaled = 1;
plan->norm = 1.0/(nfast*nmid*nslow);
plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
(out_khi-out_klo+1);
}
*/
#elif defined(FFT_FFTW3)
#if defined(FFT_FFTW_THREADS)
if (nthreads > 1) {
@ -620,15 +555,8 @@ struct fft_plan_3d *fft_3d_create_plan(
NULL,&nslow,1,plan->length3,
FFTW_BACKWARD,FFTW_ESTIMATE);
if (scaled == 0)
plan->scaled = 0;
else {
plan->scaled = 1;
plan->norm = 1.0/(nfast*nmid*nslow);
plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
(out_khi-out_klo+1);
}
#else
#else /* FFT_KISS */
plan->cfg_fast_forward = kiss_fft_alloc(nfast,0,NULL,NULL);
plan->cfg_fast_backward = kiss_fft_alloc(nfast,1,NULL,NULL);
@ -654,6 +582,8 @@ struct fft_plan_3d *fft_3d_create_plan(
plan->cfg_slow_backward = kiss_fft_alloc(nslow,1,NULL,NULL);
}
#endif
if (scaled == 0)
plan->scaled = 0;
else {
@ -663,8 +593,6 @@ struct fft_plan_3d *fft_3d_create_plan(
(out_khi-out_klo+1);
}
#endif
return plan;
}
@ -686,20 +614,6 @@ void fft_3d_destroy_plan(struct fft_plan_3d *plan)
DftiFreeDescriptor(&(plan->handle_fast));
DftiFreeDescriptor(&(plan->handle_mid));
DftiFreeDescriptor(&(plan->handle_slow));
/*
#elif defined(FFT_FFTW2)
if (plan->plan_slow_forward != plan->plan_fast_forward &&
plan->plan_slow_forward != plan->plan_mid_forward) {
fftw_destroy_plan(plan->plan_slow_forward);
fftw_destroy_plan(plan->plan_slow_backward);
}
if (plan->plan_mid_forward != plan->plan_fast_forward) {
fftw_destroy_plan(plan->plan_mid_forward);
fftw_destroy_plan(plan->plan_mid_backward);
}
fftw_destroy_plan(plan->plan_fast_forward);
fftw_destroy_plan(plan->plan_fast_backward);
*/
#elif defined(FFT_FFTW3)
FFTW_API(destroy_plan)(plan->plan_slow_forward);
FFTW_API(destroy_plan)(plan->plan_slow_backward);
@ -840,18 +754,6 @@ void fft_1d_only(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
DftiComputeBackward(plan->handle_mid,data);
DftiComputeBackward(plan->handle_slow,data);
}
/*
#elif defined(FFT_FFTW2)
if (flag == -1) {
fftw(plan->plan_fast_forward,total1/length1,data,1,0,NULL,0,0);
fftw(plan->plan_mid_forward,total2/length2,data,1,0,NULL,0,0);
fftw(plan->plan_slow_forward,total3/length3,data,1,0,NULL,0,0);
} else {
fftw(plan->plan_fast_backward,total1/length1,data,1,0,NULL,0,0);
fftw(plan->plan_mid_backward,total2/length2,data,1,0,NULL,0,0);
fftw(plan->plan_slow_backward,total3/length3,data,1,0,NULL,0,0);
}
*/
#elif defined(FFT_FFTW3)
FFTW_API(plan) theplan;
if (flag == -1)

View File

@ -43,14 +43,6 @@ typedef double FFT_SCALAR;
typedef float _Complex FFT_DATA;
#define FFT_MKL_PREC DFTI_SINGLE
//#elif defined(FFT_FFTW2)
//#if defined(FFTW_SIZE)
//#include "sfftw.h"
//#else
//#include "fftw.h"
//#endif
//typedef FFTW_COMPLEX FFT_DATA;
#elif defined(FFT_FFTW3)
#include "fftw3.h"
typedef fftwf_complex FFT_DATA;
@ -84,14 +76,6 @@ typedef struct kiss_fft_state* kiss_fft_cfg;
typedef double _Complex FFT_DATA;
#define FFT_MKL_PREC DFTI_DOUBLE
//#elif defined(FFT_FFTW2)
//#if defined(FFTW_SIZE)
//#include "dfftw.h"
//#else
//#include "fftw.h"
//#endif
//typedef FFTW_COMPLEX FFT_DATA;
#elif defined(FFT_FFTW3)
#include "fftw3.h"
typedef fftw_complex FFT_DATA;
@ -141,13 +125,6 @@ struct fft_plan_3d {
DFTI_DESCRIPTOR *handle_fast;
DFTI_DESCRIPTOR *handle_mid;
DFTI_DESCRIPTOR *handle_slow;
//#elif defined(FFT_FFTW2)
// fftw_plan plan_fast_forward;
// fftw_plan plan_fast_backward;
// fftw_plan plan_mid_forward;
// fftw_plan plan_mid_backward;
//fftw_plan plan_slow_forward;
//fftw_plan plan_slow_backward;
#elif defined(FFT_FFTW3)
FFTW_API(plan) plan_fast_forward;
FFTW_API(plan) plan_fast_backward;