Merge branch 'master' into remove-txt-files

# Conflicts: # doc/src/Build_settings.rst
2020-01-24 21:32:24 -05:00
parent 0302c0ba24 4134d7fddd
commit ec381964e4
26 changed files with 2980 additions and 167 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -739,4 +739,20 @@ if(PKG_KSPACE)
  else()
    message(STATUS "Using double precision FFTs")
  endif()
+  if(FFT_FFTW_THREADS)
+    message(STATUS "Using threaded FFTs")
+  else()
+    message(STATUS "Using non-threaded FFTs")
+  endif()
+  if(PKG_KOKKOS)
+    if(KOKKOS_ENABLE_CUDA)
+      if (${FFT} STREQUAL "KISS")
+        message(STATUS "Kokkos FFT: KISS")
+      else()
+        message(STATUS "Kokkos FFT: cuFFT")
+      endif()
+    else()
+      message(STATUS "Kokkos FFT: ${FFT}")
+    endif()
+  endif()
 endif()
--- a/cmake/Modules/FindFFTW3.cmake
+++ b/cmake/Modules/FindFFTW3.cmake
@ -1,20 +1,22 @@
 # - Find fftw3
-# Find the native FFTW3 headers and libraries.
+# Find the native double precision FFTW3 headers and libraries.
 #
-#  FFTW3_INCLUDE_DIRS - where to find fftw3.h, etc.
-#  FFTW3_LIBRARIES    - List of libraries when using fftw3.
-#  FFTW3_FOUND        - True if fftw3 found.
+#  FFTW3_INCLUDE_DIRS  - where to find fftw3.h, etc.
+#  FFTW3_LIBRARIES     - List of libraries when using fftw3.
+#  FFTW3_OMP_LIBRARIES - List of libraries when using fftw3.
+#  FFTW3_FOUND         - True if fftw3 found.
 #

 find_package(PkgConfig)

 pkg_check_modules(PC_FFTW3 fftw3)
 find_path(FFTW3_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3_INCLUDE_DIRS})
-
 find_library(FFTW3_LIBRARY NAMES fftw3 HINTS ${PC_FFTW3_LIBRARY_DIRS})
+find_library(FFTW3_OMP_LIBRARY NAMES fftw3_omp HINTS ${PC_FFTW3_LIBRARY_DIRS})

-set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
 set(FFTW3_INCLUDE_DIRS ${FFTW3_INCLUDE_DIR})
+set(FFTW3_LIBRARIES ${FFTW3_LIBRARY})
+set(FFTW3_OMP_LIBRARIES ${FFTW3_OMP_LIBRARY})

 include(FindPackageHandleStandardArgs)
 # handle the QUIETLY and REQUIRED arguments and set FFTW3_FOUND to TRUE
@ -22,4 +24,4 @@ include(FindPackageHandleStandardArgs)

 find_package_handle_standard_args(FFTW3 DEFAULT_MSG FFTW3_LIBRARY FFTW3_INCLUDE_DIR)

-mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY )
+mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_LIBRARY FFTW3_OMP_LIBRARY)
--- a/cmake/Modules/FindFFTW3F.cmake
+++ b/cmake/Modules/FindFFTW3F.cmake
@ -1,8 +1,8 @@
-# - Find fftw3f
-# Find the native FFTW3F headers and libraries.
+# Find the native single precision FFTW3 headers and libraries.
 #
 #  FFTW3F_INCLUDE_DIRS - where to find fftw3f.h, etc.
 #  FFTW3F_LIBRARIES    - List of libraries when using fftw3f.
+#  FFTW3F_OMP_LIBRARIES - List of libraries when using fftw3.
 #  FFTW3F_FOUND        - True if fftw3f found.
 #

@ -10,11 +10,12 @@ find_package(PkgConfig)

 pkg_check_modules(PC_FFTW3F fftw3f)
 find_path(FFTW3F_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3F_INCLUDE_DIRS})
-
 find_library(FFTW3F_LIBRARY NAMES fftw3f HINTS ${PC_FFTW3F_LIBRARY_DIRS})
+find_library(FFTW3F_OMP_LIBRARY NAMES fftw3f_omp HINTS ${PC_FFTW3F_LIBRARY_DIRS})

-set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY})
 set(FFTW3F_INCLUDE_DIRS ${FFTW3F_INCLUDE_DIR})
+set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY})
+set(FFTW3F_OMP_LIBRARIES ${FFTW3F_OMP_LIBRARY})

 include(FindPackageHandleStandardArgs)
 # handle the QUIETLY and REQUIRED arguments and set FFTW3F_FOUND to TRUE
@ -22,4 +23,4 @@ include(FindPackageHandleStandardArgs)

 find_package_handle_standard_args(FFTW3F DEFAULT_MSG FFTW3F_LIBRARY FFTW3F_INCLUDE_DIR)

-mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY )
+mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY)
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -10,7 +10,7 @@ if(PKG_KOKKOS)
    set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
    set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
    add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR})
-  
+
    set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src
                            ${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src
                            ${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src
@ -39,7 +39,15 @@ if(PKG_KOKKOS)
                         ${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp)

  if(PKG_KSPACE)
-    list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp)
+    list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/fft3d_kokkos.cpp
+                                   ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp
+                                   ${KOKKOS_PKG_SOURCES_DIR}/remap_kokkos.cpp)
+    if(KOKKOS_ENABLE_CUDA)
+      if(NOT ${FFT} STREQUAL "KISS")
+        add_definitions(-DFFT_CUFFT)
+        list(APPEND LAMMPS_LINK_LIBS cufft)
+      endif()
+    endif()
  endif()

  set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")
--- a/cmake/Modules/Packages/KSPACE.cmake
+++ b/cmake/Modules/Packages/KSPACE.cmake
@ -21,6 +21,20 @@ if(PKG_KSPACE)
    add_definitions(-DFFT_FFTW3)
    include_directories(${${FFTW}_INCLUDE_DIRS})
    list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_LIBRARIES})
+    if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY)
+      option(FFT_FFTW_THREADS "Use threaded FFT library" ON)
+    else()
+      option(FFT_FFTW_THREADS "Use threaded FFT library" OFF)
+    endif()
+
+    if(FFT_FFTW_THREADS)
+      if(FFTW3_OMP_LIBRARY OR FFTW3F_OMP_LIBRARY)
+        add_definitions(-DFFT_FFTW_THREADS)
+        list(APPEND LAMMPS_LINK_LIBS ${${FFTW}_OMP_LIBRARIES})
+      else()
+        message(FATAL_ERROR "Need OpenMP enabled FFTW3 library for FFT_THREADS")
+      endif()
+    endif()
  elseif(FFT STREQUAL "MKL")
    find_package(MKL REQUIRED)
    add_definitions(-DFFT_MKL)
--- a/doc/lammps.1
+++ b/doc/lammps.1
@ -1,4 +1,4 @@
-.TH LAMMPS "9 January 2020" "2020-01-09"
+.TH LAMMPS "24 January 2020" "2020-01-24"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -350,10 +350,12 @@ For NVIDIA GPUs using CUDA:
   KOKKOS_DEVICES = Cuda
   KOKKOS_ARCH = archCPU,archGPU    # archCPU = CPU from list above that is hosting the GPU
                                    # archGPU = GPU from list above
+   FFT_INC = -DFFT_CUFFT            # enable use of cuFFT (optional)
+   FFT_LIB = -lcufft                # link to cuFFT library

-For GPUs, you also need these 2 lines in your Makefile.machine before
-the CC line is defined, in this case for use with OpenMPI mpicxx.  The
-2 lines define a nvcc wrapper compiler, which will use nvcc for
+For GPUs, you also need the following 2 lines in your Makefile.machine
+before the CC line is defined, in this case for use with OpenMPI mpicxx.
+The 2 lines define a nvcc wrapper compiler, which will use nvcc for
 compiling CUDA files and use a C++ compiler for non-Kokkos, non-CUDA
 files.

--- a/doc/src/Build_settings.rst
+++ b/doc/src/Build_settings.rst
@ -49,13 +49,19 @@ through the CMAKE\_CXX\_FLAGS variable. Example for CentOS 7:

   -D CMAKE_CXX_FLAGS="-O3 -g -fopenmp -DNDEBUG -std=c++11"

-**Makefile.machine setting**\ :
+**Makefile.machine setting**\ to bypass the C++11 test and compile in C++98 mode:


 .. parsed-literal::

   LMP_INC = -DLAMMPS_CXX98

+**Makefile.machine setting**\ to enable the C++11 with older (but not too old) GNU c++ (e.g. on CentOS 7):
+
+
+.. parsed-literal::
+
+   CCFLAGS = -g -O3 -std=c++11

 ----------

@ -86,14 +92,19 @@ LAMMPS can use them if they are available on your system.
   an exception to the rule that all CMake variables can be specified
   with lower-case values.

-Usually these settings are all that is needed.  If CMake cannot find
-the FFT library, you can set these variables:
+Usually these settings are all that is needed.  If FFTW3 is selected,
+then CMake will try to detect, if threaded FFTW libraries are available
+and enable them by default.  This setting is independent of whether
+OpenMP threads are enabled and a packages like KOKKOS or USER-OMP is
+used.  If CMake cannot detect the FFT library, you can set these variables
+to assist:


 .. parsed-literal::

   -D FFTW3_INCLUDE_DIRS=path  # path to FFTW3 include files
   -D FFTW3_LIBRARIES=path     # path to FFTW3 libraries
+   -D FFT_FFTW_THREADS=on      # enable using threaded FFTW3 libraries
   -D MKL_INCLUDE_DIRS=path    # ditto for Intel MKL library
   -D MKL_LIBRARIES=path

@ -105,6 +116,7 @@ the FFT library, you can set these variables:
   FFT_INC = -DFFT_FFTW3         # -DFFT_FFTW3, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS
                                 # default is KISS if not specified
   FFT_INC = -DFFT_SINGLE        # do not specify for double precision
+   FFT_INC = -DFFT_FFTW_THREADS  # enable using threaded FFTW3 libraries
   FFT_INC = -DFFT_PACK_ARRAY    # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY

 # default is FFT\_PACK\_ARRAY if not specified
@ -115,6 +127,7 @@ the FFT library, you can set these variables:
   FFT_INC =       -I/usr/local/include
   FFT_PATH =      -L/usr/local/lib
   FFT_LIB =       -lfftw3             # FFTW3 double precision
+   FFT_LIB =       -lfftw3 -lfftw3_omp # FFTW3 double precision with threads (needs -DFFT_FFTW_THREADS)
   FFT_LIB =       -lfftw3 -lfftw3f    # FFTW3 single precision
   FFT_LIB =       -lmkl_intel_lp64 -lmkl_sequential -lmkl_core  # MKL with Intel compiler
   FFT_LIB =       -lmkl_gf_lp64 -lmkl_sequential -lmkl_core     # MKL with GNU compier
@ -126,16 +139,19 @@ FFT\_LIB with the appropriate FFT libraries to include in the link.
 **CMake and make info**\ :

 The `KISS FFT library <http://kissfft.sf.net>`_ is included in the LAMMPS
-distribution.  It is portable across all platforms.  Depending on the
-size of the FFTs and the number of processors used, the other
-libraries listed here can be faster.
+distribution.  It is portable across all platforms.  Depending on the size
+of the FFTs and the number of processors used, the other libraries listed
+here can be faster.

 However, note that long-range Coulombics are only a portion of the
 per-timestep CPU cost, FFTs are only a portion of long-range
 Coulombics, and 1d FFTs are only a portion of the FFT cost (parallel
 communication can be costly).  A breakdown of these timings is printed
-to the screen at the end of a run using the :doc:`kspace_style pppm <kspace_style>` command.  The :doc:`Run output <Run_output>`
-doc page gives more details.
+to the screen at the end of a run when using the
+:doc:`kspace_style pppm <kspace_style>` command. The :doc:`Run output <Run_output>`
+doc page gives more details.  A more detailed (and time consuming)
+report of the FFT performance is generated with the
+:doc:`kspace_modify fftbench yes <kspace_modify>` command.

 FFTW is a fast, portable FFT library that should also work on any
 platform and can be faster than the KISS FFT library.  You can
@ -166,7 +182,7 @@ When using -DFFT\_SINGLE with FFTW3 you may need to build the FFTW
 library a second time with support for single-precision.

 For FFTW3, do the following, which should produce the additional
-library libfftw3f.a
+library libfftw3f.a or libfftw3f.so.


 .. parsed-literal::
--- a/doc/src/kspace_style.rst
+++ b/doc/src/kspace_style.rst
@ -404,9 +404,9 @@ calculation can be performed concurrently on the GPU while other
 calculations for non-bonded and bonded force calculation are performed
 on the CPU.

-The *pppm/kk* style also performs charge assignment and force
-interpolation calculations on the GPU while the FFTs themselves are
-calculated on the CPU in non-threaded mode.
+The *pppm/kk* style performs charge assignment and force interpolation
+calculations, along with the FFTs themselves, on the GPU or (optionally) threaded
+on the CPU when using OpenMP and FFTW3.

 These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
 USER-OMP, and OPT packages respectively.  They are only enabled if
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -97,6 +97,9 @@ action dihedral_opls_kokkos.cpp dihedral_opls.cpp
 action dihedral_opls_kokkos.h dihedral_opls.h
 action domain_kokkos.cpp
 action domain_kokkos.h
+action fftdata_kokkos.h fft3d.h
+action fft3d_kokkos.cpp fft3d.cpp
+action fft3d_kokkos.h fft3d.h
 action fix_deform_kokkos.cpp
 action fix_deform_kokkos.h
 action fix_enforce2d_kokkos.cpp
@ -153,6 +156,7 @@ action improper_class2_kokkos.cpp improper_class2.cpp
 action improper_class2_kokkos.h improper_class2.h
 action improper_harmonic_kokkos.cpp improper_harmonic.cpp
 action improper_harmonic_kokkos.h improper_harmonic.h
+action kissfft_kokkos.h kissfft.h
 action kokkos.cpp
 action kokkos.h
 action kokkos_base.h
@ -189,6 +193,7 @@ action min_kokkos.cpp
 action min_kokkos.h
 action min_linesearch_kokkos.cpp
 action min_linesearch_kokkos.h
+action pack_kokkos.h pack.h
 action pair_buck_coul_cut_kokkos.cpp
 action pair_buck_coul_cut_kokkos.h
 action pair_buck_coul_long_kokkos.cpp pair_buck_coul_long.cpp
@ -285,6 +290,8 @@ action rand_pool_wrap_kokkos.cpp
 action rand_pool_wrap_kokkos.h
 action region_block_kokkos.cpp
 action region_block_kokkos.h
+action remap_kokkos.cpp remap.cpp
+action remap_kokkos.h remap.h
 action sna_kokkos.h sna.h
 action sna_kokkos_impl.h sna.cpp
 action verlet_kokkos.cpp
--- a/src/KOKKOS/fft3d_kokkos.cpp
+++ b/src/KOKKOS/fft3d_kokkos.cpp
@ -0,0 +1,905 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stan Moore (SNL), Sam Mish (U.C. Davis)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "fft3d_kokkos.h"
+#include "remap_kokkos.h"
+#include "kokkos_type.h"
+#include "error.h"
+#include "kokkos.h"
+
+
+using namespace LAMMPS_NS;
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MAX(A,B) ((A) > (B) ? (A) : (B))
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow,
+             int in_ilo, int in_ihi, int in_jlo, int in_jhi,
+             int in_klo, int in_khi,
+             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
+             int out_klo, int out_khi,
+             int scaled, int permute, int *nbuf, int usecollective) :
+  Pointers(lmp)
+{
+  int nthreads = lmp->kokkos->nthreads;
+  int ngpus = lmp->kokkos->ngpus;
+  ExecutionSpace execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+#if defined(FFT_MKL)
+  if (ngpus > 0 && execution_space == Device)
+    lmp->error->all(FLERR,"Cannot use the MKL library with Kokkos CUDA on GPUs");
+#elif defined(FFT_FFTW3)
+  if (ngpus > 0 && execution_space == Device)
+    lmp->error->all(FLERR,"Cannot use the FFTW library with Kokkos CUDA on GPUs");
+#elif defined(FFT_CUFFT)
+  if (ngpus > 0 && execution_space == Host)
+    lmp->error->all(FLERR,"Cannot use the cuFFT library with Kokkos CUDA on the host CPUs");
+#elif defined(FFT_KISSFFT)
+  // The compiler can't statically determine the stack size needed for
+  //  recursive function calls in KISS FFT and the default per-thread
+  //  stack size on GPUs needs to be increased to prevent stack overflows
+  //  for reasonably sized FFTs
+  #if defined (KOKKOS_ENABLE_CUDA)
+    size_t stack_size;
+    cudaDeviceGetLimit(&stack_size,cudaLimitStackSize);
+    if (stack_size < 2048)
+      cudaDeviceSetLimit(cudaLimitStackSize,2048);
+  #endif
+#endif
+
+  plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
+                            in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+                            out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
+                            scaled,permute,nbuf,usecollective,nthreads);
+  if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FFT3dKokkos<DeviceType>::~FFT3dKokkos()
+{
+  fft_3d_destroy_plan_kokkos(plan);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FFT3dKokkos<DeviceType>::compute(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, int flag)
+{
+  typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size()/2);
+  typename AT::t_FFT_DATA_1d d_out_data(d_out.data(),d_out.size()/2);
+
+  fft_3d_kokkos(d_in_data,d_out_data,flag,plan);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FFT3dKokkos<DeviceType>::timing1d(typename AT::t_FFT_SCALAR_1d d_in, int nsize, int flag)
+{
+  typename AT::t_FFT_DATA_1d d_in_data(d_in.data(),d_in.size());
+
+  fft_3d_1d_only_kokkos(d_in_data,nsize,flag,plan);
+}
+
+/* ----------------------------------------------------------------------
+   Data layout for 3d FFTs:
+
+   data set of Nfast x Nmid x Nslow elements is owned by P procs
+   on input, each proc owns a subsection of the elements
+   on output, each proc will own a (possibly different) subsection
+   my subsection must not overlap with any other proc's subsection,
+     i.e. the union of all proc's input (or output) subsections must
+     exactly tile the global Nfast x Nmid x Nslow data set
+   when called from C, all subsection indices are
+     C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
+   when called from F77, all subsection indices are
+     F77-style from 1 to N where N = Nfast or Nmid or Nslow
+   a proc can own 0 elements on input or output
+     by specifying hi index < lo index
+   on both input and output, data is stored contiguously on a processor
+     with a fast-varying, mid-varying, and slow-varying index
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Perform 3d FFT
+
+   Arguments:
+   in           starting address of input data on this proc
+   out          starting address of where output data for this proc
+                  will be placed (can be same as in)
+   flag         1 for forward FFT, -1 for inverse FFT
+   plan         plan returned by previous call to fft_3d_create_plan
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct norm_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_DATA_1d_um d_out;
+  int norm;
+
+  norm_functor(typename AT::t_FFT_DATA_1d &d_out_, int norm_):
+    d_out(d_out_),norm(norm_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const {
+#if defined(FFT_FFTW3) || defined(FFT_CUFFT)
+    FFT_SCALAR* out_ptr = (FFT_SCALAR *)(d_out.data()+i);
+    *(out_ptr++) *= norm;
+    *(out_ptr++) *= norm;
+#elif defined(FFT_MKL)
+    d_out(i) *= norm;
+#else
+    d_out(i,0) *= norm;
+    d_out(i,1) *= norm;
+#endif
+  }
+};
+
+#ifdef FFT_KISSFFT
+template<class DeviceType>
+struct kiss_fft_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_DATA_1d_um d_data,d_tmp;
+  kiss_fft_state_kokkos<DeviceType> st;
+  int length;
+
+  kiss_fft_functor() {}
+
+  kiss_fft_functor(typename AT::t_FFT_DATA_1d &d_data_,typename AT::t_FFT_DATA_1d &d_tmp_, kiss_fft_state_kokkos<DeviceType> &st_, int length_):
+    d_data(d_data_),
+    d_tmp(d_tmp_),
+    st(st_)
+    {
+      length = length_;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const {
+    const int offset = i*length;
+    KissFFTKokkos<DeviceType>::kiss_fft_kokkos(st,d_data,d_tmp,offset);
+  }
+};
+#endif
+
+template<class DeviceType>
+void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename AT::t_FFT_DATA_1d d_in, typename AT::t_FFT_DATA_1d d_out, int flag, struct fft_plan_3d_kokkos<DeviceType> *plan)
+{
+  int total,length;
+  typename AT::t_FFT_DATA_1d d_data,d_copy;
+  typename AT::t_FFT_SCALAR_1d d_in_scalar,d_data_scalar,d_out_scalar,d_copy_scalar,d_scratch_scalar;
+
+  // pre-remap to prepare for 1st FFTs if needed
+  // copy = loc for remap result
+
+  if (plan->pre_plan) {
+    if (plan->pre_target == 0) d_copy = d_out;
+    else d_copy = plan->d_copy;
+
+     d_in_scalar = typename AT::t_FFT_SCALAR_1d(d_in.data(),d_in.size());
+     d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size());
+     d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
+
+    remapKK->remap_3d_kokkos(d_in_scalar, d_copy_scalar,
+             d_scratch_scalar, plan->pre_plan);
+
+    d_data = d_copy;
+  } else d_data = d_in;
+
+  // 1d FFTs along fast axis
+
+  total = plan->total1;
+  length = plan->length1;
+
+  #if defined(FFT_MKL)
+    if (flag == -1)
+      DftiComputeForward(plan->handle_fast,(FFT_DATA *)d_data.data());
+    else
+      DftiComputeBackward(plan->handle_fast,(FFT_DATA *)d_data.data());
+  #elif defined(FFT_FFTW3)
+    if (flag == -1)
+      FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
+    else
+      FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
+  #elif defined(FFT_CUFFT)
+    cufftExec(plan->plan_fast,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
+  #else
+    typename AT::t_FFT_DATA_1d d_tmp =
+     typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
+    kiss_fft_functor<DeviceType> f;
+    if (flag == -1)
+      f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length);
+    else
+      f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_backward,length);
+    Kokkos::parallel_for(total/length,f);
+    d_data = d_tmp;
+    d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
+  #endif
+
+
+  // 1st mid-remap to prepare for 2nd FFTs
+  // copy = loc for remap result
+
+  if (plan->mid1_target == 0) d_copy = d_out;
+  else d_copy = plan->d_copy;
+
+  d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size()*2);
+  d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size()*2);
+  d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size()*2);
+
+  remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar,
+           d_scratch_scalar, plan->mid1_plan);
+
+  d_data = d_copy;
+
+  // 1d FFTs along mid axis
+
+  total = plan->total2;
+  length = plan->length2;
+
+  #if defined(FFT_MKL)
+    if (flag == -1)
+      DftiComputeForward(plan->handle_mid,(FFT_DATA *)d_data.data());
+    else
+      DftiComputeBackward(plan->handle_mid,(FFT_DATA *)d_data.data());
+  #elif defined(FFT_FFTW3)
+    if (flag == -1)
+      FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
+    else
+      FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
+  #elif defined(FFT_CUFFT)
+    cufftExec(plan->plan_mid,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
+  #else
+    if (flag == -1)
+      f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_forward,length);
+    else
+      f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_backward,length);
+    Kokkos::parallel_for(total/length,f);
+    d_data = d_tmp;
+    d_tmp = typename AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.dimension_0());
+  #endif
+
+  // 2nd mid-remap to prepare for 3rd FFTs
+  // copy = loc for remap result
+
+  if (plan->mid2_target == 0) d_copy = d_out;
+  else d_copy = plan->d_copy;
+
+  d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size());
+  d_copy_scalar = typename AT::t_FFT_SCALAR_1d(d_copy.data(),d_copy.size());
+  d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
+
+  remapKK->remap_3d_kokkos(d_data_scalar, d_copy_scalar,
+           d_scratch_scalar, plan->mid2_plan);
+
+  d_data = d_copy;
+
+  // 1d FFTs along slow axis
+
+  total = plan->total3;
+  length = plan->length3;
+
+  #if defined(FFT_MKL)
+    if (flag == -1)
+      DftiComputeForward(plan->handle_slow,(FFT_DATA *)d_data.data());
+    else
+      DftiComputeBackward(plan->handle_slow,(FFT_DATA *)d_data.data());
+  #elif defined(FFT_FFTW3)
+    if (flag == -1)
+      FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
+    else
+      FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data());
+  #elif defined(FFT_CUFFT)
+    cufftExec(plan->plan_slow,(FFT_DATA *)d_data.data(),(FFT_DATA *)d_data.data(),flag);
+  #else
+    if (flag == -1)
+      f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_forward,length);
+    else
+      f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_backward,length);
+    Kokkos::parallel_for(total/length,f);
+    d_data = d_tmp;
+  #endif
+
+
+  // post-remap to put data in output format if needed
+  // destination is always out
+
+  if (plan->post_plan) {
+    d_data_scalar = typename AT::t_FFT_SCALAR_1d(d_data.data(),d_data.size());
+    d_out_scalar = typename AT::t_FFT_SCALAR_1d(d_out.data(),d_out.size());
+    d_scratch_scalar = typename AT::t_FFT_SCALAR_1d(plan->d_scratch.data(),plan->d_scratch.size());
+
+    remapKK->remap_3d_kokkos(d_data_scalar, d_out_scalar,
+             d_scratch_scalar, plan->post_plan);
+    }
+
+  // scaling if required
+
+  if (flag == 1 && plan->scaled) {
+    FFT_SCALAR norm = plan->norm;
+    int num = plan->normnum;
+
+    norm_functor<DeviceType> f(d_out,norm);
+    Kokkos::parallel_for(num,f);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Create plan for performing a 3d FFT
+
+   Arguments:
+   comm                 MPI communicator for the P procs which own the d_data
+   nfast,nmid,nslow     size of global 3d matrix
+   in_ilo,in_ihi        input bounds of data I own in fast index
+   in_jlo,in_jhi        input bounds of data I own in mid index
+   in_klo,in_khi        input bounds of data I own in slow index
+   out_ilo,out_ihi      output bounds of data I own in fast index
+   out_jlo,out_jhi      output bounds of data I own in mid index
+   out_klo,out_khi      output bounds of data I own in slow index
+   scaled               0 = no scaling of result, 1 = scaling
+   permute              permutation in storage order of indices on output
+                          0 = no permutation
+                          1 = permute once = mid->fast, slow->mid, fast->slow
+                          2 = permute twice = slow->fast, fast->mid, mid->slow
+   nbuf                 returns size of internal storage buffers used by FFT
+   usecollective        use collective MPI operations for remapping data
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_plan_kokkos (
+       MPI_Comm comm, int nfast, int nmid, int nslow,
+       int in_ilo, int in_ihi, int in_jlo, int in_jhi,
+       int in_klo, int in_khi,
+       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
+       int out_klo, int out_khi,
+       int scaled, int permute, int *nbuf, int usecollective,
+       int nthreads)
+{
+  struct fft_plan_3d_kokkos<DeviceType> *plan;
+  int me,nprocs;
+  int flag,remapflag;
+  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
+  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
+  int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
+  int out_size,first_size,second_size,third_size,copy_size,scratch_size;
+  int np1,np2,ip1,ip2;
+
+  // query MPI info
+
+  MPI_Comm_rank(comm,&me);
+  MPI_Comm_size(comm,&nprocs);
+
+  // compute division of procs in 2 dimensions not on-processor
+
+  bifactor(nprocs,&np1,&np2);
+  ip1 = me % np1;
+  ip2 = me/np1;
+
+  // allocate memory for plan data struct
+
+  plan = new struct fft_plan_3d_kokkos<DeviceType>;
+  remapKK = new RemapKokkos<DeviceType>(lmp);
+  if (plan == NULL) return NULL;
+
+  // remap from initial distribution to layout needed for 1st set of 1d FFTs
+  // not needed if all procs own entire fast axis initially
+  // first indices = distribution after 1st set of FFTs
+
+
+  if (in_ilo == 0 && in_ihi == nfast-1) flag = 0;
+  else flag = 1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0) {
+    first_ilo = in_ilo;
+    first_ihi = in_ihi;
+    first_jlo = in_jlo;
+    first_jhi = in_jhi;
+    first_klo = in_klo;
+    first_khi = in_khi;
+    plan->pre_plan = NULL;
+  }
+  else {
+    first_ilo = 0;
+    first_ihi = nfast - 1;
+    first_jlo = ip1*nmid/np1;
+    first_jhi = (ip1+1)*nmid/np1 - 1;
+    first_klo = ip2*nslow/np2;
+    first_khi = (ip2+1)*nslow/np2 - 1;
+    plan->pre_plan =
+      remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+                           first_ilo,first_ihi,first_jlo,first_jhi,
+                           first_klo,first_khi,2,0,0,FFT_PRECISION,0);
+    if (plan->pre_plan == NULL) return NULL;
+  }
+
+  // 1d FFTs along fast axis
+
+  plan->length1 = nfast;
+  plan->total1 = nfast * (first_jhi-first_jlo+1) * (first_khi-first_klo+1);
+
+  // remap from 1st to 2nd FFT
+  // choose which axis is split over np1 vs np2 to minimize communication
+  // second indices = distribution after 2nd set of FFTs
+
+  second_ilo = ip1*nfast/np1;
+  second_ihi = (ip1+1)*nfast/np1 - 1;
+  second_jlo = 0;
+  second_jhi = nmid - 1;
+  second_klo = ip2*nslow/np2;
+  second_khi = (ip2+1)*nslow/np2 - 1;
+  plan->mid1_plan =
+      remapKK->remap_3d_create_plan_kokkos(comm,
+                           first_ilo,first_ihi,first_jlo,first_jhi,
+                           first_klo,first_khi,
+                           second_ilo,second_ihi,second_jlo,second_jhi,
+                           second_klo,second_khi,2,1,0,FFT_PRECISION,
+                           usecollective);
+  if (plan->mid1_plan == NULL) return NULL;
+
+  // 1d FFTs along mid axis
+
+  plan->length2 = nmid;
+  plan->total2 = (second_ihi-second_ilo+1) * nmid * (second_khi-second_klo+1);
+
+  // remap from 2nd to 3rd FFT
+  // if final distribution is permute=2 with all procs owning entire slow axis
+  //   then this remapping goes directly to final distribution
+  //  third indices = distribution after 3rd set of FFTs
+
+  if (permute == 2 && out_klo == 0 && out_khi == nslow-1) flag = 0;
+  else flag = 1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0) {
+    third_ilo = out_ilo;
+    third_ihi = out_ihi;
+    third_jlo = out_jlo;
+    third_jhi = out_jhi;
+    third_klo = out_klo;
+    third_khi = out_khi;
+  }
+  else {
+    third_ilo = ip1*nfast/np1;
+    third_ihi = (ip1+1)*nfast/np1 - 1;
+    third_jlo = ip2*nmid/np2;
+    third_jhi = (ip2+1)*nmid/np2 - 1;
+    third_klo = 0;
+    third_khi = nslow - 1;
+  }
+
+  plan->mid2_plan =
+    remapKK->remap_3d_create_plan_kokkos(comm,
+                         second_jlo,second_jhi,second_klo,second_khi,
+                         second_ilo,second_ihi,
+                         third_jlo,third_jhi,third_klo,third_khi,
+                         third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
+  if (plan->mid2_plan == NULL) return NULL;
+
+  // 1d FFTs along slow axis
+
+  plan->length3 = nslow;
+  plan->total3 = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * nslow;
+
+  // remap from 3rd FFT to final distribution
+  //  not needed if permute = 2 and third indices = out indices on all procs
+
+  if (permute == 2 &&
+      out_ilo == third_ilo && out_ihi == third_ihi &&
+      out_jlo == third_jlo && out_jhi == third_jhi &&
+      out_klo == third_klo && out_khi == third_khi) flag = 0;
+  else flag = 1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0)
+    plan->post_plan = NULL;
+  else {
+    plan->post_plan =
+      remapKK->remap_3d_create_plan_kokkos(comm,
+                           third_klo,third_khi,third_ilo,third_ihi,
+                           third_jlo,third_jhi,
+                           out_klo,out_khi,out_ilo,out_ihi,
+                           out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
+    if (plan->post_plan == NULL) return NULL;
+  }
+
+  // configure plan memory pointers and allocate work space
+  // out_size = amount of memory given to FFT by user
+  // first/second/third_size = amount of memory needed after
+  //                           pre,mid1,mid2 remaps
+  // copy_size = amount needed internally for extra copy of data
+  // scratch_size = amount needed internally for remap scratch space
+  // for each remap:
+  //   out space used for result if big enough, else require copy buffer
+  //   accumulate largest required remap scratch space
+
+  out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
+  first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
+    (first_khi-first_klo+1);
+  second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
+    (second_khi-second_klo+1);
+  third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
+    (third_khi-third_klo+1);
+
+  copy_size = 0;
+  scratch_size = 0;
+
+  if (plan->pre_plan) {
+    if (first_size <= out_size)
+      plan->pre_target = 0;
+    else {
+      plan->pre_target = 1;
+      copy_size = MAX(copy_size,first_size);
+    }
+    scratch_size = MAX(scratch_size,first_size);
+  }
+
+  if (plan->mid1_plan) {
+    if (second_size <= out_size)
+      plan->mid1_target = 0;
+    else {
+      plan->mid1_target = 1;
+      copy_size = MAX(copy_size,second_size);
+    }
+    scratch_size = MAX(scratch_size,second_size);
+  }
+
+  if (plan->mid2_plan) {
+    if (third_size <= out_size)
+      plan->mid2_target = 0;
+    else {
+      plan->mid2_target = 1;
+      copy_size = MAX(copy_size,third_size);
+    }
+    scratch_size = MAX(scratch_size,third_size);
+  }
+
+  if (plan->post_plan)
+    scratch_size = MAX(scratch_size,out_size);
+
+  *nbuf = copy_size + scratch_size;
+
+  if (copy_size) {
+    plan->d_copy = typename AT::t_FFT_DATA_1d("fft3d:copy",copy_size);
+  }
+
+  if (scratch_size) {
+    plan->d_scratch = typename AT::t_FFT_DATA_1d("fft3d:scratch",scratch_size);
+  }
+
+  // system specific pre-computation of 1d FFT coeffs
+  // and scaling normalization
+
+#if defined(FFT_MKL)
+  DftiCreateDescriptor( &(plan->handle_fast), FFT_MKL_PREC, DFTI_COMPLEX, 1,
+                        (MKL_LONG)nfast);
+  DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_TRANSFORMS,
+               (MKL_LONG)plan->total1/nfast);
+  DftiSetValue(plan->handle_fast, DFTI_PLACEMENT,DFTI_INPLACE);
+  DftiSetValue(plan->handle_fast, DFTI_INPUT_DISTANCE, (MKL_LONG)nfast);
+  DftiSetValue(plan->handle_fast, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nfast);
+  //DftiSetValue(plan->handle_fast, DFTI_NUMBER_OF_USER_THREADS, nthreads);
+  DftiCommitDescriptor(plan->handle_fast);
+
+  DftiCreateDescriptor( &(plan->handle_mid), FFT_MKL_PREC, DFTI_COMPLEX, 1,
+                        (MKL_LONG)nmid);
+  DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_TRANSFORMS,
+               (MKL_LONG)plan->total2/nmid);
+  DftiSetValue(plan->handle_mid, DFTI_PLACEMENT,DFTI_INPLACE);
+  DftiSetValue(plan->handle_mid, DFTI_INPUT_DISTANCE, (MKL_LONG)nmid);
+  DftiSetValue(plan->handle_mid, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nmid);
+  //DftiSetValue(plan->handle_mid, DFTI_NUMBER_OF_USER_THREADS, nthreads);
+  DftiCommitDescriptor(plan->handle_mid);
+
+  DftiCreateDescriptor( &(plan->handle_slow), FFT_MKL_PREC, DFTI_COMPLEX, 1,
+                        (MKL_LONG)nslow);
+  DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_TRANSFORMS,
+               (MKL_LONG)plan->total3/nslow);
+  DftiSetValue(plan->handle_slow, DFTI_PLACEMENT,DFTI_INPLACE);
+  DftiSetValue(plan->handle_slow, DFTI_INPUT_DISTANCE, (MKL_LONG)nslow);
+  DftiSetValue(plan->handle_slow, DFTI_OUTPUT_DISTANCE, (MKL_LONG)nslow);
+  //DftiSetValue(plan->handle_slow, DFTI_NUMBER_OF_USER_THREADS, nthreads);
+  DftiCommitDescriptor(plan->handle_slow);
+
+  if (scaled == 0)
+    plan->scaled = 0;
+  else {
+    plan->scaled = 1;
+    plan->norm = 1.0/(nfast*nmid*nslow);
+    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
+      (out_khi-out_klo+1);
+  }
+
+#elif defined(FFT_FFTW3)
+
+#if defined (FFT_FFTW_THREADS)
+  if (nthreads > 1) {
+    FFTW_API(init_threads)();
+    FFTW_API(plan_with_nthreads)(nthreads);
+  }
+#endif
+
+  plan->plan_fast_forward =
+    FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
+                       NULL,&nfast,1,plan->length1,
+                       NULL,&nfast,1,plan->length1,
+                       FFTW_FORWARD,FFTW_ESTIMATE);
+
+  plan->plan_fast_backward =
+    FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
+                       NULL,&nfast,1,plan->length1,
+                       NULL,&nfast,1,plan->length1,
+                       FFTW_BACKWARD,FFTW_ESTIMATE);
+
+  plan->plan_mid_forward =
+    FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
+                       NULL,&nmid,1,plan->length2,
+                       NULL,&nmid,1,plan->length2,
+                       FFTW_FORWARD,FFTW_ESTIMATE);
+
+  plan->plan_mid_backward =
+    FFTW_API(plan_many_dft)(1, &nmid,plan->total2/plan->length2,
+                       NULL,&nmid,1,plan->length2,
+                       NULL,&nmid,1,plan->length2,
+                       FFTW_BACKWARD,FFTW_ESTIMATE);
+
+
+  plan->plan_slow_forward =
+    FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
+                       NULL,&nslow,1,plan->length3,
+                       NULL,&nslow,1,plan->length3,
+                       FFTW_FORWARD,FFTW_ESTIMATE);
+
+  plan->plan_slow_backward =
+    FFTW_API(plan_many_dft)(1, &nslow,plan->total3/plan->length3,
+                       NULL,&nslow,1,plan->length3,
+                       NULL,&nslow,1,plan->length3,
+                       FFTW_BACKWARD,FFTW_ESTIMATE);
+#elif defined(FFT_CUFFT)
+  cufftPlanMany(&(plan->plan_fast), 1, &nfast,
+    &nfast,1,plan->length1,
+    &nfast,1,plan->length1,
+    CUFFT_TYPE,plan->total1/plan->length1);
+
+  cufftPlanMany(&(plan->plan_mid), 1, &nmid,
+    &nmid,1,plan->length2,
+    &nmid,1,plan->length2,
+    CUFFT_TYPE,plan->total2/plan->length2);
+
+  cufftPlanMany(&(plan->plan_slow), 1, &nslow,
+    &nslow,1,plan->length3,
+    &nslow,1,plan->length3,
+    CUFFT_TYPE,plan->total3/plan->length3);
+#else
+  kissfftKK = new KissFFTKokkos<DeviceType>();
+
+  plan->cfg_fast_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nfast,0,NULL,NULL);
+  plan->cfg_fast_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nfast,1,NULL,NULL);
+
+  if (nmid == nfast) {
+    plan->cfg_mid_forward = plan->cfg_fast_forward;
+    plan->cfg_mid_backward = plan->cfg_fast_backward;
+  }
+  else {
+    plan->cfg_mid_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nmid,0,NULL,NULL);
+    plan->cfg_mid_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nmid,1,NULL,NULL);
+  }
+
+  if (nslow == nfast) {
+    plan->cfg_slow_forward = plan->cfg_fast_forward;
+    plan->cfg_slow_backward = plan->cfg_fast_backward;
+  }
+  else if (nslow == nmid) {
+    plan->cfg_slow_forward = plan->cfg_mid_forward;
+    plan->cfg_slow_backward = plan->cfg_mid_backward;
+  }
+  else {
+    plan->cfg_slow_forward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nslow,0,NULL,NULL);
+    plan->cfg_slow_backward = KissFFTKokkos<DeviceType>::kiss_fft_alloc_kokkos(nslow,1,NULL,NULL);
+  }
+#endif
+
+  if (scaled == 0)
+    plan->scaled = 0;
+  else {
+    plan->scaled = 1;
+    plan->norm = 1.0/(nfast*nmid*nslow);
+    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
+      (out_khi-out_klo+1);
+  }
+
+  return plan;
+}
+
+/* ----------------------------------------------------------------------
+   Destroy a 3d fft plan
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FFT3dKokkos<DeviceType>::fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *plan)
+{
+  if (plan->pre_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->pre_plan);
+  if (plan->mid1_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid1_plan);
+  if (plan->mid2_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->mid2_plan);
+  if (plan->post_plan) remapKK->remap_3d_destroy_plan_kokkos(plan->post_plan);
+
+#if defined(FFT_MKL)
+  DftiFreeDescriptor(&(plan->handle_fast));
+  DftiFreeDescriptor(&(plan->handle_mid));
+  DftiFreeDescriptor(&(plan->handle_slow));
+#elif defined(FFT_FFTW3)
+  FFTW_API(destroy_plan)(plan->plan_slow_forward);
+  FFTW_API(destroy_plan)(plan->plan_slow_backward);
+  FFTW_API(destroy_plan)(plan->plan_mid_forward);
+  FFTW_API(destroy_plan)(plan->plan_mid_backward);
+  FFTW_API(destroy_plan)(plan->plan_fast_forward);
+  FFTW_API(destroy_plan)(plan->plan_fast_backward);
+
+#if defined (FFT_FFTW_THREADS)
+  FFTW_API(cleanup_threads)();
+#endif
+
+#elif defined (FFT_KISSFFT)
+  delete kissfftKK;
+#endif
+
+  delete plan;
+  delete remapKK;
+}
+
+/* ----------------------------------------------------------------------
+   divide n into 2 factors of as equal size as possible
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FFT3dKokkos<DeviceType>::bifactor(int n, int *factor1, int *factor2)
+{
+  int n1,n2,facmax;
+
+  facmax = static_cast<int> (sqrt((double) n));
+
+  for (n1 = facmax; n1 > 0; n1--) {
+    n2 = n/n1;
+    if (n1*n2 == n) {
+      *factor1 = n1;
+      *factor2 = n2;
+      return;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform just the 1d FFTs needed by a 3d FFT, no data movement
+   used for timing purposes
+
+   Arguments:
+   in           starting address of input data on this proc, all set to 0.0
+   nsize        size of in
+   flag         1 for forward FFT, -1 for inverse FFT
+   plan         plan returned by previous call to fft_3d_create_plan
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FFT3dKokkos<DeviceType>::fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d d_data, int nsize, int flag,
+                    struct fft_plan_3d_kokkos<DeviceType> *plan)
+{
+  // total = size of data needed in each dim
+  // length = length of 1d FFT in each dim
+  // total/length = # of 1d FFTs in each dim
+  // if total > nsize, limit # of 1d FFTs to available size of data
+
+  int total1 = plan->total1;
+  int length1 = plan->length1;
+  int total2 = plan->total2;
+  int length2 = plan->length2;
+  int total3 = plan->total3;
+  int length3 = plan->length3;
+
+  // fftw3 and Dfti in MKL encode the number of transforms
+  // into the plan, so we cannot operate on a smaller data set
+#if defined(FFT_MKL) || defined(FFT_FFTW3)
+  if ((total1 > nsize) || (total2 > nsize) || (total3 > nsize))
+    return;
+#endif
+  if (total1 > nsize) total1 = (nsize/length1) * length1;
+  if (total2 > nsize) total2 = (nsize/length2) * length2;
+  if (total3 > nsize) total3 = (nsize/length3) * length3;
+
+  // perform 1d FFTs in each of 3 dimensions
+  // data is just an array of 0.0
+
+#if defined(FFT_MKL)
+  if (flag == -1) {
+    DftiComputeForward(plan->handle_fast,data);
+    DftiComputeForward(plan->handle_mid,data);
+    DftiComputeForward(plan->handle_slow,data);
+  } else {
+    DftiComputeBackward(plan->handle_fast,data);
+    DftiComputeBackward(plan->handle_mid,data);
+    DftiComputeBackward(plan->handle_slow,data);
+  }
+#elif defined(FFT_FFTW3)
+  if (flag == -1) {
+    FFTW_API(execute_dft)(plan->plan_fast_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
+    FFTW_API(execute_dft)(plan->plan_mid_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
+    FFTW_API(execute_dft)(plan->plan_slow_forward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
+  } else {
+    FFTW_API(execute_dft)(plan->plan_fast_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
+    FFTW_API(execute_dft)(plan->plan_mid_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
+    FFTW_API(execute_dft)(plan->plan_slow_backward,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data());
+  }
+#elif defined(FFT_CUFFT)
+  cufftExec(plan->plan_fast,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
+  cufftExec(plan->plan_mid,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
+  cufftExec(plan->plan_slow,(FFT_DATA*)d_data.data(),(FFT_DATA*)d_data.data(),flag);
+#else
+  kiss_fft_functor<DeviceType> f;
+  typename AT::t_FFT_DATA_1d d_tmp = typename AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.dimension_0());
+  if (flag == -1) {
+    f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length1);
+    Kokkos::parallel_for(total1/length1,f);
+
+    f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_forward,length2);
+    Kokkos::parallel_for(total2/length2,f);
+
+    f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_forward,length3);
+    Kokkos::parallel_for(total3/length3,f);
+  } else {
+    f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_backward,length1);
+    Kokkos::parallel_for(total1/length1,f);
+
+    f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_backward,length2);
+    Kokkos::parallel_for(total2/length2,f);
+
+    f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_backward,length3);
+    Kokkos::parallel_for(total3/length3,f);
+  }
+#endif
+
+  // scaling if required
+  // limit num to size of data
+
+  if (flag == 1 && plan->scaled) {
+    FFT_SCALAR norm = plan->norm;
+    int num = MIN(plan->normnum,nsize);
+
+    norm_functor<DeviceType> f(d_data,norm);
+    Kokkos::parallel_for(num,f);
+  }
+}
+
+namespace LAMMPS_NS {
+template class FFT3dKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FFT3dKokkos<LMPHostType>;
+#endif
+}
--- a/src/KOKKOS/fft3d_kokkos.h
+++ b/src/KOKKOS/fft3d_kokkos.h
@ -0,0 +1,198 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_FFT3D_KOKKOS_H
+#define LMP_FFT3D_KOKKOS_H
+
+#include "pointers.h"
+#include "kokkos_type.h"
+#include "remap_kokkos.h"
+#include "fftdata_kokkos.h"
+
+// with KOKKOS in CUDA mode we can only have
+// CUFFT or KISSFFT, thus undefine all other
+// FFTs here, since they may be valid in fft3d.cpp
+
+#if defined(KOKKOS_ENABLE_CUDA)
+# if defined(FFT_FFTW)
+#  undef FFT_FFTW
+# endif
+# if defined(FFT_FFTW3)
+#  undef FFT_FFTW3
+# endif
+# if defined(FFT_MKL)
+#  undef FFT_MKL
+# endif
+# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT)
+#  define FFT_KISSFFT
+# endif
+#else
+# if defined(FFT_CUFFT)
+#  error "Must enable CUDA with KOKKOS to use -DFFT_CUFFT"
+# endif
+// if user set FFTW, it means FFTW3
+# ifdef FFT_FFTW
+#  define FFT_FFTW3
+# endif
+# ifdef FFT_FFTW_THREADS
+#  if !defined(FFT_FFTW3)
+#   error "Must use -DFFT_FFTW3 with -DFFT_FFTW_THREADS"
+#  endif
+# endif
+#endif
+
+#if defined(FFT_MKL)
+  #include "mkl_dfti.h"
+  #if defined(FFT_SINGLE)
+    typedef float _Complex FFT_DATA;
+    #define FFT_MKL_PREC DFTI_SINGLE
+  #else
+    typedef double _Complex FFT_DATA;
+    #define FFT_MKL_PREC DFTI_DOUBLE
+  #endif
+#elif defined(FFT_FFTW3)
+  #include "fftw3.h"
+  #if defined(FFT_SINGLE)
+    typedef fftwf_complex FFT_DATA;
+    #define FFTW_API(function)  fftwf_ ## function
+  #else
+    typedef fftw_complex FFT_DATA;
+    #define FFTW_API(function) fftw_ ## function
+  #endif
+#elif defined(FFT_CUFFT)
+  #include "cufft.h"
+  #if defined(FFT_SINGLE)
+    #define cufftExec cufftExecC2C
+    #define CUFFT_TYPE CUFFT_C2C
+    typedef cufftComplex FFT_DATA;
+  #else
+    #define cufftExec cufftExecZ2Z
+    #define CUFFT_TYPE CUFFT_Z2Z
+    typedef cufftDoubleComplex FFT_DATA;
+  #endif
+#else
+  #include "kissfft_kokkos.h"
+  #ifndef FFT_KISSFFT
+  #define FFT_KISSFFT
+  #endif
+#endif
+
+namespace LAMMPS_NS {
+
+// -------------------------------------------------------------------------
+
+// plan for how to perform a 3d FFT
+
+template<class DeviceType>
+struct fft_plan_3d_kokkos {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  struct remap_plan_3d_kokkos<DeviceType> *pre_plan;       // remap from input -> 1st FFTs
+  struct remap_plan_3d_kokkos<DeviceType> *mid1_plan;      // remap from 1st -> 2nd FFTs
+  struct remap_plan_3d_kokkos<DeviceType> *mid2_plan;      // remap from 2nd -> 3rd FFTs
+  struct remap_plan_3d_kokkos<DeviceType> *post_plan;      // remap from 3rd FFTs -> output
+  typename AT::t_FFT_DATA_1d d_copy;                   // memory for remap results (if needed)
+  typename AT::t_FFT_DATA_1d d_scratch;                // scratch space for remaps
+  int total1,total2,total3;         // # of 1st,2nd,3rd FFTs (times length)
+  int length1,length2,length3;      // length of 1st,2nd,3rd FFTs
+  int pre_target;                   // where to put remap results
+  int mid1_target,mid2_target;
+  int scaled;                       // whether to scale FFT results
+  int normnum;                      // # of values to rescale
+  double norm;                      // normalization factor for rescaling
+
+                                    // system specific 1d FFT info
+#if defined(FFT_MKL)
+  DFTI_DESCRIPTOR *handle_fast;
+  DFTI_DESCRIPTOR *handle_mid;
+  DFTI_DESCRIPTOR *handle_slow;
+#elif defined(FFT_FFTW3)
+  FFTW_API(plan) plan_fast_forward;
+  FFTW_API(plan) plan_fast_backward;
+  FFTW_API(plan) plan_mid_forward;
+  FFTW_API(plan) plan_mid_backward;
+  FFTW_API(plan) plan_slow_forward;
+  FFTW_API(plan) plan_slow_backward;
+#elif defined(FFT_CUFFT)
+  cufftHandle plan_fast;
+  cufftHandle plan_mid;
+  cufftHandle plan_slow;
+#else
+  kiss_fft_state_kokkos<DeviceType> cfg_fast_forward;
+  kiss_fft_state_kokkos<DeviceType> cfg_fast_backward;
+  kiss_fft_state_kokkos<DeviceType> cfg_mid_forward;
+  kiss_fft_state_kokkos<DeviceType> cfg_mid_backward;
+  kiss_fft_state_kokkos<DeviceType> cfg_slow_forward;
+  kiss_fft_state_kokkos<DeviceType> cfg_slow_backward;
+#endif
+};
+
+template<class DeviceType>
+class FFT3dKokkos : protected Pointers {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  FFT3dKokkos(class LAMMPS *, MPI_Comm,
+        int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
+        int,int,int *,int);
+  ~FFT3dKokkos();
+  void compute(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, int);
+  void timing1d(typename AT::t_FFT_SCALAR_1d, int, int);
+
+ private:
+  struct fft_plan_3d_kokkos<DeviceType> *plan;
+  RemapKokkos<DeviceType> *remapKK;
+
+#ifdef FFT_KISSFFT
+  KissFFTKokkos<DeviceType> *kissfftKK;
+#endif
+
+  void fft_3d_kokkos(typename AT::t_FFT_DATA_1d, typename AT::t_FFT_DATA_1d, int, struct fft_plan_3d_kokkos<DeviceType> *);
+
+  struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
+                                         int, int, int, int, int,
+                                         int, int, int, int, int, int, int,
+                                         int, int, int *, int, int);
+
+  void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);
+
+  void fft_3d_1d_only_kokkos(typename AT::t_FFT_DATA_1d, int, int, struct fft_plan_3d_kokkos<DeviceType> *);
+
+  void bifactor(int, int *, int *);
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Could not create 3d FFT plan
+
+The FFT setup for the PPPM solver failed, typically due
+to lack of memory.  This is an unusual error.  Check the
+size of the FFT grid you are requesting.
+
+E: Cannot use the FFTW library with Kokkos CUDA on GPUs
+
+Kokkos CUDA doesn't support using the FFTW library to calculate FFTs for
+PPPM on GPUs.
+
+E: Cannot use the cuFFT library with Kokkos CUDA on the host CPUs
+
+Kokkos CUDA doesn't support using the cuFFT library to calculate FFTs
+for PPPM on the host CPUs, use KISS FFT instead.
+
+*/
--- a/src/KOKKOS/fftdata_kokkos.h
+++ b/src/KOKKOS/fftdata_kokkos.h
@ -0,0 +1,46 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#define MAX(A,B) ((A) > (B) ? (A) : (B))
+
+// data types for 2d/3d FFTs
+
+#ifndef FFT_DATA_KOKKOS_H
+#define FFT_DATA_KOKKOS_H
+
+// User-settable FFT precision
+
+// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag)
+// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag)
+
+#ifdef FFT_SINGLE
+#define FFT_PRECISION 1
+#define MPI_FFT_SCALAR MPI_FLOAT
+typedef float FFT_SCALAR;
+#else
+#define FFT_PRECISION 2
+#define MPI_FFT_SCALAR MPI_DOUBLE
+typedef double FFT_SCALAR;
+#endif
+
+// -------------------------------------------------------------------------
+
+// Data types for single-precision complex
+
+#if FFT_PRECISION == 1
+#elif FFT_PRECISION == 2
+#else
+#error "FFT_PRECISION needs to be either 1 (=single) or 2 (=double)"
+#endif
+
+#endif
--- a/src/KOKKOS/kissfft_kokkos.cpp
+++ b/src/KOKKOS/kissfft_kokkos.cpp
@ -0,0 +1,20 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+
+namespace LAMMPS_NS {
+template class KissFFTKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class KissFFTKokkos<LMPHostType>;
+#endif
+}
--- a/src/KOKKOS/kissfft_kokkos.h
+++ b/src/KOKKOS/kissfft_kokkos.h
@ -0,0 +1,550 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/*
+   we use a stripped down KISS FFT as default FFT for LAMMPS
+   this code is adapted from kiss_fft_v1_2_9
+   homepage: http://kissfft.sf.net/
+
+   changes 2008-2011 by Axel Kohlmeyer <akohlmey@gmail.com>
+
+   KISS FFT ported to Kokkos by Stan Moore (SNL)
+*/
+
+/*
+  Copyright (c) 2003-2010, Mark Borgerding
+
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+
+    * Neither the author nor the names of any contributors may be used
+      to endorse or promote products derived from this software without
+      specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#ifndef LMP_KISSFFT_KOKKOS_H
+#define LMP_KISSFFT_KOKKOS_H
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "kissfft_kokkos.h"
+#include "fftdata_kokkos.h"
+#include "kokkos_type.h"
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643383279502884197169399375105820974944
+#endif
+
+/*
+  Explanation of macros dealing with complex math:
+
+   C_MUL(m,a,b)         : m = a*b
+   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
+   C_SUB( res, a,b)     : res = a - b
+   C_SUBFROM( res , a)  : res -= a
+   C_ADDTO( res , a)    : res += a
+   C_EQ( res , a)       : res = a
+*/
+
+#define S_MUL(a,b) ( (a)*(b) )
+
+#define C_MUL(m,a,a_index,b,b_index) \
+    do{ (m)[0] = (a)(a_index,0)*(b)(b_index,0) - (a)(a_index,1)*(b)(b_index,1);\
+        (m)[1] = (a)(a_index,0)*(b)(b_index,1) + (a)(a_index,1)*(b)(b_index,0); }while(0)
+
+/*
+#define C_FIXDIV(c,div) // NOOP
+
+#define C_MULBYSCALAR( c, s ) \
+    do{ (c)[0] *= (s);\
+        (c)[1] *= (s); }while(0)
+
+#define  C_ADD( res, a,b)\
+    do { \
+            (res)[0]=(a)[0]+(b)[0];  (res)[1]=(a)[1]+(b)[1]; \
+    }while(0)
+
+#define  C_SUB( res, a,b)\
+    do { \
+            (res)[0]=(a)[0]-(b)[0];  (res)[1]=(a)[1]-(b)[1]; \
+    }while(0)
+
+#define C_ADDTO( res , a)\
+    do { \
+            (res)[0] += (a)[0];  (res)[1] += (a)[1];\
+    }while(0)
+
+#define C_SUBFROM( res , a)\
+    do {\
+            (res)[0] -= (a)[0];  (res)[1] -= (a)[1]; \
+    }while(0)
+
+#define C_EQ(res, a)\
+    do {\
+            (res)[0] = (a)[0];  (res)[1] = (a)[1]; \
+    }while(0)
+*/
+
+#define KISS_FFT_COS(phase) (FFT_SCALAR) cos(phase)
+#define KISS_FFT_SIN(phase) (FFT_SCALAR) sin(phase)
+#define HALF_OF(x) ((x)*.5)
+
+#define  kf_cexp(x,x_index,phase) \
+        do{ \
+                (x)(x_index,0) = KISS_FFT_COS(phase);\
+                (x)(x_index,1) = KISS_FFT_SIN(phase);\
+        }while(0)
+
+
+namespace LAMMPS_NS {
+
+#define MAXFACTORS 32
+/* e.g. an fft of length 128 has 4 factors
+ as far as kissfft is concerned: 4*4*4*2  */
+template<class DeviceType>
+struct kiss_fft_state_kokkos {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  int nfft;
+  int inverse;
+  typename AT::t_int_64 d_factors;
+  typename AT::t_FFT_DATA_1d d_twiddles;
+  typename AT::t_FFT_DATA_1d d_scratch;
+};
+
+template<class DeviceType>
+class KissFFTKokkos {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  KOKKOS_INLINE_FUNCTION
+  static void kf_bfly2(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
+                       const kiss_fft_state_kokkos<DeviceType> &st, int m, int Fout_count)
+  {
+      typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
+      FFT_SCALAR t[2];
+      int Fout2_count;
+      int tw1_count = 0;
+
+      Fout2_count = Fout_count + m;
+      do {
+          //C_FIXDIV(d_Fout[Fout_count],2); C_FIXDIV(d_Fout[Fout2_count],2);
+
+          C_MUL(t,d_Fout,Fout2_count,d_twiddles,tw1_count);
+          tw1_count += fstride;
+          //C_SUB(*Fout2,*Fout,t);
+          d_Fout(Fout2_count,0) = d_Fout(Fout_count,0) - t[0];
+          d_Fout(Fout2_count,1) = d_Fout(Fout_count,1) - t[1];
+          //C_ADDTO(d_Fout[Fout_count],t);
+          d_Fout(Fout_count,0) += t[0];
+          d_Fout(Fout_count,1) += t[1];
+          ++Fout2_count;
+          ++Fout_count;
+      } while(--m);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void kf_bfly4(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
+                       const kiss_fft_state_kokkos<DeviceType> &st, const size_t m, int Fout_count)
+  {
+      typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
+      FFT_SCALAR scratch[6][2];
+      size_t k=m;
+      const size_t m2=2*m;
+      const size_t m3=3*m;
+
+      int tw3_count,tw2_count,tw1_count;
+      tw3_count = tw2_count = tw1_count = 0;
+
+      do {
+          //C_FIXDIV(d_Fout[Fout_count],4); C_FIXDIV(d_Fout[m],4); C_FIXDIV(d_Fout[m2],4); C_FIXDIV(d_Fout[m3],4);
+
+          C_MUL(scratch[0],d_Fout,Fout_count + m,d_twiddles,tw1_count);
+          C_MUL(scratch[1],d_Fout,Fout_count + m2,d_twiddles,tw2_count);
+          C_MUL(scratch[2],d_Fout,Fout_count + m3,d_twiddles,tw3_count);
+
+          //C_SUB(scratch[5],d_Fout[Fout_count],scratch[1] );
+          scratch[5][0] = d_Fout(Fout_count,0) - scratch[1][0];
+          scratch[5][1] = d_Fout(Fout_count,1) - scratch[1][1];
+          //C_ADDTO(d_Fout[Fout_count], scratch[1]);
+          d_Fout(Fout_count,0) += scratch[1][0];
+          d_Fout(Fout_count,1) += scratch[1][1];
+          //C_ADD(scratch[3],scratch[0],scratch[2]);
+          scratch[3][0] = scratch[0][0] + scratch[2][0];
+          scratch[3][1] = scratch[0][1] + scratch[2][1];
+          //C_SUB( scratch[4] , scratch[0] , scratch[2] );
+          scratch[4][0] = scratch[0][0] - scratch[2][0];
+          scratch[4][1] = scratch[0][1] - scratch[2][1];
+          //C_SUB(d_Fout[m2],d_Fout[Fout_count],scratch[3]);
+          d_Fout(Fout_count + m2,0) = d_Fout(Fout_count,0) - scratch[3][0];
+          d_Fout(Fout_count + m2,1) = d_Fout(Fout_count,1) - scratch[3][1];
+
+          tw1_count += fstride;
+          tw2_count += fstride*2;
+          tw3_count += fstride*3;
+          //C_ADDTO(d_Fout[Fout_count],scratch[3]);
+          d_Fout(Fout_count,0) += scratch[3][0];
+          d_Fout(Fout_count,1) += scratch[3][1];
+
+          if (st.inverse) {
+              d_Fout(Fout_count + m,0) = scratch[5][0] - scratch[4][1];
+              d_Fout(Fout_count + m,1) = scratch[5][1] + scratch[4][0];
+              d_Fout(Fout_count + m3,0) = scratch[5][0] + scratch[4][1];
+              d_Fout(Fout_count + m3,1) = scratch[5][1] - scratch[4][0];
+          } else{
+              d_Fout(Fout_count + m,0) = scratch[5][0] + scratch[4][1];
+              d_Fout(Fout_count + m,1) = scratch[5][1] - scratch[4][0];
+              d_Fout(Fout_count + m3,0) = scratch[5][0] - scratch[4][1];
+              d_Fout(Fout_count + m3,1) = scratch[5][1] + scratch[4][0];
+          }
+          ++Fout_count;
+      } while(--k);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void kf_bfly3(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
+                       const kiss_fft_state_kokkos<DeviceType> &st, size_t m, int Fout_count)
+  {
+      size_t k=m;
+      const size_t m2 = 2*m;
+      typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
+      FFT_SCALAR scratch[5][2];
+      FFT_SCALAR epi3[2];
+      //C_EQ(epi3,d_twiddles[fstride*m]);
+      epi3[0] = d_twiddles(fstride*m,0);
+      epi3[1] = d_twiddles(fstride*m,1);
+
+      int tw1_count,tw2_count;
+      tw1_count = tw2_count = 0;
+
+      do {
+          //C_FIXDIV(d_Fout[Fout_count],3); C_FIXDIV(d_Fout[m],3); C_FIXDIV(d_Fout[m2],3);
+
+          C_MUL(scratch[1],d_Fout,Fout_count + m,d_twiddles,tw1_count);
+          C_MUL(scratch[2],d_Fout,Fout_count + m2,d_twiddles,tw2_count);
+
+          //C_ADD(scratch[3],scratch[1],scratch[2]);
+          scratch[3][0] = scratch[1][0] + scratch[2][0];
+          scratch[3][1] = scratch[1][1] + scratch[2][1];
+          //C_SUB(scratch[0],scratch[1],scratch[2]);
+          scratch[0][0] = scratch[1][0] - scratch[2][0];
+          scratch[0][1] = scratch[1][1] - scratch[2][1];
+          tw1_count += fstride;
+          tw2_count += fstride*2;
+
+          d_Fout(Fout_count + m,0) = d_Fout(Fout_count,0) - HALF_OF(scratch[3][0]);
+          d_Fout(Fout_count + m,1) = d_Fout(Fout_count,1) - HALF_OF(scratch[3][1]);
+
+          //C_MULBYSCALAR(scratch[0],epi3[1]);
+          scratch[0][0] *= epi3[1];
+          scratch[0][1] *= epi3[1];
+
+          //C_ADDTO(d_Fout[Fout_count],scratch[3]);
+          d_Fout(Fout_count,0) += scratch[3][0];
+          d_Fout(Fout_count,1) += scratch[3][1];
+
+          d_Fout(Fout_count + m2,0) = d_Fout(Fout_count + m,0) + scratch[0][1];
+          d_Fout(Fout_count + m2,1) = d_Fout(Fout_count + m,1) - scratch[0][0];
+
+          d_Fout(Fout_count + m,0) -= scratch[0][1];
+          d_Fout(Fout_count + m,1) += scratch[0][0];
+
+          ++Fout_count;
+      } while(--k);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void kf_bfly5(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
+                       const kiss_fft_state_kokkos<DeviceType> &st, int m, int Fout_count)
+  {
+      int u;
+      FFT_SCALAR scratch[13][2];
+      typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
+      FFT_SCALAR ya[2],yb[2];
+      //C_EQ(ya,d_twiddles[fstride*m]);
+      ya[1] = d_twiddles(fstride*m,1);
+      ya[0] = d_twiddles(fstride*m,0);
+      //C_EQ(yb,d_twiddles[fstride*2*m]);
+      yb[1] = d_twiddles(fstride*2*m,1);
+      yb[0] = d_twiddles(fstride*2*m,0);
+
+      int Fout0_count=Fout_count;
+      int Fout1_count=Fout0_count+m;
+      int Fout2_count=Fout0_count+2*m;
+      int Fout3_count=Fout0_count+3*m;
+      int Fout4_count=Fout0_count+4*m;
+
+      for ( u=0; u<m; ++u ) {
+          //C_FIXDIV( d_Fout[Fout0_count],5); C_FIXDIV( d_Fout[Fout1_count],5); C_FIXDIV( d_Fout[Fout2_count],5);
+          //C_FIXDIV( d_Fout[Fout3_count],5); C_FIXDIV( d_Fout[Fout4_count],5);
+          //C_EQ(scratch[0],d_Fout[Fout0_count]);
+          scratch[0][0] = d_Fout(Fout0_count,0);
+          scratch[0][1] = d_Fout(Fout0_count,1);
+
+          C_MUL(scratch[1],d_Fout,Fout1_count,d_twiddles,u*fstride  );
+          C_MUL(scratch[2],d_Fout,Fout2_count,d_twiddles,2*u*fstride);
+          C_MUL(scratch[3],d_Fout,Fout3_count,d_twiddles,3*u*fstride);
+          C_MUL(scratch[4],d_Fout,Fout4_count,d_twiddles,4*u*fstride);
+
+          //C_ADD(scratch[7],scratch[1],scratch[4]);
+          scratch[7][0] = scratch[1][0] + scratch[4][0];
+          scratch[7][1] = scratch[1][1] + scratch[4][1];
+          //C_SUB(scratch[10],scratch[1],scratch[4]);
+          scratch[10][0] = scratch[1][0] - scratch[4][0];
+          scratch[10][1] = scratch[1][1] - scratch[4][1];
+          //C_ADD(scratch[8],scratch[2],scratch[3]);
+          scratch[8][0] = scratch[2][0] + scratch[3][0];
+          scratch[8][1] = scratch[2][1] + scratch[3][1];
+          //C_SUB(scratch[9],scratch[2],scratch[3]);
+          scratch[9][0] = scratch[2][0] - scratch[3][0];
+          scratch[9][1] = scratch[2][1] - scratch[3][1];
+
+          d_Fout(Fout0_count,0) += scratch[7][0] + scratch[8][0];
+          d_Fout(Fout0_count,1) += scratch[7][1] + scratch[8][1];
+
+          scratch[5][0] = scratch[0][0] + S_MUL(scratch[7][0],ya[0]) + S_MUL(scratch[8][0],yb[0]);
+          scratch[5][1] = scratch[0][1] + S_MUL(scratch[7][1],ya[0]) + S_MUL(scratch[8][1],yb[0]);
+
+          scratch[6][0] =  S_MUL(scratch[10][1],ya[1]) + S_MUL(scratch[9][1],yb[1]);
+          scratch[6][1] = -S_MUL(scratch[10][0],ya[1]) - S_MUL(scratch[9][0],yb[1]);
+
+          //C_SUB(d_Fout[Fout1_count],scratch[5],scratch[6]);
+          d_Fout(Fout1_count,0) = scratch[5][0] - scratch[6][0];
+          d_Fout(Fout1_count,1) = scratch[5][1] - scratch[6][1];
+          //C_ADD(d_Fout[Fout4_count],scratch[5],scratch[6]);
+          d_Fout(Fout4_count,0) = scratch[5][0] + scratch[6][0];
+          d_Fout(Fout4_count,1) = scratch[5][1] + scratch[6][1];
+
+          scratch[11][0] = scratch[0][0] + S_MUL(scratch[7][0],yb[0]) + S_MUL(scratch[8][0],ya[0]);
+          scratch[11][1] = scratch[0][1] + S_MUL(scratch[7][1],yb[0]) + S_MUL(scratch[8][1],ya[0]);
+          scratch[12][0] = - S_MUL(scratch[10][1],yb[1]) + S_MUL(scratch[9][1],ya[1]);
+          scratch[12][1] = S_MUL(scratch[10][0],yb[1]) - S_MUL(scratch[9][0],ya[1]);
+
+          //C_ADD(d_Fout[Fout2_count],scratch[11],scratch[12]);
+          d_Fout(Fout2_count,0) = scratch[11][0] + scratch[12][0];
+          d_Fout(Fout2_count,1) = scratch[11][1] + scratch[12][1];
+          //C_SUB(d_Fout3[Fout3_count],scratch[11],scratch[12]);
+          d_Fout(Fout3_count,0) = scratch[11][0] - scratch[12][0];
+          d_Fout(Fout3_count,1) = scratch[11][1] - scratch[12][1];
+
+          ++Fout0_count;++Fout1_count;++Fout2_count;++Fout3_count;++Fout4_count;
+      }
+  }
+
+  /* perform the butterfly for one stage of a mixed radix FFT */
+
+  KOKKOS_INLINE_FUNCTION
+  static void kf_bfly_generic(typename AT::t_FFT_DATA_1d_um &d_Fout, const size_t fstride,
+                              const kiss_fft_state_kokkos<DeviceType> &st, int m, int p, int Fout_count)
+  {
+      int u,k,q1,q;
+      typename AT::t_FFT_DATA_1d_um d_twiddles = st.d_twiddles;
+      FFT_SCALAR t[2];
+      int Norig = st.nfft;
+
+      typename AT::t_FFT_DATA_1d_um d_scratch = st.d_scratch;
+      for ( u=0; u<m; ++u ) {
+          k=u;
+          for ( q1=0 ; q1<p ; ++q1 ) {
+              //C_EQ(d_scratch[q1],d_Fout[k]);
+              d_scratch(q1,0) = d_Fout(Fout_count + k,0);
+              d_scratch(q1,1) = d_Fout(Fout_count + k,1);
+              //C_FIXDIV(d_scratch[q1],p);
+              k += m;
+          }
+
+          k=u;
+          for ( q1=0 ; q1<p ; ++q1 ) {
+              int twidx=0;
+              //C_EQ(d_Fout[k],d_scratch[0]);
+              d_Fout(Fout_count + k,0) = d_scratch(0,0);
+              d_Fout(Fout_count + k,1) = d_scratch(0,1);
+              for (q=1;q<p;++q ) {
+                  twidx += fstride * k;
+                  if (twidx>=Norig) twidx-=Norig;
+                  C_MUL(t,d_scratch,q,d_twiddles,twidx);
+                  //C_ADDTO(d_Fout[k],t);
+                  d_Fout(Fout_count + k,0) += t[0];
+                  d_Fout(Fout_count + k,1) += t[1];
+              }
+              k += m;
+          }
+      }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void kf_work(typename AT::t_FFT_DATA_1d_um &d_Fout, const typename AT::t_FFT_DATA_1d_um &d_f,
+                      const size_t fstride, int in_stride,
+                      const typename AT::t_int_64_um &d_factors, const kiss_fft_state_kokkos<DeviceType> &st, int Fout_count, int f_count, int factors_count)
+  {
+      const int beg = Fout_count;
+      const int p = d_factors[factors_count++]; /* the radix  */
+      const int m = d_factors[factors_count++]; /* stage's fft length/p */
+      const int end = Fout_count + p*m;
+
+      if (m == 1) {
+          do {
+              //C_EQ(d_Fout[Fout_count],d_f[f_count]);
+              d_Fout(Fout_count,0) = d_f(f_count,0);
+              d_Fout(Fout_count,1) = d_f(f_count,1);
+              f_count += fstride*in_stride;
+          } while (++Fout_count != end);
+      } else {
+          do {
+              /* recursive call:
+                 DFT of size m*p performed by doing
+                 p instances of smaller DFTs of size m,
+                 each one takes a decimated version of the input */
+              kf_work(d_Fout, d_f, fstride*p, in_stride, d_factors, st, Fout_count, f_count, factors_count);
+              f_count += fstride*in_stride;
+          } while( (Fout_count += m) != end);
+      }
+
+      Fout_count=beg;
+
+      /* recombine the p smaller DFTs */
+      switch (p) {
+        case 2: kf_bfly2(d_Fout,fstride,st,m,Fout_count); break;
+        case 3: kf_bfly3(d_Fout,fstride,st,m,Fout_count); break;
+        case 4: kf_bfly4(d_Fout,fstride,st,m,Fout_count); break;
+        case 5: kf_bfly5(d_Fout,fstride,st,m,Fout_count); break;
+        default: kf_bfly_generic(d_Fout,fstride,st,m,p,Fout_count); break;
+      }
+  }
+
+  /*  facbuf is populated by p1,m1,p2,m2, ...
+      where
+      p[i] * m[i] = m[i-1]
+      m0 = n                  */
+
+  static int kf_factor(int n, HAT::t_int_64 h_facbuf)
+  {
+      int p=4, nf=0;
+      double floor_sqrt;
+      floor_sqrt = floor( sqrt((double)n) );
+      int facbuf_count = 0;
+      int p_max = 0;
+
+      /* factor out the remaining powers of 4, powers of 2,
+         and then any other remaining primes */
+      do {
+          if (nf == MAXFACTORS) p = n; /* make certain that we don't run out of space */
+          while (n % p) {
+              switch (p) {
+                case 4: p = 2; break;
+                case 2: p = 3; break;
+                default: p += 2; break;
+              }
+              if (p > floor_sqrt)
+                  p = n;          /* no more factors, skip to end */
+          }
+          n /= p;
+          h_facbuf[facbuf_count++] = p;
+          h_facbuf[facbuf_count++] = n;
+          p_max = MAX(p,p_max);
+          ++nf;
+      } while (n > 1);
+      return p_max;
+  }
+
+  /*
+   * User-callable function to allocate all necessary storage space for the fft.
+   *
+   * The return value is a contiguous block of memory, allocated with malloc.  As such,
+   * It can be freed with free(), rather than a kiss_fft-specific function.
+   */
+
+  static kiss_fft_state_kokkos<DeviceType> kiss_fft_alloc_kokkos(int nfft, int inverse_fft, void *mem, size_t *lenmem)
+  {
+      kiss_fft_state_kokkos<DeviceType> st;
+      int i;
+      st.nfft = nfft;
+      st.inverse = inverse_fft;
+
+      typename AT::tdual_int_64 k_factors = typename AT::tdual_int_64();
+      typename AT::tdual_FFT_DATA_1d k_twiddles = typename AT::tdual_FFT_DATA_1d();
+
+      if (nfft > 0) {
+          k_factors = typename AT::tdual_int_64("kissfft:factors",MAXFACTORS*2);
+          k_twiddles = typename AT::tdual_FFT_DATA_1d("kissfft:twiddles",nfft);
+
+          for (i=0;i<nfft;++i) {
+              const double phase = (st.inverse ? 2.0*M_PI:-2.0*M_PI)*i / nfft;
+              kf_cexp(k_twiddles.h_view,i,phase );
+          }
+
+          int p_max = kf_factor(nfft,k_factors.h_view);
+          st.d_scratch = typename AT::t_FFT_DATA_1d("kissfft:scratch",p_max);
+      }
+
+      k_factors.template modify<LMPHostType>();
+      k_factors.template sync<LMPDeviceType>();
+      st.d_factors = k_factors.template view<DeviceType>();
+
+      k_twiddles.template modify<LMPHostType>();
+      k_twiddles.template sync<LMPDeviceType>();
+      st.d_twiddles = k_twiddles.template view<DeviceType>();
+
+      return st;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void kiss_fft_stride(const kiss_fft_state_kokkos<DeviceType> &st, const typename AT::t_FFT_DATA_1d_um &d_fin, typename AT::t_FFT_DATA_1d_um &d_fout, int in_stride, int offset)
+  {
+      //if (d_fin.data() == d_fout.data()) {
+      //    // NOTE: this is not really an in-place FFT algorithm.
+      //    // It just performs an out-of-place FFT into a temp buffer
+      //    typename AT::t_FFT_DATA_1d_um d_tmpbuf = typename AT::t_FFT_DATA_1d("kissfft:tmpbuf",d_fin.extent(1));
+      //    kf_work(d_tmpbuf,d_fin,1,in_stride,st.d_factors,st,offset,offset,0);
+      //    Kokkos::deep_copy(d_fout,d_tmpbuf);
+      //} else {
+        kf_work(d_fout,d_fin,1,in_stride,st.d_factors,st,offset,offset,0);
+      //}
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void kiss_fft_kokkos(const kiss_fft_state_kokkos<DeviceType> &cfg, const typename AT::t_FFT_DATA_1d_um d_fin, typename AT::t_FFT_DATA_1d_um d_fout, int offset)
+  {
+      kiss_fft_stride(cfg,d_fin,d_fout,1,offset);
+  }
+
+};
+
+}
+#endif
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@ -174,20 +174,6 @@ t_scalar3<Scalar> operator *
  return t_scalar3<Scalar>(a.x*b,a.y*b,a.z*b);
 }

-#if !defined(__CUDACC__) && !defined(__VECTOR_TYPES_H__)
-  struct double2 {
-    double x, y;
-  };
-  struct float2 {
-    float x, y;
-  };
-  struct float4 {
-    float x, y, z, w;
-  };
-  struct double4 {
-    double x, y, z, w;
-  };
-#endif
 // set LMPHostype and LMPDeviceType from Kokkos Default Types
 typedef Kokkos::DefaultExecutionSpace LMPDeviceType;
 typedef Kokkos::HostSpace::execution_space LMPHostType;
@ -310,14 +296,8 @@ public:
 #endif
 #if PRECISION==1
 typedef float LMP_FLOAT;
-typedef float2 LMP_FLOAT2;
-typedef lmp_float3 LMP_FLOAT3;
-typedef float4 LMP_FLOAT4;
 #else
 typedef double LMP_FLOAT;
-typedef double2 LMP_FLOAT2;
-typedef lmp_double3 LMP_FLOAT3;
-typedef double4 LMP_FLOAT4;
 #endif

 #ifndef PREC_FORCE
@ -326,14 +306,8 @@ typedef double4 LMP_FLOAT4;

 #if PREC_FORCE==1
 typedef float F_FLOAT;
-typedef float2 F_FLOAT2;
-typedef lmp_float3 F_FLOAT3;
-typedef float4 F_FLOAT4;
 #else
 typedef double F_FLOAT;
-typedef double2 F_FLOAT2;
-typedef lmp_double3 F_FLOAT3;
-typedef double4 F_FLOAT4;
 #endif

 #ifndef PREC_ENERGY
@ -342,12 +316,8 @@ typedef double4 F_FLOAT4;

 #if PREC_ENERGY==1
 typedef float E_FLOAT;
-typedef float2 E_FLOAT2;
-typedef float4 E_FLOAT4;
 #else
 typedef double E_FLOAT;
-typedef double2 E_FLOAT2;
-typedef double4 E_FLOAT4;
 #endif

 struct s_EV_FLOAT {
@ -500,12 +470,8 @@ typedef struct s_FEV_FLOAT FEV_FLOAT;

 #if PREC_POS==1
 typedef float X_FLOAT;
-typedef float2 X_FLOAT2;
-typedef float4 X_FLOAT4;
 #else
 typedef double X_FLOAT;
-typedef double2 X_FLOAT2;
-typedef double4 X_FLOAT4;
 #endif

 #ifndef PREC_VELOCITIES
@ -514,22 +480,14 @@ typedef double4 X_FLOAT4;

 #if PREC_VELOCITIES==1
 typedef float V_FLOAT;
-typedef float2 V_FLOAT2;
-typedef float4 V_FLOAT4;
 #else
 typedef double V_FLOAT;
-typedef double2 V_FLOAT2;
-typedef double4 V_FLOAT4;
 #endif

 #if PREC_KSPACE==1
 typedef float K_FLOAT;
-typedef float2 K_FLOAT2;
-typedef float4 K_FLOAT4;
 #else
 typedef double K_FLOAT;
-typedef double2 K_FLOAT2;
-typedef double4 K_FLOAT4;
 #endif

 typedef int T_INT;
--- a/src/KOKKOS/pack_kokkos.h
+++ b/src/KOKKOS/pack_kokkos.h
@ -0,0 +1,474 @@
+/* ----------------------------------------------------------------------
+   SPARTA - Stochastic PArallel Rarefied-gas Time-accurate Analyzer
+   http://sparta.sandia.gov
+   Steve Plimpton, sjplimp@sandia.gov, Michael Gallis, magalli@sandia.gov
+   Sandia National Laboratories
+
+   Copyright (2014) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level SPARTA directory.
+------------------------------------------------------------------------- */
+
+// loop counters for doing a pack/unpack
+
+#include "pack.h"
+
+/* ----------------------------------------------------------------------
+   Pack and unpack functions:
+
+   pack routines copy strided values from data into contiguous locs in buf
+   unpack routines copy contiguous values from buf into strided locs in data
+   different versions of unpack depending on permutation
+     and # of values/element
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   pack from data -> buf
+------------------------------------------------------------------------- */
+
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PackKokkos {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+struct pack_3d_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
+  int buf_offset,data_offset;
+  int nfast;                 // # of elements in fast index
+  int nmid;                  // # of elements in mid index
+  int nslow;                 // # of elements in slow index
+  int nstride_line;          // stride between successive mid indices
+  int nstride_plane;         // stride between successive slow indices
+
+  pack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
+    d_buf(d_buf_),
+    d_data(d_data_)
+    {
+      buf_offset    = buf_offset_        ;
+      data_offset   = data_offset_       ;
+      nfast         = plan->nfast        ;
+      nmid          = plan->nmid         ;
+      nslow         = plan->nslow        ;
+      nstride_line  = plan->nstride_line ;
+      nstride_plane = plan->nstride_plane;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &index) const {
+    const int fast = index / (nslow * nmid);
+    const int index_new = index - (fast * nslow * nmid);
+    const int mid = index_new / nslow;
+    const int slow = index_new % nslow;
+    const int in_orig = slow*nmid*nfast;
+    const int plane = slow*nstride_plane;
+    const int out_orig = plane + mid*nstride_line;
+    const int in = in_orig + mid*nfast + fast;
+    const int out = out_orig + fast;
+    d_buf[buf_offset + in] = d_data[data_offset + out];
+  }
+};
+
+static void pack_3d(typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, struct pack_plan_3d *plan)
+{
+  const int nslow = plan->nslow;
+  const int nmid = plan->nmid;
+  const int nfast = plan->nfast;
+  pack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan);
+  Kokkos::parallel_for(nslow*nmid*nfast,f);
+  DeviceType::fence();
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data
+------------------------------------------------------------------------- */
+
+struct unpack_3d_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
+  int buf_offset,data_offset;
+  int nfast;                 // # of elements in fast index
+  int nmid;                  // # of elements in mid index
+  int nslow;                 // # of elements in slow index
+  int nstride_line;          // stride between successive mid indices
+  int nstride_plane;         // stride between successive slow indices
+
+  unpack_3d_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
+    d_buf(d_buf_),
+    d_data(d_data_)
+    {
+      buf_offset    = buf_offset_        ;
+      data_offset   = data_offset_       ;
+      nfast         = plan->nfast        ;
+      nmid          = plan->nmid         ;
+      nslow         = plan->nslow        ;
+      nstride_line  = plan->nstride_line ;
+      nstride_plane = plan->nstride_plane;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &index) const {
+    const int fast = index / (nslow * nmid);
+    const int index_new = index - (fast * nslow * nmid);
+    const int mid = index_new / nslow;
+    const int slow = index_new % nslow;
+    const int out_orig = slow*nmid*nfast;
+    const int plane = slow*nstride_plane;
+    const int in_orig = plane + mid*nstride_line;
+    const int in = in_orig + fast;
+    const int out = out_orig + mid*nfast + fast;
+    d_data[data_offset + in] = d_buf[buf_offset + out];
+  }
+};
+
+static void unpack_3d(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
+{
+  const int nslow = plan->nslow;
+  const int nmid = plan->nmid;
+  const int nfast = plan->nfast;
+  unpack_3d_functor f(d_buf,buf_offset,d_data,data_offset,plan);
+  Kokkos::parallel_for(nslow*nmid*nfast,f);
+  DeviceType::fence();
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, 1 value/element
+------------------------------------------------------------------------- */
+
+
+struct unpack_3d_permute1_1_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
+  int buf_offset,data_offset;
+  int nfast;                 // # of elements in fast index
+  int nmid;                  // # of elements in mid index
+  int nslow;                 // # of elements in slow index
+  int nstride_line;          // stride between successive mid indices
+  int nstride_plane;         // stride between successive slow indices
+
+  unpack_3d_permute1_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
+    d_buf(d_buf_),
+    d_data(d_data_)
+    {
+      buf_offset    = buf_offset_        ;
+      data_offset   = data_offset_       ;
+      nfast         = plan->nfast        ;
+      nmid          = plan->nmid         ;
+      nslow         = plan->nslow        ;
+      nstride_line  = plan->nstride_line ;
+      nstride_plane = plan->nstride_plane;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &index) const {
+    const int fast = index / (nslow * nmid);
+    const int index_new = index - (fast * nslow * nmid);
+    const int mid = index_new / nslow;
+    const int slow = index_new % nslow;
+    const int out_orig = slow*nmid*nfast;
+    const int plane = slow*nstride_line;
+    const int in_orig = plane + mid;
+    const int in = in_orig + fast*nstride_plane;
+    const int out = out_orig + mid*nfast + fast;
+    d_data[data_offset + in] = d_buf[buf_offset + out];
+  }
+};
+
+static void unpack_3d_permute1_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
+{
+  const int nslow = plan->nslow;
+  const int nmid = plan->nmid;
+  const int nfast = plan->nfast;
+  unpack_3d_permute1_1_functor f(d_buf,buf_offset,d_data,data_offset,plan);
+  Kokkos::parallel_for(nslow*nmid*nfast,f);
+  DeviceType::fence();
+}
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, 2 values/element
+------------------------------------------------------------------------- */
+
+struct unpack_3d_permute1_2_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
+  int buf_offset,data_offset;
+  int nfast;                 // # of elements in fast index
+  int nmid;                  // # of elements in mid index
+  int nslow;                 // # of elements in slow index
+  int nstride_line;          // stride between successive mid indices
+  int nstride_plane;         // stride between successive slow indices
+
+  unpack_3d_permute1_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
+    d_buf(d_buf_),
+    d_data(d_data_)
+    {
+      buf_offset    = buf_offset_        ;
+      data_offset   = data_offset_       ;
+      nfast         = plan->nfast        ;
+      nmid          = plan->nmid         ;
+      nslow         = plan->nslow        ;
+      nstride_line  = plan->nstride_line ;
+      nstride_plane = plan->nstride_plane;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &index) const {
+    const int fast = index / (nslow * nmid);
+    const int index_new = index - (fast * nslow * nmid);
+    const int mid = index_new / nslow;
+    const int slow = index_new % nslow;
+    const int out_orig = slow*nmid*nfast*2;
+    const int plane = slow*nstride_line;
+    const int in_orig = plane + 2*mid;
+    const int in = in_orig + fast*nstride_plane;
+    const int out = out_orig + 2*mid*nfast + 2*fast;
+    d_data[data_offset + in] = d_buf[buf_offset + out];
+    d_data[data_offset + in+1] = d_buf[buf_offset + out+1];
+  }
+};
+
+static void unpack_3d_permute1_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
+{
+  const int nslow = plan->nslow;
+  const int nmid = plan->nmid;
+  const int nfast = plan->nfast;
+  unpack_3d_permute1_2_functor f(d_buf,buf_offset,d_data,data_offset,plan);
+  Kokkos::parallel_for(nslow*nmid*nfast,f);
+  DeviceType::fence();
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, nqty values/element
+------------------------------------------------------------------------- */
+
+struct unpack_3d_permute1_n_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
+  int buf_offset,data_offset;
+  int nfast;                 // # of elements in fast index
+  int nmid;                  // # of elements in mid index
+  int nslow;                 // # of elements in slow index
+  int nstride_line;          // stride between successive mid indices
+  int nstride_plane;         // stride between successive slow indices
+  int nqty;                  // # of values/element
+
+  unpack_3d_permute1_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
+    d_buf(d_buf_),
+    d_data(d_data_)
+    {
+      buf_offset    = buf_offset_        ;
+      data_offset   = data_offset_       ;
+      nfast         = plan->nfast        ;
+      nmid          = plan->nmid         ;
+      nslow         = plan->nslow        ;
+      nstride_line  = plan->nstride_line ;
+      nstride_plane = plan->nstride_plane;
+      nqty          = plan->nqty         ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &index) const {
+    const int fast = index / (nslow * nmid);
+    const int index_new = index - (fast * nslow * nmid);
+    const int mid = index_new / nslow;
+    const int slow = index_new % nslow;
+    const int out_orig = slow*nmid*nfast*nqty;
+    const int plane = slow*nstride_line;
+    const int instart = plane + nqty*mid;
+    int in = instart + nqty*fast*nstride_plane;
+    int out = out_orig + nqty*mid*nfast + nqty*fast;
+    for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++];
+  }
+};
+
+static void unpack_3d_permute1_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
+{
+  const int nslow = plan->nslow;
+  const int nmid = plan->nmid;
+  const int nfast = plan->nfast;
+  unpack_3d_permute1_n_functor f(d_buf,buf_offset,d_data,data_offset,plan);
+  Kokkos::parallel_for(nslow*nmid*nfast,f);
+  DeviceType::fence();
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, 1 value/element
+------------------------------------------------------------------------- */
+
+struct unpack_3d_permute2_1_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
+  int buf_offset,data_offset;
+  int nfast;                 // # of elements in fast index
+  int nmid;                  // # of elements in mid index
+  int nslow;                 // # of elements in slow index
+  int nstride_line;          // stride between successive mid indices
+  int nstride_plane;         // stride between successive slow indices
+
+  unpack_3d_permute2_1_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
+    d_buf(d_buf_),
+    d_data(d_data_)
+    {
+      buf_offset    = buf_offset_        ;
+      data_offset   = data_offset_       ;
+      nfast         = plan->nfast        ;
+      nmid          = plan->nmid         ;
+      nslow         = plan->nslow        ;
+      nstride_line  = plan->nstride_line ;
+      nstride_plane = plan->nstride_plane;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &index) const {
+    const int fast = index / (nslow * nmid);
+    const int index_new = index - (fast * nslow * nmid);
+    const int mid = index_new / nslow;
+    const int slow = index_new % nslow;
+    const int out_orig = slow*nmid*nfast;
+    const int in_orig = slow + mid*nstride_plane;
+    const int in = in_orig + fast*nstride_line;
+    const int out = out_orig + mid*nfast + fast;
+    d_data[data_offset + in] = d_buf[buf_offset + out];
+  }
+};
+
+static void unpack_3d_permute2_1(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
+{
+  const int nslow = plan->nslow;
+  const int nmid = plan->nmid;
+  const int nfast = plan->nfast;
+  unpack_3d_permute2_1_functor f(d_buf,buf_offset,d_data,data_offset,plan);
+  Kokkos::parallel_for(nslow*nmid*nfast,f);
+  DeviceType::fence();
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, 2 values/element
+------------------------------------------------------------------------- */
+
+struct unpack_3d_permute2_2_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
+  int buf_offset,data_offset;
+  int nfast;                 // # of elements in fast index
+  int nmid;                  // # of elements in mid index
+  int nslow;                 // # of elements in slow index
+  int nstride_line;          // stride between successive mid indices
+  int nstride_plane;         // stride between successive slow indices
+
+  unpack_3d_permute2_2_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
+    d_buf(d_buf_),
+    d_data(d_data_)
+    {
+      buf_offset    = buf_offset_        ;
+      data_offset   = data_offset_       ;
+      nfast         = plan->nfast        ;
+      nmid          = plan->nmid         ;
+      nslow         = plan->nslow        ;
+      nstride_line  = plan->nstride_line ;
+      nstride_plane = plan->nstride_plane;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &index) const {
+    const int fast = index / (nslow * nmid);
+    const int index_new = index - (fast * nslow * nmid);
+    const int mid = index_new / nslow;
+    const int slow = index_new % nslow;
+    const int out_orig = slow*nmid*nfast*2;
+    const int in_orig = 2*slow + mid*nstride_plane;
+    const int in = in_orig + fast*nstride_line;
+    const int out = out_orig + 2*mid*nfast + 2*fast;
+    d_data[data_offset + in] = d_buf[buf_offset + out];
+    d_data[data_offset + in+1] = d_buf[buf_offset + out+1];
+  }
+};
+
+static void unpack_3d_permute2_2(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
+{
+  const int nslow = plan->nslow;
+  const int nmid = plan->nmid;
+  const int nfast = plan->nfast;
+  unpack_3d_permute2_2_functor f(d_buf,buf_offset,d_data,data_offset,plan);
+  Kokkos::parallel_for(nslow*nmid*nfast,f);
+  DeviceType::fence();
+}
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, nqty values/element
+------------------------------------------------------------------------- */
+
+struct unpack_3d_permute2_n_functor {
+public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d_um d_buf,d_data;
+  int buf_offset,data_offset;
+  int nfast;                 // # of elements in fast index
+  int nmid;                  // # of elements in mid index
+  int nslow;                 // # of elements in slow index
+  int nstride_line;          // stride between successive mid indices
+  int nstride_plane;         // stride between successive slow indices
+  int nqty;                  // # of values/element
+
+  unpack_3d_permute2_n_functor(typename AT::t_FFT_SCALAR_1d_um d_buf_, int buf_offset_, typename AT::t_FFT_SCALAR_1d_um d_data_, int data_offset_, struct pack_plan_3d *plan):
+    d_buf(d_buf_),
+    d_data(d_data_)
+    {
+      buf_offset    = buf_offset_        ;
+      data_offset   = data_offset_       ;
+      nfast         = plan->nfast        ;
+      nmid          = plan->nmid         ;
+      nslow         = plan->nslow        ;
+      nstride_line  = plan->nstride_line ;
+      nstride_plane = plan->nstride_plane;
+      nqty          = plan->nqty         ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &index) const {
+    const int fast = index / (nslow * nmid);
+    const int index_new = index - (fast * nslow * nmid);
+    const int mid = index_new / nslow;
+    const int slow = index_new % nslow;
+    const int out_orig = slow*nmid*nfast*nqty;
+    const int instart = nqty*slow + mid*nstride_plane;
+    int in = instart + nqty*fast*nstride_line;
+    int out = out_orig + nqty*mid*nfast + nqty*fast;
+    for (int iqty = 0; iqty < nqty; iqty++) d_data[data_offset + in++] = d_buf[buf_offset + out++];
+  }
+};
+
+static void unpack_3d_permute2_n(typename AT::t_FFT_SCALAR_1d_um d_buf, int buf_offset, typename AT::t_FFT_SCALAR_1d_um d_data, int data_offset, struct pack_plan_3d *plan)
+{
+  const int nslow = plan->nslow;
+  const int nmid = plan->nmid;
+  const int nfast = plan->nfast;
+  unpack_3d_permute2_n_functor f(d_buf,buf_offset,d_data,data_offset,plan);
+  Kokkos::parallel_for(nslow*nmid*nfast,f);
+  DeviceType::fence();
+}
+
+};
+
+}
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@ -25,8 +25,8 @@
 #include "force.h"
 #include "pair.h"
 #include "domain.h"
-#include "fft3d_wrap.h"
-#include "remap_wrap.h"
+#include "fft3d_kokkos.h"
+#include "remap_kokkos.h"
 #include "memory_kokkos.h"
 #include "error.h"
 #include "atom_masks.h"
@ -842,19 +842,19 @@ void PPPMKokkos<DeviceType>::allocate()

  int tmp;

-  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   0,0,&tmp,collective_flag);
+  fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                         0,0,&tmp,collective_flag);

-  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,collective_flag);
-  remap = new Remap(lmp,world,
-                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    1,0,0,FFT_PRECISION,collective_flag);
+  fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                         nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                         0,0,&tmp,collective_flag);
+  remap = new RemapKokkos<DeviceType>(lmp,world,
+                          nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                          nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                          1,0,0,FFT_PRECISION,collective_flag);

  // create ghost grid object for rho and electric field communication

@ -1783,12 +1783,7 @@ void PPPMKokkos<DeviceType>::brick2fft()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_brick2fft>(0,inum_inout),*this);
  copymode = 0;

-  k_density_fft.template modify<DeviceType>();
-  k_density_fft.template sync<LMPHostType>();
-  remap->perform(density_fft,density_fft,work1);
-  k_density_fft.template modify<LMPHostType>();
-  k_density_fft.template sync<DeviceType>();
-
+  remap->perform(d_density_fft,d_density_fft,d_work1);
 }

 template<class DeviceType>
@ -1830,11 +1825,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik1>(0,nfft),*this);
  copymode = 0;

-  k_work1.template modify<DeviceType>();
-  k_work1.template sync<LMPHostType>();
-  fft1->compute(work1,work1,1);
-  k_work1.template modify<LMPHostType>();
-  k_work1.template sync<DeviceType>();
+  fft1->compute(d_work1,d_work1,1);


  // global energy and virial contribution
@ -1898,11 +1889,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik5>(0,inum_fft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -1916,11 +1903,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik7>(0,inum_fft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -1933,11 +1916,7 @@ void PPPMKokkos<DeviceType>::poisson_ik()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik9>(0,inum_fft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -2193,11 +2172,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom1>(0,nfft),*this);
    copymode = 0;

-    k_work2.template modify<DeviceType>();
-    k_work2.template sync<LMPHostType>();
-    fft2->compute(work2,work2,-1);
-    k_work2.template modify<LMPHostType>();
-    k_work2.template sync<DeviceType>();
+    fft2->compute(d_work2,d_work2,-1);


    copymode = 1;
@ -2214,11 +2189,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom3>(0,nfft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -2230,11 +2201,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom5>(0,nfft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -2246,11 +2213,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom7>(0,nfft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -2261,11 +2224,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom9>(0,nfft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -2277,11 +2236,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom11>(0,nfft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -2293,11 +2248,7 @@ void PPPMKokkos<DeviceType>::poisson_peratom()
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom13>(0,nfft),*this);
  copymode = 0;

-  k_work2.template modify<DeviceType>();
-  k_work2.template sync<LMPHostType>();
-  fft2->compute(work2,work2,-1);
-  k_work2.template modify<LMPHostType>();
-  k_work2.template sync<DeviceType>();
+  fft2->compute(d_work2,d_work2,-1);


  copymode = 1;
@ -3037,10 +2988,10 @@ int PPPMKokkos<DeviceType>::timing_1d(int n, double &time1d)
  time1 = MPI_Wtime();

  for (int i = 0; i < n; i++) {
-    fft1->timing1d(work1,nfft_both,1);
-    fft2->timing1d(work1,nfft_both,-1);
-    fft2->timing1d(work1,nfft_both,-1);
-    fft2->timing1d(work1,nfft_both,-1);
+    fft1->timing1d(d_work1,nfft_both,1);
+    fft2->timing1d(d_work1,nfft_both,-1);
+    fft2->timing1d(d_work1,nfft_both,-1);
+    fft2->timing1d(d_work1,nfft_both,-1);
  }

  MPI_Barrier(world);
@ -3074,10 +3025,10 @@ int PPPMKokkos<DeviceType>::timing_3d(int n, double &time3d)
  time1 = MPI_Wtime();

  for (int i = 0; i < n; i++) {
-    fft1->compute(work1,work1,1);
-    fft2->compute(work1,work1,-1);
-    fft2->compute(work1,work1,-1);
-    fft2->compute(work1,work1,-1);
+    fft1->compute(d_work1,d_work1,1);
+    fft2->compute(d_work1,d_work1,-1);
+    fft2->compute(d_work1,d_work1,-1);
+    fft2->compute(d_work1,d_work1,-1);
  }

  MPI_Barrier(world);
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@ -22,11 +22,31 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
 #ifndef LMP_PPPM_KOKKOS_H
 #define LMP_PPPM_KOKKOS_H

-#include "pppm.h"
 #include "gridcomm_kokkos.h"
+#include "remap_kokkos.h"
+#include "fft3d_kokkos.h"
 #include "kokkos_base.h"
 #include "kokkos_type.h"

+// fix up FFT defines for KOKKOS with CUDA
+
+#if defined(KOKKOS_ENABLE_CUDA)
+# if defined(FFT_FFTW)
+#  undef FFT_FFTW
+# endif
+# if defined(FFT_FFTW3)
+#  undef FFT_FFTW3
+# endif
+# if defined(FFT_MKL)
+#  undef FFT_MKL
+# endif
+# if !defined(FFT_CUFFT) && !defined(FFT_KISSFFT)
+#  define FFT_KISSFFT
+# endif
+#endif
+
+#include "pppm.h"
+
 namespace LAMMPS_NS {

 struct TagPPPM_setup1{};
@ -331,8 +351,8 @@ class PPPMKokkos : public PPPM, public KokkosBase {
  //double **acons;
  typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,DeviceType>::t_host acons;

-  class FFT3d *fft1,*fft2;
-  class Remap *remap;
+  FFT3dKokkos<DeviceType> *fft1,*fft2;
+  RemapKokkos<DeviceType> *remap;
  GridCommKokkos<DeviceType> *cg;
  GridCommKokkos<DeviceType> *cg_peratom;

--- a/src/KOKKOS/remap_kokkos.cpp
+++ b/src/KOKKOS/remap_kokkos.cpp
@ -0,0 +1,511 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "remap_kokkos.h"
+#include "pack_kokkos.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(A,B) ((A) < (B) ? (A) : (B))
+#define MAX(A,B) ((A) > (B) ? (A) : (B))
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp) : Pointers(lmp)
+{
+  plan = NULL;
+}
+
+template<class DeviceType>
+RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
+             int in_ilo, int in_ihi, int in_jlo, int in_jhi,
+             int in_klo, int in_khi,
+             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
+             int out_klo, int out_khi,
+             int nqty, int permute, int memory,
+             int precision, int usecollective) : Pointers(lmp)
+{
+  plan = remap_3d_create_plan_kokkos(comm,
+                              in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+                              out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
+                              nqty,permute,memory,precision,usecollective);
+  if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+RemapKokkos<DeviceType>::~RemapKokkos()
+{
+  remap_3d_destroy_plan_kokkos(plan);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void RemapKokkos<DeviceType>::perform(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf)
+{
+  remap_3d_kokkos(d_in,d_out,d_buf,plan);
+}
+
+
+/* ----------------------------------------------------------------------
+   Data layout for 3d remaps:
+
+   data set of Nfast x Nmid x Nslow elements is owned by P procs
+   each element = nqty contiguous datums
+   on input, each proc owns a subsection of the elements
+   on output, each proc will own a (presumably different) subsection
+   my subsection must not overlap with any other proc's subsection,
+     i.e. the union of all proc's input (or output) subsections must
+     exactly tile the global Nfast x Nmid x Nslow data set
+   when called from C, all subsection indices are
+     C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
+   when called from F77, all subsection indices are
+     F77-style from 1 to N where N = Nfast or Nmid or Nslow
+   a proc can own 0 elements on input or output
+     by specifying hi index < lo index
+   on both input and output, data is stored contiguously on a processor
+     with a fast-varying, mid-varying, and slow-varying index
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Perform 3d remap
+
+   Arguments:
+   in           starting address of input data on this proc
+   out          starting address of where output data for this proc
+                  will be placed (can be same as in)
+   buf          extra memory required for remap
+                if memory=0 was used in call to remap_3d_create_plan
+                  then buf must be big enough to hold output result
+                  i.e. nqty * (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
+                              (out_khi-out_klo+1)
+                if memory=1 was used in call to remap_3d_create_plan
+                  then buf is not used, can just be a dummy pointer
+   plan         plan returned by previous call to remap_3d_create_plan
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void RemapKokkos<DeviceType>::remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d d_in, typename AT::t_FFT_SCALAR_1d d_out, typename AT::t_FFT_SCALAR_1d d_buf,
+              struct remap_plan_3d_kokkos<DeviceType> *plan)
+{
+  // collective flag not yet supported
+
+  // use point-to-point communication
+
+  int i,isend,irecv;
+  typename AT::t_FFT_SCALAR_1d d_scratch;
+
+  if (plan->memory == 0)
+    d_scratch = d_buf;
+  else
+    d_scratch = plan->d_scratch;
+
+  // post all recvs into scratch space
+
+  for (irecv = 0; irecv < plan->nrecv; irecv++) {
+    FFT_SCALAR* scratch = d_scratch.ptr_on_device() + plan->recv_bufloc[irecv];
+    MPI_Irecv(scratch,plan->recv_size[irecv],
+              MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
+              plan->comm,&plan->request[irecv]);
+  }
+
+  // send all messages to other procs
+
+  for (isend = 0; isend < plan->nsend; isend++) {
+    int in_offset = plan->send_offset[isend];
+    plan->pack(d_in,in_offset,
+               plan->d_sendbuf,0,&plan->packplan[isend]);
+    MPI_Send(plan->d_sendbuf.ptr_on_device(),plan->send_size[isend],MPI_FFT_SCALAR,
+             plan->send_proc[isend],0,plan->comm);
+  }
+
+  // copy in -> scratch -> out for self data
+
+  if (plan->self) {
+    isend = plan->nsend;
+    irecv = plan->nrecv;
+
+    int in_offset = plan->send_offset[isend];
+    int scratch_offset = plan->recv_bufloc[irecv];
+    int out_offset = plan->recv_offset[irecv];
+
+    plan->pack(d_in,in_offset,
+               d_scratch,scratch_offset,
+               &plan->packplan[isend]);
+    plan->unpack(d_scratch,scratch_offset,
+                 d_out,out_offset,&plan->unpackplan[irecv]);
+  }
+
+  // unpack all messages from scratch -> out
+
+  for (i = 0; i < plan->nrecv; i++) {
+    MPI_Waitany(plan->nrecv,plan->request,&irecv,MPI_STATUS_IGNORE);
+
+    int scratch_offset = plan->recv_bufloc[irecv];
+    int out_offset = plan->recv_offset[irecv];
+
+    plan->unpack(d_scratch,scratch_offset,
+                 d_out,out_offset,&plan->unpackplan[irecv]);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Create plan for performing a 3d remap
+
+   Arguments:
+   comm                 MPI communicator for the P procs which own the data
+   in_ilo,in_ihi        input bounds of data I own in fast index
+   in_jlo,in_jhi        input bounds of data I own in mid index
+   in_klo,in_khi        input bounds of data I own in slow index
+   out_ilo,out_ihi      output bounds of data I own in fast index
+   out_jlo,out_jhi      output bounds of data I own in mid index
+   out_klo,out_khi      output bounds of data I own in slow index
+   nqty                 # of datums per element
+   permute              permutation in storage order of indices on output
+                          0 = no permutation
+                          1 = permute once = mid->fast, slow->mid, fast->slow
+                          2 = permute twice = slow->fast, fast->mid, mid->slow
+   memory               user provides buffer memory for remap or system does
+                          0 = user provides memory
+                          1 = system provides memory
+   precision            precision of data
+                          1 = single precision (4 bytes per datum)
+                          2 = double precision (8 bytes per datum)
+   usecollective        whether to use collective MPI or point-to-point
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_create_plan_kokkos(
+  MPI_Comm comm,
+  int in_ilo, int in_ihi, int in_jlo, int in_jhi,
+  int in_klo, int in_khi,
+  int out_ilo, int out_ihi, int out_jlo, int out_jhi,
+  int out_klo, int out_khi,
+  int nqty, int permute, int memory, int precision, int usecollective)
+{
+
+  struct remap_plan_3d_kokkos<DeviceType> *plan;
+  struct extent_3d *inarray, *outarray;
+  struct extent_3d in,out,overlap;
+  int i,iproc,nsend,nrecv,ibuf,size,me,nprocs;
+
+  // query MPI info
+
+  MPI_Comm_rank(comm,&me);
+  MPI_Comm_size(comm,&nprocs);
+
+  // allocate memory for plan data struct
+
+  plan = new struct remap_plan_3d_kokkos<DeviceType>;
+  if (plan == NULL) return NULL;
+  plan->usecollective = usecollective;
+
+  // store parameters in local data structs
+
+  in.ilo = in_ilo;
+  in.ihi = in_ihi;
+  in.isize = in.ihi - in.ilo + 1;
+
+  in.jlo = in_jlo;
+  in.jhi = in_jhi;
+  in.jsize = in.jhi - in.jlo + 1;
+
+  in.klo = in_klo;
+  in.khi = in_khi;
+  in.ksize = in.khi - in.klo + 1;
+
+  out.ilo = out_ilo;
+  out.ihi = out_ihi;
+  out.isize = out.ihi - out.ilo + 1;
+
+  out.jlo = out_jlo;
+  out.jhi = out_jhi;
+  out.jsize = out.jhi - out.jlo + 1;
+
+  out.klo = out_klo;
+  out.khi = out_khi;
+  out.ksize = out.khi - out.klo + 1;
+
+  // combine output extents across all procs
+
+  inarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
+  if (inarray == NULL) return NULL;
+
+  outarray = (struct extent_3d *) malloc(nprocs*sizeof(struct extent_3d));
+  if (outarray == NULL) return NULL;
+
+  MPI_Allgather(&out,sizeof(struct extent_3d),MPI_BYTE,
+                outarray,sizeof(struct extent_3d),MPI_BYTE,comm);
+
+  // count send collides, including self
+
+  nsend = 0;
+  iproc = me;
+  for (i = 0; i < nprocs; i++) {
+    iproc++;
+    if (iproc == nprocs) iproc = 0;
+    nsend += remap_3d_collide(&in,&outarray[iproc],&overlap);
+  }
+
+  // malloc space for send info
+
+  if (nsend) {
+    plan->pack = PackKokkos<DeviceType>::pack_3d;
+
+    plan->send_offset = (int *) malloc(nsend*sizeof(int));
+    plan->send_size = (int *) malloc(nsend*sizeof(int));
+    plan->send_proc = (int *) malloc(nsend*sizeof(int));
+    plan->packplan = (struct pack_plan_3d *)
+      malloc(nsend*sizeof(struct pack_plan_3d));
+
+    if (plan->send_offset == NULL || plan->send_size == NULL ||
+        plan->send_proc == NULL || plan->packplan == NULL) return NULL;
+  }
+
+  // store send info, with self as last entry
+
+  nsend = 0;
+  iproc = me;
+  for (i = 0; i < nprocs; i++) {
+    iproc++;
+    if (iproc == nprocs) iproc = 0;
+    if (remap_3d_collide(&in,&outarray[iproc],&overlap)) {
+      plan->send_proc[nsend] = iproc;
+      plan->send_offset[nsend] = nqty *
+        ((overlap.klo-in.klo)*in.jsize*in.isize +
+         ((overlap.jlo-in.jlo)*in.isize + overlap.ilo-in.ilo));
+      plan->packplan[nsend].nfast = nqty*overlap.isize;
+      plan->packplan[nsend].nmid = overlap.jsize;
+      plan->packplan[nsend].nslow = overlap.ksize;
+      plan->packplan[nsend].nstride_line = nqty*in.isize;
+      plan->packplan[nsend].nstride_plane = nqty*in.jsize*in.isize;
+      plan->packplan[nsend].nqty = nqty;
+      plan->send_size[nsend] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+      nsend++;
+    }
+  }
+
+  // plan->nsend = # of sends not including self
+
+  if (nsend && plan->send_proc[nsend-1] == me) {
+    if (plan->usecollective) // for collectives include self in nsend list
+      plan->nsend = nsend;
+    else
+      plan->nsend = nsend - 1;
+  } else
+    plan->nsend = nsend;
+
+  // combine input extents across all procs
+
+  MPI_Allgather(&in,sizeof(struct extent_3d),MPI_BYTE,
+                inarray,sizeof(struct extent_3d),MPI_BYTE,comm);
+
+  // count recv collides, including self
+
+  nrecv = 0;
+  iproc = me;
+  for (i = 0; i < nprocs; i++) {
+    iproc++;
+    if (iproc == nprocs) iproc = 0;
+    nrecv += remap_3d_collide(&out,&inarray[iproc],&overlap);
+  }
+
+  // malloc space for recv info
+
+  if (nrecv) {
+    if (permute == 0)
+      plan->unpack = PackKokkos<DeviceType>::unpack_3d;
+    else if (permute == 1) {
+      if (nqty == 1)
+        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_1;
+      else if (nqty == 2)
+        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_2;
+      else
+        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute1_n;
+    }
+    else if (permute == 2) {
+      if (nqty == 1)
+        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_1;
+      else if (nqty == 2)
+        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_2;
+      else
+        plan->unpack = PackKokkos<DeviceType>::unpack_3d_permute2_n;
+    }
+
+    plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
+    plan->recv_size = (int *) malloc(nrecv*sizeof(int));
+    plan->recv_proc = (int *) malloc(nrecv*sizeof(int));
+    plan->recv_bufloc = (int *) malloc(nrecv*sizeof(int));
+    plan->request = (MPI_Request *) malloc(nrecv*sizeof(MPI_Request));
+    plan->unpackplan = (struct pack_plan_3d *)
+      malloc(nrecv*sizeof(struct pack_plan_3d));
+
+    if (plan->recv_offset == NULL || plan->recv_size == NULL ||
+        plan->recv_proc == NULL || plan->recv_bufloc == NULL ||
+        plan->request == NULL || plan->unpackplan == NULL) return NULL;
+  }
+
+  // store recv info, with self as last entry
+
+  ibuf = 0;
+  nrecv = 0;
+  iproc = me;
+
+  for (i = 0; i < nprocs; i++) {
+    iproc++;
+    if (iproc == nprocs) iproc = 0;
+    if (remap_3d_collide(&out,&inarray[iproc],&overlap)) {
+      plan->recv_proc[nrecv] = iproc;
+      plan->recv_bufloc[nrecv] = ibuf;
+
+      if (permute == 0) {
+        plan->recv_offset[nrecv] = nqty *
+          ((overlap.klo-out.klo)*out.jsize*out.isize +
+           (overlap.jlo-out.jlo)*out.isize + (overlap.ilo-out.ilo));
+        plan->unpackplan[nrecv].nfast = nqty*overlap.isize;
+        plan->unpackplan[nrecv].nmid = overlap.jsize;
+        plan->unpackplan[nrecv].nslow = overlap.ksize;
+        plan->unpackplan[nrecv].nstride_line = nqty*out.isize;
+        plan->unpackplan[nrecv].nstride_plane = nqty*out.jsize*out.isize;
+        plan->unpackplan[nrecv].nqty = nqty;
+      }
+      else if (permute == 1) {
+        plan->recv_offset[nrecv] = nqty *
+          ((overlap.ilo-out.ilo)*out.ksize*out.jsize +
+           (overlap.klo-out.klo)*out.jsize + (overlap.jlo-out.jlo));
+        plan->unpackplan[nrecv].nfast = overlap.isize;
+        plan->unpackplan[nrecv].nmid = overlap.jsize;
+        plan->unpackplan[nrecv].nslow = overlap.ksize;
+        plan->unpackplan[nrecv].nstride_line = nqty*out.jsize;
+        plan->unpackplan[nrecv].nstride_plane = nqty*out.ksize*out.jsize;
+        plan->unpackplan[nrecv].nqty = nqty;
+      }
+      else {
+        plan->recv_offset[nrecv] = nqty *
+          ((overlap.jlo-out.jlo)*out.isize*out.ksize +
+           (overlap.ilo-out.ilo)*out.ksize + (overlap.klo-out.klo));
+        plan->unpackplan[nrecv].nfast = overlap.isize;
+        plan->unpackplan[nrecv].nmid = overlap.jsize;
+        plan->unpackplan[nrecv].nslow = overlap.ksize;
+        plan->unpackplan[nrecv].nstride_line = nqty*out.ksize;
+        plan->unpackplan[nrecv].nstride_plane = nqty*out.isize*out.ksize;
+        plan->unpackplan[nrecv].nqty = nqty;
+      }
+
+      plan->recv_size[nrecv] = nqty*overlap.isize*overlap.jsize*overlap.ksize;
+      ibuf += plan->recv_size[nrecv];
+      nrecv++;
+    }
+  }
+
+  // plan->nrecv = # of recvs not including self
+  // for collectives include self in the nsend list
+
+  if (nrecv && plan->recv_proc[nrecv-1] == me) {
+    if (plan->usecollective) plan->nrecv = nrecv;
+    else plan->nrecv = nrecv - 1;
+  } else plan->nrecv = nrecv;
+
+  // init remaining fields in remap plan
+
+  plan->memory = memory;
+
+  if (nrecv == plan->nrecv) plan->self = 0;
+  else plan->self = 1;
+
+  // free locally malloced space
+
+  free(inarray);
+  free(outarray);
+
+  // find biggest send message (not including self) and malloc space for it
+
+  size = 0;
+  for (nsend = 0; nsend < plan->nsend; nsend++)
+    size = MAX(size,plan->send_size[nsend]);
+
+  if (size) {
+    plan->d_sendbuf = typename AT::t_FFT_SCALAR_1d("remap3d:sendbuf",size);
+    if (!plan->d_sendbuf.data()) return NULL;
+  }
+
+  // if requested, allocate internal scratch space for recvs,
+  // only need it if I will receive any data (including self)
+
+  if (memory == 1) {
+    if (nrecv > 0) {
+      plan->d_scratch =
+        typename AT::t_FFT_SCALAR_1d("remap3d:scratch",nqty*out.isize*out.jsize*out.ksize);
+      if (!plan->d_scratch.data()) return NULL;
+    }
+  }
+
+  // not using collective - dup comm
+
+  MPI_Comm_dup(comm,&plan->comm);
+
+  // return pointer to plan
+
+  return plan;
+}
+
+/* ----------------------------------------------------------------------
+   Destroy a 3d remap plan
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void RemapKokkos<DeviceType>::remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *plan)
+{
+  if (plan == NULL) return;
+
+  // free MPI communicator
+
+  if (!((plan->usecollective) && (plan->commringlen == 0)))
+    MPI_Comm_free(&plan->comm);
+
+  // free internal arrays
+
+  if (plan->nsend || plan->self) {
+    free(plan->send_offset);
+    free(plan->send_size);
+    free(plan->send_proc);
+    free(plan->packplan);
+  }
+
+  if (plan->nrecv || plan->self) {
+    free(plan->recv_offset);
+    free(plan->recv_size);
+    free(plan->recv_proc);
+    free(plan->recv_bufloc);
+    free(plan->request);
+    free(plan->unpackplan);
+  }
+
+  // free plan itself
+
+  delete plan;
+}
+
+namespace LAMMPS_NS {
+template class RemapKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class RemapKokkos<LMPHostType>;
+#endif
+}
--- a/src/KOKKOS/remap_kokkos.h
+++ b/src/KOKKOS/remap_kokkos.h
@ -0,0 +1,88 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_REMAP_KOKKOS_H
+#define LMP_REMAP_KOKKOS_H
+
+#include "pointers.h"
+#include <mpi.h>
+#include "fftdata_kokkos.h"
+#include "remap.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+// details of how to do a 3d remap
+
+template<class DeviceType>
+struct remap_plan_3d_kokkos {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_FFT_SCALAR_1d d_sendbuf;                  // buffer for MPI sends
+  typename AT::t_FFT_SCALAR_1d d_scratch;                  // scratch buffer for MPI recvs
+  void (*pack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
+                                    // which pack function to use
+  void (*unpack)(typename AT::t_FFT_SCALAR_1d_um, int, typename AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
+                                    // which unpack function to use
+  int *send_offset;                 // extraction loc for each send
+  int *send_size;                   // size of each send message
+  int *send_proc;                   // proc to send each message to
+  struct pack_plan_3d *packplan;    // pack plan for each send message
+  int *recv_offset;                 // insertion loc for each recv
+  int *recv_size;                   // size of each recv message
+  int *recv_proc;                   // proc to recv each message from
+  int *recv_bufloc;                 // offset in scratch buf for each recv
+  MPI_Request *request;             // MPI request for each posted recv
+  struct pack_plan_3d *unpackplan;  // unpack plan for each recv message
+  int nrecv;                        // # of recvs from other procs
+  int nsend;                        // # of sends to other procs
+  int self;                         // whether I send/recv with myself
+  int memory;                       // user provides scratch space or not
+  MPI_Comm comm;                    // group of procs performing remap
+  int usecollective;                // use collective or point-to-point MPI
+  int commringlen;                  // length of commringlist
+  int *commringlist;                // ranks on communication ring of this plan
+};
+
+template<class DeviceType>
+class RemapKokkos : protected Pointers {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  RemapKokkos(class LAMMPS *);
+  RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
+        int,int,int,int,int,int,int,int,int,int,int);
+  ~RemapKokkos();
+  void perform(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d);
+
+  struct remap_plan_3d_kokkos<DeviceType> *plan;
+
+  void remap_3d_kokkos(typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, typename AT::t_FFT_SCALAR_1d, struct remap_plan_3d_kokkos<DeviceType> *);
+  struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
+                                             int, int, int, int, int, int,
+                                             int, int, int, int, int, int,
+                                             int, int, int, int, int);
+  void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Could not create 3d remap plan
+
+The FFT setup in pppm failed.
+
+*/
--- a/src/KSPACE/fft3d.cpp
+++ b/src/KSPACE/fft3d.cpp
@ -25,6 +25,9 @@
 #include <cstdlib>
 #include <cmath>
 #include "remap.h"
+#if defined(_OPENMP)
+#include <omp.h>
+#endif

 #ifdef FFT_KISS
 /* include kissfft implementation */
@ -266,7 +269,7 @@ struct fft_plan_3d *fft_3d_create_plan(
       int scaled, int permute, int *nbuf, int usecollective)
 {
  struct fft_plan_3d *plan;
-  int me,nprocs;
+  int me,nprocs,nthreads;
  int flag,remapflag;
  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
@ -279,6 +282,14 @@ struct fft_plan_3d *fft_3d_create_plan(
  MPI_Comm_rank(comm,&me);
  MPI_Comm_size(comm,&nprocs);

+#if defined(_OPENMP)
+  // query OpenMP info.
+  // should have been initialized systemwide in Comm class constructor
+  nthreads = omp_get_max_threads();
+#else
+  nthreads = 1;
+#endif
+
  // compute division of procs in 2 dimensions not on-processor

  bifactor(nprocs,&np1,&np2);
@ -571,6 +582,13 @@ struct fft_plan_3d *fft_3d_create_plan(
  */

 #elif defined(FFT_FFTW3)
+#if defined(FFT_FFTW_THREADS)
+  if (nthreads > 1) {
+    FFTW_API(init_threads)();
+    FFTW_API(plan_with_nthreads)(nthreads);
+  }
+#endif
+
  plan->plan_fast_forward =
    FFTW_API(plan_many_dft)(1, &nfast,plan->total1/plan->length1,
                            NULL,&nfast,1,plan->length1,
@ -689,6 +707,9 @@ void fft_3d_destroy_plan(struct fft_plan_3d *plan)
  FFTW_API(destroy_plan)(plan->plan_mid_backward);
  FFTW_API(destroy_plan)(plan->plan_fast_forward);
  FFTW_API(destroy_plan)(plan->plan_fast_backward);
+#if defined(FFT_FFTW_THREADS)
+  FFTW_API(cleanup_threads)();
+#endif
 #else
  if (plan->cfg_slow_forward != plan->cfg_fast_forward &&
      plan->cfg_slow_forward != plan->cfg_mid_forward) {
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@ -26,6 +26,8 @@ KSpaceStyle(pppm,PPPM)
 #define LMP_FFT_LIB "FFTW3"
 #elif defined(FFT_MKL)
 #define LMP_FFT_LIB "MKL FFT"
+#elif defined(FFT_CUFFT)
+#define LMP_FFT_LIB "cuFFT"
 #else
 #define LMP_FFT_LIB "KISS FFT"
 #endif
--- a/src/MAKE/MACHINES/Makefile.lassen_kokkos
+++ b/src/MAKE/MACHINES/Makefile.lassen_kokkos
@ -1,4 +1,4 @@
-# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc 7.3.1
+# lassen_kokkos = KOKKOS/CUDA, V100 GPU and Power9, IBM Spectrum MPI, nvcc compiler with gcc

 SHELL = /bin/sh

@ -44,9 +44,12 @@ LMP_INC =	-DLAMMPS_GZIP
 # PATH = path for MPI library
 # LIB = name of MPI library

-MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/include
+MY_MPI_EXE = $(shell which mpicxx)
+MY_MPI_PATH = $(dir ${MY_MPI_EXE})
+
+MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I${MY_MPI_PATH}../include
 MPI_PATH = 
-MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.1/lib -lmpi_ibm
+MPI_LIB = -L${MY_MPI_PATH}../lib -lmpi_ibm

 # FFT library
 # see discussion in Section 2.2 (step 6) of manaul
@ -55,9 +58,9 @@ MPI_LIB = -L/usr/tce/packages/spectrum-mpi/spectrum-mpi-rolling-release-gcc-7.3.
 # PATH = path for FFT library
 # LIB = name of FFT library

-FFT_INC =    	
+FFT_INC = -DFFT_CUFFT
 FFT_PATH = 
-FFT_LIB =	
+FFT_LIB = -lcufft

 # JPEG and/or PNG library
 # see discussion in Section 2.2 (step 7) of manual
--- a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi
+++ b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi
@ -55,9 +55,9 @@ MPI_LIB =
 # PATH = path for FFT library
 # LIB = name of FFT library

-FFT_INC =    	
+FFT_INC = -DFFT_CUFFT	
 FFT_PATH = 
-FFT_LIB =	
+FFT_LIB = -lcufft

 # JPEG and/or PNG library
 # see discussion in Section 2.2 (step 7) of manual
--- a/src/version.h
+++ b/src/version.h
@ -1 +1 @@
-#define LAMMPS_VERSION "09 Jan 2020"
+#define LAMMPS_VERSION "24 Jan 2020"